n29_xizang.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import requests,json
  4. from setting import proxies
  5. from urllib import parse
  6. from lxml import etree
  7. from mongo_cho import myco29,r_myco15,myco29_b
  8. from rety import retry
  9. r = requests.session()
  10. r.keep_alive = False
  11. @retry(3)
  12. def r1_d(cmpname,url):
  13. # url = 'https://xizang.chinatax.gov.cn/art/2019/6/26/art_2371_382.html'
  14. headers = {
  15. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  16. }
  17. response = r.get(url=url,headers=headers,proxies=proxies)
  18. response.encoding = 'UTF-8'
  19. html = response.text
  20. if "附件下载" in html:
  21. return {}
  22. if "市局频道" in html:
  23. return {}
  24. selector = etree.HTML(html)
  25. dts = selector.xpath('//div[@class="main"]//div[@class="main_content"]//span/text()')
  26. dt = ''
  27. for i in dts:
  28. if '发布时间' in i:
  29. i1 = i.split(' ')[0]
  30. dt = i2 = i1.split(':')[1].replace('-','/')
  31. print(dt)
  32. dict1 = {}
  33. dict1['纳税人名称'] = cmpname
  34. result = selector.xpath('//div[@id="zoom"]')
  35. result1 = result[0].xpath('string(.)').strip()
  36. # print(result1)
  37. reu1 = result1.split('注册地址:')
  38. # print(reu1[1])
  39. dict1['注册地址'] = reu1[1]
  40. reu2 = reu1[0].split('主要违法事实:')
  41. # print(reu2[1])
  42. dict1['主要违法事实'] =reu2[1]
  43. reu3 = reu2[0].split('违法案件性质:')
  44. # print(reu3[1])
  45. dict1['违法案件性质'] =reu3[1]
  46. reu4 = reu3[0].split('组织机构代码:')
  47. # print(reu4[1])
  48. dict1['组织机构代码'] =reu4[1]
  49. reu5 = reu4[0].split('法人信息:')
  50. # print(reu5[1])
  51. dict1['法人信息'] =reu5[1]
  52. reu6 = reu5[0].split('纳税人识别号:')
  53. # print(reu6[1])
  54. dict1['纳税人识别号'] =reu6[1]
  55. dict1['url'] = url
  56. dict1['date'] = dt
  57. print(dict1)
  58. return dict1
  59. # r1_d('')
  60. @retry(3)
  61. def r1(pg):
  62. url = 'https://xizang.chinatax.gov.cn/module/search/index.jsp'
  63. headers = {
  64. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  65. }
  66. params = {
  67. "field":"vc_name:1,field_406:1,field_407:1,field_408:1",
  68. "i_columnid": "style_63",
  69. "vc_name": "",
  70. "field_406": "",
  71. "field_407": "",
  72. "field_408": "",
  73. "currpage": pg,
  74. }
  75. response = r.get(url=url,headers=headers,params=params,proxies=proxies)
  76. html = response.text
  77. selector = etree.HTML(html)
  78. a = selector.xpath('//td//a/@href')
  79. list1 = []
  80. list2 = []
  81. for i in a:
  82. # print(i)
  83. if "art" in i:
  84. url1 = "https://xizang.chinatax.gov.cn" + i.replace('../..','')
  85. print(url1)
  86. utf = r_myco15.sismember('n29', url1) ##更改
  87. if not utf:
  88. rsd = r1_d(url1)
  89. if rsd:
  90. list1.append(rsd)
  91. list2.append(url1)
  92. else:
  93. print('已存在,>>>n29')
  94. # if list1:
  95. # myco29.insert_many(list1)
  96. # print('已存入原始库')
  97. # if list2:
  98. # myco29_b.insert_many(list1)
  99. # print('已存入备份原始库')
  100. # for mis in list2:
  101. # r_myco15.sadd('n29', mis) ##更改
  102. # if list1:
  103. # myco29.insert_many(list1)
  104. @retry(3)
  105. def r2(pg):
  106. url = 'https://xizang.chinatax.gov.cn/module/search/index.jsp'
  107. headers = {
  108. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  109. }
  110. params = {
  111. "field":"vc_name:1,field_406:1,field_407:1,field_408:1",
  112. "i_columnid": "style_63",
  113. "vc_name": "",
  114. "field_406": "",
  115. "field_407": "",
  116. "field_408": "",
  117. "currpage": pg,
  118. }
  119. response = r.get(url=url,headers=headers,params=params,proxies=proxies)
  120. html = response.text
  121. selector = etree.HTML(html)
  122. a = selector.xpath('//tr[@class="form-list"]')
  123. list1 = []
  124. list2 = []
  125. for i in a:
  126. cmpname = i.xpath('td[2]/text()')[0]
  127. # print(cmpname)
  128. urlz = i.xpath('td[4]//a/@href')[0]
  129. # print(urlz)
  130. url1 = "https://xizang.chinatax.gov.cn" + urlz.replace('../..', '')
  131. utf = r_myco15.sismember('n29', url1) ##更改
  132. if not utf:
  133. rsd = r1_d(cmpname,url1)
  134. if rsd:
  135. list1.append(rsd)
  136. list2.append(url1)
  137. else:
  138. print('已存在,>>>n29')
  139. if list1:
  140. myco29.insert_many(list1)
  141. print('已存入原始库')
  142. if list2:
  143. myco29_b.insert_many(list1)
  144. print('已存入备份原始库')
  145. for mis in list2:
  146. r_myco15.sadd('n29', mis) ##更改
  147. def runs():
  148. for pg in range(1,4):
  149. print(pg,'================')
  150. r2(pg)
  151. runs()