n09_sd.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import re
  4. import requests,json
  5. from setting import proxies
  6. from urllib import parse
  7. from lxml import etree
  8. from mongo_cho import myco9,r_myco15,myco9_b
  9. from rety import retry
  10. r = requests.session()
  11. r.keep_alive = False
  12. import urllib3
  13. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  14. @retry(3)
  15. def r2(ny,cid,dicts):
  16. url = 'https://etax.shandong.chinatax.gov.cn/DZSWJ/DZSWJ_SSWFSXAJ_CX_NAVIGATE?method=queryMx&nsrmc=&nsrsbh={cid}&zcdz=&zzjgdm=&fddbrxm=&fddbrsfzhm=&cwfzrxm=&cwfzrsfzhm=&cxnd={ny}%D4%C2&cxdq=&ajxz='.format(cid=cid,ny=ny)
  17. headers = {
  18. "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
  19. }
  20. data = {
  21. "s_nsrsbh":"",
  22. "nsrmc": "",
  23. "zcdz": "",
  24. "zzjgdm": "",
  25. "fddbrxm": "",
  26. "fddbrsfzhm": "",
  27. "cwfzrxm": "",
  28. "cwfzrsfzhm": "",
  29. "cxdq": "",
  30. "ajxz": "",
  31. "cxnd": "{}月".format(ny),
  32. }
  33. response = r.post(url=url,data=data,headers=headers,verify=False,proxies=proxies)
  34. html = response.text
  35. # print(html)
  36. try:
  37. dict1 = {}
  38. NSRMC = re.findall(r'<NSRMC>(.*?)</NSRMC>',html)
  39. dict1['纳税人名称'] = NSRMC[0]
  40. NSRSBH = re.findall(r'<NSRSBH>(.*?)</NSRSBH>', html)
  41. dict1['纳税人识别号或社会信用代码'] = NSRSBH[0]
  42. ZZJGDM = re.findall(r'<ZZJGDM>(.*?)</ZZJGDM>', html)
  43. dict1['组织机构代码'] = ZZJGDM[0]
  44. ZCDZ = re.findall(r'<ZCDZ>(.*?)</ZCDZ>', html)
  45. dict1['注册地址'] = ZCDZ[0]
  46. FDDBRHFZRXM = re.findall(r'<FDDBRHFZRXM>(.*?)</FDDBRHFZRXM>', html)
  47. dict1['法定代表人或者负责人姓名'] = FDDBRHFZRXM[0]
  48. FDDBRHFZRXB = re.findall(r'<FDDBRHFZRXB>(.*?)</FDDBRHFZRXB>', html)
  49. dict1['性别'] = FDDBRHFZRXB[0]
  50. FDDBRHFZRZJHM = re.findall(r'<FDDBRHFZRZJHM>(.*?)</FDDBRHFZRZJHM>', html)
  51. dict1['证件号码1'] = FDDBRHFZRZJHM[0]
  52. FDRZJHM = re.findall(r'<FDRZJHM>(.*?)</FDRZJHM>', html)
  53. dict1['证件号码2'] = FDRZJHM[0]
  54. AJXZ = re.findall(r'<AJXZ>(.*?)</AJXZ>', html)
  55. dict1['案件性质'] = AJXZ[0]
  56. ZYWFSS = re.findall(r'<ZYWFSS>(.*?)</ZYWFSS>', html)
  57. dict1['主要违法事实'] = ZYWFSS[0]
  58. XGFLYJJSWCLCFQK = re.findall(r'<XGFLYJJSWCLCFQK>(.*?)</XGFLYJJSWCLCFQK>', html)
  59. dict1['相关法律依据及税务处理处罚情况 '] = XGFLYJJSWCLCFQK[0]
  60. dict1['date'] = ny[:4] +'/'+ ny[4:]
  61. dict1['uid'] = cid
  62. # print(dict1)
  63. return dict1
  64. except:
  65. return dicts
  66. # r2()
  67. @retry(3)
  68. def r1(ny,pg):
  69. url = 'https://etax.shandong.chinatax.gov.cn/DZSWJ/DZSWJ_SSWFSXAJ_CX_NAVIGATE?method=queryMxFh&nsrmc=&nsrsbh=&zcdz=&zzjgdm=&fddbrxm=&fddbrsfzhm=&cwfzrxm=&cwfzrsfzhm=&cxnd={ny}%D4%C2&cxdq=&ajxz=&page={pg}'.format(ny=ny,pg=pg)
  70. # url = 'https://etax.shandong.chinatax.gov.cn/DZSWJ/DZSWJ_SSWFSXAJ_CX_NAVIGATE?method=queryBynd&nsrmc=&nsrsbh=&zcdz=&zzjgdm=&fddbrxm=&fddbrsfzhm=&cwfzrxm=&cwfzrsfzhm=&cxnd=201911%D4%C2'
  71. headers = {
  72. "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
  73. }
  74. data = {
  75. "s_nsrsbh":"",
  76. "nsrmc": "",
  77. "zcdz": "",
  78. "zzjgdm": "",
  79. "fddbrxm": "",
  80. "fddbrsfzhm": "",
  81. "cwfzrxm": "",
  82. "cwfzrsfzhm": "",
  83. "cxdq": "",
  84. "ajxz": "",
  85. "cxnd": "20201年度",
  86. }
  87. params = {
  88. "method":"queryMxFh",
  89. "nsrmc": "",
  90. "nsrsbh": "",
  91. "zcdz": "",
  92. "zzjgdm": "",
  93. "fddbrxm": "",
  94. "fddbrsfzhm": "",
  95. "cwfzrxm": "",
  96. "cwfzrsfzhm": "",
  97. "cxnd": "20195%D4%C2",
  98. "cxdq": "",
  99. "ajxz": "",
  100. "page": "2",
  101. }
  102. response = r.get(url=url,headers=headers,verify=False,proxies=proxies)
  103. html = response.text
  104. # print(html)
  105. selector = etree.HTML(html)
  106. b = selector.xpath('//tr')
  107. try:
  108. num = 0
  109. list1 = []
  110. list2 = []
  111. for i in b:
  112. num += 1
  113. if num > 1:
  114. dict1 = {}
  115. td2 = i.xpath('td[2]/text()')
  116. td22 = td2[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '').replace('\xa0', '')
  117. dict1['纳税人名称'] = td22
  118. td3 = i.xpath('td[3]/text()')
  119. td33 = td3[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '').replace('\xa0', '')
  120. dict1['纳税人识别号或社会信用代码'] = td33
  121. td4 = i.xpath('td[4]/text()')
  122. td44 = td4[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '').replace('\xa0', '')
  123. dict1['案件性质'] = td44
  124. td5 = i.xpath('td[5]/input[@id="xxxx"]/@onclick')
  125. td55 = td5[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '').replace('\xa0', '')
  126. cid = re.findall(r"\('(.*?)'\)",td55)[0]
  127. dict1['组织机构代码'] = ''
  128. dict1['注册地址'] = ''
  129. dict1['法定代表人或者负责人姓名'] = ''
  130. dict1['性别'] = ''
  131. dict1['证件号码1'] = ''
  132. dict1['证件号码2'] = ''
  133. dict1['主要违法事实'] = ''
  134. dict1['相关法律依据及税务处理处罚情况 '] = ''
  135. dict1['date'] = ny[:4] + '/' + ny[4:]
  136. dict1['uid'] = cid
  137. utf = r_myco15.sismember('n09', cid) ##更改
  138. if not utf:
  139. rsd = r2(ny, cid, dict1)
  140. print(rsd)
  141. list1.append(rsd)
  142. list2.append(cid)
  143. else:
  144. print('已存在,>>>n09')
  145. pass
  146. if list1:
  147. myco9.insert_many(list1)
  148. if list2:
  149. myco9_b.insert_many(list1)
  150. for mis in list2:
  151. r_myco15.sadd('n09', mis) ##更改
  152. return '1'
  153. except:
  154. return '2'
  155. # myco9.insert_many(list1)
  156. # break
  157. # print(dict1)
  158. # print(cid)
  159. def runs(ny1,ny2):
  160. ny = str(ny1) + str(ny2)
  161. tpg = 100
  162. for pg in range(1,tpg):
  163. print(pg,'===================')
  164. btf = r1(ny,pg)
  165. if btf == "2":
  166. break
  167. runs('2023','11')