n31_tianjin.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import re
  4. import requests,json
  5. from setting import proxies
  6. from urllib import parse
  7. from lxml import etree
  8. from mongo_cho import myco31,r_myco15,myco31_b
  9. from rety import retry
  10. r = requests.session()
  11. r.keep_alive = False
  12. @retry(3)
  13. def r1(ny1,ny2,pg):
  14. url = 'http://tianjin.chinatax.gov.cn/wzcx/sjcx_cxqyxx.action'
  15. headers = {
  16. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  17. }
  18. data = {
  19. "szsf":"11200000000",
  20. "nfjd": str(ny1) + '0' + str(ny2),
  21. "page":pg,
  22. "pageCount":"15"
  23. }
  24. response = r.post(url=url,headers=headers,data=data,proxies=proxies)
  25. html = response.text
  26. # print(html)
  27. html1 = html.replace(' ','').replace('\r','').replace('\t','').replace('\n','')
  28. rpg1 = re.findall(r"pageCount='(.*?)'", html1)[0]
  29. # print(rpg1)
  30. selector = etree.HTML(html)
  31. a = selector.xpath('//span[@class="mxxx"]')
  32. dt = str(ny1) + '/0' + str(ny2)
  33. list1 = []
  34. list2 = []
  35. for i in a:
  36. dict1 = {}
  37. nsrm = i.xpath('@data-nsrmc')
  38. dict1['纳税人名称'] = nsrm[0]
  39. nsrsbh = i.xpath('@data-nsrsbh')
  40. dict1['纳税人识别号'] =nsrsbh[0]
  41. zzjgdm = i.xpath('@data-zzjgdm')
  42. dict1['组织机构代码'] =zzjgdm[0]
  43. zcjydz = i.xpath('@data-zcjydz')
  44. dict1['注册地址'] =zcjydz[0]
  45. fddbrxm = i.xpath('@data-fddbrxm')
  46. dict1['姓名'] =fddbrxm[0]
  47. fddbrxb = i.xpath('@data-fddbrxb')
  48. dict1['性别'] =fddbrxb[0]
  49. fddbrzjmc = i.xpath('@data-fddbrzjmc')
  50. dict1['证件名称'] =fddbrzjmc[0]
  51. fddbrzjhm = i.xpath('@data-fddbrzjhm')
  52. dict1['证件号码'] =fddbrzjhm[0]
  53. ajlxmc = i.xpath('@data-ajlxmc')
  54. dict1['案件性质'] =ajlxmc[0]
  55. zywfss = i.xpath('@data-zywfss')
  56. dict1['主要违法事实'] =zywfss[0]
  57. clqk = i.xpath('@data-clqk')
  58. dict1['相关法律依据及税务处理处罚情况'] =clqk[0]
  59. dict1['date'] = dt
  60. print(dict1)
  61. list1.append(dict1)
  62. # return int(rpg1)
  63. if list1:
  64. myco31.insert_many(list1)
  65. # http://tianjin.chinatax.gov.cn/wzcx/cx_zdwfaj.action?szsf=11200000000
  66. # 此数据无法去重,遂单独更新,查看页数,季度
  67. def runs():
  68. for pg in range(1,2):
  69. print(pg,'==========')
  70. r1(2023,5,pg) ##中间2为季度,每次更新前务必加1季度
  71. r1()