n10_guangdong.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import re
  4. import requests,json
  5. from setting import proxies
  6. from urllib import parse
  7. from lxml import etree
  8. from mongo_cho import myco10,r_myco15,myco10_b
  9. from rety import retry
  10. r = requests.session()
  11. r.keep_alive = False
  12. @retry(3)
  13. def r1_d(cid,dt):
  14. url = 'http://guangdong.chinatax.gov.cn/siteapps/webpage/gdtax/zdsswfaj/service.jsp'
  15. headers = {
  16. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  17. }
  18. data = {
  19. "manuscriptId": cid,
  20. }
  21. response = r.post(url=url, headers=headers, data=data,proxies=proxies)
  22. html = response.text
  23. selector = etree.HTML(html)
  24. a = selector.xpath('//tr')
  25. dict1 = {}
  26. for i in a:
  27. k1 = i.xpath('td[1]/text()')
  28. v1 = i.xpath('td[2]/text()')
  29. if k1:
  30. k2 = k1[0].replace(' ','').replace('\r','').replace('\t','').replace('\n','')
  31. # print(k2)
  32. if v1:
  33. v2 = v1[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '')
  34. # print(v2)
  35. else:
  36. v2 = ''
  37. if k2:
  38. dict1[k2] = v2
  39. dict1["uid"] = cid
  40. dict1['date'] = dt
  41. print(dict1)
  42. return dict1
  43. # r1_d('42da48b512b046d488189ce36a833fa8','9')
  44. @retry(3)
  45. def r1(ny1,ny2,pg):
  46. url = 'http://guangdong.chinatax.gov.cn/siteapps/webpage/gdtax/zdsswfaj/query.jsp'
  47. headers = {
  48. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  49. }
  50. data = {
  51. "yf": "{ny1}_{ny2}".format(ny1=ny1,ny2=ny2),
  52. "pageSize":"20",
  53. "pageNo": pg,
  54. "channelId": "",
  55. "taxNature": "",
  56. "quarter": "",
  57. "nsr_mc": "",
  58. "nsr_sbh": "",
  59. "fddbr_xm": "",
  60. "zcdz": "",
  61. "zzjgdm": "",
  62. "fddbrzjhm": "",
  63. "cwfzrxm": "",
  64. "cwfzrzjhm": "",
  65. }
  66. response = r.post(url=url,headers=headers,data=data,proxies=proxies)
  67. html = response.text
  68. # print(html)
  69. selector = etree.HTML(html)
  70. rpg = re.findall('共(.*?)页',html)[0].replace(' ','')
  71. # a1 = selector.xpath('//*[@id="zdss_tb"]/tbody/tr[2]/td[5]/text()')
  72. # print(a1)
  73. # for i in a:
  74. # print(i.xpath('a/@onclick'))
  75. a = selector.xpath('//a/@onclick')
  76. list1 = []
  77. list2 = []
  78. for i in a:
  79. i1 = re.findall("'(.*?)'",i)
  80. if i1:
  81. cid = i1[0]
  82. dt = str(ny1) + '/' + str(ny2+1)
  83. if cid == '#pageIndex':
  84. pass
  85. else:
  86. utf = r_myco15.sismember('n10', cid) ##更改
  87. if not utf:
  88. print(cid)
  89. rsd = r1_d(cid, dt)
  90. list1.append(rsd)
  91. list2.append(cid)
  92. else:
  93. print('已存在,>>>n10')
  94. pass
  95. # if list1:
  96. if list1:
  97. myco10.insert_many(list1)
  98. print('已存入原始库')
  99. if list2:
  100. myco10_b.insert_many(list1)
  101. print('已存入备份原始库')
  102. for mis in list2:
  103. r_myco15.sadd('n10', mis) ##更改
  104. # myco10.insert_many(list1)
  105. return rpg
  106. def runs(ny1,ny2):
  107. # for ny1 in range(2021,2022):
  108. # for ny2 in range(0,4):
  109. # print(ny1,ny2,'===========')
  110. rpg = r1(ny1,ny2,pg=1)
  111. print(ny1,ny2)
  112. if int(rpg) > 1:
  113. for pg in range(2,int(rpg)+1):
  114. print(pg,'==============')
  115. r1(ny1,ny2,pg)
  116. runs(2023,11)