n18_hn.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import requests,json
  4. from setting import proxies
  5. from urllib import parse
  6. from lxml import etree
  7. from mongo_cho import myco18,myco18_b,r_myco15
  8. from rety import retry
  9. r = requests.session()
  10. r.keep_alive = False
  11. @retry(3)
  12. def r1_d(url,dt):
  13. # url = 'https://henan.chinatax.gov.cn/henanchinatax/xxgk/zdsswfsxaj/2021060109153715435/index.html'
  14. headers = {
  15. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  16. }
  17. response = r.get(url=url,headers=headers,proxies=proxies)
  18. response.encoding = 'UTF-8'
  19. html = response.text
  20. seletor = etree.HTML(html)
  21. a = seletor.xpath('//table[@class="zhongdatable"]//th')
  22. list1 = []
  23. for i in a:
  24. text = i.xpath('string(.)').strip()
  25. t1 = text.replace(' ','').replace('\r','').replace('\t','').replace('\n','')
  26. list1.append(t1)
  27. b = seletor.xpath('//table[@class="zhongdatable"]//td')
  28. list2 = []
  29. for i in b:
  30. text = i.xpath('string(.)').strip()
  31. t2 = text.replace(' ','').replace('\r','').replace('\t','').replace('\n','')
  32. list2.append(t2)
  33. dict1 = {}
  34. # print(list1)
  35. # print(list2)
  36. for i in range(len(a)):
  37. # print(i)
  38. k1 = list1[i]
  39. v1 = list2[i]
  40. # print(k1,v1)
  41. dict1[k1] = v1
  42. # print(dict1)
  43. dict1['url'] = url
  44. dict1['date'] = dt
  45. # print(dict1)
  46. return dict1
  47. # print(len(b))
  48. # for i1 in b:
  49. # print(i1.replace(' ','').replace('\r','').replace('\t','').replace('\n',''))
  50. # print(response.text)
  51. # r1_d()
  52. @retry(3)
  53. def r1(pg,dt):
  54. url = 'https://henan.chinatax.gov.cn/eportal/ui?pageId=bdfef9dfa679454c86d68f2203a69e84&currentPage={}&moduleId=143e1aeaa3b6405ea0fe04142c021d5b&staticRequest=yes'.format(pg)
  55. headers = {
  56. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  57. }
  58. # data = {
  59. # "filter_LIKE_EXT_STR15":dt
  60. # }
  61. data = {
  62. "filter_LIKE_EXT_STR6":"",
  63. "filter_LIKE_main.TITLE": "",
  64. "filter_LIKE_EXT_STR2": "",
  65. "filter_LIKE_EXT_STR4": "",
  66. "filter_LIKE_EXT_STR3": "",
  67. "filter_LIKE_EXT_STR8": "",
  68. "filter_LIKE_EXT_STR19": "",
  69. "filter_LIKE_EXT_STR10": "",
  70. "filter_LIKE_EXT_STR23": "",
  71. }
  72. response = r.post(url=url,headers=headers,data=data,proxies=proxies)
  73. html = response.text
  74. # print(html)
  75. seletor = etree.HTML(html)
  76. a = seletor.xpath('//a[@istitle="true"]')
  77. list1 = []
  78. list2 = []
  79. print(a)
  80. for i in a:
  81. # print(i.xpath('@title'))
  82. url1 = 'https://henan.chinatax.gov.cn' + i.xpath('@href')[0]
  83. print(url1)
  84. utf = r_myco15.sismember('n18', url1) ##更改
  85. if not utf:
  86. rsd = r1_d(url1, dt)
  87. print(rsd)
  88. list1.append(rsd)
  89. list2.append(url1)
  90. else:
  91. print('已存在,>>>n18')
  92. if list1:
  93. myco18.insert_many(list1)
  94. print('已存入原始库')
  95. if list2:
  96. myco18_b.insert_many(list1)
  97. print('已存入备份原始库')
  98. for mis in list2:
  99. r_myco15.sadd('n18', mis) ##更改
  100. # if list1:
  101. # myco18.insert_many(list1)
  102. def runs():
  103. dt = '2023'
  104. for pg in range(1,2):
  105. print(pg,'========')
  106. r1(pg,dt)
  107. runs()