n14_hlj.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import re
  4. import requests,json
  5. from setting import proxies
  6. from urllib import parse
  7. from lxml import etree
  8. from mongo_cho import myco14,myco14_b,r_myco15
  9. from rety import retry
  10. r = requests.session()
  11. r.keep_alive = False
  12. @retry(3)
  13. def r1_d(url,dt):
  14. # url = 'http://heilongjiang.chinatax.gov.cn/art/2021/4/10/art_6410_962.html'
  15. headers = {
  16. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  17. }
  18. response = r.get(url=url,headers=headers,proxies=proxies)
  19. response.encoding = 'UTF-8'
  20. html = response.text
  21. selector = etree.HTML(html)
  22. a = selector.xpath('//tr')
  23. dict1 = {}
  24. for i in a:
  25. k1 = i.xpath('td[1]')
  26. if k1:
  27. k2 = k1[0].xpath('string(.)').strip()
  28. k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
  29. # print(k2)
  30. v1 = i.xpath('td[2]')
  31. if v1:
  32. v2 = v1[0].xpath('string(.)').strip()
  33. v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
  34. # print(v3)
  35. else:
  36. v3 = ''
  37. if k3:
  38. dict1[k3] = v3
  39. # print(dict1)
  40. dict1['url'] = url
  41. dict1['date'] = dt
  42. # print(dict1)
  43. return dict1
  44. # r1_d()
  45. @retry(3)
  46. def r1(ny1,ny2):
  47. # url = 'http://heilongjiang.chinatax.gov.cn/module/jslib/bulletin/ajaxdata.jsp?startrecord=1&endrecord=2&perpage=11'
  48. url = 'http://heilongjiang.chinatax.gov.cn/module/jslib/bulletin/ajaxdata.jsp'
  49. headers = {
  50. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  51. }
  52. data = {
  53. "searhvalue":ny2,
  54. "searchkey": "jd",
  55. "year": ny1,
  56. }
  57. response = r.post(url=url,headers=headers,data=data,proxies=proxies)
  58. html = response.text
  59. hfs = re.findall("href='(.*?)'",html)
  60. if ny1 == 1:
  61. y1 = '2019'
  62. elif ny1 == 2:
  63. y1 = '2020'
  64. elif ny1 == 3:
  65. y1 = '2021'
  66. dt = y1 + '/' +str(ny2+1)
  67. list1 = []
  68. list2 = []
  69. for url1 in hfs:
  70. print(url1)
  71. utf = r_myco15.sismember('n14', url1) ##更改
  72. if not utf:
  73. rsd = r1_d(url1, dt)
  74. list1.append(rsd)
  75. list2.append(url1)
  76. else:
  77. print('已存在,>>>n14')
  78. if list1:
  79. myco14.insert_many(list1)
  80. print('已存入原始库')
  81. if list2:
  82. myco14_b.insert_many(list1)
  83. print('已存入备份原始库')
  84. for mis in list2:
  85. r_myco15.sadd('n14', mis) ##更改
  86. # if list1:
  87. # myco14.insert_many(list1)
  88. def runs(ny2):
  89. ny1 = 3
  90. ny3 = int(ny2) - 1
  91. print('2023',ny2,'=========')
  92. r1(ny1, ny2)
  93. runs(10)