n01_ah.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. import requests,json,time
  2. from setting import proxies
  3. from urllib import parse
  4. from lxml import etree
  5. from mongo_cho import myco1,r_myco15,myco1_b
  6. r = requests.session()
  7. r.keep_alive = False
  8. from rety import retry
  9. def zh1(list1):
  10. str1 = ''
  11. for i in list1:
  12. str1 += i.replace(' ','').replace('\r','').replace('\n','').replace('\t','')
  13. return str1
  14. @retry(3)
  15. def r1_d(url,dt1):
  16. # url = 'http://anhui.chinatax.gov.cn/art/2021/3/3/art_20155_6021.html'
  17. headers = {
  18. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  19. }
  20. response = r.get(url=url,headers=headers,proxies=proxies)
  21. html = response.text
  22. selector = etree.HTML(html)
  23. a = selector.xpath('//tr[@class="rlbbox"]')
  24. dict1 = {}
  25. for i in a:
  26. k1 = i.xpath('td[1]/div/text()')
  27. # print(k1)
  28. k2 = zh1(k1)
  29. # print(k2)
  30. v1 = i.xpath('td[2]/div/text()')
  31. # print(v1)
  32. v2 = zh1(v1)
  33. # print(v2)
  34. dict1[k2] = v2
  35. # dict1 = {k2:v2}
  36. # print(dict1)
  37. dict1['date'] = dt1
  38. dict1['url'] = url
  39. return dict1
  40. @retry(3)
  41. def r1(searhvalue,year):
  42. url = 'http://anhui.chinatax.gov.cn/module/jslib/bulletin/ajaxdata.jsp'
  43. headers = {
  44. "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  45. }
  46. data = {
  47. "searhvalue":parse.quote(searhvalue),
  48. "searchkey": "jd1",
  49. "year": parse.quote(year),
  50. }
  51. response = r.post(url=url,headers=headers,data=data,proxies=proxies)
  52. html = response.text
  53. # print(html)
  54. selector = etree.HTML(html)
  55. a = selector.xpath('//tr[@class="rlbbox"]')
  56. list1 = []
  57. list2 = []
  58. for i in a:
  59. url1 = i.xpath('td[5]/div/a/@href')[0]
  60. print(url1)
  61. # url1='http://anhui.chinatax.gov.cn/art/2020/5/9/art_19687_3782.html'
  62. utf = r_myco15.sismember('n01',url1) ##更改
  63. if not utf:
  64. dt1 = year.replace('年','/') + searhvalue.replace('月','')
  65. rsd = r1_d(url1,dt1)
  66. if rsd:
  67. list1.append(rsd)
  68. list2.append(url1)
  69. else:
  70. print('已存在,>>>n01')
  71. print(list1)
  72. if list1:
  73. myco1.insert_many(list1)
  74. print('已存入原始库')
  75. if list2:
  76. myco1_b.insert_many(list1)
  77. print('已存入备份原始库')
  78. for mis in list2:
  79. r_myco15.sadd('n01', mis) ##更改
  80. @retry(3)
  81. def get_ny():
  82. url ='http://anhui.chinatax.gov.cn//module/jslib/bulletin/bullenleft.jsp'
  83. headers = {
  84. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  85. }
  86. response = r.get(url=url, headers=headers, proxies=proxies)
  87. html = response.text
  88. # print(html)
  89. selector = etree.HTML(html)
  90. len1=selector.xpath('//tr[@id="jiduonclick1"]/td/span//a')
  91. list1 = []
  92. for i in len1:
  93. ny = i.xpath('text()')[0].replace('月','')
  94. list1.append(ny)
  95. eny = list1[-1]
  96. print(eny,'>>>from n01_ah***')
  97. return eny
  98. # get_ny()
  99. def runs():
  100. year = '2024年'
  101. for i in range(1,2):
  102. searhvalue = '{}月'.format(i)
  103. r1(searhvalue,year)
  104. runs()