n22_yn.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import re
  4. import requests,json
  5. from setting import proxies
  6. from urllib import parse
  7. from lxml import etree
  8. from mongo_cho import myco22,r_myco15,myco22_b
  9. from rety import retry
  10. r = requests.session()
  11. r.keep_alive = False
  12. import urllib3
  13. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  14. @retry(3)
  15. def r1_d(url,dt):
  16. # url = 'https://yunnan.chinatax.gov.cn/art/2021/2/9/art_8101_588.html'
  17. headers = {
  18. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  19. }
  20. response = r.get(url=url,headers=headers,proxies=proxies,verify=False)
  21. response.encoding = 'UTF-8'
  22. html = response.text
  23. selector = etree.HTML(html)
  24. a = selector.xpath('//tr')
  25. dict1 = {}
  26. for i in a:
  27. k1 = i.xpath('td[1]/div/text()')
  28. v1 = i.xpath('td[2]/div/text()')
  29. # print(k1,v1)
  30. k2 = ''
  31. if k1:
  32. for i1 in k1:
  33. k2 += i1
  34. if v1:
  35. v2 = ''
  36. for i2 in v1:
  37. v2+=i2
  38. else:
  39. v2 = ''
  40. dict1[k2] = v2
  41. dict1['url'] = url
  42. dict1['date'] = dt
  43. print(dict1)
  44. return dict1
  45. # r1_d()
  46. @retry(3)
  47. def r1(ny1,ny2):
  48. url = 'https://yunnan.chinatax.gov.cn/bulletin/ajaxdata.jsp?startrecord=1&endrecord=8&perpage=11&rowpage=1'
  49. headers = {
  50. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  51. }
  52. data = {
  53. "searhvalue":"{}%E6%9C%88".format(ny2),
  54. "searchkey": "jd1",
  55. "year": "{}%E5%B9%B4%E5%BA%A6".format(ny1),
  56. }
  57. response = r.post(url=url,data=data,headers=headers,proxies=proxies,verify=False)
  58. html = response.text
  59. a = re.findall("href='(.*?)'", html)
  60. list1 = []
  61. list2 = []
  62. for i in a:
  63. print(i)
  64. dt = str(ny1) + '/' + str(ny2)
  65. utf = r_myco15.sismember('n22', i) ##更改
  66. if not utf:
  67. rsl = r1_d(i, dt)
  68. list1.append(rsl)
  69. list2.append(i)
  70. else:
  71. print('已存在,>>>n22')
  72. if list1:
  73. myco22.insert_many(list1)
  74. print('已存入原始库')
  75. if list2:
  76. myco22_b.insert_many(list1)
  77. print('已存入备份原始库')
  78. for mis in list2:
  79. r_myco15.sadd('n22', mis) ##更改
  80. # if list1:
  81. # myco22.insert_many(list1)
  82. def runs(ny1,ny2):
  83. print(ny1,ny2,'=========')
  84. r1(ny1, ny2)
  85. runs('2023','5')