n27_nmg.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import re
  4. import requests,json
  5. from setting import proxies
  6. from urllib import parse
  7. from lxml import etree
  8. from mongo_cho import myco27,r_myco15,myco27_b
  9. from rety import retry
  10. r = requests.session()
  11. r.keep_alive = False
  12. def r1_d(url,dt):
  13. # url = 'http://neimenggu.chinatax.gov.cn/nsfw/sscx/zdaj/hlbeszdwfaj/202106/t20210609_751387.html'
  14. headers = {
  15. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  16. }
  17. response = r.get(url=url,headers=headers,proxies=proxies)
  18. response.encoding = 'UTF-8'
  19. html = response.text
  20. selector = etree.HTML(html)
  21. a = selector.xpath('//table//tr')
  22. dict1 = {}
  23. for i in a:
  24. k1 = i.xpath('td[1]')
  25. if k1:
  26. k2 = k1[0].xpath('string(.)').strip()
  27. k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
  28. # print(k2)
  29. v1 = i.xpath('td[2]')
  30. if v1:
  31. v2 = v1[0].xpath('string(.)').strip()
  32. v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
  33. # print(v3)
  34. else:
  35. v3 = ''
  36. if k3:
  37. dict1[k3] = v3
  38. dict1['url'] = url
  39. dict1['date'] = dt
  40. print(dict1)
  41. return dict1
  42. # r1_d('1')
  43. @retry(3)
  44. def r1():
  45. url = 'http://neimenggu.chinatax.gov.cn/nsfw/sscx/zdaj/sj/2023/' ##查看时间
  46. headers = {
  47. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  48. }
  49. response = r.get(url=url,headers=headers,proxies=proxies)
  50. response.encoding = 'UTF-8'
  51. html = response.text
  52. # print(html)
  53. selector = etree.HTML(html)
  54. a = re.findall('href="(.*?)"',html)
  55. for i in a:
  56. # print(i)
  57. if "html" in i:
  58. url1 = 'http://neimenggu.chinatax.gov.cn/nsfw/sscx/zdaj' + i.replace('../..','')
  59. print(url1)
  60. u1 = url1.split('/')
  61. u2 = u1[-1].split('_')[0]
  62. # print(u2)
  63. dt = u2[1:5] + '/' + u2[5:7] + '/' + u2[7:9]
  64. print(dt)
  65. utf = r_myco15.sismember('n27', url1) ##更改
  66. if not utf:
  67. rsd = r1_d(url1, dt)
  68. myco27.insert_one(rsd)
  69. myco27_b.insert_one(rsd)
  70. r_myco15.sadd('n27', url1)
  71. print('存入主备库,>>>n27')
  72. else:
  73. print('已存在,>>>n27')
  74. # time.sleep(10)
  75. r1()