n06_js.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import requests,json
  4. from setting import proxies
  5. from urllib import parse
  6. from lxml import etree
  7. from rety import retry
  8. r = requests.session()
  9. r.keep_alive = False
  10. from mongo_cho import myco6,r_myco15,myco6_b
  11. #https://jiangsu.chinatax.gov.cn/col/col16916/index.html
  12. @retry(3)
  13. def r1_d(url,dt):
  14. headers = {
  15. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  16. }
  17. response = r.get(url=url,headers=headers,proxies=proxies)
  18. response.encoding = 'UTF-8'
  19. html = response.text
  20. selector = etree.HTML(html)
  21. a = selector.xpath('//body/table/tbody/tr')
  22. dict1 = {}
  23. for i in a:
  24. k1 = i.xpath('td[1]')
  25. text = k1[0].xpath('string(.)').strip()
  26. # print(text)
  27. v1 = i.xpath('td[2]')
  28. text1 = v1[0].xpath('string(.)').strip()
  29. text2 = text1.replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '')
  30. dict1[text] = text2
  31. dict1['url'] = url
  32. dict1['date'] = dt
  33. print(dict1)
  34. return dict1
  35. # r1_d()
  36. @retry(3)
  37. def r1(ny1,ny2,dt):
  38. url = 'https://jiangsu.chinatax.gov.cn/module/jslib/bulletin/lpajaxdata.jsp?startrecord=1&endrecord=36&perpage=11&rowpage=1'
  39. headers = {
  40. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  41. }
  42. data = {
  43. "searhvalue":ny2,
  44. "searchkey": "jd",
  45. "year": ny1,
  46. }
  47. response = r.post(url=url,data=data,headers=headers,proxies=proxies)
  48. html = response.text
  49. print(html)
  50. selector = etree.HTML(html)
  51. a = selector.xpath('//a/@href')
  52. # print(a)
  53. list1 = []
  54. list2 = []
  55. for i in a:
  56. print(i)
  57. utf = r_myco15.sismember('n06', i)
  58. if not utf:
  59. rsd = r1_d(i, dt)
  60. list1.append(rsd)
  61. list2.append(i)
  62. else:
  63. print('已存在,>>>n06')
  64. pass
  65. if list1:
  66. myco6.insert_many(list1)
  67. if list2:
  68. myco6_b.insert_many(list1)
  69. for mis in list2:
  70. r_myco15.sadd('n06', mis)
  71. # myco6.insert_many(list1)
  72. # https://jiangsu.chinatax.gov.cn/col/col16916/index.html
  73. def runs():
  74. ny1 = '2021'
  75. ny2 = '8'
  76. dt= str(ny1) + '/' + str(ny2)
  77. r1(ny1,ny2,dt)
  78. runs()