n07_zhejiang.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import re
  4. import requests,json
  5. from setting import proxies
  6. from urllib import parse
  7. from lxml import etree
  8. from mongo_cho import myco7,r_myco15,myco7_b
  9. from rety import retry
  10. r = requests.session()
  11. r.keep_alive = False
  12. def zh1(list1):
  13. str1 = ''
  14. for i in list1:
  15. str1 += i.replace(' ','').replace('\r','').replace('\n','').replace('\t','')
  16. return str1
  17. @retry(3)
  18. def r1_d(url,dt):
  19. headers = {
  20. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  21. }
  22. response = r.get(url=url,headers=headers,proxies=proxies)
  23. html = response.text
  24. selector = etree.HTML(html)
  25. a = selector.xpath('//tr[@class="rlbbox"]')
  26. dict1 = {}
  27. for i in a:
  28. k1 = i.xpath('td[1]/div/text()')
  29. # print(k1)
  30. k2 = zh1(k1)
  31. # print(k2)
  32. v1 = i.xpath('td[2]/div/text()')
  33. # print(v1)
  34. v2 = zh1(v1)
  35. # print(v2)
  36. dict1[k2] = v2
  37. # dict1 = {k2:v2}
  38. # print(dict1)
  39. dict1['url'] = url
  40. dict1['date'] = dt
  41. return dict1
  42. # r1_d()
  43. @retry(3)
  44. def r1(searhvalue,year,pg,dt):
  45. # url = 'http://anhui.chinatax.gov.cn/module/jslib/bulletin/ajaxdata.jsp'
  46. pg1 = str(pg *10 -9)
  47. pg2 = str(pg *10)
  48. url = 'http://zhejiang.chinatax.gov.cn/module/jslib/bulletin/ajaxdata.jsp?startrecord={pg1}&endrecord={pg2}'.format(pg1=pg1,pg2=pg2)
  49. headers = {
  50. "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  51. }
  52. data = {
  53. "searhvalue":parse.quote(searhvalue),
  54. "searchkey": "jd1",
  55. "year": parse.quote(year),
  56. }
  57. response = r.post(url=url,headers=headers,data=data,proxies=proxies)
  58. html = response.text
  59. # print(html)
  60. aa = re.findall(r"http://zhejiang.chinatax.gov.cn/art/(.*?).html", html)
  61. list1 = []
  62. list2 = []
  63. for i1 in aa:
  64. # print(i1)
  65. url1 = "http://zhejiang.chinatax.gov.cn/art/" + i1 + ".html"
  66. print(url1)
  67. utf = r_myco15.sismember('n07', url1) ##更改
  68. if not utf:
  69. rsd = r1_d(url1, dt)
  70. list1.append(rsd)
  71. list2.append(url1)
  72. else:
  73. print('已存在,>>>n07')
  74. pass
  75. if list1:
  76. myco7.insert_many(list1)
  77. if list2:
  78. myco7_b.insert_many(list1)
  79. for mis in list2:
  80. r_myco15.sadd('n07', mis) ##更改
  81. # print(list1)
  82. # myco7.insert_many(list1)
  83. @retry(3)
  84. def get_pg(ny1,ny2):
  85. url ='http://zhejiang.chinatax.gov.cn/module/jslib/bulletin/bullenright.jsp?searhvalue={ny2}%E6%9C%88&searchkey=jd1&year={ny1}%E5%B9%B4%E5%BA%A6'.format(ny1=ny1,ny2=ny2)
  86. headers = {
  87. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  88. }
  89. response = r.get(url=url, headers=headers, proxies=proxies)
  90. html = response.text
  91. # print(html)
  92. rpg = re.findall(r"var totalRecord = '(.*?)'",html)
  93. if rpg:
  94. tpg = rpg[0]
  95. tpg1 = int(tpg)//10 + 2
  96. # print(tpg1)
  97. return tpg1
  98. # get_pg('2021','1')
  99. def runs(ny1,ny2):
  100. searhvalue = str(ny2) + '月'
  101. year = str(ny1) + '年度'
  102. pg = get_pg(ny1, ny2)
  103. dt = str(ny1) + '/' + str(ny2)
  104. # print(dt)
  105. for i in range(1,pg):
  106. print(i,'页===========')
  107. r1(searhvalue,year,i,dt)
  108. runs(2023,11)