n05_sichuan.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import requests,json,re
  4. from rety import retry
  5. from setting import proxies
  6. from urllib import parse
  7. from lxml import etree
  8. from mongo_cho import myco5
  9. r = requests.session()
  10. r.keep_alive = False
  11. import urllib3
  12. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  13. @retry(3)
  14. def r1_d(url,dt):
  15. # url = 'https://sichuan.chinatax.gov.cn/art/2021/3/23/art_15873_10537.html'
  16. headers = {
  17. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  18. }
  19. response = r.get(url=url,headers=headers,verify=False,proxies=proxies)
  20. response.encoding = 'UTF-8'
  21. html= response.text
  22. # print(html)
  23. selector = etree.HTML(html)
  24. a = selector.xpath('//tbody//tr')
  25. dict1 = {}
  26. try:
  27. for i in a:
  28. k1 = i.xpath('td[1]')
  29. text = k1[0].xpath('string(.)').strip()
  30. # print(text)
  31. v1 = i.xpath('td[2]')
  32. text1 = v1[0].xpath('string(.)').strip()
  33. v2 = text1.replace(' ','').replace('\r','').replace('\t','').replace('\n','')
  34. dict1[text] = v2
  35. except:
  36. pass
  37. print(dict1)
  38. dict1['url'] = url
  39. dict1['date'] = dt
  40. return dict1
  41. # r1_d()
  42. @retry(3)
  43. def r1(icid,cpg,dt):
  44. url = 'https://sichuan.chinatax.gov.cn/module/search/index.jsp'
  45. headers = {
  46. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  47. }
  48. params = {
  49. "vc_name":"",
  50. "field_439": "",
  51. "field_440": "",
  52. "field_441": "",
  53. "field_442": "",
  54. "field_443": "",
  55. "strSelectID": "390,439,440,441,442,443",
  56. "i_columnid": icid,
  57. "field": "vc_name:1:0,field_439:1:0,field_440:1:0,field_441:1:0,field_442:1:0,field_443:1:0",
  58. "currpage": cpg,
  59. }
  60. response = r.get(url=url,headers=headers,params=params,verify=False,proxies=proxies)
  61. html = response.text
  62. # print(html)
  63. selector = etree.HTML(html)
  64. a = selector.xpath('//tr/td[5]/a/@href')
  65. list1 = []
  66. for i in a:
  67. # print(i)
  68. url1 = i.replace('../..','https://sichuan.chinatax.gov.cn')
  69. print(url1)
  70. rsd = r1_d(url1,dt)
  71. list1.append(rsd)
  72. # print(list1)
  73. myco5.insert_many(list1)
  74. # 'https://sichuan.chinatax.gov.cn'
  75. # https://sichuan.chinatax.gov.cn/col/col15873/index.html
  76. ##季度更新,对比上次页数,
  77. def runs():
  78. icid='15873' #季度id
  79. tpg=9 #总共几页
  80. dt='2023/10' #时限
  81. for i in range(1,tpg+1):
  82. print(i,'==================')
  83. r1(icid,i,dt)
  84. runs()