n16_gs.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import requests,json
  4. from setting import proxies
  5. from urllib import parse
  6. from lxml import etree
  7. from mongo_cho import myco16,r_myco15,myco16_b
  8. from rety import retry
  9. r = requests.session()
  10. r.keep_alive = False
  11. @retry(3)
  12. def r1_d(url,dt):
  13. # url = 'http://gansu.chinatax.gov.cn/art/2020/3/10/art_8350_65.html'
  14. headers = {
  15. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  16. }
  17. response = r.get(url=url,headers=headers,proxies=proxies)
  18. response.encoding = 'UTF-8'
  19. html = response.text
  20. selector = etree.HTML(html)
  21. a = selector.xpath('//table[@class="zdsc_con"]//tr')
  22. dict1 = {}
  23. for i in a:
  24. k1 = i.xpath('td[1]')
  25. if k1:
  26. k2 = k1[0].xpath('string(.)').strip()
  27. k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
  28. # print(k2)
  29. v1 = i.xpath('td[2]')
  30. if v1:
  31. v2 = v1[0].xpath('string(.)').strip()
  32. v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
  33. # print(v3)
  34. else:
  35. v3 = ''
  36. dict1[k3] = v3
  37. dict1['url'] = url
  38. dict1['date'] = dt
  39. # print(dict1)
  40. return dict1
  41. # r1_d()
  42. @retry(3)
  43. def r1(f86,f85):
  44. url = 'http://gansu.chinatax.gov.cn/module/search/index.jsp'
  45. headers = {
  46. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  47. }
  48. params = {
  49. "field_849":"",
  50. "field_850": "",
  51. "field_857": "",
  52. "field_868": "",
  53. "field_867": "",
  54. "field_860": "",
  55. "field_851": "",
  56. "field_852": "",
  57. "field_855": f85,
  58. "field_866": "",
  59. "field_856": "",
  60. "field_865": f86,
  61. "strSelectID": "849,850,868,857,867,860,851,852,855,866,865,856",
  62. "i_columnid": "8350",
  63. "field": "field_849:1:1,field_850:1:1,field_851:1:1,field_852:1:1,field_857:1:1,field_860:1:1,field_867:1:1,field_868:1:1,field_855:1:1,field_866:1:1,field_865:1:1,field_856:1:1",
  64. "initKind": "FieldForm",
  65. "type": "1,1,1,1,1,1,1,1,1,1,1,1",
  66. "currentplace": "",
  67. "splitflag": "",
  68. "fullpath": "0",
  69. "currpage":"1",
  70. }
  71. response = r.get(url=url,headers=headers,params=params,proxies=proxies)
  72. html = response.text
  73. selector = etree.HTML(html)
  74. a = selector.xpath('//a/@href')
  75. list1 = []
  76. list2 = []
  77. for i in a:
  78. if "art" in i:
  79. # print(i)
  80. url1 = 'http://gansu.chinatax.gov.cn' + i.replace('../..','')
  81. # print(url1)
  82. dt = str(f86) + '/' + str(f85)
  83. utf = r_myco15.sismember('n16', url1) ##更改
  84. if not utf:
  85. rsd = r1_d(url1, dt)
  86. print(rsd)
  87. list1.append(rsd)
  88. list2.append(url1)
  89. else:
  90. print('已存在,>>>n16')
  91. if list1:
  92. myco16.insert_many(list1)
  93. print('已存入原始库')
  94. if list2:
  95. myco16_b.insert_many(list1)
  96. print('已存入备份原始库')
  97. for mis in list2:
  98. r_myco15.sadd('n16', mis) ##更改
  99. # if list1:
  100. # print(list1)
  101. # myco16.insert_many(list1)
  102. def runs():
  103. ny1 = '2023' #年度
  104. ny2 = '1' #季度
  105. r1(ny1,ny2)
  106. runs()