n25_shanxi.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import re
  4. import requests,json
  5. from setting import proxies
  6. from urllib import parse
  7. from lxml import etree
  8. from mongo_cho import myco25,r_myco15,myco25_b
  9. from rety import retry
  10. r = requests.session()
  11. r.keep_alive = False
  12. @retry(3)
  13. def r1_d(url,dt):
  14. # url = 'http://shaanxi.chinatax.gov.cn/art/2021/4/15/art_15616_7502.html'
  15. headers = {
  16. "User-Agent": "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0; .NET CLR 2.0.50727; SLCC2; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729)",
  17. }
  18. response = r.get(url=url,headers=headers,proxies=proxies)
  19. response.encoding = 'UTF-8'
  20. html = response.text
  21. selector = etree.HTML(html)
  22. a = selector.xpath('//table[@class="zdsc_con"]//tr')
  23. dict1 = {}
  24. for i in a:
  25. k1 = i.xpath('td[1]')
  26. if k1:
  27. k2 = k1[0].xpath('string(.)').strip()
  28. k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
  29. # print(k2)
  30. v1 = i.xpath('td[2]')
  31. if v1:
  32. v2 = v1[0].xpath('string(.)').strip()
  33. v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
  34. # print(v3)
  35. else:
  36. v3 = ''
  37. dict1[k3] = v3
  38. dict1['url'] = url
  39. # dt = ''
  40. dict1['date'] = dt
  41. # print(dict1)
  42. return dict1
  43. # r1_d()
  44. @retry(3)
  45. def r1(ny1,ny2,pg):
  46. url = 'http://shaanxi.chinatax.gov.cn/module/search/index.jsp'
  47. headers = {
  48. "User-Agent": "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0; .NET CLR 2.0.50727; SLCC2; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729)",
  49. }
  50. params = {
  51. "field_2166": "",
  52. "field_1656": "",
  53. "field_1652": "",
  54. "field_1663": "",
  55. "field_1653": "",
  56. "field_2390": "",
  57. "field_2213": "",
  58. "field_2391": "",
  59. "field_2410": ny2,
  60. "field_1670": "",
  61. "field_1672": ny1,
  62. "currpage": pg,
  63. "field_1651": "",
  64. "strSelectID": "style_2166,1656,1663,1652,1653,2390,2213,2391,1651,1672,2410,1670",
  65. "i_columnid": "style_3",
  66. "field": "field_2166:1:0,field_2213:1:0,field_1656:1:0,field_2391:1:0,field_2410:12:0,field_1651:12:0,field_1652:1:0,field_2390:1:0,field_1672:12:0,field_1670:12:0,field_1653:1:0,field_1663:1:0",
  67. "initKind": "FieldFormMetadata",
  68. "type": "0,0,0,0,0,0,0,0,0,0,0,0",
  69. "currentplace": "",
  70. "splitflag": "",
  71. "fullpath": "0",
  72. }
  73. response = r.get(url=url, headers=headers, params=params,proxies=proxies)
  74. html = response.text
  75. # print(html)
  76. rpg = re.findall("<font color='red'>(.*?)</font>",html)[0]
  77. # print(rpg)
  78. selector = etree.HTML(html)
  79. a = selector.xpath('//li//a/@href')
  80. list1 = []
  81. list2 = []
  82. for i in a:
  83. # print(i)
  84. url1 = 'http://shaanxi.chinatax.gov.cn' + i.replace('../..','')
  85. dt = str(ny1) + '/' + str(ny2+1)
  86. utf = r_myco15.sismember('n25', url1) ##更改
  87. if not utf:
  88. rsd = r1_d(url1, dt)
  89. print(rsd)
  90. list1.append(rsd)
  91. list2.append(url1)
  92. else:
  93. print('已存在,>>>n25')
  94. if list1:
  95. myco25.insert_many(list1)
  96. print('已存入原始库')
  97. if list2:
  98. myco25_b.insert_many(list1)
  99. print('已存入备份原始库')
  100. for mis in list2:
  101. r_myco15.sadd('n25', mis) ##更改
  102. # if list1:
  103. # myco25.insert_many(list1)
  104. return int(rpg)
  105. def runs(ny1,ny2):
  106. # for ny1 in range(2021,2022):
  107. # for ny2 in range(1,2):
  108. # print(ny1,ny2,'========')
  109. ny3 = int(ny2) - 1
  110. rpg = r1(ny1,ny3,pg=1)
  111. # print(rpg,'----------------')
  112. tpg = rpg//20
  113. if tpg >1:
  114. for pg in range(2,tpg +2):
  115. print(pg,'=======')
  116. r1(ny1,ny3,pg)
  117. runs(2023,11)