n28_ningxia.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import re
  4. import requests,json
  5. from setting import proxies
  6. from urllib import parse
  7. from lxml import etree
  8. from mongo_cho import myco28,r_myco15,myco28_b
  9. from rety import retry
  10. r = requests.session()
  11. r.keep_alive = False
  12. @retry(3)
  13. def r1_d(url,dt):
  14. # url = 'http://ningxia.chinatax.gov.cn/art/2021/3/3/art_14329_8626.html'
  15. headers = {
  16. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  17. }
  18. response = r.get(url=url,headers=headers,proxies=proxies)
  19. response.encoding = 'UTF-8'
  20. html = response.text
  21. selector = etree.HTML(html)
  22. a = selector.xpath('//table[@class="color"]//tr')
  23. dict1 = {}
  24. for i in a:
  25. k1 = i.xpath('td[1]')
  26. if k1:
  27. k2 = k1[0].xpath('string(.)').strip()
  28. k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
  29. # print(k2)
  30. v1 = i.xpath('td[2]')
  31. if v1:
  32. v2 = v1[0].xpath('string(.)').strip()
  33. v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
  34. # print(v3)
  35. else:
  36. v3 = ''
  37. dict1[k3] = v3
  38. dict1['url'] = url
  39. dict1['date'] = dt
  40. # print(dict1)
  41. return dict1
  42. # r1_d('1')
  43. @retry(3)
  44. def r1(ny,pg):
  45. url = 'http://ningxia.chinatax.gov.cn/module/search/index.jsp'
  46. headers = {
  47. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  48. }
  49. params = {
  50. "vc_name":"",
  51. "field_147": "",
  52. "field_149": "",
  53. "field_148": "",
  54. "field_150": "",
  55. "field_151": "",
  56. "strSelectID": "104,147,148,149,150,151",
  57. "i_columnid": ny, #202103
  58. "currpage":pg,
  59. "field": "field_148:1,field_149:1,vc_name:1,field_147:1,field_150:1,field_151:1",
  60. "initKind": "FieldForm",
  61. "currentplace": "",
  62. "splitflag": "",
  63. "fullpath": "0",
  64. }
  65. response = r.get(url=url,headers=headers,params=params,proxies=proxies)
  66. html = response.text
  67. # print(html)
  68. rpg = re.findall(r'共 (.*?) 页',html)
  69. # print(rpg)
  70. selector = etree.HTML(html)
  71. a = selector.xpath('//a/@href')
  72. list1 = []
  73. list2 = []
  74. for i in a:
  75. # print(i)
  76. if "art" in i:
  77. url1 = 'http://ningxia.chinatax.gov.cn' + i.replace('../..','')
  78. print(url1)
  79. dt1 = i.split('/')
  80. # print(dt1)
  81. dt=dt1[3] + '/' + dt1[4] + '/' +dt1[5]
  82. print(dt)
  83. utf = r_myco15.sismember('n28', url1) ##更改
  84. if not utf:
  85. rsd = r1_d(url1, dt)
  86. print(rsd)
  87. list1.append(rsd)
  88. list2.append(url1)
  89. else:
  90. print('已存在,>>>n28')
  91. if list1:
  92. myco28.insert_many(list1)
  93. print('已存入原始库')
  94. if list2:
  95. myco28_b.insert_many(list1)
  96. print('已存入备份原始库')
  97. for mis in list2:
  98. r_myco15.sadd('n28', mis) ##更改
  99. return rpg[0]
  100. # if list1:
  101. # myco28.insert_many(list1)
  102. def get_pg(ny):
  103. url = 'http://ningxia.chinatax.gov.cn/col/col14330/index.html'
  104. headers = {
  105. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  106. }
  107. response = r.get(url=url, headers=headers, proxies=proxies)
  108. response.encoding = 'UTF-8'
  109. html = response.text
  110. # print(html)
  111. selector = etree.HTML(html)
  112. ### niandu3对应2021年
  113. a = selector.xpath('//dl[@id="niandu3"]//dt[@class="open"]//a')
  114. for i in a:
  115. yf = i.xpath('text()')[0]
  116. if yf == '{}月'.format(ny):
  117. href = i.xpath('@href')[0]
  118. print(href)
  119. h1 = re.findall(r'col/col(.*?)/i',href)
  120. return h1[0]
  121. # yf = get_pg(3)
  122. # print(yf)
  123. def runs(ny):
  124. yf = get_pg(ny)
  125. rpg = r1(yf, pg=1)
  126. for pg in range(2,int(rpg)+1):
  127. print(pg,'=============')
  128. r1(ny, pg)
  129. runs(1)