n13_ln.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import re
  4. import time
  5. import requests,json
  6. from setting import proxies
  7. from urllib import parse
  8. from lxml import etree
  9. from mongo_cho import myco13,r_myco15,myco13_b
  10. from rety import retry
  11. r = requests.session()
  12. r.keep_alive = False
  13. @retry(3)
  14. def r1_d(url,dt):
  15. # url = 'http://liaoning.chinatax.gov.cn/art/2020/12/8/art_5883_1808.html'
  16. headers = {
  17. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  18. }
  19. response = r.get(url=url,headers=headers,proxies=proxies)
  20. response.encoding = 'UTF-8'
  21. html = response.text
  22. selector = etree.HTML(html)
  23. a = selector.xpath('//table[@class="contentTable"]//tr')
  24. dict1 = {}
  25. for i in a:
  26. k1 = i.xpath('td[1]')
  27. if k1:
  28. k2 = k1[0].xpath('string(.)').strip()
  29. k3 = k2.replace('\r','').replace('\t','').replace('\n','').replace(' ','')
  30. # print(k2)
  31. v1 = i.xpath('td[2]')
  32. if v1:
  33. v2 = v1[0].xpath('string(.)').strip()
  34. v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
  35. # print(v3)
  36. else:
  37. v3 = ''
  38. if k3:
  39. dict1[k3] = v3
  40. dict1['url'] = url
  41. dict1['date'] = dt
  42. # print(dict1)
  43. return dict1
  44. # r1_d()
  45. @retry(3)
  46. def r1(ny,pg,dt):
  47. url = 'http://liaoning.chinatax.gov.cn/module/search/index.jsp'
  48. headers = {
  49. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  50. }
  51. params= {
  52. "a":"",
  53. "b": "",
  54. "d": "",
  55. "c": "",
  56. "e": "",
  57. "h": "",
  58. "k": "",
  59. "n": "",
  60. "t": "",
  61. "x_large": "",
  62. "x_small": "",
  63. "y_large": "",
  64. "y_small": "",
  65. "z_large": ny,
  66. "z_small": ny,
  67. "strSelectID": "1754,1755,1756,1757,1758,1761,1764,1767,1777,1778,1779",
  68. "i_columnid": "5883",
  69. "field": "a:1:0,b:1:1,c:1:1,d:1:0,e:1:1,h:1:0,k:1:1,n:1:0,t:1:0,x:0:1,y:0:1,z:0:1",
  70. "initKind": "FieldForm",
  71. "type": "0,1,1,0,1,0,1,0,0,1,1,1",
  72. "currpage":pg,
  73. "currentplace": "",
  74. "splitflag": "",
  75. "fullpath": "0",
  76. }
  77. response = r.get(url=url,headers=headers,params=params,proxies=proxies)
  78. html = response.text
  79. # print(html)
  80. rpg = re.findall(r'共 (.*?)&nbsp',html)[0]
  81. # print(rpg)
  82. # time.sleep(9)
  83. selector = etree.HTML(html)
  84. rsl = selector.xpath('//a[@class="xxxx"]/@href')
  85. list1 = []
  86. list2 = []
  87. for i in rsl:
  88. # print(i)
  89. url1 = 'http://liaoning.chinatax.gov.cn' + i.replace('../..','')
  90. print(url1)
  91. utf = r_myco15.sismember('n13', url1) ##更改
  92. if not utf:
  93. rsd = r1_d(url1, dt)
  94. print(rsd)
  95. list1.append(rsd)
  96. list2.append(url1)
  97. else:
  98. print('已存在,>>>n13')
  99. if list1:
  100. myco13.insert_many(list1)
  101. print('已存入原始库')
  102. if list2:
  103. myco13_b.insert_many(list1)
  104. print('已存入备份原始库')
  105. for mis in list2:
  106. r_myco15.sadd('n13', mis) ##更改
  107. return int(rpg)
  108. # http://liaoning.chinatax.gov.cn/col/col5883/index.html
  109. def runs(ny1,ny2):
  110. if len(str(ny2)) == 1:
  111. ny = str(ny1) + '0' + str(ny2)
  112. else:
  113. ny = str(ny1) + str(ny2)
  114. pg = 1
  115. dt = str(ny1) + '/' + str(ny2)
  116. tpg = r1(ny,pg,dt)
  117. for pg in range(1,tpg+1):
  118. print(pg,'=====')
  119. r1(ny,pg,dt)
  120. runs(2023,4)