n15_jl.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import re
  4. import requests,json
  5. from setting import proxies
  6. from urllib import parse
  7. from lxml import etree
  8. from mongo_cho import myco15,r_myco15,myco15_b
  9. from rety import retry
  10. r = requests.session()
  11. r.keep_alive = False
  12. @retry(3)
  13. def r1_d(url,dt):
  14. # url = 'http://jilin.chinatax.gov.cn/art/2021/3/3/art_19972_7390.html'
  15. headers = {
  16. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  17. }
  18. response = r.get(url=url,headers=headers,proxies=proxies)
  19. response.encoding = 'UTF-8'
  20. html = response.text
  21. selector = etree.HTML(html)
  22. a = selector.xpath('//table[@class="zdwf"]//tr')
  23. dict1 = {}
  24. for i in a:
  25. k1 = i.xpath('td[1]/text()')[0]
  26. v1 = i.xpath('td[2]/text()')
  27. # print(k1,v1)
  28. v3 = ''
  29. for v2 in v1:
  30. v3 += v2
  31. k2 = k1.replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '')
  32. v4 = v3.replace(' ','').replace('\r','').replace('\n','').replace('\t','')
  33. if k2:
  34. dict1[k2] = v4
  35. dict1['url'] = url
  36. dict1['date'] = dt
  37. # print(dict1)
  38. return dict1
  39. # r1_d('1','2')
  40. @retry(3)
  41. def r1(ny, dt, pg):
  42. url = 'http://jilin.chinatax.gov.cn/module/search/index.jsp'
  43. headers = {
  44. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  45. }
  46. params = {
  47. "field_1136_large":"",
  48. "field_1136_small": "",
  49. "field_1137_large": "",
  50. "field_1137_small": "",
  51. "field_1138_large": ny,
  52. "field_1138_small": ny,
  53. "field_1113": "",
  54. "field_1114": "",
  55. "field_1115": "",
  56. "field_1116": "",
  57. "field_1117": "",
  58. "field_1120": "",
  59. "field_1123": "",
  60. "field_1126": "",
  61. "strSelectID": "1113,1114,1115,1116,1117,1120,1123,1126,1136,1137,1138",
  62. "i_columnid": "19972",
  63. "field": "field_1113:1:1,field_1114:1:1,field_1115:1:1,field_1116:1:1,field_1117:1:1,field_1120:1:1,field_1123:1:1,field_1126:1:1,field_1136:0:1,field_1137:0:1,field_1138:0:1",
  64. "initKind": "FieldForm",
  65. "type": "1,1,1,1,1,1,1,1,1,1,1",
  66. "currentplace": "",
  67. "currpage": pg,
  68. "splitflag": "",
  69. "fullpath": "0",
  70. }
  71. response = r.get(url=url,headers=headers,params=params,proxies=proxies)
  72. # print(response.text)
  73. html = response.text
  74. tpg = re.findall('共(.*?)页',html)[0].replace(' ','')
  75. selector = etree.HTML(html)
  76. rsl = selector.xpath('//a[@class="xxxx"]/@href')
  77. list1 = []
  78. list2 = []
  79. for i in rsl:
  80. # print(i)
  81. url1 = 'http://jilin.chinatax.gov.cn/' + i.replace('../..', '')
  82. # print(url1)
  83. utf = r_myco15.sismember('n15', url1) ##更改
  84. if not utf:
  85. rsd = r1_d(url1, dt)
  86. print(rsd)
  87. list2.append(url1)
  88. list1.append(rsd)
  89. else:
  90. print('已存在,>>>n15')
  91. if list1:
  92. myco15.insert_many(list1)
  93. print('已存入原始库')
  94. if list2:
  95. myco15_b.insert_many(list1)
  96. print('已存入备份原始库')
  97. for mis in list2:
  98. r_myco15.sadd('n15', mis) ##更改
  99. # if list1:
  100. # myco15.insert_many(list1)
  101. return tpg
  102. # ny= '202001'
  103. # pg = '1'
  104. # dt = '0'
  105. def runs(ny1,ny2):
  106. # for ny1 in range(2019,2021): #2021
  107. # for ny2 in range(1,13): # 1 4
  108. if len(str(ny2)) ==1:
  109. ny2 = '0' +str(ny2)
  110. ny = str(ny1) + str(ny2)
  111. dt = str(ny1) + '/' +str(ny2)
  112. print(ny,dt,'======')
  113. rpg = r1(ny, dt, pg=1)
  114. if int(rpg) >1:
  115. for pg1 in range(2,int(rpg)+1):
  116. r1(ny, dt, pg1)
  117. ny1 = '2023'
  118. ny2 = '10'
  119. runs(ny1,ny2)