n23_hn.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import re
  4. import requests,json
  5. from setting import proxies
  6. from urllib import parse
  7. from lxml import etree
  8. from mongo_cho import myco23,r_myco15,myco23_b
  9. from rety import retry
  10. r = requests.session()
  11. r.keep_alive = False
  12. @retry(3)
  13. def r1_d(uid,dt):
  14. url = 'http://hainan.chinatax.gov.cn/weifaCase/weifa_case_list.htm'
  15. headers = {
  16. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  17. }
  18. data = {
  19. "id":uid
  20. }
  21. response = r.post(url=url, headers=headers, data=data,proxies=proxies)
  22. html = response.text
  23. selector = etree.HTML(html)
  24. a = selector.xpath('//table[@class="div2-table3"]//tr')
  25. dict1 = {}
  26. for i in a:
  27. k1 = i.xpath('th')
  28. if k1:
  29. k2 = k1[0].xpath('string(.)').strip()
  30. k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
  31. # print(k2)
  32. v1 = i.xpath('td')
  33. if v1:
  34. v2 = v1[0].xpath('string(.)').strip()
  35. v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
  36. # print(v3)
  37. else:
  38. v3 = ''
  39. dict1[k3] = v3
  40. dict1['url'] = uid
  41. # dt = ''
  42. dict1['date'] = dt
  43. # print(dict1)
  44. return dict1
  45. # r1_d('1')
  46. @retry(3)
  47. def r1(ny1,ny2,pg):
  48. url = 'http://hainan.chinatax.gov.cn/weifaCase/weifa_case_list.htm?pageNo={}'.format(pg)
  49. headers = {
  50. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  51. }
  52. dt = str(ny1) + '/' + str(ny2)
  53. data = {
  54. "area":"",
  55. "ajinformation": "",
  56. "startDate": str(ny1) + '-' + str(ny2),
  57. "month": "1",
  58. "nsrname": "",
  59. "nsridentify": "",
  60. "regaddress": "",
  61. "organization": "",
  62. "legal": "",
  63. "legalId": "",
  64. "finance": "",
  65. "financeId": "",
  66. }
  67. response = r.post(url=url,headers=headers,data=data,proxies=proxies)
  68. html = response.text
  69. # print(html)
  70. tpg = re.findall(r'共<em>(.*?)</em>条',html)
  71. # print(tpg)
  72. if tpg:
  73. rpg = int(tpg[0])
  74. else:
  75. rpg = 0
  76. selector = etree.HTML(html)
  77. a = selector.xpath('//input/@onclick')
  78. list1 = []
  79. list2 = []
  80. for i in a:
  81. uid = i.replace('weifaCaseDetail(','').replace(')','')
  82. # print(uid)
  83. utf = r_myco15.sismember('n23', uid) ##更改
  84. if not utf:
  85. rsd = r1_d(uid, dt)
  86. print(rsd)
  87. list1.append(rsd)
  88. if list1:
  89. myco23.insert_many(list1)
  90. print('已存入原始库')
  91. if list2:
  92. myco23_b.insert_many(list1)
  93. print('已存入备份原始库')
  94. for mis in list2:
  95. r_myco15.sadd('n23', mis) ##更改
  96. return rpg
  97. # if list1:
  98. # myco23.insert_many(list1)
  99. def runs():
  100. ny1= '2023'
  101. ny2 = '11'
  102. # pg = 2
  103. rpg = r1(ny1,ny2,pg=1)
  104. tpg = rpg//15 +1
  105. for pg in range(2,tpg):
  106. r1(ny1,ny2,pg)
  107. runs()