n11_guangxi.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import requests,json,re
  4. from setting import proxies
  5. from urllib import parse
  6. from lxml import etree
  7. from mongo_cho import myco11,r_myco15,myco11_b
  8. r = requests.session()
  9. r.keep_alive = False
  10. from rety import retry
  11. import urllib3
  12. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  13. @retry(3)
  14. def r1(pg):
  15. url = 'http://guangxi.chinatax.gov.cn/restSearch'
  16. headers = {
  17. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  18. }
  19. params = {
  20. "channelid":"290909",
  21. "searchword": "",
  22. "orderby": "RELEVANCE",
  23. "page": pg,
  24. "pageSize": "10",
  25. }
  26. response = r.get(url=url,headers=headers,params=params,proxies=proxies)
  27. # print(response.text)
  28. rsd = response.json()
  29. print(rsd['pager'])
  30. rsl = rsd['datas']
  31. # myco11.insert_many(rsl)
  32. # for i in rsl:
  33. # print(i)
  34. # for pg in range(109,310):
  35. # print(pg,'============')
  36. # r1(pg)
  37. def r2(ny1,ny2,pg):
  38. url = 'http://guangxi.chinatax.gov.cn/restSearch'
  39. headers = {
  40. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  41. }
  42. params = {
  43. "channelid": "290909",
  44. "searchword": "(NF={ny1} and YF={ny2}月)".format(ny1=ny1,ny2=ny2),
  45. "orderby": "RELEVANCE",
  46. "page": pg,
  47. "pageSize": "10",
  48. }
  49. response = r.get(url=url,headers=headers,params=params,proxies=proxies)
  50. # print(response.text)
  51. rsd = response.json()
  52. rpg = rsd['pager']['pageCount']
  53. rsl = rsd['datas']
  54. list1 = []
  55. list2 = []
  56. for i in rsl:
  57. url1 = i['DOCPUBURL']
  58. utf = r_myco15.sismember('n11', url1) ##更改
  59. if not utf:
  60. list2.append(url1)
  61. list1.append(rsd)
  62. else:
  63. print('已存在,>>>n11')
  64. if list1:
  65. myco11.insert_many(list1)
  66. print('已存入原始库')
  67. if list2:
  68. myco11_b.insert_many(list1)
  69. print('已存入备份原始库')
  70. for mis in list2:
  71. r_myco15.sadd('n11', mis) ##更改
  72. # print(response.text)
  73. return int(rpg)
  74. # r2(2)
  75. def runs(ny1, ny2):
  76. rpg = r2(ny1, ny2, pg=1)
  77. if rpg > 1:
  78. for pg in range(2,rpg+1):
  79. print(pg,'==========')
  80. r2(ny1, ny2, pg)
  81. ny1 = 2023
  82. ny2 = 11
  83. runs(ny1, ny2)