n24_shanxi.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import requests,json
  4. from setting import proxies
  5. from urllib import parse
  6. from lxml import etree
  7. from mongo_cho import myco24,r_myco15,myco24_b
  8. from rety import retry
  9. r = requests.session()
  10. r.keep_alive = False
  11. @retry(3)
  12. def r1(ny1,ny2,pg):
  13. d2 = str(ny2)
  14. if len(d2) == 1:
  15. d3 = '0' + str(d2)
  16. else:
  17. d3 = str(ny2)
  18. url = 'http://shanxi.chinatax.gov.cn/common/extQuery?sqlid=web_zdsswf&limit=10&cx_lx=0&cx_xsrq={ny1}-{ny2}&page={pg}'.format(ny1=ny1,ny2=d3,pg=pg)
  19. headers = {
  20. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  21. }
  22. data = {
  23. "start":"0",
  24. }
  25. response = r.post(url=url,headers=headers,data=data,proxies=proxies)
  26. # print(response.text)
  27. rsd = response.json()
  28. # print(rsd)
  29. rsl = rsd['message']['list']
  30. rpg = rsd['message']['totalPage']
  31. list1 = []
  32. list2 = []
  33. for i in rsl:
  34. url1 = i['ajbh']
  35. utf = r_myco15.sismember('n24', url1) ##更改
  36. if not utf:
  37. list1.append(i)
  38. list2.append(url1)
  39. else:
  40. print('已存在,>>>n24')
  41. if list1:
  42. myco24.insert_many(list1)
  43. print('已存入原始库')
  44. if list2:
  45. myco24_b.insert_many(list1)
  46. print('已存入备份原始库')
  47. for mis in list2:
  48. r_myco15.sadd('n24', mis) ##更改
  49. return int(rpg)
  50. # if rsl:
  51. # print('1')
  52. # myco24.insert_many(rsl)
  53. # ny1 = 2020
  54. # ny2 = 12
  55. def runs(ny1,ny2):
  56. print(ny1,ny2,'---------')
  57. rpg = r1(ny1,ny2,pg=1)
  58. for pg in range(1,rpg+1):
  59. print(pg,'==================')
  60. r1(ny1,ny2,pg)
  61. ny1 = 2023
  62. ny2 = 12
  63. runs(ny1,ny2)