n33_shanghai.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. #!/usr/bin/env python
  2. # coding:utf-8
  3. import requests,json
  4. from setting import proxies
  5. from urllib import parse
  6. from lxml import etree
  7. from mongo_cho import myco33,r_myco15,myco33_b
  8. from rety import retry
  9. r = requests.session()
  10. r.keep_alive = False
  11. @retry(3)
  12. def r1(ny1,ny2,pg):
  13. url = 'http://shanghai.chinatax.gov.cn/newxbwz/tycx/TYCXzdsswfajgblCtrl-getxxsByTj.pfv'
  14. headers = {
  15. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
  16. }
  17. params = {
  18. "nd":str(ny1)+str(ny2),
  19. "qjswjgdm": "",
  20. "curPage":pg,
  21. "time": "Tue Jun 08 2021 08:11:49 GMT 0800 (中国标准时间)",
  22. }
  23. response = r.get(url=url,headers=headers,params=params,proxies=proxies)
  24. # print(response.text)
  25. rsd =response.json()
  26. rpg = rsd['pageCount']
  27. rsl = rsd['pageData']
  28. list1 = []
  29. list2 = []
  30. for i in rsl:
  31. i['date'] = ny1 + '/' + ny2
  32. i.pop('toChar(t2.ajDm)')
  33. print(i)
  34. url1 = i['djxh']
  35. utf = r_myco15.sismember('n33', url1) ##更改
  36. if not utf:
  37. list1.append(i)
  38. list2.append(url1)
  39. else:
  40. print('已存在,>>>n33')
  41. if list1:
  42. myco33.insert_many(list1)
  43. print('已存入原始库')
  44. if list2:
  45. myco33_b.insert_many(list1)
  46. print('已存入备份原始库')
  47. for mis in list2:
  48. r_myco15.sadd('n33', mis) ##更改
  49. return int(rpg)
  50. # l1=['2019','2020']
  51. # l2 = ['01','02','03','04','05','06','07','08','09','10','11','12']
  52. def runs(ny1,ny2):
  53. ny11 = str(ny1)
  54. ny22 = str(ny2)
  55. if len(ny22) ==1:
  56. ny33 = '0' + ny22
  57. else:
  58. ny33 = ny22
  59. rpg = r1(ny11,ny33,pg=1)
  60. print(ny11,ny33,'===')
  61. if rpg >1:
  62. for pg in range(2,rpg+1):
  63. print(pg,'===')
  64. r1(ny11,ny33,pg)
  65. runs(2020,1)