123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869 |
- #!/usr/bin/env python
- # coding:utf-8
- import requests,json
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco33,r_myco15,myco33_b
- from rety import retry
- r = requests.session()
- r.keep_alive = False
- @retry(3)
- def r1(ny1,ny2,pg):
- url = 'http://shanghai.chinatax.gov.cn/newxbwz/tycx/TYCXzdsswfajgblCtrl-getxxsByTj.pfv'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- params = {
- "nd":str(ny1)+str(ny2),
- "qjswjgdm": "",
- "curPage":pg,
- "time": "Tue Jun 08 2021 08:11:49 GMT 0800 (中国标准时间)",
- }
- response = r.get(url=url,headers=headers,params=params,proxies=proxies)
- # print(response.text)
- rsd =response.json()
- rpg = rsd['pageCount']
- rsl = rsd['pageData']
- list1 = []
- list2 = []
- for i in rsl:
- i['date'] = ny1 + '/' + ny2
- i.pop('toChar(t2.ajDm)')
- print(i)
- url1 = i['djxh']
- utf = r_myco15.sismember('n33', url1) ##更改
- if not utf:
- list1.append(i)
- list2.append(url1)
- else:
- print('已存在,>>>n33')
- if list1:
- myco33.insert_many(list1)
- print('已存入原始库')
- if list2:
- myco33_b.insert_many(list1)
- print('已存入备份原始库')
- for mis in list2:
- r_myco15.sadd('n33', mis) ##更改
- return int(rpg)
- # l1=['2019','2020']
- # l2 = ['01','02','03','04','05','06','07','08','09','10','11','12']
- def runs(ny1,ny2):
- ny11 = str(ny1)
- ny22 = str(ny2)
- if len(ny22) ==1:
- ny33 = '0' + ny22
- else:
- ny33 = ny22
- rpg = r1(ny11,ny33,pg=1)
- print(ny11,ny33,'===')
- if rpg >1:
- for pg in range(2,rpg+1):
- print(pg,'===')
- r1(ny11,ny33,pg)
- runs(2020,1)
|