#!/usr/bin/env python # coding:utf-8 import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco33,r_myco15,myco33_b from rety import retry r = requests.session() r.keep_alive = False @retry(3) def r1(ny1,ny2,pg): url = 'http://shanghai.chinatax.gov.cn/newxbwz/tycx/TYCXzdsswfajgblCtrl-getxxsByTj.pfv' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } params = { "nd":str(ny1)+str(ny2), "qjswjgdm": "", "curPage":pg, "time": "Tue Jun 08 2021 08:11:49 GMT 0800 (中国标准时间)", } response = r.get(url=url,headers=headers,params=params,proxies=proxies) # print(response.text) rsd =response.json() rpg = rsd['pageCount'] rsl = rsd['pageData'] list1 = [] list2 = [] for i in rsl: i['date'] = ny1 + '/' + ny2 i.pop('toChar(t2.ajDm)') print(i) url1 = i['djxh'] utf = r_myco15.sismember('n33', url1) ##更改 if not utf: list1.append(i) list2.append(url1) else: print('已存在,>>>n33') if list1: myco33.insert_many(list1) print('已存入原始库') if list2: myco33_b.insert_many(list1) print('已存入备份原始库') for mis in list2: r_myco15.sadd('n33', mis) ##更改 return int(rpg) # l1=['2019','2020'] # l2 = ['01','02','03','04','05','06','07','08','09','10','11','12'] def runs(ny1,ny2): ny11 = str(ny1) ny22 = str(ny2) if len(ny22) ==1: ny33 = '0' + ny22 else: ny33 = ny22 rpg = r1(ny11,ny33,pg=1) print(ny11,ny33,'===') if rpg >1: for pg in range(2,rpg+1): print(pg,'===') r1(ny11,ny33,pg) runs(2020,1)