#!/usr/bin/env python # coding:utf-8 import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco19,r_myco15,myco19_b from rety import retry r = requests.session() r.keep_alive = False @retry(3) def r1(ny,pg): url = 'https://etax.hubei.chinatax.gov.cn/webroot/gzcxAction.do' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } params = { "method":"zdsswfajcx", "page": pg, "limit": "15", "nsrsbh": "", "zzjgdm": "", "nsrmc": "", "fddbrmc": "", "fddbrzjh": "", "cwfzrmc": "", "cwfzrzjh": "", "nsrlx": "", "ds": "", "zcdz": "", "ajxz": "", "ssnd": parse.quote(ny), } response = r.get(url=url,headers=headers,params=params,proxies=proxies) # print(response.text) rsd = response.json() cot = rsd['count'] rpg = cot//15 + 2 rsl = rsd['data'] # if rsl: # myco19.insert_many(rsl) list1 = [] list2 = [] for i in rsl: url1 = i['LSH'] utf = r_myco15.sismember('n19', url1) ##更改 if not utf: list1.append(i) list2.append(url1) else: print('已存在,>>>n01') if list1: myco19.insert_many(list1) print('已存入原始库') if list2: myco19_b.insert_many(list1) print('已存入备份原始库') for mis in list2: r_myco15.sadd('n19', mis) ##更改 # print(rpg) return rpg def runs(ny1,ny2): # for ny1 in range(2020,2021): # for ny2 in range(1,13): ny = str(ny1)+'年'+str(ny2)+'月' rpg = r1(ny,pg=1) print(ny,'======') if rpg >1: for pg in range(2,rpg): print(pg,'==') rpg = r1(ny, pg) runs('2023','11')