#!/usr/bin/env python # coding:utf-8 import requests,json,re from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco11,r_myco15,myco11_b r = requests.session() r.keep_alive = False from rety import retry import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @retry(3) def r1(pg): url = 'http://guangxi.chinatax.gov.cn/restSearch' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } params = { "channelid":"290909", "searchword": "", "orderby": "RELEVANCE", "page": pg, "pageSize": "10", } response = r.get(url=url,headers=headers,params=params,proxies=proxies) # print(response.text) rsd = response.json() print(rsd['pager']) rsl = rsd['datas'] # myco11.insert_many(rsl) # for i in rsl: # print(i) # for pg in range(109,310): # print(pg,'============') # r1(pg) def r2(ny1,ny2,pg): url = 'http://guangxi.chinatax.gov.cn/restSearch' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } params = { "channelid": "290909", "searchword": "(NF={ny1} and YF={ny2}月)".format(ny1=ny1,ny2=ny2), "orderby": "RELEVANCE", "page": pg, "pageSize": "10", } response = r.get(url=url,headers=headers,params=params,proxies=proxies) # print(response.text) rsd = response.json() rpg = rsd['pager']['pageCount'] rsl = rsd['datas'] list1 = [] list2 = [] for i in rsl: url1 = i['DOCPUBURL'] utf = r_myco15.sismember('n11', url1) ##更改 if not utf: list2.append(url1) list1.append(rsd) else: print('已存在,>>>n11') if list1: myco11.insert_many(list1) print('已存入原始库') if list2: myco11_b.insert_many(list1) print('已存入备份原始库') for mis in list2: r_myco15.sadd('n11', mis) ##更改 # print(response.text) return int(rpg) # r2(2) def runs(ny1, ny2): rpg = r2(ny1, ny2, pg=1) if rpg > 1: for pg in range(2,rpg+1): print(pg,'==========') r2(ny1, ny2, pg) ny1 = 2023 ny2 = 11 runs(ny1, ny2)