123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- #!/usr/bin/env python
- # coding:utf-8
- import requests,json,re
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco11,r_myco15,myco11_b
- r = requests.session()
- r.keep_alive = False
- from rety import retry
- import urllib3
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
- @retry(3)
- def r1(pg):
- url = 'http://guangxi.chinatax.gov.cn/restSearch'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- params = {
- "channelid":"290909",
- "searchword": "",
- "orderby": "RELEVANCE",
- "page": pg,
- "pageSize": "10",
- }
- response = r.get(url=url,headers=headers,params=params,proxies=proxies)
- # print(response.text)
- rsd = response.json()
- print(rsd['pager'])
- rsl = rsd['datas']
- # myco11.insert_many(rsl)
- # for i in rsl:
- # print(i)
- # for pg in range(109,310):
- # print(pg,'============')
- # r1(pg)
- def r2(ny1,ny2,pg):
- url = 'http://guangxi.chinatax.gov.cn/restSearch'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- params = {
- "channelid": "290909",
- "searchword": "(NF={ny1} and YF={ny2}月)".format(ny1=ny1,ny2=ny2),
- "orderby": "RELEVANCE",
- "page": pg,
- "pageSize": "10",
- }
- response = r.get(url=url,headers=headers,params=params,proxies=proxies)
- # print(response.text)
- rsd = response.json()
- rpg = rsd['pager']['pageCount']
- rsl = rsd['datas']
- list1 = []
- list2 = []
- for i in rsl:
- url1 = i['DOCPUBURL']
- utf = r_myco15.sismember('n11', url1) ##更改
- if not utf:
- list2.append(url1)
- list1.append(rsd)
- else:
- print('已存在,>>>n11')
- if list1:
- myco11.insert_many(list1)
- print('已存入原始库')
- if list2:
- myco11_b.insert_many(list1)
- print('已存入备份原始库')
- for mis in list2:
- r_myco15.sadd('n11', mis) ##更改
- # print(response.text)
- return int(rpg)
- # r2(2)
- def runs(ny1, ny2):
- rpg = r2(ny1, ny2, pg=1)
- if rpg > 1:
- for pg in range(2,rpg+1):
- print(pg,'==========')
- r2(ny1, ny2, pg)
- ny1 = 2023
- ny2 = 11
- runs(ny1, ny2)
|