#!/usr/bin/env python # coding:utf-8 import requests,json,re from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco8,myco8_b,r_myco15 from rety import retry r = requests.session() r.keep_alive = False @retry(3) def r1(ny1,ny2,pg): url = 'http://fujian.chinatax.gov.cn/was5/web/search' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } params = { "channelid":"291316", "templet": "zdaj.jsp", "sortfield": "-datefor", "classsql": "datefor={ny1}\-{ny2}".format(ny1=ny1,ny2=ny2), "r": "0.31052286956801844", "prepage": "8", "page": pg, } response = r.get(url=url,headers=headers,params=params,proxies=proxies) html = response.text tpg = re.findall('"pagenum":"(.*?)"',html)[0] print(tpg,'===========') # a = json.loads(html) print(html) html1 = html.replace(" ","").replace("\r","").replace("\n","").replace("\t","") res1 = re.findall('"docs":\[(.*?)\]',html1) res2 = res1[0] res3 = re.findall('\{(.*?)\}',res2) list1 = [] listurl = [] for i in res3: i1 = "{" + i + "}" i2 = json.loads(i1) print(i2) url1 = i2['url'] utf = r_myco15.sismember('n08', url1) ##更改 if not utf: listurl.append(url1) list1.append(i2) else: print('已存在,>>>n08') pass list2 = list1[:-1] # print(list2) # if list2: # myco8.insert_many(list2) # if listurl: # myco8_b.insert_many(list2) # for mis in listurl: # r_myco15.sadd('n08', mis) ##更改 return tpg def runs(ny1,ny2): rpg = r1(ny1,ny2,pg=1) # print(rpg) # print(type(rpg)) if rpg == '0': print('122') return 'er1' else: for pg2 in range(2,int(rpg)+1): r1(ny1, ny2, pg2) for pg2 in range(3,7): r1('2023', '10', pg2)