123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778 |
- #!/usr/bin/env python
- # coding:utf-8
- import requests,json,re
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco8,myco8_b,r_myco15
- from rety import retry
- r = requests.session()
- r.keep_alive = False
- @retry(3)
- def r1(ny1,ny2,pg):
- url = 'http://fujian.chinatax.gov.cn/was5/web/search'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- params = {
- "channelid":"291316",
- "templet": "zdaj.jsp",
- "sortfield": "-datefor",
- "classsql": "datefor={ny1}\-{ny2}".format(ny1=ny1,ny2=ny2),
- "r": "0.31052286956801844",
- "prepage": "8",
- "page": pg,
- }
- response = r.get(url=url,headers=headers,params=params,proxies=proxies)
- html = response.text
- tpg = re.findall('"pagenum":"(.*?)"',html)[0]
- print(tpg,'===========')
- # a = json.loads(html)
- print(html)
- html1 = html.replace(" ","").replace("\r","").replace("\n","").replace("\t","")
- res1 = re.findall('"docs":\[(.*?)\]',html1)
- res2 = res1[0]
- res3 = re.findall('\{(.*?)\}',res2)
- list1 = []
- listurl = []
- for i in res3:
- i1 = "{" + i + "}"
- i2 = json.loads(i1)
- print(i2)
- url1 = i2['url']
- utf = r_myco15.sismember('n08', url1) ##更改
- if not utf:
- listurl.append(url1)
- list1.append(i2)
- else:
- print('已存在,>>>n08')
- pass
- list2 = list1[:-1]
- # print(list2)
- # if list2:
- # myco8.insert_many(list2)
- # if listurl:
- # myco8_b.insert_many(list2)
- # for mis in listurl:
- # r_myco15.sadd('n08', mis) ##更改
- return tpg
- def runs(ny1,ny2):
- rpg = r1(ny1,ny2,pg=1)
- # print(rpg)
- # print(type(rpg))
- if rpg == '0':
- print('122')
- return 'er1'
- else:
- for pg2 in range(2,int(rpg)+1):
- r1(ny1, ny2, pg2)
- for pg2 in range(3,7):
- r1('2023', '10', pg2)
|