123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117 |
- #!/usr/bin/env python
- # coding:utf-8
- import re
- import requests,json
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco7,r_myco15,myco7_b
- from rety import retry
- r = requests.session()
- r.keep_alive = False
- def zh1(list1):
- str1 = ''
- for i in list1:
- str1 += i.replace(' ','').replace('\r','').replace('\n','').replace('\t','')
- return str1
- @retry(3)
- def r1_d(url,dt):
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- response = r.get(url=url,headers=headers,proxies=proxies)
- html = response.text
- selector = etree.HTML(html)
- a = selector.xpath('//tr[@class="rlbbox"]')
- dict1 = {}
- for i in a:
- k1 = i.xpath('td[1]/div/text()')
- # print(k1)
- k2 = zh1(k1)
- # print(k2)
- v1 = i.xpath('td[2]/div/text()')
- # print(v1)
- v2 = zh1(v1)
- # print(v2)
- dict1[k2] = v2
- # dict1 = {k2:v2}
- # print(dict1)
- dict1['url'] = url
- dict1['date'] = dt
- return dict1
- # r1_d()
- @retry(3)
- def r1(searhvalue,year,pg,dt):
- # url = 'http://anhui.chinatax.gov.cn/module/jslib/bulletin/ajaxdata.jsp'
- pg1 = str(pg *10 -9)
- pg2 = str(pg *10)
- url = 'http://zhejiang.chinatax.gov.cn/module/jslib/bulletin/ajaxdata.jsp?startrecord={pg1}&endrecord={pg2}'.format(pg1=pg1,pg2=pg2)
- headers = {
- "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- data = {
- "searhvalue":parse.quote(searhvalue),
- "searchkey": "jd1",
- "year": parse.quote(year),
- }
- response = r.post(url=url,headers=headers,data=data,proxies=proxies)
- html = response.text
- # print(html)
- aa = re.findall(r"http://zhejiang.chinatax.gov.cn/art/(.*?).html", html)
- list1 = []
- list2 = []
- for i1 in aa:
- # print(i1)
- url1 = "http://zhejiang.chinatax.gov.cn/art/" + i1 + ".html"
- print(url1)
- utf = r_myco15.sismember('n07', url1) ##更改
- if not utf:
- rsd = r1_d(url1, dt)
- list1.append(rsd)
- list2.append(url1)
- else:
- print('已存在,>>>n07')
- pass
- if list1:
- myco7.insert_many(list1)
- if list2:
- myco7_b.insert_many(list1)
- for mis in list2:
- r_myco15.sadd('n07', mis) ##更改
- # print(list1)
- # myco7.insert_many(list1)
- @retry(3)
- def get_pg(ny1,ny2):
- url ='http://zhejiang.chinatax.gov.cn/module/jslib/bulletin/bullenright.jsp?searhvalue={ny2}%E6%9C%88&searchkey=jd1&year={ny1}%E5%B9%B4%E5%BA%A6'.format(ny1=ny1,ny2=ny2)
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- response = r.get(url=url, headers=headers, proxies=proxies)
- html = response.text
- # print(html)
- rpg = re.findall(r"var totalRecord = '(.*?)'",html)
- if rpg:
- tpg = rpg[0]
- tpg1 = int(tpg)//10 + 2
- # print(tpg1)
- return tpg1
- # get_pg('2021','1')
- def runs(ny1,ny2):
- searhvalue = str(ny2) + '月'
- year = str(ny1) + '年度'
- pg = get_pg(ny1, ny2)
- dt = str(ny1) + '/' + str(ny2)
- # print(dt)
- for i in range(1,pg):
- print(i,'页===========')
- r1(searhvalue,year,i,dt)
- runs(2023,11)
|