#!/usr/bin/env python # coding:utf-8 import re import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco7,r_myco15,myco7_b from rety import retry r = requests.session() r.keep_alive = False def zh1(list1): str1 = '' for i in list1: str1 += i.replace(' ','').replace('\r','').replace('\n','').replace('\t','') return str1 @retry(3) def r1_d(url,dt): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url,headers=headers,proxies=proxies) html = response.text selector = etree.HTML(html) a = selector.xpath('//tr[@class="rlbbox"]') dict1 = {} for i in a: k1 = i.xpath('td[1]/div/text()') # print(k1) k2 = zh1(k1) # print(k2) v1 = i.xpath('td[2]/div/text()') # print(v1) v2 = zh1(v1) # print(v2) dict1[k2] = v2 # dict1 = {k2:v2} # print(dict1) dict1['url'] = url dict1['date'] = dt return dict1 # r1_d() @retry(3) def r1(searhvalue,year,pg,dt): # url = 'http://anhui.chinatax.gov.cn/module/jslib/bulletin/ajaxdata.jsp' pg1 = str(pg *10 -9) pg2 = str(pg *10) url = 'http://zhejiang.chinatax.gov.cn/module/jslib/bulletin/ajaxdata.jsp?startrecord={pg1}&endrecord={pg2}'.format(pg1=pg1,pg2=pg2) headers = { "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } data = { "searhvalue":parse.quote(searhvalue), "searchkey": "jd1", "year": parse.quote(year), } response = r.post(url=url,headers=headers,data=data,proxies=proxies) html = response.text # print(html) aa = re.findall(r"http://zhejiang.chinatax.gov.cn/art/(.*?).html", html) list1 = [] list2 = [] for i1 in aa: # print(i1) url1 = "http://zhejiang.chinatax.gov.cn/art/" + i1 + ".html" print(url1) utf = r_myco15.sismember('n07', url1) ##更改 if not utf: rsd = r1_d(url1, dt) list1.append(rsd) list2.append(url1) else: print('已存在,>>>n07') pass if list1: myco7.insert_many(list1) if list2: myco7_b.insert_many(list1) for mis in list2: r_myco15.sadd('n07', mis) ##更改 # print(list1) # myco7.insert_many(list1) @retry(3) def get_pg(ny1,ny2): url ='http://zhejiang.chinatax.gov.cn/module/jslib/bulletin/bullenright.jsp?searhvalue={ny2}%E6%9C%88&searchkey=jd1&year={ny1}%E5%B9%B4%E5%BA%A6'.format(ny1=ny1,ny2=ny2) headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url, headers=headers, proxies=proxies) html = response.text # print(html) rpg = re.findall(r"var totalRecord = '(.*?)'",html) if rpg: tpg = rpg[0] tpg1 = int(tpg)//10 + 2 # print(tpg1) return tpg1 # get_pg('2021','1') def runs(ny1,ny2): searhvalue = str(ny2) + '月' year = str(ny1) + '年度' pg = get_pg(ny1, ny2) dt = str(ny1) + '/' + str(ny2) # print(dt) for i in range(1,pg): print(i,'页===========') r1(searhvalue,year,i,dt) runs(2023,11)