import requests,json,time from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco1,r_myco15,myco1_b r = requests.session() r.keep_alive = False from rety import retry def zh1(list1): str1 = '' for i in list1: str1 += i.replace(' ','').replace('\r','').replace('\n','').replace('\t','') return str1 @retry(3) def r1_d(url,dt1): # url = 'http://anhui.chinatax.gov.cn/art/2021/3/3/art_20155_6021.html' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url,headers=headers,proxies=proxies) html = response.text selector = etree.HTML(html) a = selector.xpath('//tr[@class="rlbbox"]') dict1 = {} for i in a: k1 = i.xpath('td[1]/div/text()') # print(k1) k2 = zh1(k1) # print(k2) v1 = i.xpath('td[2]/div/text()') # print(v1) v2 = zh1(v1) # print(v2) dict1[k2] = v2 # dict1 = {k2:v2} # print(dict1) dict1['date'] = dt1 dict1['url'] = url return dict1 @retry(3) def r1(searhvalue,year): url = 'http://anhui.chinatax.gov.cn/module/jslib/bulletin/ajaxdata.jsp' headers = { "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } data = { "searhvalue":parse.quote(searhvalue), "searchkey": "jd1", "year": parse.quote(year), } response = r.post(url=url,headers=headers,data=data,proxies=proxies) html = response.text # print(html) selector = etree.HTML(html) a = selector.xpath('//tr[@class="rlbbox"]') list1 = [] list2 = [] for i in a: url1 = i.xpath('td[5]/div/a/@href')[0] print(url1) # url1='http://anhui.chinatax.gov.cn/art/2020/5/9/art_19687_3782.html' utf = r_myco15.sismember('n01',url1) ##更改 if not utf: dt1 = year.replace('年','/') + searhvalue.replace('月','') rsd = r1_d(url1,dt1) if rsd: list1.append(rsd) list2.append(url1) else: print('已存在,>>>n01') print(list1) if list1: myco1.insert_many(list1) print('已存入原始库') if list2: myco1_b.insert_many(list1) print('已存入备份原始库') for mis in list2: r_myco15.sadd('n01', mis) ##更改 @retry(3) def get_ny(): url ='http://anhui.chinatax.gov.cn//module/jslib/bulletin/bullenleft.jsp' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url, headers=headers, proxies=proxies) html = response.text # print(html) selector = etree.HTML(html) len1=selector.xpath('//tr[@id="jiduonclick1"]/td/span//a') list1 = [] for i in len1: ny = i.xpath('text()')[0].replace('月','') list1.append(ny) eny = list1[-1] print(eny,'>>>from n01_ah***') return eny # get_ny() def runs(): year = '2024年' for i in range(1,2): searhvalue = '{}月'.format(i) r1(searhvalue,year) runs()