#!/usr/bin/env python # coding:utf-8 import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco17,r_myco15,myco17_b from rety import retry r = requests.session() r.keep_alive = False # http://qinghai.chinatax.gov.cn/web/zdsswfsxaj/zdaj.shtml @retry(3) def r1_d(url): # url = 'http://qinghai.chinatax.gov.cn/web/2020nd/202007/e4856c576fa04e059eff6762dc47bf0c.shtml' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url, headers=headers, proxies=proxies) # html = response.text # print(html) html = response.text selector = etree.HTML(html) dt = selector.xpath('//*[@id="page-newContent"]/div[2]/div/div[1]/div/div[1]/div/span[1]/text()') dt1 = dt[0].replace('发布时间:','').replace('\r','').replace('\n','').replace(' ','').replace('-','/') dt2 = dt1[:-5] print(dt2) a = selector.xpath('//tr') dict1 = {} for i in a: k1 = i.xpath('td[1]') if k1: k2 = k1[0].xpath('string(.)').strip() k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '') # print(k2) v1 = i.xpath('td[2]') if v1: v2 = v1[0].xpath('string(.)').strip() v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '') # print(v3) else: v3 = '' dict1[k3] = v3 dict1['url'] = url # dt='' dict1['date'] = dt2 print(dict1) return dict1 # r1_d(url) @retry(3) def r1(ny,pg): if pg ==1: url = 'http://qinghai.chinatax.gov.cn/web/{}nd/iframe.shtml'.format(ny) else: url = 'http://qinghai.chinatax.gov.cn/web/{ny}nd/iframe_{pg}.shtml'.format(ny=ny,pg=pg) # url = 'http://qinghai.chinatax.gov.cn/web/2021nd/iframe.shtml' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url, headers=headers, proxies=proxies) html = response.text # print(html) if "404 Not Found" in html: print('zz') return 'zz' selector = etree.HTML(html) a = selector.xpath('//a/@href') list1 = [] list2 = [] for i in a: print(i) url1 = 'http://qinghai.chinatax.gov.cn' + i utf = r_myco15.sismember('n17', url1) ##更改 if not utf: rsd = r1_d(url1) list1.append(rsd) list2.append(url1) else: print('已存在,>>>n17') if list1: myco17.insert_many(list1) print('已存入原始库') if list2: myco17_b.insert_many(list1) print('已存入备份原始库') for mis in list2: r_myco15.sadd('n17', mis) ##更改 # if list1: # myco17.insert_many(list1) # print('1') # r1(pg=1) def runs(ny): for pg in range(1,100): print(pg, '===========') tf = r1(ny,pg) if tf == "zz": break runs(2021) for pg in range(54,55): print(pg,'===========') r1(2021,pg)