#!/usr/bin/env python # coding:utf-8 import requests,json,re from rety import retry from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco5 r = requests.session() r.keep_alive = False import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @retry(3) def r1_d(url,dt): # url = 'https://sichuan.chinatax.gov.cn/art/2021/3/23/art_15873_10537.html' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url,headers=headers,verify=False,proxies=proxies) response.encoding = 'UTF-8' html= response.text # print(html) selector = etree.HTML(html) a = selector.xpath('//tbody//tr') dict1 = {} try: for i in a: k1 = i.xpath('td[1]') text = k1[0].xpath('string(.)').strip() # print(text) v1 = i.xpath('td[2]') text1 = v1[0].xpath('string(.)').strip() v2 = text1.replace(' ','').replace('\r','').replace('\t','').replace('\n','') dict1[text] = v2 except: pass print(dict1) dict1['url'] = url dict1['date'] = dt return dict1 # r1_d() @retry(3) def r1(icid,cpg,dt): url = 'https://sichuan.chinatax.gov.cn/module/search/index.jsp' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } params = { "vc_name":"", "field_439": "", "field_440": "", "field_441": "", "field_442": "", "field_443": "", "strSelectID": "390,439,440,441,442,443", "i_columnid": icid, "field": "vc_name:1:0,field_439:1:0,field_440:1:0,field_441:1:0,field_442:1:0,field_443:1:0", "currpage": cpg, } response = r.get(url=url,headers=headers,params=params,verify=False,proxies=proxies) html = response.text # print(html) selector = etree.HTML(html) a = selector.xpath('//tr/td[5]/a/@href') list1 = [] for i in a: # print(i) url1 = i.replace('../..','https://sichuan.chinatax.gov.cn') print(url1) rsd = r1_d(url1,dt) list1.append(rsd) # print(list1) myco5.insert_many(list1) # 'https://sichuan.chinatax.gov.cn' # https://sichuan.chinatax.gov.cn/col/col15873/index.html ##季度更新,对比上次页数, def runs(): icid='15873' #季度id tpg=9 #总共几页 dt='2023/10' #时限 for i in range(1,tpg+1): print(i,'==================') r1(icid,i,dt) runs()