#!/usr/bin/env python # coding:utf-8 import requests,json,time from setting import proxies # from urllib import parse # from pymongo import MongoClient # myclient = MongoClient("mongodb://127.0.0.1:27017/") # myco2 = myclient['shuiwu06']['02_nb'] # myco2_b = myclient['shuiwu06']['02_nb'] from lxml import etree from mongo_cho import myco2,myco2_b r = requests.session() r.keep_alive = False #########见30的 # http://ningbo.chinatax.gov.cn/col/col6300/index.html def r1(year,mon,day): dict1 = {} url = 'http://ningbo.chinatax.gov.cn/art/{year}/{mon}/{day}/art_6166_7114.html'.format(year=year,mon=mon,day=mon) headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } html = requests.get(url=url,headers=headers,proxies=proxies) selector = etree.HTML(html) a = selector.xpath('//div[@id="zoom"]//table//tr') for i in a: k1 = i.xpath('td[1]') if k1: k2 = k1[0].xpath('string(.)').strip() k3 = k2.replace('\r','').replace('\t','').replace('\n','').replace(' ','') # print(k2) v1 = i.xpath('td[2]') if v1: v2 = v1[0].xpath('string(.)').strip() v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '') # print(v3) else: v3 = '' dict1[k3] = v3 dict1['url'] = url dict1['date'] = str(year)+'/'+str('04') print(dict1) myco2_b.insert_one(dict1) for i in range(1,30): r1('2023','4',i)