#!/usr/bin/env python # coding:utf-8 import re import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco27,r_myco15,myco27_b from rety import retry r = requests.session() r.keep_alive = False def r1_d(url,dt): # url = 'http://neimenggu.chinatax.gov.cn/nsfw/sscx/zdaj/hlbeszdwfaj/202106/t20210609_751387.html' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url,headers=headers,proxies=proxies) response.encoding = 'UTF-8' html = response.text selector = etree.HTML(html) a = selector.xpath('//table//tr') dict1 = {} for i in a: k1 = i.xpath('td[1]') if k1: k2 = k1[0].xpath('string(.)').strip() k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '') # print(k2) v1 = i.xpath('td[2]') if v1: v2 = v1[0].xpath('string(.)').strip() v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '') # print(v3) else: v3 = '' if k3: dict1[k3] = v3 dict1['url'] = url dict1['date'] = dt print(dict1) return dict1 # r1_d('1') @retry(3) def r1(): url = 'http://neimenggu.chinatax.gov.cn/nsfw/sscx/zdaj/sj/2023/' ##查看时间 headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url,headers=headers,proxies=proxies) response.encoding = 'UTF-8' html = response.text # print(html) selector = etree.HTML(html) a = re.findall('href="(.*?)"',html) for i in a: # print(i) if "html" in i: url1 = 'http://neimenggu.chinatax.gov.cn/nsfw/sscx/zdaj' + i.replace('../..','') print(url1) u1 = url1.split('/') u2 = u1[-1].split('_')[0] # print(u2) dt = u2[1:5] + '/' + u2[5:7] + '/' + u2[7:9] print(dt) utf = r_myco15.sismember('n27', url1) ##更改 if not utf: rsd = r1_d(url1, dt) myco27.insert_one(rsd) myco27_b.insert_one(rsd) r_myco15.sadd('n27', url1) print('存入主备库,>>>n27') else: print('已存在,>>>n27') # time.sleep(10) r1()