#!/usr/bin/env python # coding:utf-8 import re import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco22,r_myco15,myco22_b from rety import retry r = requests.session() r.keep_alive = False import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @retry(3) def r1_d(url,dt): # url = 'https://yunnan.chinatax.gov.cn/art/2021/2/9/art_8101_588.html' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url,headers=headers,proxies=proxies,verify=False) response.encoding = 'UTF-8' html = response.text selector = etree.HTML(html) a = selector.xpath('//tr') dict1 = {} for i in a: k1 = i.xpath('td[1]/div/text()') v1 = i.xpath('td[2]/div/text()') # print(k1,v1) k2 = '' if k1: for i1 in k1: k2 += i1 if v1: v2 = '' for i2 in v1: v2+=i2 else: v2 = '' dict1[k2] = v2 dict1['url'] = url dict1['date'] = dt print(dict1) return dict1 # r1_d() @retry(3) def r1(ny1,ny2): url = 'https://yunnan.chinatax.gov.cn/bulletin/ajaxdata.jsp?startrecord=1&endrecord=8&perpage=11&rowpage=1' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } data = { "searhvalue":"{}%E6%9C%88".format(ny2), "searchkey": "jd1", "year": "{}%E5%B9%B4%E5%BA%A6".format(ny1), } response = r.post(url=url,data=data,headers=headers,proxies=proxies,verify=False) html = response.text a = re.findall("href='(.*?)'", html) list1 = [] list2 = [] for i in a: print(i) dt = str(ny1) + '/' + str(ny2) utf = r_myco15.sismember('n22', i) ##更改 if not utf: rsl = r1_d(i, dt) list1.append(rsl) list2.append(i) else: print('已存在,>>>n22') if list1: myco22.insert_many(list1) print('已存入原始库') if list2: myco22_b.insert_many(list1) print('已存入备份原始库') for mis in list2: r_myco15.sadd('n22', mis) ##更改 # if list1: # myco22.insert_many(list1) def runs(ny1,ny2): print(ny1,ny2,'=========') r1(ny1, ny2) runs('2023','5')