#!/usr/bin/env python # coding:utf-8 import re import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco14,myco14_b,r_myco15 from rety import retry r = requests.session() r.keep_alive = False @retry(3) def r1_d(url,dt): # url = 'http://heilongjiang.chinatax.gov.cn/art/2021/4/10/art_6410_962.html' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url,headers=headers,proxies=proxies) response.encoding = 'UTF-8' html = response.text selector = etree.HTML(html) a = selector.xpath('//tr') dict1 = {} for i in a: k1 = i.xpath('td[1]') if k1: k2 = k1[0].xpath('string(.)').strip() k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '') # print(k2) v1 = i.xpath('td[2]') if v1: v2 = v1[0].xpath('string(.)').strip() v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '') # print(v3) else: v3 = '' if k3: dict1[k3] = v3 # print(dict1) dict1['url'] = url dict1['date'] = dt # print(dict1) return dict1 # r1_d() @retry(3) def r1(ny1,ny2): # url = 'http://heilongjiang.chinatax.gov.cn/module/jslib/bulletin/ajaxdata.jsp?startrecord=1&endrecord=2&perpage=11' url = 'http://heilongjiang.chinatax.gov.cn/module/jslib/bulletin/ajaxdata.jsp' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } data = { "searhvalue":ny2, "searchkey": "jd", "year": ny1, } response = r.post(url=url,headers=headers,data=data,proxies=proxies) html = response.text hfs = re.findall("href='(.*?)'",html) if ny1 == 1: y1 = '2019' elif ny1 == 2: y1 = '2020' elif ny1 == 3: y1 = '2021' dt = y1 + '/' +str(ny2+1) list1 = [] list2 = [] for url1 in hfs: print(url1) utf = r_myco15.sismember('n14', url1) ##更改 if not utf: rsd = r1_d(url1, dt) list1.append(rsd) list2.append(url1) else: print('已存在,>>>n14') if list1: myco14.insert_many(list1) print('已存入原始库') if list2: myco14_b.insert_many(list1) print('已存入备份原始库') for mis in list2: r_myco15.sadd('n14', mis) ##更改 # if list1: # myco14.insert_many(list1) def runs(ny2): ny1 = 3 ny3 = int(ny2) - 1 print('2023',ny2,'=========') r1(ny1, ny2) runs(10)