#!/usr/bin/env python # coding:utf-8 import re import time import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco13,r_myco15,myco13_b from rety import retry r = requests.session() r.keep_alive = False @retry(3) def r1_d(url,dt): # url = 'http://liaoning.chinatax.gov.cn/art/2020/12/8/art_5883_1808.html' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url,headers=headers,proxies=proxies) response.encoding = 'UTF-8' html = response.text selector = etree.HTML(html) a = selector.xpath('//table[@class="contentTable"]//tr') dict1 = {} for i in a: k1 = i.xpath('td[1]') if k1: k2 = k1[0].xpath('string(.)').strip() k3 = k2.replace('\r','').replace('\t','').replace('\n','').replace(' ','') # print(k2) v1 = i.xpath('td[2]') if v1: v2 = v1[0].xpath('string(.)').strip() v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '') # print(v3) else: v3 = '' if k3: dict1[k3] = v3 dict1['url'] = url dict1['date'] = dt # print(dict1) return dict1 # r1_d() @retry(3) def r1(ny,pg,dt): url = 'http://liaoning.chinatax.gov.cn/module/search/index.jsp' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } params= { "a":"", "b": "", "d": "", "c": "", "e": "", "h": "", "k": "", "n": "", "t": "", "x_large": "", "x_small": "", "y_large": "", "y_small": "", "z_large": ny, "z_small": ny, "strSelectID": "1754,1755,1756,1757,1758,1761,1764,1767,1777,1778,1779", "i_columnid": "5883", "field": "a:1:0,b:1:1,c:1:1,d:1:0,e:1:1,h:1:0,k:1:1,n:1:0,t:1:0,x:0:1,y:0:1,z:0:1", "initKind": "FieldForm", "type": "0,1,1,0,1,0,1,0,0,1,1,1", "currpage":pg, "currentplace": "", "splitflag": "", "fullpath": "0", } response = r.get(url=url,headers=headers,params=params,proxies=proxies) html = response.text # print(html) rpg = re.findall(r'共 (.*?) ',html)[0] # print(rpg) # time.sleep(9) selector = etree.HTML(html) rsl = selector.xpath('//a[@class="xxxx"]/@href') list1 = [] list2 = [] for i in rsl: # print(i) url1 = 'http://liaoning.chinatax.gov.cn' + i.replace('../..','') print(url1) utf = r_myco15.sismember('n13', url1) ##更改 if not utf: rsd = r1_d(url1, dt) print(rsd) list1.append(rsd) list2.append(url1) else: print('已存在,>>>n13') if list1: myco13.insert_many(list1) print('已存入原始库') if list2: myco13_b.insert_many(list1) print('已存入备份原始库') for mis in list2: r_myco15.sadd('n13', mis) ##更改 return int(rpg) # http://liaoning.chinatax.gov.cn/col/col5883/index.html def runs(ny1,ny2): if len(str(ny2)) == 1: ny = str(ny1) + '0' + str(ny2) else: ny = str(ny1) + str(ny2) pg = 1 dt = str(ny1) + '/' + str(ny2) tpg = r1(ny,pg,dt) for pg in range(1,tpg+1): print(pg,'=====') r1(ny,pg,dt) runs(2023,4)