#!/usr/bin/env python # coding:utf-8 import re import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco25,r_myco15,myco25_b from rety import retry r = requests.session() r.keep_alive = False @retry(3) def r1_d(url,dt): # url = 'http://shaanxi.chinatax.gov.cn/art/2021/4/15/art_15616_7502.html' headers = { "User-Agent": "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0; .NET CLR 2.0.50727; SLCC2; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729)", } response = r.get(url=url,headers=headers,proxies=proxies) response.encoding = 'UTF-8' html = response.text selector = etree.HTML(html) a = selector.xpath('//table[@class="zdsc_con"]//tr') dict1 = {} for i in a: k1 = i.xpath('td[1]') if k1: k2 = k1[0].xpath('string(.)').strip() k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '') # print(k2) v1 = i.xpath('td[2]') if v1: v2 = v1[0].xpath('string(.)').strip() v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '') # print(v3) else: v3 = '' dict1[k3] = v3 dict1['url'] = url # dt = '' dict1['date'] = dt # print(dict1) return dict1 # r1_d() @retry(3) def r1(ny1,ny2,pg): url = 'http://shaanxi.chinatax.gov.cn/module/search/index.jsp' headers = { "User-Agent": "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0; .NET CLR 2.0.50727; SLCC2; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729)", } params = { "field_2166": "", "field_1656": "", "field_1652": "", "field_1663": "", "field_1653": "", "field_2390": "", "field_2213": "", "field_2391": "", "field_2410": ny2, "field_1670": "", "field_1672": ny1, "currpage": pg, "field_1651": "", "strSelectID": "style_2166,1656,1663,1652,1653,2390,2213,2391,1651,1672,2410,1670", "i_columnid": "style_3", "field": "field_2166:1:0,field_2213:1:0,field_1656:1:0,field_2391:1:0,field_2410:12:0,field_1651:12:0,field_1652:1:0,field_2390:1:0,field_1672:12:0,field_1670:12:0,field_1653:1:0,field_1663:1:0", "initKind": "FieldFormMetadata", "type": "0,0,0,0,0,0,0,0,0,0,0,0", "currentplace": "", "splitflag": "", "fullpath": "0", } response = r.get(url=url, headers=headers, params=params,proxies=proxies) html = response.text # print(html) rpg = re.findall("(.*?)",html)[0] # print(rpg) selector = etree.HTML(html) a = selector.xpath('//li//a/@href') list1 = [] list2 = [] for i in a: # print(i) url1 = 'http://shaanxi.chinatax.gov.cn' + i.replace('../..','') dt = str(ny1) + '/' + str(ny2+1) utf = r_myco15.sismember('n25', url1) ##更改 if not utf: rsd = r1_d(url1, dt) print(rsd) list1.append(rsd) list2.append(url1) else: print('已存在,>>>n25') if list1: myco25.insert_many(list1) print('已存入原始库') if list2: myco25_b.insert_many(list1) print('已存入备份原始库') for mis in list2: r_myco15.sadd('n25', mis) ##更改 # if list1: # myco25.insert_many(list1) return int(rpg) def runs(ny1,ny2): # for ny1 in range(2021,2022): # for ny2 in range(1,2): # print(ny1,ny2,'========') ny3 = int(ny2) - 1 rpg = r1(ny1,ny3,pg=1) # print(rpg,'----------------') tpg = rpg//20 if tpg >1: for pg in range(2,tpg +2): print(pg,'=======') r1(ny1,ny3,pg) runs(2023,11)