#!/usr/bin/env python # coding:utf-8 import re import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco15,r_myco15,myco15_b from rety import retry r = requests.session() r.keep_alive = False @retry(3) def r1_d(url,dt): # url = 'http://jilin.chinatax.gov.cn/art/2021/3/3/art_19972_7390.html' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url,headers=headers,proxies=proxies) response.encoding = 'UTF-8' html = response.text selector = etree.HTML(html) a = selector.xpath('//table[@class="zdwf"]//tr') dict1 = {} for i in a: k1 = i.xpath('td[1]/text()')[0] v1 = i.xpath('td[2]/text()') # print(k1,v1) v3 = '' for v2 in v1: v3 += v2 k2 = k1.replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '') v4 = v3.replace(' ','').replace('\r','').replace('\n','').replace('\t','') if k2: dict1[k2] = v4 dict1['url'] = url dict1['date'] = dt # print(dict1) return dict1 # r1_d('1','2') @retry(3) def r1(ny, dt, pg): url = 'http://jilin.chinatax.gov.cn/module/search/index.jsp' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } params = { "field_1136_large":"", "field_1136_small": "", "field_1137_large": "", "field_1137_small": "", "field_1138_large": ny, "field_1138_small": ny, "field_1113": "", "field_1114": "", "field_1115": "", "field_1116": "", "field_1117": "", "field_1120": "", "field_1123": "", "field_1126": "", "strSelectID": "1113,1114,1115,1116,1117,1120,1123,1126,1136,1137,1138", "i_columnid": "19972", "field": "field_1113:1:1,field_1114:1:1,field_1115:1:1,field_1116:1:1,field_1117:1:1,field_1120:1:1,field_1123:1:1,field_1126:1:1,field_1136:0:1,field_1137:0:1,field_1138:0:1", "initKind": "FieldForm", "type": "1,1,1,1,1,1,1,1,1,1,1", "currentplace": "", "currpage": pg, "splitflag": "", "fullpath": "0", } response = r.get(url=url,headers=headers,params=params,proxies=proxies) # print(response.text) html = response.text tpg = re.findall('共(.*?)页',html)[0].replace(' ','') selector = etree.HTML(html) rsl = selector.xpath('//a[@class="xxxx"]/@href') list1 = [] list2 = [] for i in rsl: # print(i) url1 = 'http://jilin.chinatax.gov.cn/' + i.replace('../..', '') # print(url1) utf = r_myco15.sismember('n15', url1) ##更改 if not utf: rsd = r1_d(url1, dt) print(rsd) list2.append(url1) list1.append(rsd) else: print('已存在,>>>n15') if list1: myco15.insert_many(list1) print('已存入原始库') if list2: myco15_b.insert_many(list1) print('已存入备份原始库') for mis in list2: r_myco15.sadd('n15', mis) ##更改 # if list1: # myco15.insert_many(list1) return tpg # ny= '202001' # pg = '1' # dt = '0' def runs(ny1,ny2): # for ny1 in range(2019,2021): #2021 # for ny2 in range(1,13): # 1 4 if len(str(ny2)) ==1: ny2 = '0' +str(ny2) ny = str(ny1) + str(ny2) dt = str(ny1) + '/' +str(ny2) print(ny,dt,'======') rpg = r1(ny, dt, pg=1) if int(rpg) >1: for pg1 in range(2,int(rpg)+1): r1(ny, dt, pg1) ny1 = '2023' ny2 = '10' runs(ny1,ny2)