#!/usr/bin/env python # coding:utf-8 import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco18,myco18_b,r_myco15 from rety import retry r = requests.session() r.keep_alive = False @retry(3) def r1_d(url,dt): # url = 'https://henan.chinatax.gov.cn/henanchinatax/xxgk/zdsswfsxaj/2021060109153715435/index.html' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url,headers=headers,proxies=proxies) response.encoding = 'UTF-8' html = response.text seletor = etree.HTML(html) a = seletor.xpath('//table[@class="zhongdatable"]//th') list1 = [] for i in a: text = i.xpath('string(.)').strip() t1 = text.replace(' ','').replace('\r','').replace('\t','').replace('\n','') list1.append(t1) b = seletor.xpath('//table[@class="zhongdatable"]//td') list2 = [] for i in b: text = i.xpath('string(.)').strip() t2 = text.replace(' ','').replace('\r','').replace('\t','').replace('\n','') list2.append(t2) dict1 = {} # print(list1) # print(list2) for i in range(len(a)): # print(i) k1 = list1[i] v1 = list2[i] # print(k1,v1) dict1[k1] = v1 # print(dict1) dict1['url'] = url dict1['date'] = dt # print(dict1) return dict1 # print(len(b)) # for i1 in b: # print(i1.replace(' ','').replace('\r','').replace('\t','').replace('\n','')) # print(response.text) # r1_d() @retry(3) def r1(pg,dt): url = 'https://henan.chinatax.gov.cn/eportal/ui?pageId=bdfef9dfa679454c86d68f2203a69e84¤tPage={}&moduleId=143e1aeaa3b6405ea0fe04142c021d5b&staticRequest=yes'.format(pg) headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } # data = { # "filter_LIKE_EXT_STR15":dt # } data = { "filter_LIKE_EXT_STR6":"", "filter_LIKE_main.TITLE": "", "filter_LIKE_EXT_STR2": "", "filter_LIKE_EXT_STR4": "", "filter_LIKE_EXT_STR3": "", "filter_LIKE_EXT_STR8": "", "filter_LIKE_EXT_STR19": "", "filter_LIKE_EXT_STR10": "", "filter_LIKE_EXT_STR23": "", } response = r.post(url=url,headers=headers,data=data,proxies=proxies) html = response.text # print(html) seletor = etree.HTML(html) a = seletor.xpath('//a[@istitle="true"]') list1 = [] list2 = [] print(a) for i in a: # print(i.xpath('@title')) url1 = 'https://henan.chinatax.gov.cn' + i.xpath('@href')[0] print(url1) utf = r_myco15.sismember('n18', url1) ##更改 if not utf: rsd = r1_d(url1, dt) print(rsd) list1.append(rsd) list2.append(url1) else: print('已存在,>>>n18') if list1: myco18.insert_many(list1) print('已存入原始库') if list2: myco18_b.insert_many(list1) print('已存入备份原始库') for mis in list2: r_myco15.sadd('n18', mis) ##更改 # if list1: # myco18.insert_many(list1) def runs(): dt = '2023' for pg in range(1,2): print(pg,'========') r1(pg,dt) runs()