#!/usr/bin/env python # coding:utf-8 import re,time import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco21,myco21_b,r_myco15 from rety import retry r = requests.session() r.keep_alive = False @retry(3) def r2(dt,uid): url = 'http://jiangxi.chinatax.gov.cn/taxmap/front/getdetail.do' headers = { "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0", } params = { "iid":uid } response = r.get(url=url,params=params,headers=headers,proxies=proxies) response.encoding = 'UTF-8' html = response.text # print(html) selector = etree.HTML(html) a = selector.xpath('//table[@class="xxTable"]//tr') dict1 = {} for i in a: k1 = i.xpath('th/text()') if k1: str1 = '' for kk1 in k1: str1 += kk1 v1 =i.xpath('td/text()') str2 = '' if v1: for vv1 in v1: str2 += vv1 dict1[str1] = str2 dict1['date'] = dt dict1['uid'] = uid # print(dict1) return dict1 # r2() @retry(3) def r1(pg): dt = '2021/05' url = 'http://jiangxi.chinatax.gov.cn/taxmap/front/result2.do' headers = { "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0", } params = { "region":"", "nature": "", "year": "", "pageno": pg, "_": int(round(time.time() * 1000)), } response = r.get(url=url,headers=headers,params=params,proxies=proxies) html = response.text # print(html) a = re.findall('getDetail\((.*?)\)',html) list1 = [] list2 = [] for uid in a: if uid != 'iid': print(uid) utf = r_myco15.sismember('n21', uid) ##更改 if not utf: rsd = r2(dt, uid) list1.append(rsd) list2.append(uid) else: print('已存在,>>>n21') if list1: myco21.insert_many(list1) print('已存入原始库') if list2: myco21_b.insert_many(list1) print('已存入备份原始库') for mis in list2: r_myco15.sadd('n21', mis) ##更改 # pg = '1' def runs(): for pg in range(1,3): print(pg,'===============================') r1(pg) runs()