#!/usr/bin/env python # coding:utf-8 import re import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco10,r_myco15,myco10_b from rety import retry r = requests.session() r.keep_alive = False @retry(3) def r1_d(cid,dt): url = 'http://guangdong.chinatax.gov.cn/siteapps/webpage/gdtax/zdsswfaj/service.jsp' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } data = { "manuscriptId": cid, } response = r.post(url=url, headers=headers, data=data,proxies=proxies) html = response.text selector = etree.HTML(html) a = selector.xpath('//tr') dict1 = {} for i in a: k1 = i.xpath('td[1]/text()') v1 = i.xpath('td[2]/text()') if k1: k2 = k1[0].replace(' ','').replace('\r','').replace('\t','').replace('\n','') # print(k2) if v1: v2 = v1[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '') # print(v2) else: v2 = '' if k2: dict1[k2] = v2 dict1["uid"] = cid dict1['date'] = dt print(dict1) return dict1 # r1_d('42da48b512b046d488189ce36a833fa8','9') @retry(3) def r1(ny1,ny2,pg): url = 'http://guangdong.chinatax.gov.cn/siteapps/webpage/gdtax/zdsswfaj/query.jsp' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } data = { "yf": "{ny1}_{ny2}".format(ny1=ny1,ny2=ny2), "pageSize":"20", "pageNo": pg, "channelId": "", "taxNature": "", "quarter": "", "nsr_mc": "", "nsr_sbh": "", "fddbr_xm": "", "zcdz": "", "zzjgdm": "", "fddbrzjhm": "", "cwfzrxm": "", "cwfzrzjhm": "", } response = r.post(url=url,headers=headers,data=data,proxies=proxies) html = response.text # print(html) selector = etree.HTML(html) rpg = re.findall('共(.*?)页',html)[0].replace(' ','') # a1 = selector.xpath('//*[@id="zdss_tb"]/tbody/tr[2]/td[5]/text()') # print(a1) # for i in a: # print(i.xpath('a/@onclick')) a = selector.xpath('//a/@onclick') list1 = [] list2 = [] for i in a: i1 = re.findall("'(.*?)'",i) if i1: cid = i1[0] dt = str(ny1) + '/' + str(ny2+1) if cid == '#pageIndex': pass else: utf = r_myco15.sismember('n10', cid) ##更改 if not utf: print(cid) rsd = r1_d(cid, dt) list1.append(rsd) list2.append(cid) else: print('已存在,>>>n10') pass # if list1: if list1: myco10.insert_many(list1) print('已存入原始库') if list2: myco10_b.insert_many(list1) print('已存入备份原始库') for mis in list2: r_myco15.sadd('n10', mis) ##更改 # myco10.insert_many(list1) return rpg def runs(ny1,ny2): # for ny1 in range(2021,2022): # for ny2 in range(0,4): # print(ny1,ny2,'===========') rpg = r1(ny1,ny2,pg=1) print(ny1,ny2) if int(rpg) > 1: for pg in range(2,int(rpg)+1): print(pg,'==============') r1(ny1,ny2,pg) runs(2023,11)