#!/usr/bin/env python # coding:utf-8 import requests,json,re from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco4,r_myco15,myco4_b r = requests.session() r.keep_alive = False import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) from rety import retry @retry(3) def r1_d(url,dt): # url = 'https://shenzhen.chinatax.gov.cn/mhsofpro/otherproject/wgtg/data.jsp?tags=ps_18756&fh=true' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url,headers=headers,verify=False,proxies=proxies) html = response.text # print(html) selector = etree.HTML(html) a = selector.xpath('//tr') dict1 = {} for i in a: # print(i.xpath('td[1]/text()')) # print(i.xpath('td[2]/text()')) try: k1 = i.xpath('td[1]') text = k1[0].xpath('string(.)').strip() # text v1 = i.xpath('td[2]') text1 = v1[0].xpath('string(.)').strip() v2 = text1.replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '') dict1[text] = v2 except: pass dict1['url'] = url dict1['date'] = dt print(dict1) return dict1 # r1_d() @retry(3) def r1(ny,cpg,tpg,dt): url = 'https://shenzhen.chinatax.gov.cn/mhsofpro/otherproject/page/page.jsp?type=w_date&msg={}'.format(ny) headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } data = { "curPage":cpg, "totalPages": tpg, "pageNum": "1", } response = r.post(url=url,headers=headers,verify=False,data=data,proxies=proxies) html = response.text selector = etree.HTML(html) rpg = re.findall(r'\/(.*?)页',html) tpg = rpg[1] # print(rpg) a = selector.xpath('//input[@id="button2"]/@onclick') list1 = [] list2 = [] for i in a: # print(i) aa = re.findall("'(.*?)'",i)[0] url1 = 'https://shenzhen.chinatax.gov.cn'+aa print(url1,dt) utf = r_myco15.sismember('n04', url1) if not utf: rsd = r1_d(url1, dt) list1.append(rsd) list2.append(url1) else: print('已存在,>>>n04') pass if list1: myco4.insert_many(list1) if list2: myco4_b.insert_many(list1) for mis in list2: r_myco15.sadd('n04', mis) # myco4.insert_many(list1) return int(tpg) def runs(ny1='2021',ny2='9'): ny=str(ny2)+'_'+ str(ny1) # cpg='1' dt=str(ny1)+'/'+str(ny2)+'/1' tpg = r1(ny, '1', '1', dt) if tpg >1: for i in range(2,tpg+1): print(i,'页============') r1(ny,i,tpg,dt) runs()