#!/usr/bin/env python # coding:utf-8 import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco16,r_myco15,myco16_b from rety import retry r = requests.session() r.keep_alive = False @retry(3) def r1_d(url,dt): # url = 'http://gansu.chinatax.gov.cn/art/2020/3/10/art_8350_65.html' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url,headers=headers,proxies=proxies) response.encoding = 'UTF-8' html = response.text selector = etree.HTML(html) a = selector.xpath('//table[@class="zdsc_con"]//tr') dict1 = {} for i in a: k1 = i.xpath('td[1]') if k1: k2 = k1[0].xpath('string(.)').strip() k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '') # print(k2) v1 = i.xpath('td[2]') if v1: v2 = v1[0].xpath('string(.)').strip() v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '') # print(v3) else: v3 = '' dict1[k3] = v3 dict1['url'] = url dict1['date'] = dt # print(dict1) return dict1 # r1_d() @retry(3) def r1(f86,f85): url = 'http://gansu.chinatax.gov.cn/module/search/index.jsp' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } params = { "field_849":"", "field_850": "", "field_857": "", "field_868": "", "field_867": "", "field_860": "", "field_851": "", "field_852": "", "field_855": f85, "field_866": "", "field_856": "", "field_865": f86, "strSelectID": "849,850,868,857,867,860,851,852,855,866,865,856", "i_columnid": "8350", "field": "field_849:1:1,field_850:1:1,field_851:1:1,field_852:1:1,field_857:1:1,field_860:1:1,field_867:1:1,field_868:1:1,field_855:1:1,field_866:1:1,field_865:1:1,field_856:1:1", "initKind": "FieldForm", "type": "1,1,1,1,1,1,1,1,1,1,1,1", "currentplace": "", "splitflag": "", "fullpath": "0", "currpage":"1", } response = r.get(url=url,headers=headers,params=params,proxies=proxies) html = response.text selector = etree.HTML(html) a = selector.xpath('//a/@href') list1 = [] list2 = [] for i in a: if "art" in i: # print(i) url1 = 'http://gansu.chinatax.gov.cn' + i.replace('../..','') # print(url1) dt = str(f86) + '/' + str(f85) utf = r_myco15.sismember('n16', url1) ##更改 if not utf: rsd = r1_d(url1, dt) print(rsd) list1.append(rsd) list2.append(url1) else: print('已存在,>>>n16') if list1: myco16.insert_many(list1) print('已存入原始库') if list2: myco16_b.insert_many(list1) print('已存入备份原始库') for mis in list2: r_myco15.sadd('n16', mis) ##更改 # if list1: # print(list1) # myco16.insert_many(list1) def runs(): ny1 = '2023' #年度 ny2 = '1' #季度 r1(ny1,ny2) runs()