#!/usr/bin/env python # coding:utf-8 import re import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco28,r_myco15,myco28_b from rety import retry r = requests.session() r.keep_alive = False @retry(3) def r1_d(url,dt): # url = 'http://ningxia.chinatax.gov.cn/art/2021/3/3/art_14329_8626.html' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url,headers=headers,proxies=proxies) response.encoding = 'UTF-8' html = response.text selector = etree.HTML(html) a = selector.xpath('//table[@class="color"]//tr') dict1 = {} for i in a: k1 = i.xpath('td[1]') if k1: k2 = k1[0].xpath('string(.)').strip() k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '') # print(k2) v1 = i.xpath('td[2]') if v1: v2 = v1[0].xpath('string(.)').strip() v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '') # print(v3) else: v3 = '' dict1[k3] = v3 dict1['url'] = url dict1['date'] = dt # print(dict1) return dict1 # r1_d('1') @retry(3) def r1(ny,pg): url = 'http://ningxia.chinatax.gov.cn/module/search/index.jsp' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } params = { "vc_name":"", "field_147": "", "field_149": "", "field_148": "", "field_150": "", "field_151": "", "strSelectID": "104,147,148,149,150,151", "i_columnid": ny, #202103 "currpage":pg, "field": "field_148:1,field_149:1,vc_name:1,field_147:1,field_150:1,field_151:1", "initKind": "FieldForm", "currentplace": "", "splitflag": "", "fullpath": "0", } response = r.get(url=url,headers=headers,params=params,proxies=proxies) html = response.text # print(html) rpg = re.findall(r'共 (.*?) 页',html) # print(rpg) selector = etree.HTML(html) a = selector.xpath('//a/@href') list1 = [] list2 = [] for i in a: # print(i) if "art" in i: url1 = 'http://ningxia.chinatax.gov.cn' + i.replace('../..','') print(url1) dt1 = i.split('/') # print(dt1) dt=dt1[3] + '/' + dt1[4] + '/' +dt1[5] print(dt) utf = r_myco15.sismember('n28', url1) ##更改 if not utf: rsd = r1_d(url1, dt) print(rsd) list1.append(rsd) list2.append(url1) else: print('已存在,>>>n28') if list1: myco28.insert_many(list1) print('已存入原始库') if list2: myco28_b.insert_many(list1) print('已存入备份原始库') for mis in list2: r_myco15.sadd('n28', mis) ##更改 return rpg[0] # if list1: # myco28.insert_many(list1) def get_pg(ny): url = 'http://ningxia.chinatax.gov.cn/col/col14330/index.html' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url, headers=headers, proxies=proxies) response.encoding = 'UTF-8' html = response.text # print(html) selector = etree.HTML(html) ### niandu3对应2021年 a = selector.xpath('//dl[@id="niandu3"]//dt[@class="open"]//a') for i in a: yf = i.xpath('text()')[0] if yf == '{}月'.format(ny): href = i.xpath('@href')[0] print(href) h1 = re.findall(r'col/col(.*?)/i',href) return h1[0] # yf = get_pg(3) # print(yf) def runs(ny): yf = get_pg(ny) rpg = r1(yf, pg=1) for pg in range(2,int(rpg)+1): print(pg,'=============') r1(ny, pg) runs(1)