#!/usr/bin/env python # coding:utf-8 import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco29,r_myco15,myco29_b from rety import retry r = requests.session() r.keep_alive = False @retry(3) def r1_d(cmpname,url): # url = 'https://xizang.chinatax.gov.cn/art/2019/6/26/art_2371_382.html' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url,headers=headers,proxies=proxies) response.encoding = 'UTF-8' html = response.text if "附件下载" in html: return {} if "市局频道" in html: return {} selector = etree.HTML(html) dts = selector.xpath('//div[@class="main"]//div[@class="main_content"]//span/text()') dt = '' for i in dts: if '发布时间' in i: i1 = i.split(' ')[0] dt = i2 = i1.split(':')[1].replace('-','/') print(dt) dict1 = {} dict1['纳税人名称'] = cmpname result = selector.xpath('//div[@id="zoom"]') result1 = result[0].xpath('string(.)').strip() # print(result1) reu1 = result1.split('注册地址:') # print(reu1[1]) dict1['注册地址'] = reu1[1] reu2 = reu1[0].split('主要违法事实:') # print(reu2[1]) dict1['主要违法事实'] =reu2[1] reu3 = reu2[0].split('违法案件性质:') # print(reu3[1]) dict1['违法案件性质'] =reu3[1] reu4 = reu3[0].split('组织机构代码:') # print(reu4[1]) dict1['组织机构代码'] =reu4[1] reu5 = reu4[0].split('法人信息:') # print(reu5[1]) dict1['法人信息'] =reu5[1] reu6 = reu5[0].split('纳税人识别号:') # print(reu6[1]) dict1['纳税人识别号'] =reu6[1] dict1['url'] = url dict1['date'] = dt print(dict1) return dict1 # r1_d('') @retry(3) def r1(pg): url = 'https://xizang.chinatax.gov.cn/module/search/index.jsp' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } params = { "field":"vc_name:1,field_406:1,field_407:1,field_408:1", "i_columnid": "style_63", "vc_name": "", "field_406": "", "field_407": "", "field_408": "", "currpage": pg, } response = r.get(url=url,headers=headers,params=params,proxies=proxies) html = response.text selector = etree.HTML(html) a = selector.xpath('//td//a/@href') list1 = [] list2 = [] for i in a: # print(i) if "art" in i: url1 = "https://xizang.chinatax.gov.cn" + i.replace('../..','') print(url1) utf = r_myco15.sismember('n29', url1) ##更改 if not utf: rsd = r1_d(url1) if rsd: list1.append(rsd) list2.append(url1) else: print('已存在,>>>n29') # if list1: # myco29.insert_many(list1) # print('已存入原始库') # if list2: # myco29_b.insert_many(list1) # print('已存入备份原始库') # for mis in list2: # r_myco15.sadd('n29', mis) ##更改 # if list1: # myco29.insert_many(list1) @retry(3) def r2(pg): url = 'https://xizang.chinatax.gov.cn/module/search/index.jsp' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } params = { "field":"vc_name:1,field_406:1,field_407:1,field_408:1", "i_columnid": "style_63", "vc_name": "", "field_406": "", "field_407": "", "field_408": "", "currpage": pg, } response = r.get(url=url,headers=headers,params=params,proxies=proxies) html = response.text selector = etree.HTML(html) a = selector.xpath('//tr[@class="form-list"]') list1 = [] list2 = [] for i in a: cmpname = i.xpath('td[2]/text()')[0] # print(cmpname) urlz = i.xpath('td[4]//a/@href')[0] # print(urlz) url1 = "https://xizang.chinatax.gov.cn" + urlz.replace('../..', '') utf = r_myco15.sismember('n29', url1) ##更改 if not utf: rsd = r1_d(cmpname,url1) if rsd: list1.append(rsd) list2.append(url1) else: print('已存在,>>>n29') if list1: myco29.insert_many(list1) print('已存入原始库') if list2: myco29_b.insert_many(list1) print('已存入备份原始库') for mis in list2: r_myco15.sadd('n29', mis) ##更改 def runs(): for pg in range(1,4): print(pg,'================') r2(pg) runs()