#!/usr/bin/env python # coding:utf-8 import re import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco9,r_myco15,myco9_b from rety import retry r = requests.session() r.keep_alive = False import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @retry(3) def r2(ny,cid,dicts): url = 'https://etax.shandong.chinatax.gov.cn/DZSWJ/DZSWJ_SSWFSXAJ_CX_NAVIGATE?method=queryMx&nsrmc=&nsrsbh={cid}&zcdz=&zzjgdm=&fddbrxm=&fddbrsfzhm=&cwfzrxm=&cwfzrsfzhm=&cxnd={ny}%D4%C2&cxdq=&ajxz='.format(cid=cid,ny=ny) headers = { "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0", } data = { "s_nsrsbh":"", "nsrmc": "", "zcdz": "", "zzjgdm": "", "fddbrxm": "", "fddbrsfzhm": "", "cwfzrxm": "", "cwfzrsfzhm": "", "cxdq": "", "ajxz": "", "cxnd": "{}月".format(ny), } response = r.post(url=url,data=data,headers=headers,verify=False,proxies=proxies) html = response.text # print(html) try: dict1 = {} NSRMC = re.findall(r'(.*?)',html) dict1['纳税人名称'] = NSRMC[0] NSRSBH = re.findall(r'(.*?)', html) dict1['纳税人识别号或社会信用代码'] = NSRSBH[0] ZZJGDM = re.findall(r'(.*?)', html) dict1['组织机构代码'] = ZZJGDM[0] ZCDZ = re.findall(r'(.*?)', html) dict1['注册地址'] = ZCDZ[0] FDDBRHFZRXM = re.findall(r'(.*?)', html) dict1['法定代表人或者负责人姓名'] = FDDBRHFZRXM[0] FDDBRHFZRXB = re.findall(r'(.*?)', html) dict1['性别'] = FDDBRHFZRXB[0] FDDBRHFZRZJHM = re.findall(r'(.*?)', html) dict1['证件号码1'] = FDDBRHFZRZJHM[0] FDRZJHM = re.findall(r'(.*?)', html) dict1['证件号码2'] = FDRZJHM[0] AJXZ = re.findall(r'(.*?)', html) dict1['案件性质'] = AJXZ[0] ZYWFSS = re.findall(r'(.*?)', html) dict1['主要违法事实'] = ZYWFSS[0] XGFLYJJSWCLCFQK = re.findall(r'(.*?)', html) dict1['相关法律依据及税务处理处罚情况 '] = XGFLYJJSWCLCFQK[0] dict1['date'] = ny[:4] +'/'+ ny[4:] dict1['uid'] = cid # print(dict1) return dict1 except: return dicts # r2() @retry(3) def r1(ny,pg): url = 'https://etax.shandong.chinatax.gov.cn/DZSWJ/DZSWJ_SSWFSXAJ_CX_NAVIGATE?method=queryMxFh&nsrmc=&nsrsbh=&zcdz=&zzjgdm=&fddbrxm=&fddbrsfzhm=&cwfzrxm=&cwfzrsfzhm=&cxnd={ny}%D4%C2&cxdq=&ajxz=&page={pg}'.format(ny=ny,pg=pg) # url = 'https://etax.shandong.chinatax.gov.cn/DZSWJ/DZSWJ_SSWFSXAJ_CX_NAVIGATE?method=queryBynd&nsrmc=&nsrsbh=&zcdz=&zzjgdm=&fddbrxm=&fddbrsfzhm=&cwfzrxm=&cwfzrsfzhm=&cxnd=201911%D4%C2' headers = { "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0", } data = { "s_nsrsbh":"", "nsrmc": "", "zcdz": "", "zzjgdm": "", "fddbrxm": "", "fddbrsfzhm": "", "cwfzrxm": "", "cwfzrsfzhm": "", "cxdq": "", "ajxz": "", "cxnd": "20201年度", } params = { "method":"queryMxFh", "nsrmc": "", "nsrsbh": "", "zcdz": "", "zzjgdm": "", "fddbrxm": "", "fddbrsfzhm": "", "cwfzrxm": "", "cwfzrsfzhm": "", "cxnd": "20195%D4%C2", "cxdq": "", "ajxz": "", "page": "2", } response = r.get(url=url,headers=headers,verify=False,proxies=proxies) html = response.text # print(html) selector = etree.HTML(html) b = selector.xpath('//tr') try: num = 0 list1 = [] list2 = [] for i in b: num += 1 if num > 1: dict1 = {} td2 = i.xpath('td[2]/text()') td22 = td2[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '').replace('\xa0', '') dict1['纳税人名称'] = td22 td3 = i.xpath('td[3]/text()') td33 = td3[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '').replace('\xa0', '') dict1['纳税人识别号或社会信用代码'] = td33 td4 = i.xpath('td[4]/text()') td44 = td4[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '').replace('\xa0', '') dict1['案件性质'] = td44 td5 = i.xpath('td[5]/input[@id="xxxx"]/@onclick') td55 = td5[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '').replace('\xa0', '') cid = re.findall(r"\('(.*?)'\)",td55)[0] dict1['组织机构代码'] = '' dict1['注册地址'] = '' dict1['法定代表人或者负责人姓名'] = '' dict1['性别'] = '' dict1['证件号码1'] = '' dict1['证件号码2'] = '' dict1['主要违法事实'] = '' dict1['相关法律依据及税务处理处罚情况 '] = '' dict1['date'] = ny[:4] + '/' + ny[4:] dict1['uid'] = cid utf = r_myco15.sismember('n09', cid) ##更改 if not utf: rsd = r2(ny, cid, dict1) print(rsd) list1.append(rsd) list2.append(cid) else: print('已存在,>>>n09') pass if list1: myco9.insert_many(list1) if list2: myco9_b.insert_many(list1) for mis in list2: r_myco15.sadd('n09', mis) ##更改 return '1' except: return '2' # myco9.insert_many(list1) # break # print(dict1) # print(cid) def runs(ny1,ny2): ny = str(ny1) + str(ny2) tpg = 100 for pg in range(1,tpg): print(pg,'===================') btf = r1(ny,pg) if btf == "2": break runs('2023','11')