#!/usr/bin/env python
# coding:utf-8
import re
import requests,json
from setting import proxies
from urllib import parse
from lxml import etree
from mongo_cho import myco9,r_myco15,myco9_b
from rety import retry
r = requests.session()
r.keep_alive = False
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@retry(3)
def r2(ny,cid,dicts):
url = 'https://etax.shandong.chinatax.gov.cn/DZSWJ/DZSWJ_SSWFSXAJ_CX_NAVIGATE?method=queryMx&nsrmc=&nsrsbh={cid}&zcdz=&zzjgdm=&fddbrxm=&fddbrsfzhm=&cwfzrxm=&cwfzrsfzhm=&cxnd={ny}%D4%C2&cxdq=&ajxz='.format(cid=cid,ny=ny)
headers = {
"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
}
data = {
"s_nsrsbh":"",
"nsrmc": "",
"zcdz": "",
"zzjgdm": "",
"fddbrxm": "",
"fddbrsfzhm": "",
"cwfzrxm": "",
"cwfzrsfzhm": "",
"cxdq": "",
"ajxz": "",
"cxnd": "{}月".format(ny),
}
response = r.post(url=url,data=data,headers=headers,verify=False,proxies=proxies)
html = response.text
# print(html)
try:
dict1 = {}
NSRMC = re.findall(r'(.*?)',html)
dict1['纳税人名称'] = NSRMC[0]
NSRSBH = re.findall(r'(.*?)', html)
dict1['纳税人识别号或社会信用代码'] = NSRSBH[0]
ZZJGDM = re.findall(r'(.*?)', html)
dict1['组织机构代码'] = ZZJGDM[0]
ZCDZ = re.findall(r'(.*?)', html)
dict1['注册地址'] = ZCDZ[0]
FDDBRHFZRXM = re.findall(r'(.*?)', html)
dict1['法定代表人或者负责人姓名'] = FDDBRHFZRXM[0]
FDDBRHFZRXB = re.findall(r'(.*?)', html)
dict1['性别'] = FDDBRHFZRXB[0]
FDDBRHFZRZJHM = re.findall(r'(.*?)', html)
dict1['证件号码1'] = FDDBRHFZRZJHM[0]
FDRZJHM = re.findall(r'(.*?)', html)
dict1['证件号码2'] = FDRZJHM[0]
AJXZ = re.findall(r'(.*?)', html)
dict1['案件性质'] = AJXZ[0]
ZYWFSS = re.findall(r'(.*?)', html)
dict1['主要违法事实'] = ZYWFSS[0]
XGFLYJJSWCLCFQK = re.findall(r'(.*?)', html)
dict1['相关法律依据及税务处理处罚情况 '] = XGFLYJJSWCLCFQK[0]
dict1['date'] = ny[:4] +'/'+ ny[4:]
dict1['uid'] = cid
# print(dict1)
return dict1
except:
return dicts
# r2()
@retry(3)
def r1(ny,pg):
url = 'https://etax.shandong.chinatax.gov.cn/DZSWJ/DZSWJ_SSWFSXAJ_CX_NAVIGATE?method=queryMxFh&nsrmc=&nsrsbh=&zcdz=&zzjgdm=&fddbrxm=&fddbrsfzhm=&cwfzrxm=&cwfzrsfzhm=&cxnd={ny}%D4%C2&cxdq=&ajxz=&page={pg}'.format(ny=ny,pg=pg)
# url = 'https://etax.shandong.chinatax.gov.cn/DZSWJ/DZSWJ_SSWFSXAJ_CX_NAVIGATE?method=queryBynd&nsrmc=&nsrsbh=&zcdz=&zzjgdm=&fddbrxm=&fddbrsfzhm=&cwfzrxm=&cwfzrsfzhm=&cxnd=201911%D4%C2'
headers = {
"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
}
data = {
"s_nsrsbh":"",
"nsrmc": "",
"zcdz": "",
"zzjgdm": "",
"fddbrxm": "",
"fddbrsfzhm": "",
"cwfzrxm": "",
"cwfzrsfzhm": "",
"cxdq": "",
"ajxz": "",
"cxnd": "20201年度",
}
params = {
"method":"queryMxFh",
"nsrmc": "",
"nsrsbh": "",
"zcdz": "",
"zzjgdm": "",
"fddbrxm": "",
"fddbrsfzhm": "",
"cwfzrxm": "",
"cwfzrsfzhm": "",
"cxnd": "20195%D4%C2",
"cxdq": "",
"ajxz": "",
"page": "2",
}
response = r.get(url=url,headers=headers,verify=False,proxies=proxies)
html = response.text
# print(html)
selector = etree.HTML(html)
b = selector.xpath('//tr')
try:
num = 0
list1 = []
list2 = []
for i in b:
num += 1
if num > 1:
dict1 = {}
td2 = i.xpath('td[2]/text()')
td22 = td2[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '').replace('\xa0', '')
dict1['纳税人名称'] = td22
td3 = i.xpath('td[3]/text()')
td33 = td3[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '').replace('\xa0', '')
dict1['纳税人识别号或社会信用代码'] = td33
td4 = i.xpath('td[4]/text()')
td44 = td4[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '').replace('\xa0', '')
dict1['案件性质'] = td44
td5 = i.xpath('td[5]/input[@id="xxxx"]/@onclick')
td55 = td5[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '').replace('\xa0', '')
cid = re.findall(r"\('(.*?)'\)",td55)[0]
dict1['组织机构代码'] = ''
dict1['注册地址'] = ''
dict1['法定代表人或者负责人姓名'] = ''
dict1['性别'] = ''
dict1['证件号码1'] = ''
dict1['证件号码2'] = ''
dict1['主要违法事实'] = ''
dict1['相关法律依据及税务处理处罚情况 '] = ''
dict1['date'] = ny[:4] + '/' + ny[4:]
dict1['uid'] = cid
utf = r_myco15.sismember('n09', cid) ##更改
if not utf:
rsd = r2(ny, cid, dict1)
print(rsd)
list1.append(rsd)
list2.append(cid)
else:
print('已存在,>>>n09')
pass
if list1:
myco9.insert_many(list1)
if list2:
myco9_b.insert_many(list1)
for mis in list2:
r_myco15.sadd('n09', mis) ##更改
return '1'
except:
return '2'
# myco9.insert_many(list1)
# break
# print(dict1)
# print(cid)
def runs(ny1,ny2):
ny = str(ny1) + str(ny2)
tpg = 100
for pg in range(1,tpg):
print(pg,'===================')
btf = r1(ny,pg)
if btf == "2":
break
runs('2023','11')