123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174 |
- #!/usr/bin/env python
- # coding:utf-8
- import re
- import requests,json
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco9,r_myco15,myco9_b
- from rety import retry
- r = requests.session()
- r.keep_alive = False
- import urllib3
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
- @retry(3)
- def r2(ny,cid,dicts):
- url = 'https://etax.shandong.chinatax.gov.cn/DZSWJ/DZSWJ_SSWFSXAJ_CX_NAVIGATE?method=queryMx&nsrmc=&nsrsbh={cid}&zcdz=&zzjgdm=&fddbrxm=&fddbrsfzhm=&cwfzrxm=&cwfzrsfzhm=&cxnd={ny}%D4%C2&cxdq=&ajxz='.format(cid=cid,ny=ny)
- headers = {
- "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
- }
- data = {
- "s_nsrsbh":"",
- "nsrmc": "",
- "zcdz": "",
- "zzjgdm": "",
- "fddbrxm": "",
- "fddbrsfzhm": "",
- "cwfzrxm": "",
- "cwfzrsfzhm": "",
- "cxdq": "",
- "ajxz": "",
- "cxnd": "{}月".format(ny),
- }
- response = r.post(url=url,data=data,headers=headers,verify=False,proxies=proxies)
- html = response.text
- # print(html)
- try:
- dict1 = {}
- NSRMC = re.findall(r'<NSRMC>(.*?)</NSRMC>',html)
- dict1['纳税人名称'] = NSRMC[0]
- NSRSBH = re.findall(r'<NSRSBH>(.*?)</NSRSBH>', html)
- dict1['纳税人识别号或社会信用代码'] = NSRSBH[0]
- ZZJGDM = re.findall(r'<ZZJGDM>(.*?)</ZZJGDM>', html)
- dict1['组织机构代码'] = ZZJGDM[0]
- ZCDZ = re.findall(r'<ZCDZ>(.*?)</ZCDZ>', html)
- dict1['注册地址'] = ZCDZ[0]
- FDDBRHFZRXM = re.findall(r'<FDDBRHFZRXM>(.*?)</FDDBRHFZRXM>', html)
- dict1['法定代表人或者负责人姓名'] = FDDBRHFZRXM[0]
- FDDBRHFZRXB = re.findall(r'<FDDBRHFZRXB>(.*?)</FDDBRHFZRXB>', html)
- dict1['性别'] = FDDBRHFZRXB[0]
- FDDBRHFZRZJHM = re.findall(r'<FDDBRHFZRZJHM>(.*?)</FDDBRHFZRZJHM>', html)
- dict1['证件号码1'] = FDDBRHFZRZJHM[0]
- FDRZJHM = re.findall(r'<FDRZJHM>(.*?)</FDRZJHM>', html)
- dict1['证件号码2'] = FDRZJHM[0]
- AJXZ = re.findall(r'<AJXZ>(.*?)</AJXZ>', html)
- dict1['案件性质'] = AJXZ[0]
- ZYWFSS = re.findall(r'<ZYWFSS>(.*?)</ZYWFSS>', html)
- dict1['主要违法事实'] = ZYWFSS[0]
- XGFLYJJSWCLCFQK = re.findall(r'<XGFLYJJSWCLCFQK>(.*?)</XGFLYJJSWCLCFQK>', html)
- dict1['相关法律依据及税务处理处罚情况 '] = XGFLYJJSWCLCFQK[0]
- dict1['date'] = ny[:4] +'/'+ ny[4:]
- dict1['uid'] = cid
- # print(dict1)
- return dict1
- except:
- return dicts
- # r2()
- @retry(3)
- def r1(ny,pg):
- url = 'https://etax.shandong.chinatax.gov.cn/DZSWJ/DZSWJ_SSWFSXAJ_CX_NAVIGATE?method=queryMxFh&nsrmc=&nsrsbh=&zcdz=&zzjgdm=&fddbrxm=&fddbrsfzhm=&cwfzrxm=&cwfzrsfzhm=&cxnd={ny}%D4%C2&cxdq=&ajxz=&page={pg}'.format(ny=ny,pg=pg)
- # url = 'https://etax.shandong.chinatax.gov.cn/DZSWJ/DZSWJ_SSWFSXAJ_CX_NAVIGATE?method=queryBynd&nsrmc=&nsrsbh=&zcdz=&zzjgdm=&fddbrxm=&fddbrsfzhm=&cwfzrxm=&cwfzrsfzhm=&cxnd=201911%D4%C2'
- headers = {
- "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
- }
- data = {
- "s_nsrsbh":"",
- "nsrmc": "",
- "zcdz": "",
- "zzjgdm": "",
- "fddbrxm": "",
- "fddbrsfzhm": "",
- "cwfzrxm": "",
- "cwfzrsfzhm": "",
- "cxdq": "",
- "ajxz": "",
- "cxnd": "20201年度",
- }
- params = {
- "method":"queryMxFh",
- "nsrmc": "",
- "nsrsbh": "",
- "zcdz": "",
- "zzjgdm": "",
- "fddbrxm": "",
- "fddbrsfzhm": "",
- "cwfzrxm": "",
- "cwfzrsfzhm": "",
- "cxnd": "20195%D4%C2",
- "cxdq": "",
- "ajxz": "",
- "page": "2",
- }
- response = r.get(url=url,headers=headers,verify=False,proxies=proxies)
- html = response.text
- # print(html)
- selector = etree.HTML(html)
- b = selector.xpath('//tr')
- try:
- num = 0
- list1 = []
- list2 = []
- for i in b:
- num += 1
- if num > 1:
- dict1 = {}
- td2 = i.xpath('td[2]/text()')
- td22 = td2[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '').replace('\xa0', '')
- dict1['纳税人名称'] = td22
- td3 = i.xpath('td[3]/text()')
- td33 = td3[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '').replace('\xa0', '')
- dict1['纳税人识别号或社会信用代码'] = td33
- td4 = i.xpath('td[4]/text()')
- td44 = td4[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '').replace('\xa0', '')
- dict1['案件性质'] = td44
- td5 = i.xpath('td[5]/input[@id="xxxx"]/@onclick')
- td55 = td5[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '').replace('\xa0', '')
- cid = re.findall(r"\('(.*?)'\)",td55)[0]
- dict1['组织机构代码'] = ''
- dict1['注册地址'] = ''
- dict1['法定代表人或者负责人姓名'] = ''
- dict1['性别'] = ''
- dict1['证件号码1'] = ''
- dict1['证件号码2'] = ''
- dict1['主要违法事实'] = ''
- dict1['相关法律依据及税务处理处罚情况 '] = ''
- dict1['date'] = ny[:4] + '/' + ny[4:]
- dict1['uid'] = cid
- utf = r_myco15.sismember('n09', cid) ##更改
- if not utf:
- rsd = r2(ny, cid, dict1)
- print(rsd)
- list1.append(rsd)
- list2.append(cid)
- else:
- print('已存在,>>>n09')
- pass
- if list1:
- myco9.insert_many(list1)
- if list2:
- myco9_b.insert_many(list1)
- for mis in list2:
- r_myco15.sadd('n09', mis) ##更改
- return '1'
- except:
- return '2'
- # myco9.insert_many(list1)
- # break
- # print(dict1)
- # print(cid)
- def runs(ny1,ny2):
- ny = str(ny1) + str(ny2)
- tpg = 100
- for pg in range(1,tpg):
- print(pg,'===================')
- btf = r1(ny,pg)
- if btf == "2":
- break
- runs('2023','11')
|