123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475 |
- #!/usr/bin/env python
- # coding:utf-8
- import re
- import requests,json
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco31,r_myco15,myco31_b
- from rety import retry
- r = requests.session()
- r.keep_alive = False
- @retry(3)
- def r1(ny1,ny2,pg):
- url = 'http://tianjin.chinatax.gov.cn/wzcx/sjcx_cxqyxx.action'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- data = {
- "szsf":"11200000000",
- "nfjd": str(ny1) + '0' + str(ny2),
- "page":pg,
- "pageCount":"15"
- }
- response = r.post(url=url,headers=headers,data=data,proxies=proxies)
- html = response.text
- # print(html)
- html1 = html.replace(' ','').replace('\r','').replace('\t','').replace('\n','')
- rpg1 = re.findall(r"pageCount='(.*?)'", html1)[0]
- # print(rpg1)
- selector = etree.HTML(html)
- a = selector.xpath('//span[@class="mxxx"]')
- dt = str(ny1) + '/0' + str(ny2)
- list1 = []
- list2 = []
- for i in a:
- dict1 = {}
- nsrm = i.xpath('@data-nsrmc')
- dict1['纳税人名称'] = nsrm[0]
- nsrsbh = i.xpath('@data-nsrsbh')
- dict1['纳税人识别号'] =nsrsbh[0]
- zzjgdm = i.xpath('@data-zzjgdm')
- dict1['组织机构代码'] =zzjgdm[0]
- zcjydz = i.xpath('@data-zcjydz')
- dict1['注册地址'] =zcjydz[0]
- fddbrxm = i.xpath('@data-fddbrxm')
- dict1['姓名'] =fddbrxm[0]
- fddbrxb = i.xpath('@data-fddbrxb')
- dict1['性别'] =fddbrxb[0]
- fddbrzjmc = i.xpath('@data-fddbrzjmc')
- dict1['证件名称'] =fddbrzjmc[0]
- fddbrzjhm = i.xpath('@data-fddbrzjhm')
- dict1['证件号码'] =fddbrzjhm[0]
- ajlxmc = i.xpath('@data-ajlxmc')
- dict1['案件性质'] =ajlxmc[0]
- zywfss = i.xpath('@data-zywfss')
- dict1['主要违法事实'] =zywfss[0]
- clqk = i.xpath('@data-clqk')
- dict1['相关法律依据及税务处理处罚情况'] =clqk[0]
- dict1['date'] = dt
- print(dict1)
- list1.append(dict1)
- # return int(rpg1)
- if list1:
- myco31.insert_many(list1)
- # http://tianjin.chinatax.gov.cn/wzcx/cx_zdwfaj.action?szsf=11200000000
- # 此数据无法去重,遂单独更新,查看页数,季度
- def runs():
- for pg in range(1,2):
- print(pg,'==========')
- r1(2023,5,pg) ##中间2为季度,每次更新前务必加1季度
- r1()
|