#!/usr/bin/env python # coding:utf-8 import re import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco31,r_myco15,myco31_b from rety import retry r = requests.session() r.keep_alive = False @retry(3) def r1(ny1,ny2,pg): url = 'http://tianjin.chinatax.gov.cn/wzcx/sjcx_cxqyxx.action' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } data = { "szsf":"11200000000", "nfjd": str(ny1) + '0' + str(ny2), "page":pg, "pageCount":"15" } response = r.post(url=url,headers=headers,data=data,proxies=proxies) html = response.text # print(html) html1 = html.replace(' ','').replace('\r','').replace('\t','').replace('\n','') rpg1 = re.findall(r"pageCount='(.*?)'", html1)[0] # print(rpg1) selector = etree.HTML(html) a = selector.xpath('//span[@class="mxxx"]') dt = str(ny1) + '/0' + str(ny2) list1 = [] list2 = [] for i in a: dict1 = {} nsrm = i.xpath('@data-nsrmc') dict1['纳税人名称'] = nsrm[0] nsrsbh = i.xpath('@data-nsrsbh') dict1['纳税人识别号'] =nsrsbh[0] zzjgdm = i.xpath('@data-zzjgdm') dict1['组织机构代码'] =zzjgdm[0] zcjydz = i.xpath('@data-zcjydz') dict1['注册地址'] =zcjydz[0] fddbrxm = i.xpath('@data-fddbrxm') dict1['姓名'] =fddbrxm[0] fddbrxb = i.xpath('@data-fddbrxb') dict1['性别'] =fddbrxb[0] fddbrzjmc = i.xpath('@data-fddbrzjmc') dict1['证件名称'] =fddbrzjmc[0] fddbrzjhm = i.xpath('@data-fddbrzjhm') dict1['证件号码'] =fddbrzjhm[0] ajlxmc = i.xpath('@data-ajlxmc') dict1['案件性质'] =ajlxmc[0] zywfss = i.xpath('@data-zywfss') dict1['主要违法事实'] =zywfss[0] clqk = i.xpath('@data-clqk') dict1['相关法律依据及税务处理处罚情况'] =clqk[0] dict1['date'] = dt print(dict1) list1.append(dict1) # return int(rpg1) if list1: myco31.insert_many(list1) # http://tianjin.chinatax.gov.cn/wzcx/cx_zdwfaj.action?szsf=11200000000 # 此数据无法去重,遂单独更新,查看页数,季度 def runs(): for pg in range(1,2): print(pg,'==========') r1(2023,5,pg) ##中间2为季度,每次更新前务必加1季度 r1()