#!/usr/bin/env python # coding:utf-8 import re from rety import retry import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco3,r_myco15,myco3_b r = requests.session() r.keep_alive = False # @retry(3) def r1_d(cid,ny): url = 'http://beijing.chinatax.gov.cn/bjsat/office/jsp/zdsswfaj/wwidquery' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } data = { "id":cid, "dq": "", "ajlx": "", "ndjd": ny, "bz": "ndjd", "dqy": "2", "ymdx": "", "nsrmc": "", "nsrsbh": "", "zcdz": "", "zzjgdm": "", "fddbrmc": "", "fddbrsfzhm": "", "cwfzrmc": "", "cwfzrsfzhm": "", "orgCode": "11100000000", } response = r.post(url=url,headers=headers,data=data,proxies=proxies) html = response.text selector = etree.HTML(html) a = selector.xpath('/html/body/table/tbody/tr/td/table/tbody//tr') dict1 = {} for i in a: try: k1 = i.xpath('td[1]/text()')[0].replace(' ','').replace('\r','').replace('\n','').replace('\t','') v1 = i.xpath('td[2]/text()')[0].replace(' ','').replace('\r','').replace('\n','').replace('\t','') dict1[k1] = v1 except: pass dict1['uid'] = cid dict1['date'] = ny.replace('年度','/').replace('月','').replace(' ','') # print(dict1) return dict1 # r3_d() @retry(3) def r1(ny,dqy): url = 'http://beijing.chinatax.gov.cn/bjsat/office/jsp/zdsswfaj/wwquery' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } data1 = { "orgCode":"11100000000", "bz": "ndjd", "ndjd": ny, } data = { "id": '', "dq": "", "ajlx": "", "ndjd": ny, "bz": "ndjd", "dqy": dqy, "ymdx": "", "nsrmc": "", "nsrsbh": "", "zcdz": "", "zzjgdm": "", "fddbrmc": "", "fddbrsfzhm": "", "cwfzrmc": "", "cwfzrsfzhm": "", "orgCode": "11100000000", } response = r.post(url=url,headers=headers,data=data,proxies=proxies) html = response.text rpg = re.findall(r'果(.*?)页',html) tpg = 0 if rpg: tpg = rpg[0].replace(' ','') print(tpg) selector = etree.HTML(html) a = selector.xpath('/html/body/table/tbody/tr/td/table[2]/tbody//tr') list1 = [] list2 = [] for i in a: rst = i.xpath('td[5]/input/@onclick') if rst: codt = rst[0] cd1 = re.findall(r"'(.*?)'",codt)[0] print(cd1) utf = r_myco15.sismember('n03', cd1) if not utf: rsd = r1_d(cd1, ny) list1.append(rsd) list2.append(cd1) else: print('已存在,>>>n03') pass if list1: myco3.insert_many(list1) if list2: myco3_b.insert_many(list1) for mis in list2: r_myco15.sadd('n03', mis) # myco3.insert_many(list1) return int(tpg) @retry(3) def get_ny(): url ='http://beijing.chinatax.gov.cn/bjsat/office/jsp/zdsswfaj/ndjd.jsp?orgCode=11100000000' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0", } response = r.get(url=url, headers=headers, proxies=proxies) html = response.text # print(html) a = re.findall(r"cx\('2021年度(.*?) 月'\)",html) list1 = [] for i in a: # print(i) ny = i.replace(' ','') list1.append(ny) eny = list1[-1] print(eny,'>>>from n03_ah***') return eny # get_ny() def runs(year,month): if len(str(month)) == 1: Year_Month = str(year) + "年度" + str(month) + " 月" else: Year_Month = str(year) + "年度" + str(month) + " 月" for dqy in range(1,2): print(year,month,dqy,'页=================') tpg = r1(Year_Month,dqy) if tpg >1: for i in range(2,tpg+1): print(year,month,i,'页=================') r1(Year_Month, i) runs(2024,1)