123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154 |
- #!/usr/bin/env python
- # coding:utf-8
- import re
- from rety import retry
- import requests,json
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco3,r_myco15,myco3_b
- r = requests.session()
- r.keep_alive = False
- @retry(3)
- def r1_d(cid,ny):
- url = 'http://beijing.chinatax.gov.cn/bjsat/office/jsp/zdsswfaj/wwidquery'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- data = {
- "id":cid,
- "dq": "",
- "ajlx": "",
- "ndjd": ny,
- "bz": "ndjd",
- "dqy": "2",
- "ymdx": "",
- "nsrmc": "",
- "nsrsbh": "",
- "zcdz": "",
- "zzjgdm": "",
- "fddbrmc": "",
- "fddbrsfzhm": "",
- "cwfzrmc": "",
- "cwfzrsfzhm": "",
- "orgCode": "11100000000",
- }
- response = r.post(url=url,headers=headers,data=data,proxies=proxies)
- html = response.text
- selector = etree.HTML(html)
- a = selector.xpath('/html/body/table/tbody/tr/td/table/tbody//tr')
- dict1 = {}
- for i in a:
- try:
- k1 = i.xpath('td[1]/text()')[0].replace(' ','').replace('\r','').replace('\n','').replace('\t','')
- v1 = i.xpath('td[2]/text()')[0].replace(' ','').replace('\r','').replace('\n','').replace('\t','')
- dict1[k1] = v1
- except:
- pass
- dict1['uid'] = cid
- dict1['date'] = ny.replace('年度','/').replace('月','').replace(' ','')
- # print(dict1)
- return dict1
- # r3_d()
- @retry(3)
- def r1(ny,dqy):
- url = 'http://beijing.chinatax.gov.cn/bjsat/office/jsp/zdsswfaj/wwquery'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- data1 = {
- "orgCode":"11100000000",
- "bz": "ndjd",
- "ndjd": ny,
- }
- data = {
- "id": '',
- "dq": "",
- "ajlx": "",
- "ndjd": ny,
- "bz": "ndjd",
- "dqy": dqy,
- "ymdx": "",
- "nsrmc": "",
- "nsrsbh": "",
- "zcdz": "",
- "zzjgdm": "",
- "fddbrmc": "",
- "fddbrsfzhm": "",
- "cwfzrmc": "",
- "cwfzrsfzhm": "",
- "orgCode": "11100000000",
- }
- response = r.post(url=url,headers=headers,data=data,proxies=proxies)
- html = response.text
- rpg = re.findall(r'果(.*?)页',html)
- tpg = 0
- if rpg:
- tpg = rpg[0].replace(' ','')
- print(tpg)
- selector = etree.HTML(html)
- a = selector.xpath('/html/body/table/tbody/tr/td/table[2]/tbody//tr')
- list1 = []
- list2 = []
- for i in a:
- rst = i.xpath('td[5]/input/@onclick')
- if rst:
- codt = rst[0]
- cd1 = re.findall(r"'(.*?)'",codt)[0]
- print(cd1)
- utf = r_myco15.sismember('n03', cd1)
- if not utf:
- rsd = r1_d(cd1, ny)
- list1.append(rsd)
- list2.append(cd1)
- else:
- print('已存在,>>>n03')
- pass
- if list1:
- myco3.insert_many(list1)
- if list2:
- myco3_b.insert_many(list1)
- for mis in list2:
- r_myco15.sadd('n03', mis)
- # myco3.insert_many(list1)
- return int(tpg)
- @retry(3)
- def get_ny():
- url ='http://beijing.chinatax.gov.cn/bjsat/office/jsp/zdsswfaj/ndjd.jsp?orgCode=11100000000'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- response = r.get(url=url, headers=headers, proxies=proxies)
- html = response.text
- # print(html)
- a = re.findall(r"cx\('2021年度(.*?) 月'\)",html)
- list1 = []
- for i in a:
- # print(i)
- ny = i.replace(' ','')
- list1.append(ny)
- eny = list1[-1]
- print(eny,'>>>from n03_ah***')
- return eny
- # get_ny()
- def runs(ny1,ny2):
- if len(str(ny2)) == 1:
- ny = str(ny1) + "年度" + str(ny2) + " 月"
- else:
- ny = str(ny1) + "年度" + str(ny2) + " 月"
- for dqy in range(1,2):
- print(ny1,ny2,dqy,'页=================')
- tpg = r1(ny,dqy)
- if tpg >1:
- for i in range(2,tpg+1):
- print(ny1,ny2,i,'页=================')
- r1(ny, i)
- runs(2021,12)
|