123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125 |
- #!/usr/bin/env python
- # coding:utf-8
- import re
- import requests,json
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco10,r_myco15,myco10_b
- from rety import retry
- r = requests.session()
- r.keep_alive = False
- @retry(3)
- def r1_d(cid,dt):
- url = 'http://guangdong.chinatax.gov.cn/siteapps/webpage/gdtax/zdsswfaj/service.jsp'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- data = {
- "manuscriptId": cid,
- }
- response = r.post(url=url, headers=headers, data=data,proxies=proxies)
- html = response.text
- selector = etree.HTML(html)
- a = selector.xpath('//tr')
- dict1 = {}
- for i in a:
- k1 = i.xpath('td[1]/text()')
- v1 = i.xpath('td[2]/text()')
- if k1:
- k2 = k1[0].replace(' ','').replace('\r','').replace('\t','').replace('\n','')
- # print(k2)
- if v1:
- v2 = v1[0].replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '')
- # print(v2)
- else:
- v2 = ''
- if k2:
- dict1[k2] = v2
- dict1["uid"] = cid
- dict1['date'] = dt
- print(dict1)
- return dict1
- # r1_d('42da48b512b046d488189ce36a833fa8','9')
- @retry(3)
- def r1(ny1,ny2,pg):
- url = 'http://guangdong.chinatax.gov.cn/siteapps/webpage/gdtax/zdsswfaj/query.jsp'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- data = {
- "yf": "{ny1}_{ny2}".format(ny1=ny1,ny2=ny2),
- "pageSize":"20",
- "pageNo": pg,
- "channelId": "",
- "taxNature": "",
- "quarter": "",
- "nsr_mc": "",
- "nsr_sbh": "",
- "fddbr_xm": "",
- "zcdz": "",
- "zzjgdm": "",
- "fddbrzjhm": "",
- "cwfzrxm": "",
- "cwfzrzjhm": "",
- }
- response = r.post(url=url,headers=headers,data=data,proxies=proxies)
- html = response.text
- # print(html)
- selector = etree.HTML(html)
- rpg = re.findall('共(.*?)页',html)[0].replace(' ','')
- # a1 = selector.xpath('//*[@id="zdss_tb"]/tbody/tr[2]/td[5]/text()')
- # print(a1)
- # for i in a:
- # print(i.xpath('a/@onclick'))
- a = selector.xpath('//a/@onclick')
- list1 = []
- list2 = []
- for i in a:
- i1 = re.findall("'(.*?)'",i)
- if i1:
- cid = i1[0]
- dt = str(ny1) + '/' + str(ny2+1)
- if cid == '#pageIndex':
- pass
- else:
- utf = r_myco15.sismember('n10', cid) ##更改
- if not utf:
- print(cid)
- rsd = r1_d(cid, dt)
- list1.append(rsd)
- list2.append(cid)
- else:
- print('已存在,>>>n10')
- pass
- # if list1:
- if list1:
- myco10.insert_many(list1)
- print('已存入原始库')
- if list2:
- myco10_b.insert_many(list1)
- print('已存入备份原始库')
- for mis in list2:
- r_myco15.sadd('n10', mis) ##更改
- # myco10.insert_many(list1)
- return rpg
- def runs(ny1,ny2):
- # for ny1 in range(2021,2022):
- # for ny2 in range(0,4):
- # print(ny1,ny2,'===========')
- rpg = r1(ny1,ny2,pg=1)
- print(ny1,ny2)
- if int(rpg) > 1:
- for pg in range(2,int(rpg)+1):
- print(pg,'==============')
- r1(ny1,ny2,pg)
- runs(2023,11)
|