123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112 |
- #!/usr/bin/env python
- # coding:utf-8
- import requests,json
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco18,myco18_b,r_myco15
- from rety import retry
- r = requests.session()
- r.keep_alive = False
- @retry(3)
- def r1_d(url,dt):
- # url = 'https://henan.chinatax.gov.cn/henanchinatax/xxgk/zdsswfsxaj/2021060109153715435/index.html'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- response = r.get(url=url,headers=headers,proxies=proxies)
- response.encoding = 'UTF-8'
- html = response.text
- seletor = etree.HTML(html)
- a = seletor.xpath('//table[@class="zhongdatable"]//th')
- list1 = []
- for i in a:
- text = i.xpath('string(.)').strip()
- t1 = text.replace(' ','').replace('\r','').replace('\t','').replace('\n','')
- list1.append(t1)
- b = seletor.xpath('//table[@class="zhongdatable"]//td')
- list2 = []
- for i in b:
- text = i.xpath('string(.)').strip()
- t2 = text.replace(' ','').replace('\r','').replace('\t','').replace('\n','')
- list2.append(t2)
- dict1 = {}
- # print(list1)
- # print(list2)
- for i in range(len(a)):
- # print(i)
- k1 = list1[i]
- v1 = list2[i]
- # print(k1,v1)
- dict1[k1] = v1
- # print(dict1)
- dict1['url'] = url
- dict1['date'] = dt
- # print(dict1)
- return dict1
- # print(len(b))
- # for i1 in b:
- # print(i1.replace(' ','').replace('\r','').replace('\t','').replace('\n',''))
- # print(response.text)
- # r1_d()
- @retry(3)
- def r1(pg,dt):
- url = 'https://henan.chinatax.gov.cn/eportal/ui?pageId=bdfef9dfa679454c86d68f2203a69e84¤tPage={}&moduleId=143e1aeaa3b6405ea0fe04142c021d5b&staticRequest=yes'.format(pg)
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- # data = {
- # "filter_LIKE_EXT_STR15":dt
- # }
- data = {
- "filter_LIKE_EXT_STR6":"",
- "filter_LIKE_main.TITLE": "",
- "filter_LIKE_EXT_STR2": "",
- "filter_LIKE_EXT_STR4": "",
- "filter_LIKE_EXT_STR3": "",
- "filter_LIKE_EXT_STR8": "",
- "filter_LIKE_EXT_STR19": "",
- "filter_LIKE_EXT_STR10": "",
- "filter_LIKE_EXT_STR23": "",
- }
- response = r.post(url=url,headers=headers,data=data,proxies=proxies)
- html = response.text
- # print(html)
- seletor = etree.HTML(html)
- a = seletor.xpath('//a[@istitle="true"]')
- list1 = []
- list2 = []
- print(a)
- for i in a:
- # print(i.xpath('@title'))
- url1 = 'https://henan.chinatax.gov.cn' + i.xpath('@href')[0]
- print(url1)
- utf = r_myco15.sismember('n18', url1) ##更改
- if not utf:
- rsd = r1_d(url1, dt)
- print(rsd)
- list1.append(rsd)
- list2.append(url1)
- else:
- print('已存在,>>>n18')
- if list1:
- myco18.insert_many(list1)
- print('已存入原始库')
- if list2:
- myco18_b.insert_many(list1)
- print('已存入备份原始库')
- for mis in list2:
- r_myco15.sadd('n18', mis) ##更改
- # if list1:
- # myco18.insert_many(list1)
- def runs():
- dt = '2023'
- for pg in range(1,2):
- print(pg,'========')
- r1(pg,dt)
- runs()
|