123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- #!/usr/bin/env python
- # coding:utf-8
- import re
- import requests,json
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco22,r_myco15,myco22_b
- from rety import retry
- r = requests.session()
- r.keep_alive = False
- import urllib3
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
- @retry(3)
- def r1_d(url,dt):
- # url = 'https://yunnan.chinatax.gov.cn/art/2021/2/9/art_8101_588.html'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- response = r.get(url=url,headers=headers,proxies=proxies,verify=False)
- response.encoding = 'UTF-8'
- html = response.text
- selector = etree.HTML(html)
- a = selector.xpath('//tr')
- dict1 = {}
- for i in a:
- k1 = i.xpath('td[1]/div/text()')
- v1 = i.xpath('td[2]/div/text()')
- # print(k1,v1)
- k2 = ''
- if k1:
- for i1 in k1:
- k2 += i1
- if v1:
- v2 = ''
- for i2 in v1:
- v2+=i2
- else:
- v2 = ''
- dict1[k2] = v2
- dict1['url'] = url
- dict1['date'] = dt
- print(dict1)
- return dict1
- # r1_d()
- @retry(3)
- def r1(ny1,ny2):
- url = 'https://yunnan.chinatax.gov.cn/bulletin/ajaxdata.jsp?startrecord=1&endrecord=8&perpage=11&rowpage=1'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- data = {
- "searhvalue":"{}%E6%9C%88".format(ny2),
- "searchkey": "jd1",
- "year": "{}%E5%B9%B4%E5%BA%A6".format(ny1),
- }
- response = r.post(url=url,data=data,headers=headers,proxies=proxies,verify=False)
- html = response.text
- a = re.findall("href='(.*?)'", html)
- list1 = []
- list2 = []
- for i in a:
- print(i)
- dt = str(ny1) + '/' + str(ny2)
- utf = r_myco15.sismember('n22', i) ##更改
- if not utf:
- rsl = r1_d(i, dt)
- list1.append(rsl)
- list2.append(i)
- else:
- print('已存在,>>>n22')
- if list1:
- myco22.insert_many(list1)
- print('已存入原始库')
- if list2:
- myco22_b.insert_many(list1)
- print('已存入备份原始库')
- for mis in list2:
- r_myco15.sadd('n22', mis) ##更改
- # if list1:
- # myco22.insert_many(list1)
- def runs(ny1,ny2):
- print(ny1,ny2,'=========')
- r1(ny1, ny2)
- runs('2023','5')
|