12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- #!/usr/bin/env python
- # coding:utf-8
- import re
- import requests,json
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco27,r_myco15,myco27_b
- from rety import retry
- r = requests.session()
- r.keep_alive = False
- def r1_d(url,dt):
- # url = 'http://neimenggu.chinatax.gov.cn/nsfw/sscx/zdaj/hlbeszdwfaj/202106/t20210609_751387.html'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- response = r.get(url=url,headers=headers,proxies=proxies)
- response.encoding = 'UTF-8'
- html = response.text
- selector = etree.HTML(html)
- a = selector.xpath('//table//tr')
- dict1 = {}
- for i in a:
- k1 = i.xpath('td[1]')
- if k1:
- k2 = k1[0].xpath('string(.)').strip()
- k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
- # print(k2)
- v1 = i.xpath('td[2]')
- if v1:
- v2 = v1[0].xpath('string(.)').strip()
- v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
- # print(v3)
- else:
- v3 = ''
- if k3:
- dict1[k3] = v3
- dict1['url'] = url
- dict1['date'] = dt
- print(dict1)
- return dict1
- # r1_d('1')
- @retry(3)
- def r1():
- url = 'http://neimenggu.chinatax.gov.cn/nsfw/sscx/zdaj/sj/2023/' ##查看时间
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- response = r.get(url=url,headers=headers,proxies=proxies)
- response.encoding = 'UTF-8'
- html = response.text
- # print(html)
- selector = etree.HTML(html)
- a = re.findall('href="(.*?)"',html)
- for i in a:
- # print(i)
- if "html" in i:
- url1 = 'http://neimenggu.chinatax.gov.cn/nsfw/sscx/zdaj' + i.replace('../..','')
- print(url1)
- u1 = url1.split('/')
- u2 = u1[-1].split('_')[0]
- # print(u2)
- dt = u2[1:5] + '/' + u2[5:7] + '/' + u2[7:9]
- print(dt)
- utf = r_myco15.sismember('n27', url1) ##更改
- if not utf:
- rsd = r1_d(url1, dt)
- myco27.insert_one(rsd)
- myco27_b.insert_one(rsd)
- r_myco15.sadd('n27', url1)
- print('存入主备库,>>>n27')
- else:
- print('已存在,>>>n27')
- # time.sleep(10)
- r1()
|