123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110 |
- #!/usr/bin/env python
- # coding:utf-8
- import requests,json
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco17,r_myco15,myco17_b
- from rety import retry
- r = requests.session()
- r.keep_alive = False
- # http://qinghai.chinatax.gov.cn/web/zdsswfsxaj/zdaj.shtml
- @retry(3)
- def r1_d(url):
- # url = 'http://qinghai.chinatax.gov.cn/web/2020nd/202007/e4856c576fa04e059eff6762dc47bf0c.shtml'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- response = r.get(url=url, headers=headers, proxies=proxies)
- # html = response.text
- # print(html)
- html = response.text
- selector = etree.HTML(html)
- dt = selector.xpath('//*[@id="page-newContent"]/div[2]/div/div[1]/div/div[1]/div/span[1]/text()')
- dt1 = dt[0].replace('发布时间:','').replace('\r','').replace('\n','').replace(' ','').replace('-','/')
- dt2 = dt1[:-5]
- print(dt2)
- a = selector.xpath('//tr')
- dict1 = {}
- for i in a:
- k1 = i.xpath('td[1]')
- if k1:
- k2 = k1[0].xpath('string(.)').strip()
- k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
- # print(k2)
- v1 = i.xpath('td[2]')
- if v1:
- v2 = v1[0].xpath('string(.)').strip()
- v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
- # print(v3)
- else:
- v3 = ''
- dict1[k3] = v3
- dict1['url'] = url
- # dt=''
- dict1['date'] = dt2
- print(dict1)
- return dict1
- # r1_d(url)
- @retry(3)
- def r1(ny,pg):
- if pg ==1:
- url = 'http://qinghai.chinatax.gov.cn/web/{}nd/iframe.shtml'.format(ny)
- else:
- url = 'http://qinghai.chinatax.gov.cn/web/{ny}nd/iframe_{pg}.shtml'.format(ny=ny,pg=pg)
- # url = 'http://qinghai.chinatax.gov.cn/web/2021nd/iframe.shtml'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- response = r.get(url=url, headers=headers, proxies=proxies)
- html = response.text
- # print(html)
- if "404 Not Found" in html:
- print('zz')
- return 'zz'
- selector = etree.HTML(html)
- a = selector.xpath('//a/@href')
- list1 = []
- list2 = []
- for i in a:
- print(i)
- url1 = 'http://qinghai.chinatax.gov.cn' + i
- utf = r_myco15.sismember('n17', url1) ##更改
- if not utf:
- rsd = r1_d(url1)
- list1.append(rsd)
- list2.append(url1)
- else:
- print('已存在,>>>n17')
- if list1:
- myco17.insert_many(list1)
- print('已存入原始库')
- if list2:
- myco17_b.insert_many(list1)
- print('已存入备份原始库')
- for mis in list2:
- r_myco15.sadd('n17', mis) ##更改
- # if list1:
- # myco17.insert_many(list1)
- # print('1')
- # r1(pg=1)
- def runs(ny):
- for pg in range(1,100):
- print(pg, '===========')
- tf = r1(ny,pg)
- if tf == "zz":
- break
- runs(2021)
- for pg in range(54,55):
- print(pg,'===========')
- r1(2021,pg)
|