12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849 |
- #!/usr/bin/env python
- # coding:utf-8
- import requests,json,time
- from setting import proxies
- # from urllib import parse
- # from pymongo import MongoClient
- # myclient = MongoClient("mongodb://127.0.0.1:27017/")
- # myco2 = myclient['shuiwu06']['02_nb']
- # myco2_b = myclient['shuiwu06']['02_nb']
- from lxml import etree
- from mongo_cho import myco2,myco2_b
- r = requests.session()
- r.keep_alive = False
- #########见30的
- # http://ningbo.chinatax.gov.cn/col/col6300/index.html
- def r1(year,mon,day):
- dict1 = {}
- url = 'http://ningbo.chinatax.gov.cn/art/{year}/{mon}/{day}/art_6166_7114.html'.format(year=year,mon=mon,day=mon)
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- html = requests.get(url=url,headers=headers,proxies=proxies)
- selector = etree.HTML(html)
- a = selector.xpath('//div[@id="zoom"]//table//tr')
- for i in a:
- k1 = i.xpath('td[1]')
- if k1:
- k2 = k1[0].xpath('string(.)').strip()
- k3 = k2.replace('\r','').replace('\t','').replace('\n','').replace(' ','')
- # print(k2)
- v1 = i.xpath('td[2]')
- if v1:
- v2 = v1[0].xpath('string(.)').strip()
- v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
- # print(v3)
- else:
- v3 = ''
- dict1[k3] = v3
- dict1['url'] = url
- dict1['date'] = str(year)+'/'+str('04')
- print(dict1)
- myco2_b.insert_one(dict1)
- for i in range(1,30):
- r1('2023','4',i)
|