1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889 |
- #!/usr/bin/env python
- # coding:utf-8
- import requests,json,re
- from rety import retry
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco5
- r = requests.session()
- r.keep_alive = False
- import urllib3
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
- @retry(3)
- def r1_d(url,dt):
- # url = 'https://sichuan.chinatax.gov.cn/art/2021/3/23/art_15873_10537.html'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- response = r.get(url=url,headers=headers,verify=False,proxies=proxies)
- response.encoding = 'UTF-8'
- html= response.text
- # print(html)
- selector = etree.HTML(html)
- a = selector.xpath('//tbody//tr')
- dict1 = {}
- try:
- for i in a:
- k1 = i.xpath('td[1]')
- text = k1[0].xpath('string(.)').strip()
- # print(text)
- v1 = i.xpath('td[2]')
- text1 = v1[0].xpath('string(.)').strip()
- v2 = text1.replace(' ','').replace('\r','').replace('\t','').replace('\n','')
- dict1[text] = v2
- except:
- pass
- print(dict1)
- dict1['url'] = url
- dict1['date'] = dt
- return dict1
- # r1_d()
- @retry(3)
- def r1(icid,cpg,dt):
- url = 'https://sichuan.chinatax.gov.cn/module/search/index.jsp'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- params = {
- "vc_name":"",
- "field_439": "",
- "field_440": "",
- "field_441": "",
- "field_442": "",
- "field_443": "",
- "strSelectID": "390,439,440,441,442,443",
- "i_columnid": icid,
- "field": "vc_name:1:0,field_439:1:0,field_440:1:0,field_441:1:0,field_442:1:0,field_443:1:0",
- "currpage": cpg,
- }
- response = r.get(url=url,headers=headers,params=params,verify=False,proxies=proxies)
- html = response.text
- # print(html)
- selector = etree.HTML(html)
- a = selector.xpath('//tr/td[5]/a/@href')
- list1 = []
- for i in a:
- # print(i)
- url1 = i.replace('../..','https://sichuan.chinatax.gov.cn')
- print(url1)
- rsd = r1_d(url1,dt)
- list1.append(rsd)
- # print(list1)
- myco5.insert_many(list1)
- # 'https://sichuan.chinatax.gov.cn'
- # https://sichuan.chinatax.gov.cn/col/col15873/index.html
- ##季度更新,对比上次页数,
- def runs():
- icid='15873' #季度id
- tpg=9 #总共几页
- dt='2023/10' #时限
- for i in range(1,tpg+1):
- print(i,'==================')
- r1(icid,i,dt)
- runs()
|