123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136 |
- #!/usr/bin/env python
- # coding:utf-8
- import re
- import requests,json
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco28,r_myco15,myco28_b
- from rety import retry
- r = requests.session()
- r.keep_alive = False
- @retry(3)
- def r1_d(url,dt):
- # url = 'http://ningxia.chinatax.gov.cn/art/2021/3/3/art_14329_8626.html'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- response = r.get(url=url,headers=headers,proxies=proxies)
- response.encoding = 'UTF-8'
- html = response.text
- selector = etree.HTML(html)
- a = selector.xpath('//table[@class="color"]//tr')
- dict1 = {}
- for i in a:
- k1 = i.xpath('td[1]')
- if k1:
- k2 = k1[0].xpath('string(.)').strip()
- k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
- # print(k2)
- v1 = i.xpath('td[2]')
- if v1:
- v2 = v1[0].xpath('string(.)').strip()
- v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
- # print(v3)
- else:
- v3 = ''
- dict1[k3] = v3
- dict1['url'] = url
- dict1['date'] = dt
- # print(dict1)
- return dict1
- # r1_d('1')
- @retry(3)
- def r1(ny,pg):
- url = 'http://ningxia.chinatax.gov.cn/module/search/index.jsp'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- params = {
- "vc_name":"",
- "field_147": "",
- "field_149": "",
- "field_148": "",
- "field_150": "",
- "field_151": "",
- "strSelectID": "104,147,148,149,150,151",
- "i_columnid": ny, #202103
- "currpage":pg,
- "field": "field_148:1,field_149:1,vc_name:1,field_147:1,field_150:1,field_151:1",
- "initKind": "FieldForm",
- "currentplace": "",
- "splitflag": "",
- "fullpath": "0",
- }
- response = r.get(url=url,headers=headers,params=params,proxies=proxies)
- html = response.text
- # print(html)
- rpg = re.findall(r'共 (.*?) 页',html)
- # print(rpg)
- selector = etree.HTML(html)
- a = selector.xpath('//a/@href')
- list1 = []
- list2 = []
- for i in a:
- # print(i)
- if "art" in i:
- url1 = 'http://ningxia.chinatax.gov.cn' + i.replace('../..','')
- print(url1)
- dt1 = i.split('/')
- # print(dt1)
- dt=dt1[3] + '/' + dt1[4] + '/' +dt1[5]
- print(dt)
- utf = r_myco15.sismember('n28', url1) ##更改
- if not utf:
- rsd = r1_d(url1, dt)
- print(rsd)
- list1.append(rsd)
- list2.append(url1)
- else:
- print('已存在,>>>n28')
- if list1:
- myco28.insert_many(list1)
- print('已存入原始库')
- if list2:
- myco28_b.insert_many(list1)
- print('已存入备份原始库')
- for mis in list2:
- r_myco15.sadd('n28', mis) ##更改
- return rpg[0]
- # if list1:
- # myco28.insert_many(list1)
- def get_pg(ny):
- url = 'http://ningxia.chinatax.gov.cn/col/col14330/index.html'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- response = r.get(url=url, headers=headers, proxies=proxies)
- response.encoding = 'UTF-8'
- html = response.text
- # print(html)
- selector = etree.HTML(html)
- ### niandu3对应2021年
- a = selector.xpath('//dl[@id="niandu3"]//dt[@class="open"]//a')
- for i in a:
- yf = i.xpath('text()')[0]
- if yf == '{}月'.format(ny):
- href = i.xpath('@href')[0]
- print(href)
- h1 = re.findall(r'col/col(.*?)/i',href)
- return h1[0]
- # yf = get_pg(3)
- # print(yf)
- def runs(ny):
- yf = get_pg(ny)
- rpg = r1(yf, pg=1)
- for pg in range(2,int(rpg)+1):
- print(pg,'=============')
- r1(ny, pg)
- runs(1)
|