123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126 |
- #!/usr/bin/env python
- # coding:utf-8
- import re
- import requests,json
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco15,r_myco15,myco15_b
- from rety import retry
- r = requests.session()
- r.keep_alive = False
- @retry(3)
- def r1_d(url,dt):
- # url = 'http://jilin.chinatax.gov.cn/art/2021/3/3/art_19972_7390.html'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- response = r.get(url=url,headers=headers,proxies=proxies)
- response.encoding = 'UTF-8'
- html = response.text
- selector = etree.HTML(html)
- a = selector.xpath('//table[@class="zdwf"]//tr')
- dict1 = {}
- for i in a:
- k1 = i.xpath('td[1]/text()')[0]
- v1 = i.xpath('td[2]/text()')
- # print(k1,v1)
- v3 = ''
- for v2 in v1:
- v3 += v2
- k2 = k1.replace(' ', '').replace('\r', '').replace('\n', '').replace('\t', '')
- v4 = v3.replace(' ','').replace('\r','').replace('\n','').replace('\t','')
- if k2:
- dict1[k2] = v4
- dict1['url'] = url
- dict1['date'] = dt
- # print(dict1)
- return dict1
- # r1_d('1','2')
- @retry(3)
- def r1(ny, dt, pg):
- url = 'http://jilin.chinatax.gov.cn/module/search/index.jsp'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- params = {
- "field_1136_large":"",
- "field_1136_small": "",
- "field_1137_large": "",
- "field_1137_small": "",
- "field_1138_large": ny,
- "field_1138_small": ny,
- "field_1113": "",
- "field_1114": "",
- "field_1115": "",
- "field_1116": "",
- "field_1117": "",
- "field_1120": "",
- "field_1123": "",
- "field_1126": "",
- "strSelectID": "1113,1114,1115,1116,1117,1120,1123,1126,1136,1137,1138",
- "i_columnid": "19972",
- "field": "field_1113:1:1,field_1114:1:1,field_1115:1:1,field_1116:1:1,field_1117:1:1,field_1120:1:1,field_1123:1:1,field_1126:1:1,field_1136:0:1,field_1137:0:1,field_1138:0:1",
- "initKind": "FieldForm",
- "type": "1,1,1,1,1,1,1,1,1,1,1",
- "currentplace": "",
- "currpage": pg,
- "splitflag": "",
- "fullpath": "0",
- }
- response = r.get(url=url,headers=headers,params=params,proxies=proxies)
- # print(response.text)
- html = response.text
- tpg = re.findall('共(.*?)页',html)[0].replace(' ','')
- selector = etree.HTML(html)
- rsl = selector.xpath('//a[@class="xxxx"]/@href')
- list1 = []
- list2 = []
- for i in rsl:
- # print(i)
- url1 = 'http://jilin.chinatax.gov.cn/' + i.replace('../..', '')
- # print(url1)
- utf = r_myco15.sismember('n15', url1) ##更改
- if not utf:
- rsd = r1_d(url1, dt)
- print(rsd)
- list2.append(url1)
- list1.append(rsd)
- else:
- print('已存在,>>>n15')
- if list1:
- myco15.insert_many(list1)
- print('已存入原始库')
- if list2:
- myco15_b.insert_many(list1)
- print('已存入备份原始库')
- for mis in list2:
- r_myco15.sadd('n15', mis) ##更改
- # if list1:
- # myco15.insert_many(list1)
- return tpg
- # ny= '202001'
- # pg = '1'
- # dt = '0'
- def runs(ny1,ny2):
- # for ny1 in range(2019,2021): #2021
- # for ny2 in range(1,13): # 1 4
- if len(str(ny2)) ==1:
- ny2 = '0' +str(ny2)
- ny = str(ny1) + str(ny2)
- dt = str(ny1) + '/' +str(ny2)
- print(ny,dt,'======')
- rpg = r1(ny, dt, pg=1)
- if int(rpg) >1:
- for pg1 in range(2,int(rpg)+1):
- r1(ny, dt, pg1)
- ny1 = '2023'
- ny2 = '10'
- runs(ny1,ny2)
|