1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495 |
- #!/usr/bin/env python
- # coding:utf-8
- import re,time
- import requests,json
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco21,myco21_b,r_myco15
- from rety import retry
- r = requests.session()
- r.keep_alive = False
- @retry(3)
- def r2(dt,uid):
- url = 'http://jiangxi.chinatax.gov.cn/taxmap/front/getdetail.do'
- headers = {
- "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
- }
- params = {
- "iid":uid
- }
- response = r.get(url=url,params=params,headers=headers,proxies=proxies)
- response.encoding = 'UTF-8'
- html = response.text
- # print(html)
- selector = etree.HTML(html)
- a = selector.xpath('//table[@class="xxTable"]//tr')
- dict1 = {}
- for i in a:
- k1 = i.xpath('th/text()')
- if k1:
- str1 = ''
- for kk1 in k1:
- str1 += kk1
- v1 =i.xpath('td/text()')
- str2 = ''
- if v1:
- for vv1 in v1:
- str2 += vv1
- dict1[str1] = str2
- dict1['date'] = dt
- dict1['uid'] = uid
- # print(dict1)
- return dict1
- # r2()
- @retry(3)
- def r1(pg):
- dt = '2021/05'
- url = 'http://jiangxi.chinatax.gov.cn/taxmap/front/result2.do'
- headers = {
- "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
- }
- params = {
- "region":"",
- "nature": "",
- "year": "",
- "pageno": pg,
- "_": int(round(time.time() * 1000)),
- }
- response = r.get(url=url,headers=headers,params=params,proxies=proxies)
- html = response.text
- # print(html)
- a = re.findall('getDetail\((.*?)\)',html)
- list1 = []
- list2 = []
- for uid in a:
- if uid != 'iid':
- print(uid)
- utf = r_myco15.sismember('n21', uid) ##更改
- if not utf:
- rsd = r2(dt, uid)
- list1.append(rsd)
- list2.append(uid)
- else:
- print('已存在,>>>n21')
- if list1:
- myco21.insert_many(list1)
- print('已存入原始库')
- if list2:
- myco21_b.insert_many(list1)
- print('已存入备份原始库')
- for mis in list2:
- r_myco15.sadd('n21', mis) ##更改
- # pg = '1'
- def runs():
- for pg in range(1,3):
- print(pg,'===============================')
- r1(pg)
- runs()
|