123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116 |
- #!/usr/bin/env python
- # coding:utf-8
- import re
- import requests,json
- from setting import proxies
- from urllib import parse
- from lxml import etree
- from mongo_cho import myco23,r_myco15,myco23_b
- from rety import retry
- r = requests.session()
- r.keep_alive = False
- @retry(3)
- def r1_d(uid,dt):
- url = 'http://hainan.chinatax.gov.cn/weifaCase/weifa_case_list.htm'
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- data = {
- "id":uid
- }
- response = r.post(url=url, headers=headers, data=data,proxies=proxies)
- html = response.text
- selector = etree.HTML(html)
- a = selector.xpath('//table[@class="div2-table3"]//tr')
- dict1 = {}
- for i in a:
- k1 = i.xpath('th')
- if k1:
- k2 = k1[0].xpath('string(.)').strip()
- k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
- # print(k2)
- v1 = i.xpath('td')
- if v1:
- v2 = v1[0].xpath('string(.)').strip()
- v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
- # print(v3)
- else:
- v3 = ''
- dict1[k3] = v3
- dict1['url'] = uid
- # dt = ''
- dict1['date'] = dt
- # print(dict1)
- return dict1
- # r1_d('1')
- @retry(3)
- def r1(ny1,ny2,pg):
- url = 'http://hainan.chinatax.gov.cn/weifaCase/weifa_case_list.htm?pageNo={}'.format(pg)
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
- }
- dt = str(ny1) + '/' + str(ny2)
- data = {
- "area":"",
- "ajinformation": "",
- "startDate": str(ny1) + '-' + str(ny2),
- "month": "1",
- "nsrname": "",
- "nsridentify": "",
- "regaddress": "",
- "organization": "",
- "legal": "",
- "legalId": "",
- "finance": "",
- "financeId": "",
- }
- response = r.post(url=url,headers=headers,data=data,proxies=proxies)
- html = response.text
- # print(html)
- tpg = re.findall(r'共<em>(.*?)</em>条',html)
- # print(tpg)
- if tpg:
- rpg = int(tpg[0])
- else:
- rpg = 0
- selector = etree.HTML(html)
- a = selector.xpath('//input/@onclick')
- list1 = []
- list2 = []
- for i in a:
- uid = i.replace('weifaCaseDetail(','').replace(')','')
- # print(uid)
- utf = r_myco15.sismember('n23', uid) ##更改
- if not utf:
- rsd = r1_d(uid, dt)
- print(rsd)
- list1.append(rsd)
- if list1:
- myco23.insert_many(list1)
- print('已存入原始库')
- if list2:
- myco23_b.insert_many(list1)
- print('已存入备份原始库')
- for mis in list2:
- r_myco15.sadd('n23', mis) ##更改
- return rpg
- # if list1:
- # myco23.insert_many(list1)
- def runs():
- ny1= '2023'
- ny2 = '11'
- # pg = 2
- rpg = r1(ny1,ny2,pg=1)
- tpg = rpg//15 +1
- for pg in range(2,tpg):
- r1(ny1,ny2,pg)
- runs()
|