#!/usr/bin/env python # coding:utf-8 import re import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco23,r_myco15,myco23_b from rety import retry r = requests.session() r.keep_alive = False @retry(3) def r1_d(uid,dt): url = 'http://hainan.chinatax.gov.cn/weifaCase/weifa_case_list.htm' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } data = { "id":uid } response = r.post(url=url, headers=headers, data=data,proxies=proxies) html = response.text selector = etree.HTML(html) a = selector.xpath('//table[@class="div2-table3"]//tr') dict1 = {} for i in a: k1 = i.xpath('th') if k1: k2 = k1[0].xpath('string(.)').strip() k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '') # print(k2) v1 = i.xpath('td') if v1: v2 = v1[0].xpath('string(.)').strip() v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '') # print(v3) else: v3 = '' dict1[k3] = v3 dict1['url'] = uid # dt = '' dict1['date'] = dt # print(dict1) return dict1 # r1_d('1') @retry(3) def r1(ny1,ny2,pg): url = 'http://hainan.chinatax.gov.cn/weifaCase/weifa_case_list.htm?pageNo={}'.format(pg) headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } dt = str(ny1) + '/' + str(ny2) data = { "area":"", "ajinformation": "", "startDate": str(ny1) + '-' + str(ny2), "month": "1", "nsrname": "", "nsridentify": "", "regaddress": "", "organization": "", "legal": "", "legalId": "", "finance": "", "financeId": "", } response = r.post(url=url,headers=headers,data=data,proxies=proxies) html = response.text # print(html) tpg = re.findall(r'共(.*?)条',html) # print(tpg) if tpg: rpg = int(tpg[0]) else: rpg = 0 selector = etree.HTML(html) a = selector.xpath('//input/@onclick') list1 = [] list2 = [] for i in a: uid = i.replace('weifaCaseDetail(','').replace(')','') # print(uid) utf = r_myco15.sismember('n23', uid) ##更改 if not utf: rsd = r1_d(uid, dt) print(rsd) list1.append(rsd) if list1: myco23.insert_many(list1) print('已存入原始库') if list2: myco23_b.insert_many(list1) print('已存入备份原始库') for mis in list2: r_myco15.sadd('n23', mis) ##更改 return rpg # if list1: # myco23.insert_many(list1) def runs(): ny1= '2023' ny2 = '11' # pg = 2 rpg = r1(ny1,ny2,pg=1) tpg = rpg//15 +1 for pg in range(2,tpg): r1(ny1,ny2,pg) runs()