czyc
/
Shuiwu


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
							#!/usr/bin/env python
# coding:utf-8

import re
import requests,json
from setting import proxies
from urllib import parse
from lxml import etree
from mongo_cho import myco23,r_myco15,myco23_b
from rety import retry
r = requests.session()
r.keep_alive = False

@retry(3)
def r1_d(uid,dt):
    url = 'http://hainan.chinatax.gov.cn/weifaCase/weifa_case_list.htm'
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
    }
    data = {
        "id":uid
    }
    response = r.post(url=url, headers=headers, data=data,proxies=proxies)
    html = response.text
    selector = etree.HTML(html)
    a = selector.xpath('//table[@class="div2-table3"]//tr')
    dict1 = {}
    for i in a:

        k1 = i.xpath('th')
        if k1:
            k2 = k1[0].xpath('string(.)').strip()

            k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
            # print(k2)
            v1 = i.xpath('td')
            if v1:
                v2 = v1[0].xpath('string(.)').strip()
                v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
                # print(v3)
            else:
                v3 = ''
            dict1[k3] = v3
    dict1['url'] = uid
    # dt = ''
    dict1['date'] = dt
    # print(dict1)
    return dict1

# r1_d('1')

@retry(3)
def r1(ny1,ny2,pg):
    url = 'http://hainan.chinatax.gov.cn/weifaCase/weifa_case_list.htm?pageNo={}'.format(pg)
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
    }
    dt = str(ny1) + '/' + str(ny2)
    data = {
        "area":"",
        "ajinformation": "",
        "startDate": str(ny1) + '-' + str(ny2),
        "month": "1",
        "nsrname": "",
        "nsridentify": "",
        "regaddress": "",
        "organization": "",
        "legal": "",
        "legalId": "",
        "finance": "",
        "financeId": "",
    }
    response = r.post(url=url,headers=headers,data=data,proxies=proxies)
    html = response.text
    # print(html)
    tpg = re.findall(r'共<em>(.*?)</em>条',html)
    # print(tpg)
    if tpg:
        rpg = int(tpg[0])
    else:
        rpg = 0
    selector = etree.HTML(html)
    a = selector.xpath('//input/@onclick')
    list1 = []
    list2 = []
    for i in a:
        uid = i.replace('weifaCaseDetail(','').replace(')','')
        # print(uid)
        utf = r_myco15.sismember('n23', uid)  ##更改
        if not utf:
            rsd = r1_d(uid, dt)
            print(rsd)
            list1.append(rsd)
    if list1:
        myco23.insert_many(list1)
        print('已存入原始库')
    if list2:
        myco23_b.insert_many(list1)
        print('已存入备份原始库')
        for mis in list2:
            r_myco15.sadd('n23', mis)  ##更改
    return rpg
    # if list1:
    #     myco23.insert_many(list1)


def runs():
    ny1= '2023'
    ny2 = '11'
    # pg = 2
    rpg = r1(ny1,ny2,pg=1)
    tpg = rpg//15 +1
    for pg in range(2,tpg):
        r1(ny1,ny2,pg)

runs()