czyc
/
Shuiwu


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
							#!/usr/bin/env python
# coding:utf-8

import requests,json
from setting import proxies
from urllib import parse
from lxml import etree
from mongo_cho import myco18,myco18_b,r_myco15
from rety import retry
r = requests.session()
r.keep_alive = False

@retry(3)
def r1_d(url,dt):
    # url = 'https://henan.chinatax.gov.cn/henanchinatax/xxgk/zdsswfsxaj/2021060109153715435/index.html'
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
    }
    response = r.get(url=url,headers=headers,proxies=proxies)
    response.encoding = 'UTF-8'
    html = response.text
    seletor = etree.HTML(html)
    a = seletor.xpath('//table[@class="zhongdatable"]//th')
    list1 = []
    for i in a:
        text = i.xpath('string(.)').strip()
        t1 = text.replace(' ','').replace('\r','').replace('\t','').replace('\n','')
        list1.append(t1)
    b = seletor.xpath('//table[@class="zhongdatable"]//td')
    list2 = []
    for i in b:
        text = i.xpath('string(.)').strip()
        t2 = text.replace(' ','').replace('\r','').replace('\t','').replace('\n','')
        list2.append(t2)
    dict1 = {}
    # print(list1)
    # print(list2)
    for i in range(len(a)):
        # print(i)
        k1 = list1[i]
        v1 = list2[i]
        # print(k1,v1)
        dict1[k1] = v1
    # print(dict1)
    dict1['url'] = url
    dict1['date'] = dt
    # print(dict1)
    return dict1
    # print(len(b))
    # for i1 in b:
    #     print(i1.replace(' ','').replace('\r','').replace('\t','').replace('\n',''))
    # print(response.text)
# r1_d()

@retry(3)
def r1(pg,dt):
    url = 'https://henan.chinatax.gov.cn/eportal/ui?pageId=bdfef9dfa679454c86d68f2203a69e84&currentPage={}&moduleId=143e1aeaa3b6405ea0fe04142c021d5b&staticRequest=yes'.format(pg)
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
    }
    # data = {
    #     "filter_LIKE_EXT_STR15":dt
    # }
    data = {
        "filter_LIKE_EXT_STR6":"",
        "filter_LIKE_main.TITLE": "",
        "filter_LIKE_EXT_STR2": "",
        "filter_LIKE_EXT_STR4": "",
        "filter_LIKE_EXT_STR3": "",
        "filter_LIKE_EXT_STR8": "",
        "filter_LIKE_EXT_STR19": "",
        "filter_LIKE_EXT_STR10": "",
        "filter_LIKE_EXT_STR23": "",
    }
    response = r.post(url=url,headers=headers,data=data,proxies=proxies)
    html = response.text
    # print(html)
    seletor = etree.HTML(html)
    a = seletor.xpath('//a[@istitle="true"]')
    list1 = []
    list2 = []
    print(a)
    for i in a:
        # print(i.xpath('@title'))
        url1 = 'https://henan.chinatax.gov.cn' + i.xpath('@href')[0]
        print(url1)
        utf = r_myco15.sismember('n18', url1)  ##更改
        if not utf:
            rsd = r1_d(url1, dt)
            print(rsd)
            list1.append(rsd)
            list2.append(url1)
        else:
            print('已存在，>>>n18')
    if list1:
        myco18.insert_many(list1)
        print('已存入原始库')
    if list2:
        myco18_b.insert_many(list1)
        print('已存入备份原始库')
        for mis in list2:
            r_myco15.sadd('n18', mis)  ##更改
    # if list1:
    #     myco18.insert_many(list1)


def runs():
    dt = '2023'
    for pg in range(1,2):
        print(pg,'========')
        r1(pg,dt)
runs()