czyc
/
Shuiwu


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
							#!/usr/bin/env python
# coding:utf-8

import re
import requests,json
from setting import proxies
from urllib import parse
from lxml import etree
from mongo_cho import myco25,r_myco15,myco25_b
from rety import retry
r = requests.session()
r.keep_alive = False

@retry(3)
def r1_d(url,dt):
    # url = 'http://shaanxi.chinatax.gov.cn/art/2021/4/15/art_15616_7502.html'
    headers = {
        "User-Agent": "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0; .NET CLR 2.0.50727; SLCC2; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729)",
    }
    response = r.get(url=url,headers=headers,proxies=proxies)
    response.encoding = 'UTF-8'
    html = response.text
    selector = etree.HTML(html)
    a = selector.xpath('//table[@class="zdsc_con"]//tr')
    dict1 = {}
    for i in a:

        k1 = i.xpath('td[1]')
        if k1:
            k2 = k1[0].xpath('string(.)').strip()

            k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
            # print(k2)
            v1 = i.xpath('td[2]')
            if v1:
                v2 = v1[0].xpath('string(.)').strip()
                v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
                # print(v3)
            else:
                v3 = ''
            dict1[k3] = v3
    dict1['url'] = url
    # dt = ''
    dict1['date'] = dt
    # print(dict1)
    return dict1

# r1_d()
@retry(3)
def r1(ny1,ny2,pg):
    url = 'http://shaanxi.chinatax.gov.cn/module/search/index.jsp'
    headers = {
        "User-Agent": "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0; .NET CLR 2.0.50727; SLCC2; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729)",
    }
    params = {
        "field_2166": "",
        "field_1656": "",
        "field_1652": "",
        "field_1663": "",
        "field_1653": "",
        "field_2390": "",
        "field_2213": "",
        "field_2391": "",
        "field_2410": ny2,
        "field_1670": "",
        "field_1672": ny1,
        "currpage": pg,
        "field_1651": "",
        "strSelectID": "style_2166,1656,1663,1652,1653,2390,2213,2391,1651,1672,2410,1670",
        "i_columnid": "style_3",
        "field": "field_2166:1:0,field_2213:1:0,field_1656:1:0,field_2391:1:0,field_2410:12:0,field_1651:12:0,field_1652:1:0,field_2390:1:0,field_1672:12:0,field_1670:12:0,field_1653:1:0,field_1663:1:0",
        "initKind": "FieldFormMetadata",
        "type": "0,0,0,0,0,0,0,0,0,0,0,0",
        "currentplace": "",
        "splitflag": "",
        "fullpath": "0",
    }
    response = r.get(url=url, headers=headers, params=params,proxies=proxies)
    html = response.text
    # print(html)
    rpg = re.findall("<font color='red'>(.*?)</font>",html)[0]
    # print(rpg)
    selector = etree.HTML(html)
    a = selector.xpath('//li//a/@href')
    list1 = []
    list2 = []
    for i in a:
        # print(i)
        url1 = 'http://shaanxi.chinatax.gov.cn' + i.replace('../..','')
        dt = str(ny1) + '/' + str(ny2+1)
        utf = r_myco15.sismember('n25', url1)  ##更改
        if not utf:
            rsd = r1_d(url1, dt)
            print(rsd)
            list1.append(rsd)
            list2.append(url1)
        else:
            print('已存在，>>>n25')
    if list1:
        myco25.insert_many(list1)
        print('已存入原始库')
    if list2:
        myco25_b.insert_many(list1)
        print('已存入备份原始库')
        for mis in list2:
            r_myco15.sadd('n25', mis)  ##更改
    # if list1:
    #     myco25.insert_many(list1)
    return int(rpg)

def runs(ny1,ny2):
    # for ny1 in range(2021,2022):
    #     for ny2 in range(1,2):
    #         print(ny1,ny2,'========')
    ny3 = int(ny2) - 1
    rpg = r1(ny1,ny3,pg=1)
    # print(rpg,'----------------')
    tpg = rpg//20
    if tpg >1:
        for pg in range(2,tpg +2):
            print(pg,'=======')
            r1(ny1,ny3,pg)


runs(2023,11)