czyc
/
Shuiwu


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
							import requests,json,time
from setting import proxies
from urllib import parse
from lxml import etree
from mongo_cho import myco1,r_myco15,myco1_b
r = requests.session()
r.keep_alive = False
from rety import retry

def zh1(list1):
    str1 = ''
    for i in list1:
        str1 += i.replace(' ','').replace('\r','').replace('\n','').replace('\t','')
    return str1

@retry(3)
def r1_d(url,dt1):
    # url = 'http://anhui.chinatax.gov.cn/art/2021/3/3/art_20155_6021.html'
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
    }
    response = r.get(url=url,headers=headers,proxies=proxies)
    html = response.text
    selector = etree.HTML(html)
    a = selector.xpath('//tr[@class="rlbbox"]')
    dict1 = {}
    for i in a:
        k1 = i.xpath('td[1]/div/text()')
        # print(k1)
        k2 = zh1(k1)
        # print(k2)
        v1 = i.xpath('td[2]/div/text()')
        # print(v1)
        v2 = zh1(v1)
        # print(v2)
        dict1[k2] = v2

        # dict1 = {k2:v2}
    # print(dict1)
    dict1['date'] = dt1
    dict1['url'] = url
    return dict1

@retry(3)
def r1(searhvalue,year):
    url = 'http://anhui.chinatax.gov.cn/module/jslib/bulletin/ajaxdata.jsp'
    headers = {
        "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
    }
    data = {
        "searhvalue":parse.quote(searhvalue),
        "searchkey": "jd1",
        "year": parse.quote(year),
    }
    response = r.post(url=url,headers=headers,data=data,proxies=proxies)
    html = response.text
    # print(html)
    selector = etree.HTML(html)
    a = selector.xpath('//tr[@class="rlbbox"]')
    list1 = []
    list2 = []
    for i in a:
        url1 = i.xpath('td[5]/div/a/@href')[0]
        print(url1)
        # url1='http://anhui.chinatax.gov.cn/art/2020/5/9/art_19687_3782.html'
        utf = r_myco15.sismember('n01',url1) ##更改
        if not utf:
            dt1 = year.replace('年','/') +  searhvalue.replace('月','')
            rsd = r1_d(url1,dt1)
            if rsd:
                list1.append(rsd)
                list2.append(url1)
        else:
            print('已存在，>>>n01')
    print(list1)
    if list1:
        myco1.insert_many(list1)
        print('已存入原始库')
    if list2:
        myco1_b.insert_many(list1)
        print('已存入备份原始库')
        for mis in list2:
            r_myco15.sadd('n01', mis)  ##更改

@retry(3)
def get_ny():
    url ='http://anhui.chinatax.gov.cn//module/jslib/bulletin/bullenleft.jsp'
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
    }
    response = r.get(url=url, headers=headers, proxies=proxies)
    html = response.text
    # print(html)
    selector = etree.HTML(html)
    len1=selector.xpath('//tr[@id="jiduonclick1"]/td/span//a')
    list1 = []
    for i in len1:
        ny = i.xpath('text()')[0].replace('月','')
        list1.append(ny)
    eny = list1[-1]
    print(eny,'>>>from n01_ah***')
    return eny
# get_ny()

def runs():
    year = '2024年'
    for i in range(1,2):
        searhvalue = '{}月'.format(i)
        r1(searhvalue,year)
runs()