czyc
/
Shuiwu


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
							#!/usr/bin/env python
# coding:utf-8

import requests,json
from setting import proxies
from urllib import parse
from lxml import etree
from mongo_cho import myco17,r_myco15,myco17_b
from rety import retry
r = requests.session()
r.keep_alive = False

# http://qinghai.chinatax.gov.cn/web/zdsswfsxaj/zdaj.shtml

@retry(3)
def r1_d(url):
    # url = 'http://qinghai.chinatax.gov.cn/web/2020nd/202007/e4856c576fa04e059eff6762dc47bf0c.shtml'
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
    }
    response = r.get(url=url, headers=headers, proxies=proxies)
    # html = response.text
    # print(html)
    html = response.text
    selector = etree.HTML(html)
    dt = selector.xpath('//*[@id="page-newContent"]/div[2]/div/div[1]/div/div[1]/div/span[1]/text()')
    dt1 = dt[0].replace('发布时间：','').replace('\r','').replace('\n','').replace(' ','').replace('-','/')
    dt2 = dt1[:-5]
    print(dt2)
    a = selector.xpath('//tr')
    dict1 = {}
    for i in a:

        k1 = i.xpath('td[1]')
        if k1:
            k2 = k1[0].xpath('string(.)').strip()

            k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
            # print(k2)
            v1 = i.xpath('td[2]')
            if v1:
                v2 = v1[0].xpath('string(.)').strip()
                v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
                # print(v3)
            else:
                v3 = ''
            dict1[k3] = v3
    dict1['url'] = url
    # dt=''
    dict1['date'] = dt2
    print(dict1)
    return dict1

# r1_d(url)


@retry(3)
def r1(ny,pg):
    if pg ==1:
        url = 'http://qinghai.chinatax.gov.cn/web/{}nd/iframe.shtml'.format(ny)
    else:
        url = 'http://qinghai.chinatax.gov.cn/web/{ny}nd/iframe_{pg}.shtml'.format(ny=ny,pg=pg)
    # url = 'http://qinghai.chinatax.gov.cn/web/2021nd/iframe.shtml'
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
    }
    response = r.get(url=url, headers=headers, proxies=proxies)
    html = response.text
    # print(html)
    if "404 Not Found" in html:
        print('zz')
        return 'zz'
    selector = etree.HTML(html)
    a = selector.xpath('//a/@href')
    list1 = []
    list2 = []
    for i in a:
        print(i)
        url1 = 'http://qinghai.chinatax.gov.cn' + i
        utf = r_myco15.sismember('n17', url1)  ##更改
        if not utf:
            rsd = r1_d(url1)
            list1.append(rsd)
            list2.append(url1)
        else:
            print('已存在，>>>n17')
    if list1:
        myco17.insert_many(list1)
        print('已存入原始库')
    if list2:
        myco17_b.insert_many(list1)
        print('已存入备份原始库')
        for mis in list2:
            r_myco15.sadd('n17', mis)  ##更改
    # if list1:
    #     myco17.insert_many(list1)
        # print('1')

# r1(pg=1)
def runs(ny):
    for pg in range(1,100):
        print(pg, '===========')
        tf = r1(ny,pg)
        if tf == "zz":
            break

runs(2021)
for pg in range(54,55):
    print(pg,'===========')
    r1(2021,pg)