czyc
/
Shuiwu


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
							#!/usr/bin/env python
# coding:utf-8

import re
import requests,json
from setting import proxies
from urllib import parse
from lxml import etree
from mongo_cho import myco14,myco14_b,r_myco15
from rety import retry
r = requests.session()
r.keep_alive = False

@retry(3)
def r1_d(url,dt):
    # url = 'http://heilongjiang.chinatax.gov.cn/art/2021/4/10/art_6410_962.html'
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
    }
    response = r.get(url=url,headers=headers,proxies=proxies)
    response.encoding = 'UTF-8'
    html = response.text
    selector = etree.HTML(html)
    a = selector.xpath('//tr')
    dict1 = {}
    for i in a:

        k1 = i.xpath('td[1]')
        if k1:
            k2 = k1[0].xpath('string(.)').strip()

            k3 = k2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
            # print(k2)
            v1 = i.xpath('td[2]')
            if v1:
                v2 = v1[0].xpath('string(.)').strip()
                v3 = v2.replace('\r', '').replace('\t', '').replace('\n', '').replace(' ', '')
                # print(v3)
            else:
                v3 = ''
            if k3:
                dict1[k3] = v3
    # print(dict1)
    dict1['url'] = url
    dict1['date'] = dt
    # print(dict1)
    return dict1
# r1_d()

@retry(3)
def r1(ny1,ny2):
    # url = 'http://heilongjiang.chinatax.gov.cn/module/jslib/bulletin/ajaxdata.jsp?startrecord=1&endrecord=2&perpage=11'
    url = 'http://heilongjiang.chinatax.gov.cn/module/jslib/bulletin/ajaxdata.jsp'
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
    }
    data = {
        "searhvalue":ny2,
        "searchkey": "jd",
        "year": ny1,
    }
    response = r.post(url=url,headers=headers,data=data,proxies=proxies)
    html = response.text
    hfs = re.findall("href='(.*?)'",html)
    if ny1 == 1:
        y1 = '2019'
    elif ny1 == 2:
        y1 = '2020'
    elif ny1 == 3:
        y1 = '2021'
    dt = y1 + '/' +str(ny2+1)
    list1 = []
    list2 = []
    for url1 in hfs:
        print(url1)
        utf = r_myco15.sismember('n14', url1)  ##更改
        if not utf:
            rsd = r1_d(url1, dt)
            list1.append(rsd)
            list2.append(url1)
        else:
            print('已存在，>>>n14')
    if list1:
        myco14.insert_many(list1)
        print('已存入原始库')
    if list2:
        myco14_b.insert_many(list1)
        print('已存入备份原始库')
        for mis in list2:
            r_myco15.sadd('n14', mis)  ##更改
    # if list1:
    #     myco14.insert_many(list1)

def runs(ny2):
    ny1 = 3
    ny3 = int(ny2) - 1
    print('2023',ny2,'=========')
    r1(ny1, ny2)

runs(10)