#!/usr/bin/env python # coding:utf-8 import requests,json from setting import proxies from urllib import parse from lxml import etree from rety import retry r = requests.session() r.keep_alive = False from mongo_cho import myco6,r_myco15,myco6_b #https://jiangsu.chinatax.gov.cn/col/col16916/index.html @retry(3) def r1_d(url,dt): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } response = r.get(url=url,headers=headers,proxies=proxies) response.encoding = 'UTF-8' html = response.text selector = etree.HTML(html) a = selector.xpath('//body/table/tbody/tr') dict1 = {} for i in a: k1 = i.xpath('td[1]') text = k1[0].xpath('string(.)').strip() # print(text) v1 = i.xpath('td[2]') text1 = v1[0].xpath('string(.)').strip() text2 = text1.replace(' ', '').replace('\r', '').replace('\t', '').replace('\n', '') dict1[text] = text2 dict1['url'] = url dict1['date'] = dt print(dict1) return dict1 # r1_d() @retry(3) def r1(ny1,ny2,dt): url = 'https://jiangsu.chinatax.gov.cn/module/jslib/bulletin/lpajaxdata.jsp?startrecord=1&endrecord=36&perpage=11&rowpage=1' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } data = { "searhvalue":ny2, "searchkey": "jd", "year": ny1, } response = r.post(url=url,data=data,headers=headers,proxies=proxies) html = response.text print(html) selector = etree.HTML(html) a = selector.xpath('//a/@href') # print(a) list1 = [] list2 = [] for i in a: print(i) utf = r_myco15.sismember('n06', i) if not utf: rsd = r1_d(i, dt) list1.append(rsd) list2.append(i) else: print('已存在,>>>n06') pass if list1: myco6.insert_many(list1) if list2: myco6_b.insert_many(list1) for mis in list2: r_myco15.sadd('n06', mis) # myco6.insert_many(list1) # https://jiangsu.chinatax.gov.cn/col/col16916/index.html def runs(): ny1 = '2021' ny2 = '8' dt= str(ny1) + '/' + str(ny2) r1(ny1,ny2,dt) runs()