#!/usr/bin/env python # coding:utf-8 import requests,json from setting import proxies from urllib import parse from lxml import etree from mongo_cho import myco24,r_myco15,myco24_b from rety import retry r = requests.session() r.keep_alive = False @retry(3) def r1(ny1,ny2,pg): d2 = str(ny2) if len(d2) == 1: d3 = '0' + str(d2) else: d3 = str(ny2) url = 'http://shanxi.chinatax.gov.cn/common/extQuery?sqlid=web_zdsswf&limit=10&cx_lx=0&cx_xsrq={ny1}-{ny2}&page={pg}'.format(ny1=ny1,ny2=d3,pg=pg) headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", } data = { "start":"0", } response = r.post(url=url,headers=headers,data=data,proxies=proxies) # print(response.text) rsd = response.json() # print(rsd) rsl = rsd['message']['list'] rpg = rsd['message']['totalPage'] list1 = [] list2 = [] for i in rsl: url1 = i['ajbh'] utf = r_myco15.sismember('n24', url1) ##更改 if not utf: list1.append(i) list2.append(url1) else: print('已存在,>>>n24') if list1: myco24.insert_many(list1) print('已存入原始库') if list2: myco24_b.insert_many(list1) print('已存入备份原始库') for mis in list2: r_myco15.sadd('n24', mis) ##更改 return int(rpg) # if rsl: # print('1') # myco24.insert_many(rsl) # ny1 = 2020 # ny2 = 12 def runs(ny1,ny2): print(ny1,ny2,'---------') rpg = r1(ny1,ny2,pg=1) for pg in range(1,rpg+1): print(pg,'==================') r1(ny1,ny2,pg) ny1 = 2023 ny2 = 12 runs(ny1,ny2)