# -- coding: 'utf-8' -- import logging import re import pandas as pd import requests from bs4 import BeautifulSoup import time wangye_name = "中国证券监督管理委员会_时政要闻" headers = { 'accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9, image / avif, image / webp, image / apng, * / *;q = 0.8, application / signed - exchange;v = b3;q = 0.7', 'accept - encoding': 'gzip, deflate', 'accept - language': 'zh - CN, zh;q = 0.9, en;q = 0.8, en - GB;q = 0.7, en - US;q = 0.6', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0' } urls_template = 'http://www.csrc.gov.cn/csrc/c100027/common_list_{}.shtml' url = 'http://www.csrc.gov.cn/csrc/c100027/common_list.shtml' time_now = time.strftime("%Y-%m-%d", time.localtime()) def createLogger(name: str) -> logging.Logger: datefmt = '%Y-%m-%d' logging.basicConfig(level=logging.INFO, datefmt=datefmt,format='[%(asctime)s]-[%(name)s]-[%(levelname)s]::[%(message)s]') return logging.getLogger(name) def pageDeep() -> int: response = requests.get(url, headers=headers) response.encoding = "utf-8" bat_page_index = r"page_div\',([\d]{0,4})," _data = re.search(bat_page_index, response.text).group(1) return int(_data) def fetch_news(urls, headers): news_list = [] for url in urls: try: response = requests.get(url, headers=headers) response.encoding = "utf-8" # 解决乱码问题 response.raise_for_status() # 如果请求失败,将抛出异常 soup = BeautifulSoup(response.text, 'html.parser') ul = soup.find("ul", id="list") for li in ul.find_all('li'): _a = li.find("a") _span = li.find("span") date = _span.text link = _a.get("href") title = _a.text news_list.append({'title': title, 'link': link, 'date': date}) except requests.RequestException as e: logger.error(f'Request failed for {url}: {e}') except Exception as e: logger.error(f'An error occurred for {url}: {e}') return news_list def genTask() -> list: _items = [] deep = pageDeep() for i in range(1, deep + 1): # 使用1到最大页码的序号 if i == 1: urls =url else: urls = urls_template _items.append(urls.format(i)) # 格式化URL return _items if __name__ == '__main__': logger = createLogger(__name__) all_news = [] task_urls = genTask() for url in task_urls: news = fetch_news([url], headers) all_news.extend(news) # # 打印获取的新闻列表 # for item in all_news: # print(f"Link: {item['link']}, Date: {item['date']},Title: {item['title']}") '''此操作用于将数据从1开始索引''' df = pd.DataFrame(all_news) df.reset_index(drop=False, inplace=True) df['index'] = df['index'] + 1 # 重新命名索引列的名称 df = df.rename(columns={'index': 'index'}) # 输出为excel df.to_excel( f'D:\pyp\g-pqxb8807-pachongxiangmu-FinanceInfoCollection-\金融监管局新闻{time_now}.xlsx',index = False )