123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 |
- # -- coding: 'utf-8' --
- import logging
- import re
- import pandas as pd
- import requests
- from bs4 import BeautifulSoup
- import time
- wangye_name = "中国证券监督管理委员会_时政要闻"
- headers = {
- 'accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9, image / avif, image / webp, image / apng, * / *;q = 0.8, application / signed - exchange;v = b3;q = 0.7',
- 'accept - encoding': 'gzip, deflate',
- 'accept - language': 'zh - CN, zh;q = 0.9, en;q = 0.8, en - GB;q = 0.7, en - US;q = 0.6',
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
- }
- urls_template = 'http://www.csrc.gov.cn/csrc/c100027/common_list_{}.shtml'
- url = 'http://www.csrc.gov.cn/csrc/c100027/common_list.shtml'
- time_now = time.strftime("%Y-%m-%d", time.localtime())
- def createLogger(name: str) -> logging.Logger:
- datefmt = '%Y-%m-%d'
- logging.basicConfig(level=logging.INFO, datefmt=datefmt,format='[%(asctime)s]-[%(name)s]-[%(levelname)s]::[%(message)s]')
- return logging.getLogger(name)
- def pageDeep() -> int:
- response = requests.get(url, headers=headers)
- response.encoding = "utf-8"
- bat_page_index = r"page_div\',([\d]{0,4}),"
- _data = re.search(bat_page_index, response.text).group(1)
- return int(_data)
- def fetch_news(urls, headers):
- news_list = []
- for url in urls:
- try:
- response = requests.get(url, headers=headers)
- response.encoding = "utf-8" # 解决乱码问题
- response.raise_for_status() # 如果请求失败,将抛出异常
- soup = BeautifulSoup(response.text, 'html.parser')
- ul = soup.find("ul", id="list")
- for li in ul.find_all('li'):
- _a = li.find("a")
- _span = li.find("span")
- date = _span.text
- link = _a.get("href")
- title = _a.text
- news_list.append({'title': title, 'link': link, 'date': date})
- except requests.RequestException as e:
- logger.error(f'Request failed for {url}: {e}')
- except Exception as e:
- logger.error(f'An error occurred for {url}: {e}')
- return news_list
- def genTask() -> list:
- _items = []
- deep = pageDeep()
- for i in range(1, deep + 1): # 使用1到最大页码的序号
- if i == 1:
- urls =url
- else:
- urls = urls_template
- _items.append(urls.format(i)) # 格式化URL
- return _items
- if __name__ == '__main__':
- logger = createLogger(__name__)
- all_news = []
- task_urls = genTask()
- for url in task_urls:
- news = fetch_news([url], headers)
- all_news.extend(news)
- # # 打印获取的新闻列表
- # for item in all_news:
- # print(f"Link: {item['link']}, Date: {item['date']},Title: {item['title']}")
- '''此操作用于将数据从1开始索引'''
- df = pd.DataFrame(all_news)
- df.reset_index(drop=False, inplace=True)
- df['index'] = df['index'] + 1
- # 重新命名索引列的名称
- df = df.rename(columns={'index': 'index'})
- # 输出为excel
- df.to_excel(
- f'D:\pyp\g-pqxb8807-pachongxiangmu-FinanceInfoCollection-\金融监管局新闻{time_now}.xlsx',index = False
- )
|