|
@@ -0,0 +1,51 @@
|
|
|
+# -*- encoding: utf-8 -*-
|
|
|
+'''
|
|
|
+@File : spider.py
|
|
|
+@Time : 2024/08/29 15:37:35
|
|
|
+@Author : Zhangziheng
|
|
|
+'''
|
|
|
+
|
|
|
+import re
|
|
|
+
|
|
|
+import requests
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+
|
|
|
+from .conf_spider import *
|
|
|
+from .ext_logger import logger
|
|
|
+
|
|
|
+
|
|
|
+def pageDeep() -> int:
|
|
|
+ response = requests.get(url, headers=headers)
|
|
|
+ response.encoding = "utf-8"
|
|
|
+ bat_page_index = r"page_div\',([\d]{0,4}),"
|
|
|
+ try:
|
|
|
+ _data = re.search(bat_page_index, response.text).group(1)
|
|
|
+ return int(_data)
|
|
|
+ except Exception as e:
|
|
|
+ return 10
|
|
|
+
|
|
|
+
|
|
|
+def fetch_news(urls):
|
|
|
+ news_list = []
|
|
|
+ for url in urls:
|
|
|
+ try:
|
|
|
+ response = requests.get(url, headers=headers)
|
|
|
+ response.encoding = "utf-8" # 解决乱码问题
|
|
|
+ response.raise_for_status() # 如果请求失败,将抛出异常
|
|
|
+ soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
+ ul = soup.find("ul", id="list")
|
|
|
+
|
|
|
+ for li in ul.find_all('li'):
|
|
|
+ _a = li.find("a")
|
|
|
+ _span = li.find("span")
|
|
|
+
|
|
|
+ date = _span.text
|
|
|
+ link = _a.get("href")
|
|
|
+ title = _a.text
|
|
|
+ news_list.append({'title': title, 'link': link, 'date': date})
|
|
|
+
|
|
|
+ except requests.RequestException as e:
|
|
|
+ logger.error(f'Request failed for {url}: {e}')
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f'An error occurred for {url}: {e}')
|
|
|
+ return news_list
|