123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051 |
- # -*- encoding: utf-8 -*-
- '''
- @File : spider.py
- @Time : 2024/08/29 15:37:35
- @Author : Zhangziheng
- '''
- import re
- import requests
- from bs4 import BeautifulSoup
- from .conf_spider import *
- from .ext_logger import logger
- def pageDeep() -> int:
- response = requests.get(url, headers=headers)
- response.encoding = "utf-8"
- bat_page_index = r"page_div\',([\d]{0,4}),"
- try:
- _data = re.search(bat_page_index, response.text).group(1)
- return int(_data)
- except Exception as e:
- return 10
- def fetch_news(urls):
- news_list = []
- for url in urls:
- try:
- response = requests.get(url, headers=headers)
- response.encoding = "utf-8" # 解决乱码问题
- response.raise_for_status() # 如果请求失败,将抛出异常
- soup = BeautifulSoup(response.text, 'html.parser')
- ul = soup.find("ul", id="list")
- for li in ul.find_all('li'):
- _a = li.find("a")
- _span = li.find("span")
- date = _span.text
- link = _a.get("href")
- title = _a.text
- news_list.append({'title': title, 'link': link, 'date': date})
- except requests.RequestException as e:
- logger.error(f'Request failed for {url}: {e}')
- except Exception as e:
- logger.error(f'An error occurred for {url}: {e}')
- return news_list
|