spider.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. # -*- encoding: utf-8 -*-
  2. '''
  3. @File : spider.py
  4. @Time : 2024/08/29 15:37:35
  5. @Author : Zhangziheng
  6. '''
  7. import re
  8. import requests
  9. from bs4 import BeautifulSoup
  10. from .conf_spider import *
  11. from .ext_logger import logger
  12. def pageDeep() -> int:
  13. response = requests.get(url, headers=headers)
  14. response.encoding = "utf-8"
  15. bat_page_index = r"page_div\',([\d]{0,4}),"
  16. try:
  17. _data = re.search(bat_page_index, response.text).group(1)
  18. return int(_data)
  19. except Exception as e:
  20. return 10
  21. def fetch_news(urls):
  22. news_list = []
  23. for url in urls:
  24. try:
  25. response = requests.get(url, headers=headers)
  26. response.encoding = "utf-8" # 解决乱码问题
  27. response.raise_for_status() # 如果请求失败,将抛出异常
  28. soup = BeautifulSoup(response.text, 'html.parser')
  29. ul = soup.find("ul", id="list")
  30. for li in ul.find_all('li'):
  31. _a = li.find("a")
  32. _span = li.find("span")
  33. date = _span.text
  34. link = _a.get("href")
  35. title = _a.text
  36. news_list.append({'title': title, 'link': link, 'date': date})
  37. except requests.RequestException as e:
  38. logger.error(f'Request failed for {url}: {e}')
  39. except Exception as e:
  40. logger.error(f'An error occurred for {url}: {e}')
  41. return news_list