zgzqjdglwyh.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. # -- coding: 'utf-8' --
  2. import logging
  3. import re
  4. import pandas as pd
  5. import requests
  6. from bs4 import BeautifulSoup
  7. import time
  8. wangye_name = "中国证券监督管理委员会_时政要闻"
  9. headers = {
  10. 'accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9, image / avif, image / webp, image / apng, * / *;q = 0.8, application / signed - exchange;v = b3;q = 0.7',
  11. 'accept - encoding': 'gzip, deflate',
  12. 'accept - language': 'zh - CN, zh;q = 0.9, en;q = 0.8, en - GB;q = 0.7, en - US;q = 0.6',
  13. 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
  14. }
  15. urls_template = 'http://www.csrc.gov.cn/csrc/c100027/common_list_{}.shtml'
  16. url = 'http://www.csrc.gov.cn/csrc/c100027/common_list.shtml'
  17. time_now = time.strftime("%Y-%m-%d", time.localtime())
  18. def createLogger(name: str) -> logging.Logger:
  19. datefmt = '%Y-%m-%d'
  20. logging.basicConfig(level=logging.INFO, datefmt=datefmt,format='[%(asctime)s]-[%(name)s]-[%(levelname)s]::[%(message)s]')
  21. return logging.getLogger(name)
  22. def pageDeep() -> int:
  23. response = requests.get(url, headers=headers)
  24. response.encoding = "utf-8"
  25. bat_page_index = r"page_div\',([\d]{0,4}),"
  26. _data = re.search(bat_page_index, response.text).group(1)
  27. return int(_data)
  28. def fetch_news(urls, headers):
  29. news_list = []
  30. for url in urls:
  31. try:
  32. response = requests.get(url, headers=headers)
  33. response.encoding = "utf-8" # 解决乱码问题
  34. response.raise_for_status() # 如果请求失败,将抛出异常
  35. soup = BeautifulSoup(response.text, 'html.parser')
  36. ul = soup.find("ul", id="list")
  37. for li in ul.find_all('li'):
  38. _a = li.find("a")
  39. _span = li.find("span")
  40. date = _span.text
  41. link = _a.get("href")
  42. title = _a.text
  43. news_list.append({'title': title, 'link': link, 'date': date})
  44. except requests.RequestException as e:
  45. logger.error(f'Request failed for {url}: {e}')
  46. except Exception as e:
  47. logger.error(f'An error occurred for {url}: {e}')
  48. return news_list
  49. def genTask() -> list:
  50. _items = []
  51. deep = pageDeep()
  52. for i in range(1, deep + 1): # 使用1到最大页码的序号
  53. if i == 1:
  54. urls =url
  55. else:
  56. urls = urls_template
  57. _items.append(urls.format(i)) # 格式化URL
  58. return _items
  59. if __name__ == '__main__':
  60. logger = createLogger(__name__)
  61. all_news = []
  62. task_urls = genTask()
  63. for url in task_urls:
  64. news = fetch_news([url], headers)
  65. all_news.extend(news)
  66. # # 打印获取的新闻列表
  67. # for item in all_news:
  68. # print(f"Link: {item['link']}, Date: {item['date']},Title: {item['title']}")
  69. '''此操作用于将数据从1开始索引'''
  70. df = pd.DataFrame(all_news)
  71. df.reset_index(drop=False, inplace=True)
  72. df['index'] = df['index'] + 1
  73. # 重新命名索引列的名称
  74. df = df.rename(columns={'index': 'index'})
  75. # 输出为excel
  76. df.to_excel(
  77. f'D:\pyp\g-pqxb8807-pachongxiangmu-FinanceInfoCollection-\金融监管局新闻{time_now}.xlsx',index = False
  78. )