get_message.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. # -*- coding: utf-8 -*-
  2. # @Author : ChenZhaoyuchen
  3. # @Time : 2024/9/26 16:20
  4. # @File : get_message.py
  5. from bs4 import BeautifulSoup
  6. from anjuke.utils.anjuke_response import *
  7. from anjuke.utils.setting import *
  8. from anjuke.utils.mysqlClass import *
  9. import mysql.connector
  10. # 省级
  11. def get_province():
  12. name = 'province'
  13. province_name_list = []
  14. province_url_list = []
  15. response_province = requests.get(url = url_start, headers = headers, proxies=proxies).content.decode('utf8')
  16. soup = BeautifulSoup(response_province, 'html.parser')
  17. filter_area_wrap = soup.find(class_="filter-area-wrap")
  18. for province_ in filter_area_wrap.find_all('a'):
  19. province_name = province_.text
  20. province_url = province_.get('href')
  21. province_name_list.append(province_name)
  22. province_url_list.append('https://www.anjuke.com'+province_url)
  23. del province_name_list[0],province_url_list[0]
  24. # 创建游标对象,用于执行SQL查询
  25. cursor1 = db.cursor()
  26. for i in range(len(province_name_list)):
  27. # 插入一条记录到"anjuke_province"表中
  28. sql = "INSERT INTO anjuke_province (place_name,place_url) VALUES (%s,%s)"
  29. values = (province_name_list[i], province_url_list[i])
  30. cursor1.execute(sql, values)
  31. # 提交更改到数据库
  32. db.commit()
  33. print(f"插入了 {cursor1.rowcount} 条记录")
  34. cursor1.close()
  35. print(f'已获取并添加全部省级单位')
  36. return province_name_list,province_url_list
  37. # 市级
  38. def get_city():
  39. name = 'city'
  40. # 设置空列表
  41. city_name_list = []
  42. city_url_list = []
  43. # 设置游标cursor
  44. cursor2 = db.cursor()
  45. # 传参
  46. query = "SELECT * FROM anjuke_province"
  47. cursor2.execute(query)
  48. results = cursor2.fetchall()
  49. province_list = [list(row) for row in results]
  50. # 记得关掉cursor
  51. cursor2.close()
  52. # 开始循环
  53. print('开始循环省份-城市')
  54. for i in range(len(province_list)):
  55. province_name = province_list[i][0]
  56. province_url = province_list[i][1]
  57. response_city = requests.get(url = province_url, headers = headers, proxies=proxies).content.decode('utf8')
  58. soup = BeautifulSoup(response_city, 'html.parser')
  59. filter_area_wrap = soup.find(class_="sel-content bank")
  60. zhongji_name_list = []
  61. zhongji_url_list = []
  62. for city_ in filter_area_wrap.find_all('a'):
  63. city_name = province_name + city_.text
  64. city_url = city_.get('href')
  65. zhongji_name_list.append(city_name)
  66. zhongji_url_list.append(city_url)
  67. del zhongji_name_list[0], zhongji_url_list[0]
  68. city_name_list += zhongji_name_list
  69. city_url_list += zhongji_url_list
  70. # # 上传代码
  71. # cursor2 = db.cursor()
  72. # for i in range(len(city_name_list)):
  73. # # 插入一条记录到"anjuke_province"表中
  74. # sql = "INSERT INTO anjuke_city (place_name,place_url) VALUES (%s,%s)"
  75. # values = (city_name_list[i], city_url_list[i])
  76. # cursor2.execute(sql, values)
  77. # # 提交更改到数据库
  78. # db.commit()
  79. # print(f"插入了 {cursor2.rowcount} 条记录")
  80. # print(f"插入了 {i} 条记录")
  81. # cursor2.close()
  82. print(f'已循环到第{i}个省级单位:{province_name}')
  83. print(city_name_list)
  84. return city_name_list,city_url_list
  85. # 区级
  86. def get_area():
  87. name = 'area'
  88. city_name_list, city_url_list = get_city()
  89. area_name_list = []
  90. area_url_list = []
  91. print('开始循环城市-区域')
  92. for i in range(len(city_url_list)):
  93. city_url = city_url_list[i]
  94. city_name = city_name_list[i]
  95. response_area = requests.get(url = city_url, headers = headers ,proxies=proxies).content.decode('utf8')
  96. soup = BeautifulSoup(response_area, 'html.parser')
  97. filter_area_wrap = soup.find(class_="sel-content bank")
  98. zhongji_name_list = []
  99. zhongji_url_list = []
  100. for area_ in filter_area_wrap.find_all('a'):
  101. area_name = city_name + area_.text
  102. area_url = area_.get('href')
  103. zhongji_name_list.append(area_name)
  104. zhongji_url_list.append(area_url)
  105. del zhongji_name_list[0],zhongji_url_list[0]
  106. area_name_list += zhongji_name_list
  107. area_url_list += zhongji_url_list
  108. print(f'已循环到第{i}个市级单位:{city_name_list[i]}')
  109. return area_name_list,area_url_list
  110. # 周边
  111. def get_periphery():
  112. name = 'periphery'
  113. area_name_list, area_url_list = get_area()
  114. periphery_name_list = []
  115. periphery_url_list = []
  116. print('开始循环区域-周边')
  117. for i in range(len(area_url_list)):
  118. area_url = area_url_list[i]
  119. area_name = area_name_list[i]
  120. response_periphery = requests.get(url = area_url, headers = headers ,proxies=proxies).content.decode('utf8')
  121. soup = BeautifulSoup(response_periphery, 'html.parser')
  122. filter_area_wrap = soup.find(class_="sel-content bank")
  123. zhongji_name_list = []
  124. zhongji_url_list = []
  125. for periphery_ in filter_area_wrap.find_all('a'):
  126. periphery_name = area_name + periphery_.text
  127. periphery_url = periphery_.get('href')
  128. zhongji_name_list.append(periphery_name)
  129. zhongji_url_list.append(periphery_url)
  130. del periphery_name_list[0], periphery_url_list[0]
  131. periphery_name_list += zhongji_name_list
  132. periphery_url_list += zhongji_url_list
  133. print(f'已循环到第{i}个周边单位:{area_name_list[i]}')
  134. return periphery_name_list, periphery_url_list
  135. # 获取年份
  136. def get_Year():
  137. Years_list = []
  138. soup = BeautifulSoup(response_origion, 'html.parser')
  139. years = soup.find(class_="year-list")
  140. for year_ in years.find_all('a'):
  141. year = year_.text
  142. Years_list.append(year)
  143. return Years_list
  144. # 创建mysql数据库方法
  145. # # 测试函数
  146. if __name__ == '__main__':
  147. # print('时间跨度:',get_Year())
  148. # print('省级单位:',get_province())
  149. print('市级单位:',get_city())
  150. # print('区级单位',get_area())
  151. # print('周边单位',get_periphery())