get_message.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. # -*- coding: utf-8 -*-
  2. # @Author : ChenZhaoyuchen
  3. # @Time : 2024/9/26 16:20
  4. # @File : get_message.py
  5. from bs4 import BeautifulSoup
  6. from anjuke.utils.anjuke_response import *
  7. from anjuke.utils.setting import *
  8. import time,random
  9. # 省级
  10. def get_province():
  11. province_name_list = []
  12. province_url_list = []
  13. response_province = requests.get(url = url_start, headers = headers, proxies=proxies, timeout=10).content.decode('utf8')
  14. time.sleep(random.uniform(0.5, 1))
  15. soup = BeautifulSoup(response_province, 'html.parser')
  16. # print(soup)
  17. filter_area_wrap = soup.find(class_="filter-area-wrap")
  18. # print(filter_area_wrap)
  19. for province_ in filter_area_wrap.find_all('a'):
  20. province_name = province_.text
  21. province_url = province_.get('href')
  22. province_name_list.append(province_name)
  23. province_url_list.append('https://www.anjuke.com'+province_url)
  24. del province_name_list[0],province_url_list[0]
  25. return province_name_list,province_url_list
  26. # 市级
  27. def get_city():
  28. province_name_list,province_url_list = get_province()
  29. city_name_list = []
  30. city_url_list = []
  31. for i in range(len(province_url_list)):
  32. province_url = province_url_list[i]
  33. province_name = province_name_list[i]
  34. response_city = requests.get(url = province_url, headers = headers, proxies=proxies, timeout=10).content.decode('utf8')
  35. time.sleep(random.uniform(3, 4))
  36. soup = BeautifulSoup(response_city, 'html.parser')
  37. filter_area_wrap = soup.find(class_="sel-content bank")
  38. zhongji_name_list = []
  39. zhongji_url_list = []
  40. for city_ in filter_area_wrap.find_all('a'):
  41. city_name = province_name + city_.text
  42. city_url = city_.get('href')
  43. zhongji_name_list.append(city_name)
  44. zhongji_url_list.append(city_url)
  45. del zhongji_name_list[0], zhongji_url_list[0]
  46. city_name_list += zhongji_name_list
  47. city_url_list += zhongji_url_list
  48. print(f'已循环到第{i}个省级单位:{province_name_list[i]}')
  49. return city_name_list,city_url_list
  50. # 区级
  51. def get_area():
  52. city_name_list, city_url_list = get_city()
  53. area_name_list = []
  54. area_url_list = []
  55. for i in range(len(city_url_list)):
  56. city_url = city_url_list[i]
  57. city_name = city_name_list[i]
  58. response_area = requests.get(url = city_url, headers = headers ,proxies=proxies, timeout=10).content.decode('utf8')
  59. time.sleep(random.uniform(2, 3))
  60. soup = BeautifulSoup(response_area, 'html.parser')
  61. filter_area_wrap = soup.find(class_="sel-content bank")
  62. zhongji_name_list = []
  63. zhongji_url_list = []
  64. for area_ in filter_area_wrap.find_all('a'):
  65. area_name = city_name + area_.text
  66. area_url = area_.get('href')
  67. zhongji_name_list.append(area_name)
  68. zhongji_url_list.append(area_url)
  69. area_name_list.append(area_name)
  70. area_url_list.append(area_url)
  71. del area_name_list[0],area_url_list[0]
  72. return area_name_list,area_url_list
  73. # 周边
  74. def get_periphery():
  75. area_name_list, area_url_list = get_area()
  76. periphery_name_list = []
  77. periphery_url_list = []
  78. for i in range(len(area_url_list)):
  79. area_url = area_url_list[i]
  80. area_name = area_name_list[i]
  81. response_periphery = requests.get(url = area_url, headers = headers ,proxies=proxies, timeout=10).content.decode('utf8')
  82. time.sleep(random.uniform(3, 5))
  83. soup = BeautifulSoup(response_periphery, 'html.parser')
  84. filter_area_wrap = soup.find(class_="sel-content bank")
  85. for periphery_ in filter_area_wrap.find_all('a'):
  86. periphery_name = area_name + periphery_.text
  87. periphery_url = periphery_.get('href')
  88. periphery_name_list.append(periphery_name)
  89. periphery_url_list.append(periphery_url)
  90. del periphery_name_list[0], periphery_url_list[0]
  91. time.sleep(random.uniform(3, 5))
  92. return periphery_name_list, periphery_url_list
  93. # 获取年份
  94. def get_Year():
  95. Years_list = []
  96. soup = BeautifulSoup(response_origion, 'html.parser')
  97. years = soup.find(class_="year-list")
  98. for year_ in years.find_all('a'):
  99. year = year_.text
  100. Years_list.append(year)
  101. return Years_list
  102. # 测试函数
  103. if __name__ == '__main__':
  104. # print('时间跨度:',get_Year())
  105. # print('省级单位:',get_province())
  106. print('市级单位:',get_city())
  107. # print('区级单位',get_area())
  108. # print('周边单位',get_periphery())