|
@@ -3,103 +3,97 @@
|
|
# @Time : 2024/9/26 16:20
|
|
# @Time : 2024/9/26 16:20
|
|
# @File : get_message.py
|
|
# @File : get_message.py
|
|
|
|
|
|
-from anjuke.utils.setting import *
|
|
|
|
from bs4 import BeautifulSoup
|
|
from bs4 import BeautifulSoup
|
|
|
|
+from anjuke.utils.anjuke_response import *
|
|
|
|
+from anjuke.utils.setting import *
|
|
|
|
|
|
-
|
|
|
|
-def get_province_area():
|
|
|
|
- Area_list = []
|
|
|
|
- soup = BeautifulSoup(response_HTML_province, 'html.parser')
|
|
|
|
- div_table = soup.find(class_='sel-content')
|
|
|
|
-
|
|
|
|
- for area_ in div_table.find_all('a'):
|
|
|
|
- area = area_.text
|
|
|
|
- Area_list.append(area)
|
|
|
|
- # 此处出现错误是因为del操作缩进错误,for循环里append,又del导致输出为空
|
|
|
|
- del Area_list[0]
|
|
|
|
- return Area_list
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-def get_city_area():
|
|
|
|
- Area_list = []
|
|
|
|
- soup = BeautifulSoup(response_HTML_province, 'html.parser')
|
|
|
|
- div_table = soup.find(class_='sel-sec')
|
|
|
|
-
|
|
|
|
- for area_ in div_table.find_all('a'):
|
|
|
|
- area = area_.text
|
|
|
|
- Area_list.append(area)
|
|
|
|
- del Area_list[0]
|
|
|
|
- return Area_list
|
|
|
|
-
|
|
|
|
-def get_qu_area():
|
|
|
|
- Area_list = []
|
|
|
|
- soup = BeautifulSoup(response_HTML, 'html.parser')
|
|
|
|
- div_table = soup.find(class_= 'sel-content')
|
|
|
|
-
|
|
|
|
- for area_ in div_table.find_all('a'):
|
|
|
|
- area = area_.text
|
|
|
|
- Area_list.append(area)
|
|
|
|
- del Area_list[0]
|
|
|
|
- return Area_list
|
|
|
|
-
|
|
|
|
-def get_zhoubian_area():
|
|
|
|
- Area_list = []
|
|
|
|
- soup = BeautifulSoup(response_HTML, 'html.parser')
|
|
|
|
- div_table = soup.find(class_= 'sel-sec')
|
|
|
|
-
|
|
|
|
- for area_ in div_table.find_all('a'):
|
|
|
|
- area = area_.text
|
|
|
|
- Area_list.append(area)
|
|
|
|
- del Area_list[0]
|
|
|
|
- return Area_list
|
|
|
|
-
|
|
|
|
|
|
+# 省级
|
|
|
|
+def get_province():
|
|
|
|
+ province_name_list = []
|
|
|
|
+ province_url_list = []
|
|
|
|
+ response_province = requests.get(url = url_start, headers=headers).content.decode('utf8')
|
|
|
|
+ soup = BeautifulSoup(response_province, 'html.parser')
|
|
|
|
+ print(soup)
|
|
|
|
+ filter_area_wrap = soup.find(class_="filter-area-wrap")
|
|
|
|
+ print(filter_area_wrap)
|
|
|
|
+ for province_ in filter_area_wrap.find_all('a'):
|
|
|
|
+ province_name = province_.text
|
|
|
|
+ province_url = province_.get('href')
|
|
|
|
+ province_name_list.append(province_name)
|
|
|
|
+ province_url_list.append('https://www.anjuke.com'+province_url)
|
|
|
|
+ del province_name_list[0],province_url_list[0]
|
|
|
|
+ return province_name_list,province_url_list
|
|
|
|
+
|
|
|
|
+# 市级
|
|
|
|
+def get_city():
|
|
|
|
+ province_name_list,province_url_list = get_province()
|
|
|
|
+ city_name_list = []
|
|
|
|
+ city_url_list = []
|
|
|
|
+
|
|
|
|
+ for i in range(len(province_url_list)):
|
|
|
|
+ province_url = province_url_list[i]
|
|
|
|
+ province_name = province_name_list[i]
|
|
|
|
+ response_city = requests.get(url = province_url, headers=headers).content.decode('utf8')
|
|
|
|
+ soup = BeautifulSoup(response_city, 'html.parser')
|
|
|
|
+ filter_area_wrap = soup.find(class_="sel-content bank")
|
|
|
|
+
|
|
|
|
+ for city_ in filter_area_wrap.find_all('a'):
|
|
|
|
+ city_name = province_name + city_.text
|
|
|
|
+ city_url = city_.get('href')
|
|
|
|
+ city_name_list.append(city_name)
|
|
|
|
+ city_url_list.append(city_url)
|
|
|
|
+ del city_name_list[0],city_url_list[0]
|
|
|
|
+ return city_name_list,city_url_list
|
|
|
|
+
|
|
|
|
+# 区级
|
|
|
|
+def get_area():
|
|
|
|
+ area_name_list = []
|
|
|
|
+ area_url_list = []
|
|
|
|
+ response_area = requests.get(url = '', headers=headers).content.decode('utf8')
|
|
|
|
+ soup = BeautifulSoup(response_area, 'html.parser')
|
|
|
|
+ filter_area_wrap = soup.find(class_="filter-area-wrap")
|
|
|
|
+
|
|
|
|
+ for area_ in filter_area_wrap.find_all('a'):
|
|
|
|
+ area_name = area_.text
|
|
|
|
+ area_url = area_.get('href')
|
|
|
|
+ area_name_list.append(area_name)
|
|
|
|
+ area_url_list.append('https://www.anjuke.com'+area_url)
|
|
|
|
+ del area_name_list[0],area_url_list[0]
|
|
|
|
+ return area_name_list,area_url_list
|
|
|
|
+
|
|
|
|
+# 周边
|
|
|
|
+def get_periphery():
|
|
|
|
+ periphery_name_list = []
|
|
|
|
+ periphery_url_list = []
|
|
|
|
+ response_periphery = requests.get(url = '', headers=headers).content.decode('utf8')
|
|
|
|
+ soup = BeautifulSoup(response_periphery, 'html.parser')
|
|
|
|
+ filter_area_wrap = soup.find(class_="filter-area-wrap")
|
|
|
|
+
|
|
|
|
+ for periphery_ in filter_area_wrap.find_all('a'):
|
|
|
|
+ periphery_name = periphery_.text
|
|
|
|
+ periphery_url = periphery_.get('href')
|
|
|
|
+ periphery_name_list.append(periphery_name)
|
|
|
|
+ periphery_url_list.append('https://www.anjuke.com'+periphery_url)
|
|
|
|
+ del periphery_name_list[0],periphery_url_list[0]
|
|
|
|
+ return periphery_name_list,periphery_url_list
|
|
|
|
+
|
|
|
|
+# 获取年份
|
|
def get_Year():
|
|
def get_Year():
|
|
Years_list = []
|
|
Years_list = []
|
|
- soup = BeautifulSoup(response_HTML, 'html.parser')
|
|
|
|
|
|
+ soup = BeautifulSoup(response_origion, 'html.parser')
|
|
years = soup.find(class_="year-list")
|
|
years = soup.find(class_="year-list")
|
|
for year_ in years.find_all('a'):
|
|
for year_ in years.find_all('a'):
|
|
year = year_.text
|
|
year = year_.text
|
|
Years_list.append(year)
|
|
Years_list.append(year)
|
|
return Years_list
|
|
return Years_list
|
|
|
|
|
|
-def get_area_url():
|
|
|
|
- Urls_list = []
|
|
|
|
- soup = BeautifulSoup(response_HTML, 'html.parser')
|
|
|
|
- urls = soup.find(class_='sel-content')
|
|
|
|
-
|
|
|
|
- for url_area in urls.find_all('a'):
|
|
|
|
- url_area = url_area.get('href')
|
|
|
|
- Urls_list.append('https://www.anjuke.com' + url_area)
|
|
|
|
- del Urls_list[0]
|
|
|
|
- return Urls_list
|
|
|
|
-
|
|
|
|
-def get_zhoubian_url():
|
|
|
|
- Urls_list = []
|
|
|
|
- soup = BeautifulSoup(response_HTML, 'html.parser')
|
|
|
|
- urls = soup.find(class_='sel-sec')
|
|
|
|
-
|
|
|
|
- for url_area in urls.find_all('a'):
|
|
|
|
- url_area = url_area.get('href')
|
|
|
|
- Urls_list.append( url_area)
|
|
|
|
- del Urls_list[0]
|
|
|
|
- return Urls_list
|
|
|
|
-
|
|
|
|
|
|
+# 测试函数
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
- # url和response值
|
|
|
|
- print(urls_template_high,response_200)
|
|
|
|
- print(urls_template_low,response_2002)
|
|
|
|
- # 时间跨度及地区唯一url
|
|
|
|
- print('时间跨度:',get_Year())
|
|
|
|
- # 分url
|
|
|
|
- print('省级/直辖市:',get_province_area())
|
|
|
|
- print('市级:',get_city_area())
|
|
|
|
- # 分url
|
|
|
|
- print('区级:',get_qu_area())
|
|
|
|
- print('区级'+'唯一url:',get_area_url())
|
|
|
|
- print('周边:',get_zhoubian_area())
|
|
|
|
- print('周边'+'唯一url:',get_zhoubian_url())
|
|
|
|
- # # html内容
|
|
|
|
- # print(response_HTML)
|
|
|
|
|
|
+ # print('时间跨度:',get_Year())
|
|
|
|
+ print('省级单位:',get_province())
|
|
|
|
+ # print('市级单位:',get_city())
|
|
|
|
+ # print('区级单位',get_area())
|
|
|
|
+ # print('周边单位',get_periphery())
|
|
|
|
|
|
|
|
|
|
|
|
|