123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105 |
- # -*- coding: utf-8 -*-
- # @Author : ChenZhaoyuchen
- # @Time : 2024/9/26 16:20
- # @File : get_message.py
- from anjuke.utils.setting import *
- from bs4 import BeautifulSoup
- def get_province_area():
- Area_list = []
- soup = BeautifulSoup(response_HTML_province, 'html.parser')
- div_table = soup.find(class_='sel-content')
- for area_ in div_table.find_all('a'):
- area = area_.text
- Area_list.append(area)
- # 此处出现错误是因为del操作缩进错误,for循环里append,又del导致输出为空
- del Area_list[0]
- return Area_list
- def get_city_area():
- Area_list = []
- soup = BeautifulSoup(response_HTML_province, 'html.parser')
- div_table = soup.find(class_='sel-sec')
- for area_ in div_table.find_all('a'):
- area = area_.text
- Area_list.append(area)
- del Area_list[0]
- return Area_list
- def get_qu_area():
- Area_list = []
- soup = BeautifulSoup(response_HTML, 'html.parser')
- div_table = soup.find(class_= 'sel-content')
- for area_ in div_table.find_all('a'):
- area = area_.text
- Area_list.append(area)
- del Area_list[0]
- return Area_list
- def get_zhoubian_area():
- Area_list = []
- soup = BeautifulSoup(response_HTML, 'html.parser')
- div_table = soup.find(class_= 'sel-sec')
- for area_ in div_table.find_all('a'):
- area = area_.text
- Area_list.append(area)
- del Area_list[0]
- return Area_list
- def get_Year():
- Years_list = []
- soup = BeautifulSoup(response_HTML, 'html.parser')
- years = soup.find(class_="year-list")
- for year_ in years.find_all('a'):
- year = year_.text
- Years_list.append(year)
- return Years_list
- def get_area_url():
- Urls_list = []
- soup = BeautifulSoup(response_HTML, 'html.parser')
- urls = soup.find(class_='sel-content')
- for url_area in urls.find_all('a'):
- url_area = url_area.get('href')
- Urls_list.append('https://www.anjuke.com' + url_area)
- del Urls_list[0]
- return Urls_list
- def get_zhoubian_url():
- Urls_list = []
- soup = BeautifulSoup(response_HTML, 'html.parser')
- urls = soup.find(class_='sel-sec')
- for url_area in urls.find_all('a'):
- url_area = url_area.get('href')
- Urls_list.append( url_area)
- del Urls_list[0]
- return Urls_list
- if __name__ == '__main__':
- # url和response值
- print(urls_template_high,response_200)
- print(urls_template_low,response_2002)
- # 时间跨度及地区唯一url
- print('时间跨度:',get_Year())
- # 分url
- print('省级/直辖市:',get_province_area())
- print('市级:',get_city_area())
- # 分url
- print('区级:',get_qu_area())
- print('区级'+'唯一url:',get_area_url())
- print('周边:',get_zhoubian_area())
- print('周边'+'唯一url:',get_zhoubian_url())
- # # html内容
- # print(response_HTML)
|