Эх сурвалжийг харах

安居客历史房价爬虫

Cloudmistery 7 сар өмнө
commit
892414c84c

+ 4 - 0
__init__.py

@@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+# @Author  : ChenZhaoyuchen
+# @Time    : 2024/9/26 15:59
+# @File    : __init__.py

BIN
__pycache__/__init__.cpython-39.pyc


+ 6 - 0
main.py

@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+# @Author  : ChenZhaoyuchen
+# @Time    : 2024/9/26 15:59
+# @File    : main.py
+import requests
+

+ 4 - 0
utils/__init__.py

@@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+# @Author  : ChenZhaoyuchen
+# @Time    : 2024/9/26 16:36
+# @File    : __init__.py

BIN
utils/__pycache__/__init__.cpython-39.pyc


BIN
utils/__pycache__/get_message.cpython-39.pyc


BIN
utils/__pycache__/setting.cpython-39.pyc


+ 105 - 0
utils/get_message.py

@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+# @Author  : ChenZhaoyuchen
+# @Time    : 2024/9/26 16:20
+# @File    : get_message.py
+
+from anjuke.utils.setting import *
+from bs4 import BeautifulSoup
+
+
+def get_province_area():
+    Area_list = []
+    soup = BeautifulSoup(response_HTML_province, 'html.parser')
+    div_table = soup.find(class_='sel-content')
+
+    for area_ in div_table.find_all('a'):
+        area = area_.text
+        Area_list.append(area)
+    # 此处出现错误是因为del操作缩进错误,for循环里append,又del导致输出为空
+    del Area_list[0]
+    return Area_list
+
+
+def get_city_area():
+    Area_list = []
+    soup = BeautifulSoup(response_HTML_province, 'html.parser')
+    div_table = soup.find(class_='sel-sec')
+
+    for area_ in div_table.find_all('a'):
+        area = area_.text
+        Area_list.append(area)
+    del Area_list[0]
+    return Area_list
+
+def get_qu_area():
+    Area_list = []
+    soup = BeautifulSoup(response_HTML, 'html.parser')
+    div_table = soup.find(class_= 'sel-content')
+
+    for area_ in div_table.find_all('a'):
+        area = area_.text
+        Area_list.append(area)
+    del Area_list[0]
+    return Area_list
+
+def get_zhoubian_area():
+    Area_list = []
+    soup = BeautifulSoup(response_HTML, 'html.parser')
+    div_table = soup.find(class_= 'sel-sec')
+
+    for area_ in div_table.find_all('a'):
+        area = area_.text
+        Area_list.append(area)
+    del Area_list[0]
+    return Area_list
+
+def get_Year():
+    Years_list = []
+    soup = BeautifulSoup(response_HTML, 'html.parser')
+    years = soup.find(class_="year-list")
+    for year_ in years.find_all('a'):
+        year = year_.text
+        Years_list.append(year)
+    return Years_list
+
+def get_area_url():
+    Urls_list = []
+    soup = BeautifulSoup(response_HTML, 'html.parser')
+    urls = soup.find(class_='sel-content')
+
+    for url_area in urls.find_all('a'):
+        url_area = url_area.get('href')
+        Urls_list.append('https://www.anjuke.com' + url_area)
+    del Urls_list[0]
+    return Urls_list
+
+def get_zhoubian_url():
+    Urls_list = []
+    soup = BeautifulSoup(response_HTML, 'html.parser')
+    urls = soup.find(class_='sel-sec')
+
+    for url_area in urls.find_all('a'):
+        url_area = url_area.get('href')
+        Urls_list.append( url_area)
+    del Urls_list[0]
+    return Urls_list
+
+if __name__ == '__main__':
+    # url和response值
+    print(urls_template_high,response_200)
+    print(urls_template_low,response_2002)
+    # 时间跨度及地区唯一url
+    print('时间跨度:',get_Year())
+    # 分url
+    print('省级/直辖市:',get_province_area())
+    print('市级:',get_city_area())
+    # 分url
+    print('区级:',get_qu_area())
+    print('区级'+'唯一url:',get_area_url())
+    print('周边:',get_zhoubian_area())
+    print('周边'+'唯一url:',get_zhoubian_url())
+    # # html内容
+    # print(response_HTML)
+
+
+

+ 36 - 0
utils/get_price.py

@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+# @Author  : ChenZhaoyuchen
+# @Time    : 2024/9/27 16:17
+# @File    : get_price.py
+
+from bs4 import BeautifulSoup
+from anjuke.utils.get_message import *
+from anjuke.utils.setting import *
+
+def get_price():
+    price_list = []
+    zhoubian_name_list = get_zhoubian_area()
+    zhoubian_url_list = get_zhoubian_url()
+    for i in range(len(zhoubian_name_list)):
+        name = zhoubian_name_list[i]
+        url = zhoubian_url_list[i]
+        response_price = requests.get(url = url, headers = headers).content.decode('utf8')
+        soup = BeautifulSoup(response_price, 'html.parser')
+        price_ = soup.find(class_ = "table is-headless")
+        table_trs = price_.find_all('div',class_ = "table-tr")
+
+        for table_tr in table_trs:
+            list = []
+            yue_fang_lv = table_tr.find_all('div',class_ = 'td')
+            for td in yue_fang_lv:
+                a = td.text.strip()
+                # Price_Rate = table_tr.find('div',class_ = "up") or table_tr.find('div',class_ = "down")
+                # if Price_Rate.find('div',class_ = "up"):
+                #     tab = '上涨'
+                # else:
+                #     tab = '下降'
+                print(a)
+
+if __name__ == '__main__':
+    get_price()
+

+ 11 - 0
utils/readme.txt

@@ -0,0 +1,11 @@
+## 安居客房价
+### 1.请求头
+### 2.确定基本格式:精细到区域,带价格,三个参数:区域,日期,价格
+如(省级单位):全国-安徽-合肥-包河-包河工业园
+如(直辖市):全国-直辖市-北京-朝阳—CBD
+
+### 3.爬出内容格式:
+
+每个省都有独立的市,每个市都有独立的区,每个区都有独立的范围
+所以创建爬虫爬取每个
+

+ 41 - 0
utils/setting.py

@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# @Author  : ChenZhaoyuchen
+# @Time    : 2024/9/26 16:00
+# @File    : setting.py
+
+import requests
+
+# 先试着爬取包河工业园2024年1-12月房价
+year = '2024'
+province = 'anhui'
+city = 'hf'
+area = 'baohequ'
+position = 'bhgyy'
+
+# url模板
+urls_template_high = f'https://www.anjuke.com/fangjia/{province}/'
+urls_template_low = f'https://www.anjuke.com/fangjia/{city}{year}/{area}/'
+
+
+# 省级单位的url,以省名为url后缀,进入后重定向
+url_province = 'https://www.anjuke.com/fangjia/anhui/'
+
+# 值得注意的是:选择城市之后,省级信息在url里不会体现
+url_sjzxs = f'https://www.anjuke.com/fangjia/'
+url_city = f'https://www.anjuke.com/fangjia/hf/'
+
+# 例如这个url就显示了在合肥市的包河区,但没有显示在安徽省
+url_qu = f'https://www.anjuke.com/fangjia/hf/baohequ/'
+
+headers = {
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Accept-Language': 'zh-CN,zh;q=0.9',
+    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0'
+}
+
+response_HTML_province = requests.get(url = urls_template_high, headers=headers).content.decode('utf8')
+response_HTML = requests.get(url = urls_template_low, headers=headers).content.decode('utf8')
+
+response_200 = requests.get(url = urls_template_high, headers=headers)
+response_2002 = requests.get(url = urls_template_low, headers=headers)