浏览代码

改动中

Cloudmistery 8 月之前
父节点
当前提交
1c1ba64d56

二进制
utils/__pycache__/anjuke_response.cpython-39.pyc


二进制
utils/__pycache__/get_message.cpython-39.pyc


二进制
utils/__pycache__/setting.cpython-39.pyc


+ 13 - 0
utils/anjuke_response.py

@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+# @Author  : ChenZhaoyuchen
+# @Time    : 2024/9/30 11:00
+# @File    : anjuke_response.py
+
+import requests
+from anjuke.utils.setting import *
+
+# 请求返回
+response_origion = requests.get(url = url_start, headers=headers).content.decode('utf8')
+response_200 = requests.get(url = url_start, headers=headers)
+
+print(response_origion,response_200)

+ 80 - 86
utils/get_message.py

@@ -3,103 +3,97 @@
 # @Time    : 2024/9/26 16:20
 # @File    : get_message.py
 
-from anjuke.utils.setting import *
 from bs4 import BeautifulSoup
+from anjuke.utils.anjuke_response import *
+from anjuke.utils.setting import *
 
-
-def get_province_area():
-    Area_list = []
-    soup = BeautifulSoup(response_HTML_province, 'html.parser')
-    div_table = soup.find(class_='sel-content')
-
-    for area_ in div_table.find_all('a'):
-        area = area_.text
-        Area_list.append(area)
-    # 此处出现错误是因为del操作缩进错误,for循环里append,又del导致输出为空
-    del Area_list[0]
-    return Area_list
-
-
-def get_city_area():
-    Area_list = []
-    soup = BeautifulSoup(response_HTML_province, 'html.parser')
-    div_table = soup.find(class_='sel-sec')
-
-    for area_ in div_table.find_all('a'):
-        area = area_.text
-        Area_list.append(area)
-    del Area_list[0]
-    return Area_list
-
-def get_qu_area():
-    Area_list = []
-    soup = BeautifulSoup(response_HTML, 'html.parser')
-    div_table = soup.find(class_= 'sel-content')
-
-    for area_ in div_table.find_all('a'):
-        area = area_.text
-        Area_list.append(area)
-    del Area_list[0]
-    return Area_list
-
-def get_zhoubian_area():
-    Area_list = []
-    soup = BeautifulSoup(response_HTML, 'html.parser')
-    div_table = soup.find(class_= 'sel-sec')
-
-    for area_ in div_table.find_all('a'):
-        area = area_.text
-        Area_list.append(area)
-    del Area_list[0]
-    return Area_list
-
+# 省级
+def get_province():
+    province_name_list = []
+    province_url_list = []
+    response_province = requests.get(url = url_start, headers=headers).content.decode('utf8')
+    soup = BeautifulSoup(response_province, 'html.parser')
+    print(soup)
+    filter_area_wrap = soup.find(class_="filter-area-wrap")
+    print(filter_area_wrap)
+    for province_ in filter_area_wrap.find_all('a'):
+        province_name = province_.text
+        province_url = province_.get('href')
+        province_name_list.append(province_name)
+        province_url_list.append('https://www.anjuke.com'+province_url)
+    del province_name_list[0],province_url_list[0]
+    return province_name_list,province_url_list
+
+# 市级
+def get_city():
+    province_name_list,province_url_list = get_province()
+    city_name_list = []
+    city_url_list = []
+
+    for i in range(len(province_url_list)):
+        province_url = province_url_list[i]
+        province_name = province_name_list[i]
+        response_city = requests.get(url = province_url, headers=headers).content.decode('utf8')
+        soup = BeautifulSoup(response_city, 'html.parser')
+        filter_area_wrap = soup.find(class_="sel-content bank")
+
+        for city_ in filter_area_wrap.find_all('a'):
+            city_name = province_name + city_.text
+            city_url = city_.get('href')
+            city_name_list.append(city_name)
+            city_url_list.append(city_url)
+        del city_name_list[0],city_url_list[0]
+        return city_name_list,city_url_list
+
+# 区级
+def get_area():
+    area_name_list = []
+    area_url_list = []
+    response_area = requests.get(url = '', headers=headers).content.decode('utf8')
+    soup = BeautifulSoup(response_area, 'html.parser')
+    filter_area_wrap = soup.find(class_="filter-area-wrap")
+
+    for area_ in filter_area_wrap.find_all('a'):
+        area_name = area_.text
+        area_url = area_.get('href')
+        area_name_list.append(area_name)
+        area_url_list.append('https://www.anjuke.com'+area_url)
+    del area_name_list[0],area_url_list[0]
+    return area_name_list,area_url_list
+
+# 周边
+def get_periphery():
+    periphery_name_list = []
+    periphery_url_list = []
+    response_periphery = requests.get(url = '', headers=headers).content.decode('utf8')
+    soup = BeautifulSoup(response_periphery, 'html.parser')
+    filter_area_wrap = soup.find(class_="filter-area-wrap")
+
+    for periphery_ in filter_area_wrap.find_all('a'):
+        periphery_name = periphery_.text
+        periphery_url = periphery_.get('href')
+        periphery_name_list.append(periphery_name)
+        periphery_url_list.append('https://www.anjuke.com'+periphery_url)
+    del periphery_name_list[0],periphery_url_list[0]
+    return periphery_name_list,periphery_url_list
+
+# 获取年份
 def get_Year():
     Years_list = []
-    soup = BeautifulSoup(response_HTML, 'html.parser')
+    soup = BeautifulSoup(response_origion, 'html.parser')
     years = soup.find(class_="year-list")
     for year_ in years.find_all('a'):
         year = year_.text
         Years_list.append(year)
     return Years_list
 
-def get_area_url():
-    Urls_list = []
-    soup = BeautifulSoup(response_HTML, 'html.parser')
-    urls = soup.find(class_='sel-content')
-
-    for url_area in urls.find_all('a'):
-        url_area = url_area.get('href')
-        Urls_list.append('https://www.anjuke.com' + url_area)
-    del Urls_list[0]
-    return Urls_list
-
-def get_zhoubian_url():
-    Urls_list = []
-    soup = BeautifulSoup(response_HTML, 'html.parser')
-    urls = soup.find(class_='sel-sec')
-
-    for url_area in urls.find_all('a'):
-        url_area = url_area.get('href')
-        Urls_list.append( url_area)
-    del Urls_list[0]
-    return Urls_list
-
+# 测试函数
 if __name__ == '__main__':
-    # url和response值
-    print(urls_template_high,response_200)
-    print(urls_template_low,response_2002)
-    # 时间跨度及地区唯一url
-    print('时间跨度:',get_Year())
-    # 分url
-    print('省级/直辖市:',get_province_area())
-    print('市级:',get_city_area())
-    # 分url
-    print('区级:',get_qu_area())
-    print('区级'+'唯一url:',get_area_url())
-    print('周边:',get_zhoubian_area())
-    print('周边'+'唯一url:',get_zhoubian_url())
-    # # html内容
-    # print(response_HTML)
+    # print('时间跨度:',get_Year())
+    print('省级单位:',get_province())
+    # print('市级单位:',get_city())
+    # print('区级单位',get_area())
+    # print('周边单位',get_periphery())
 
 
 

+ 38 - 20
utils/get_price.py

@@ -2,35 +2,53 @@
 # @Author  : ChenZhaoyuchen
 # @Time    : 2024/9/27 16:17
 # @File    : get_price.py
-
-from bs4 import BeautifulSoup
+import requests
 from anjuke.utils.get_message import *
 from anjuke.utils.setting import *
 
 def get_price():
-    price_list = []
-    zhoubian_name_list = get_zhoubian_area()
-    zhoubian_url_list = get_zhoubian_url()
-    for i in range(len(zhoubian_name_list)):
-        name = zhoubian_name_list[i]
-        url = zhoubian_url_list[i]
+    # # 价格列表
+    # price_list = []
+    # periphery_name_list = get_periphery()[0]
+    # periphery_url_list = get_periphery()[1]
+    #
+    # # 主函数
+    # for i in range(len(periphery_name_list)):
+    #     name = periphery_name_list[i]
+    #     url = periphery_url_list[i]
+        url = 'https://www.anjuke.com/fangjia/hf2024/bhgyy/'
         response_price = requests.get(url = url, headers = headers).content.decode('utf8')
         soup = BeautifulSoup(response_price, 'html.parser')
         price_ = soup.find(class_ = "table is-headless")
         table_trs = price_.find_all('div',class_ = "table-tr")
+        # html提取 时间time,单价price,涨跌tab,涨跌比率Price_Rate
+        list = []
 
         for table_tr in table_trs:
-            list = []
-            yue_fang_lv = table_tr.find_all('div',class_ = 'td')
-            for td in yue_fang_lv:
-                a = td.text.strip()
-                # Price_Rate = table_tr.find('div',class_ = "up") or table_tr.find('div',class_ = "down")
-                # if Price_Rate.find('div',class_ = "up"):
-                #     tab = '上涨'
-                # else:
-                #     tab = '下降'
-                print(a)
+            all_mes = table_tr.find_all('div',class_ = 'td')
+            zhongji = []
+            for td in all_mes:
+                a = td.text.strip() # 防止出现换行符等符号错误
+                print('a',a)
+                zhongji.append(a)
 
-if __name__ == '__main__':
-    get_price()
+            UporDown = table_tr.find('div',class_ = "up") or table_tr.find('div',class_ = "down")
+            if UporDown.find('div',class_ = "up"):
+                tab = '↑'
+            else:
+                if UporDown.find('div', class_="down"):
+                    tab = '↓'
+                else:
+                    tab = '未找到趋势'
+
+            zhongji.insert(2,tab)
 
+            list.append(zhongji)
+            print('zhongji',zhongji)
+        print('list',list)
+        # print(list[0][2])
+
+
+            # return list
+if __name__ == '__main__':
+    print(get_price())

+ 5 - 7
utils/readme.txt

@@ -1,11 +1,9 @@
 ## 安居客房价
 ### 1.请求头
 ### 2.确定基本格式:精细到区域,带价格,三个参数:区域,日期,价格
-如(省级单位):全国-安徽-合肥-包河-包河工业园
-如(直辖市):全国-直辖市-北京-朝阳—CBD
-
+      格式:全国-省级-城市-区-周边
+   如(省):全国-安徽-合肥-包河-包河工业园
+如(直辖市):全国-直辖-北京-朝阳—CBD
 ### 3.爬出内容格式:
-
-每个省都有独立的市,每个市都有独立的区,每个区都有独立的范围
-所以创建爬虫爬取每个
-
+四元数据A B C D
+格式:时间A房价 B元/㎡ C↑or↓ D%

+ 3 - 29
utils/setting.py

@@ -3,39 +3,13 @@
 # @Time    : 2024/9/26 16:00
 # @File    : setting.py
 
-import requests
-
-# 先试着爬取包河工业园2024年1-12月房价
-year = '2024'
-province = 'anhui'
-city = 'hf'
-area = 'baohequ'
-position = 'bhgyy'
-
-# url模板
-urls_template_high = f'https://www.anjuke.com/fangjia/{province}/'
-urls_template_low = f'https://www.anjuke.com/fangjia/{city}{year}/{area}/'
-
-
-# 省级单位的url,以省名为url后缀,进入后重定向
-url_province = 'https://www.anjuke.com/fangjia/anhui/'
-
-# 值得注意的是:选择城市之后,省级信息在url里不会体现
-url_sjzxs = f'https://www.anjuke.com/fangjia/'
-url_city = f'https://www.anjuke.com/fangjia/hf/'
-
-# 例如这个url就显示了在合肥市的包河区,但没有显示在安徽省
-url_qu = f'https://www.anjuke.com/fangjia/hf/baohequ/'
+# 初始url
+url_start = 'https://www.anjuke.com/fangjia/quanguo2024/'
 
+# 请求头
 headers = {
     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
     'Accept-Encoding': 'gzip, deflate, br',
     'Accept-Language': 'zh-CN,zh;q=0.9',
     'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0'
 }
-
-response_HTML_province = requests.get(url = urls_template_high, headers=headers).content.decode('utf8')
-response_HTML = requests.get(url = urls_template_low, headers=headers).content.decode('utf8')
-
-response_200 = requests.get(url = urls_template_high, headers=headers)
-response_2002 = requests.get(url = urls_template_low, headers=headers)