Cloudmistery 8 months ago
parent
commit
a8b2040c47

BIN
utils/__pycache__/get_message.cpython-39.pyc


BIN
utils/__pycache__/setting.cpython-39.pyc


+ 54 - 32
utils/get_message.py

@@ -6,22 +6,24 @@
 from bs4 import BeautifulSoup
 from anjuke.utils.anjuke_response import *
 from anjuke.utils.setting import *
-
+import time,random
 # 省级
 def get_province():
     province_name_list = []
     province_url_list = []
-    response_province = requests.get(url = url_start, headers=headers).content.decode('utf8')
+    response_province = requests.get(url = url_start, headers = headers).content.decode('utf8')
+    time.sleep(random.uniform(0.5, 1))
     soup = BeautifulSoup(response_province, 'html.parser')
-    print(soup)
+    # print(soup)
     filter_area_wrap = soup.find(class_="filter-area-wrap")
-    print(filter_area_wrap)
+    # print(filter_area_wrap)
     for province_ in filter_area_wrap.find_all('a'):
         province_name = province_.text
         province_url = province_.get('href')
         province_name_list.append(province_name)
         province_url_list.append('https://www.anjuke.com'+province_url)
     del province_name_list[0],province_url_list[0]
+    time.sleep(random.uniform(0.5, 1))
     return province_name_list,province_url_list
 
 # 市级
@@ -29,53 +31,73 @@ def get_city():
     province_name_list,province_url_list = get_province()
     city_name_list = []
     city_url_list = []
-
     for i in range(len(province_url_list)):
         province_url = province_url_list[i]
         province_name = province_name_list[i]
-        response_city = requests.get(url = province_url, headers=headers).content.decode('utf8')
+        response_city = requests.get(url = province_url, headers = headers).content.decode('utf8')
+        time.sleep(random.uniform(3, 4))
         soup = BeautifulSoup(response_city, 'html.parser')
         filter_area_wrap = soup.find(class_="sel-content bank")
-
+        zhongji_name_list = []
+        zhongji_url_list = []
         for city_ in filter_area_wrap.find_all('a'):
             city_name = province_name + city_.text
             city_url = city_.get('href')
-            city_name_list.append(city_name)
-            city_url_list.append(city_url)
-        del city_name_list[0],city_url_list[0]
-        return city_name_list,city_url_list
+            zhongji_name_list.append(city_name)
+            zhongji_url_list.append(city_url)
+        del zhongji_name_list[0], zhongji_url_list[0]
+        city_name_list += zhongji_name_list
+        city_url_list += zhongji_url_list
+        print(f'已循环到第{i}个省级单位:{province_name_list[i]}')
+    return city_name_list,city_url_list
 
 # 区级
 def get_area():
+    city_name_list, city_url_list = get_city()
     area_name_list = []
     area_url_list = []
-    response_area = requests.get(url = '', headers=headers).content.decode('utf8')
-    soup = BeautifulSoup(response_area, 'html.parser')
-    filter_area_wrap = soup.find(class_="filter-area-wrap")
 
-    for area_ in filter_area_wrap.find_all('a'):
-        area_name = area_.text
-        area_url = area_.get('href')
-        area_name_list.append(area_name)
-        area_url_list.append('https://www.anjuke.com'+area_url)
-    del area_name_list[0],area_url_list[0]
+    for i in range(len(city_url_list)):
+        city_url = city_url_list[i]
+        city_name = city_name_list[i]
+        response_area = requests.get(url = city_url, headers = headers).content.decode('utf8')
+        time.sleep(random.uniform(2, 3))
+        soup = BeautifulSoup(response_area, 'html.parser')
+        filter_area_wrap = soup.find(class_="sel-content bank")
+        zhongji_name_list = []
+        zhongji_url_list = []
+        for area_ in filter_area_wrap.find_all('a'):
+            area_name = city_name + area_.text
+            area_url = area_.get('href')
+            zhongji_name_list.append(area_name)
+            zhongji_url_list.append(area_url)
+            area_name_list.append(area_name)
+            area_url_list.append(area_url)
+        del area_name_list[0],area_url_list[0]
     return area_name_list,area_url_list
 
+
 # 周边
 def get_periphery():
+    area_name_list, area_url_list = get_area()
     periphery_name_list = []
     periphery_url_list = []
-    response_periphery = requests.get(url = '', headers=headers).content.decode('utf8')
-    soup = BeautifulSoup(response_periphery, 'html.parser')
-    filter_area_wrap = soup.find(class_="filter-area-wrap")
 
-    for periphery_ in filter_area_wrap.find_all('a'):
-        periphery_name = periphery_.text
-        periphery_url = periphery_.get('href')
-        periphery_name_list.append(periphery_name)
-        periphery_url_list.append('https://www.anjuke.com'+periphery_url)
-    del periphery_name_list[0],periphery_url_list[0]
-    return periphery_name_list,periphery_url_list
+    for i in range(len(area_url_list)):
+        area_url = area_url_list[i]
+        area_name = area_name_list[i]
+        response_periphery = requests.get(url = area_url, headers = headers).content.decode('utf8')
+        time.sleep(random.uniform(3, 5))
+        soup = BeautifulSoup(response_periphery, 'html.parser')
+        filter_area_wrap = soup.find(class_="sel-content bank")
+        for periphery_ in filter_area_wrap.find_all('a'):
+            periphery_name = area_name + periphery_.text
+            periphery_url = periphery_.get('href')
+            periphery_name_list.append(periphery_name)
+            periphery_url_list.append(periphery_url)
+        del periphery_name_list[0], periphery_url_list[0]
+        time.sleep(random.uniform(3, 5))
+    return periphery_name_list, periphery_url_list
 
 # 获取年份
 def get_Year():
@@ -90,8 +112,8 @@ def get_Year():
 # 测试函数
 if __name__ == '__main__':
     # print('时间跨度:',get_Year())
-    print('省级单位:',get_province())
-    # print('市级单位:',get_city())
+    # print('省级单位:',get_province())
+    print('市级单位:',get_city())
     # print('区级单位',get_area())
     # print('周边单位',get_periphery())
 

+ 24 - 28
utils/get_price.py

@@ -7,48 +7,44 @@ from anjuke.utils.get_message import *
 from anjuke.utils.setting import *
 
 def get_price():
-    # # 价格列表
-    # price_list = []
-    # periphery_name_list = get_periphery()[0]
-    # periphery_url_list = get_periphery()[1]
-    #
-    # # 主函数
-    # for i in range(len(periphery_name_list)):
-    #     name = periphery_name_list[i]
-    #     url = periphery_url_list[i]
-        url = 'https://www.anjuke.com/fangjia/hf2024/bhgyy/'
+    # 价格列表
+    periphery_name_list = get_periphery()[0]
+    periphery_url_list = get_periphery()[1]
+
+    # 主函数
+    result = []
+    for i in range(len(periphery_name_list)):
+        name = periphery_name_list[i]
+        url = periphery_url_list[i]
+        # url = 'https://www.anjuke.com/fangjia/beijing2024/chaoyang/'
         response_price = requests.get(url = url, headers = headers).content.decode('utf8')
         soup = BeautifulSoup(response_price, 'html.parser')
         price_ = soup.find(class_ = "table is-headless")
         table_trs = price_.find_all('div',class_ = "table-tr")
         # html提取 时间time,单价price,涨跌tab,涨跌比率Price_Rate
-        list = []
-
+        list1 = []
         for table_tr in table_trs:
             all_mes = table_tr.find_all('div',class_ = 'td')
             zhongji = []
             for td in all_mes:
                 a = td.text.strip() # 防止出现换行符等符号错误
                 print('a',a)
+                # 每轮3个a值,包括时间time,单价price,涨跌比率Price_Rate
                 zhongji.append(a)
-
-            UporDown = table_tr.find('div',class_ = "up") or table_tr.find('div',class_ = "down")
-            if UporDown.find('div',class_ = "up"):
+            UporDown = table_tr.find(
+                'div', class_="up") or table_tr.find('div', class_="down")
+            attr = UporDown['class'][0]
+            if attr == "up":
                 tab = '↑'
+            elif attr == "down":
+                tab = '↓'
             else:
-                if UporDown.find('div', class_="down"):
-                    tab = '↓'
-                else:
-                    tab = '未找到趋势'
-
+                tab = '-'
             zhongji.insert(2,tab)
+            list1.append(zhongji)
+            list1.append(name)
+        result.append(list1)
+    return result
 
-            list.append(zhongji)
-            print('zhongji',zhongji)
-        print('list',list)
-        # print(list[0][2])
-
-
-            # return list
 if __name__ == '__main__':
-    print(get_price())
+    get_price()

+ 40 - 4
utils/setting.py

@@ -2,14 +2,50 @@
 # @Author  : ChenZhaoyuchen
 # @Time    : 2024/9/26 16:00
 # @File    : setting.py
+import random
+
+
+# 模拟请求头
+agent_list = [
+	"Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+	"Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)",
+	"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
+	"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
+	"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
+	"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
+	"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
+	"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
+	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
+	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
+	"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
+	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
+	"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
+	"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10"
+]
 
 # 初始url
 url_start = 'https://www.anjuke.com/fangjia/quanguo2024/'
 
 # 请求头
+# headers = {
+#     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+#     'Accept-Encoding': 'gzip, deflate, br',
+#     'Accept-Language': 'zh-CN,zh;q=0.9',
+#     'referer':'https://www.anjuke.com/',
+#     'cookie':'xxzlxxid=pfmxpoucXXdKPZe3nePjn1oG3tEFYp6CwGDK9cSqkSE8FQ+YKsyHR+C1hZCtXLFDNP0S',
+#     'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0'
+# }
+
 headers = {
-    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
-    'Accept-Encoding': 'gzip, deflate, br',
-    'Accept-Language': 'zh-CN,zh;q=0.9',
-    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0'
+'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+'accept-encoding':'gzip, deflate, br, zstd',
+'accept-language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+'cache-control':'max-age=0',
+'connection':'keep-alive',
+'cookie':'aQQ_ajkguid=8A58B742-7F9E-4169-9684-065B9DF9AC96; sessid=2C9914D2-0EC1-4CF8-8B9F-BB2DD2432060; ajk-appVersion=; ctid=33; obtain_by=2; twe=2; id58=CkwAb2cHJ2CSvzjwJRw2Ag==; xxzlclientid=ac98d936-a0a8-41cf-b57a-1728522083413; xxzlxxid=pfmxpoucXXdKPZe3nePjn1oG3tEFYp6CwGDK9cSqkSE8FQ+YKsyHR+C1hZCtXLFDNP0S; fzq_h=bfec261cadc68ed6a35b0159901cf584_1728609619502_56d94ea5cdbf416fac02c5b4f2d27600_47896428890875912854068920960365571880; xxzlbbid=pfmbM3wxMDM0NnwxLjEwLjF8MTcyODYxMDg4MjI2MTg3OTkyMXwvak5hTThZUkZuRzE1TkkxbnJKaVBDZnZvTUR5WjB0QXA4dUtlZDZ2VWVrPXw0ZTM5ZmI1NzdkM2QyMGM1ZGJkM2I3MDEyNDQ5ODU3N18xNzI4NjEwODgxODU5Xzk1MjY2MTViNmRhMjQ3NmQ5ZGU5MDlkNWI4OGYyNzM4XzM3MDQ0ODAwNjB8ZTc0OWUyMTUyNTMzMzUzMzM4ZGZjZmE5ODY4NWE3OGNfMTcyODYxMDg4MTg0NV8yNTU=',
+'host':'www.anjuke.com',
+'if-none-match':"2db91-inlYvGp0xIvRpS6/mdxeLVOjQ9k",
+'user-agent':random.choice(agent_list),
 }
+
+