|
@@ -6,22 +6,24 @@
|
|
|
from bs4 import BeautifulSoup
|
|
|
from anjuke.utils.anjuke_response import *
|
|
|
from anjuke.utils.setting import *
|
|
|
-
|
|
|
+import time,random
|
|
|
# 省级
|
|
|
def get_province():
|
|
|
province_name_list = []
|
|
|
province_url_list = []
|
|
|
- response_province = requests.get(url = url_start, headers=headers).content.decode('utf8')
|
|
|
+ response_province = requests.get(url = url_start, headers = headers).content.decode('utf8')
|
|
|
+ time.sleep(random.uniform(0.5, 1))
|
|
|
soup = BeautifulSoup(response_province, 'html.parser')
|
|
|
- print(soup)
|
|
|
+ # print(soup)
|
|
|
filter_area_wrap = soup.find(class_="filter-area-wrap")
|
|
|
- print(filter_area_wrap)
|
|
|
+ # print(filter_area_wrap)
|
|
|
for province_ in filter_area_wrap.find_all('a'):
|
|
|
province_name = province_.text
|
|
|
province_url = province_.get('href')
|
|
|
province_name_list.append(province_name)
|
|
|
province_url_list.append('https://www.anjuke.com'+province_url)
|
|
|
del province_name_list[0],province_url_list[0]
|
|
|
+ time.sleep(random.uniform(0.5, 1))
|
|
|
return province_name_list,province_url_list
|
|
|
|
|
|
# 市级
|
|
@@ -29,53 +31,73 @@ def get_city():
|
|
|
province_name_list,province_url_list = get_province()
|
|
|
city_name_list = []
|
|
|
city_url_list = []
|
|
|
-
|
|
|
for i in range(len(province_url_list)):
|
|
|
province_url = province_url_list[i]
|
|
|
province_name = province_name_list[i]
|
|
|
- response_city = requests.get(url = province_url, headers=headers).content.decode('utf8')
|
|
|
+ response_city = requests.get(url = province_url, headers = headers).content.decode('utf8')
|
|
|
+ time.sleep(random.uniform(3, 4))
|
|
|
soup = BeautifulSoup(response_city, 'html.parser')
|
|
|
filter_area_wrap = soup.find(class_="sel-content bank")
|
|
|
-
|
|
|
+ zhongji_name_list = []
|
|
|
+ zhongji_url_list = []
|
|
|
for city_ in filter_area_wrap.find_all('a'):
|
|
|
city_name = province_name + city_.text
|
|
|
city_url = city_.get('href')
|
|
|
- city_name_list.append(city_name)
|
|
|
- city_url_list.append(city_url)
|
|
|
- del city_name_list[0],city_url_list[0]
|
|
|
- return city_name_list,city_url_list
|
|
|
+ zhongji_name_list.append(city_name)
|
|
|
+ zhongji_url_list.append(city_url)
|
|
|
+ del zhongji_name_list[0], zhongji_url_list[0]
|
|
|
+ city_name_list += zhongji_name_list
|
|
|
+ city_url_list += zhongji_url_list
|
|
|
+ print(f'已循环到第{i}个省级单位:{province_name_list[i]}')
|
|
|
+ return city_name_list,city_url_list
|
|
|
|
|
|
# 区级
|
|
|
def get_area():
|
|
|
+ city_name_list, city_url_list = get_city()
|
|
|
area_name_list = []
|
|
|
area_url_list = []
|
|
|
- response_area = requests.get(url = '', headers=headers).content.decode('utf8')
|
|
|
- soup = BeautifulSoup(response_area, 'html.parser')
|
|
|
- filter_area_wrap = soup.find(class_="filter-area-wrap")
|
|
|
|
|
|
- for area_ in filter_area_wrap.find_all('a'):
|
|
|
- area_name = area_.text
|
|
|
- area_url = area_.get('href')
|
|
|
- area_name_list.append(area_name)
|
|
|
- area_url_list.append('https://www.anjuke.com'+area_url)
|
|
|
- del area_name_list[0],area_url_list[0]
|
|
|
+ for i in range(len(city_url_list)):
|
|
|
+ city_url = city_url_list[i]
|
|
|
+ city_name = city_name_list[i]
|
|
|
+ response_area = requests.get(url = city_url, headers = headers).content.decode('utf8')
|
|
|
+ time.sleep(random.uniform(2, 3))
|
|
|
+ soup = BeautifulSoup(response_area, 'html.parser')
|
|
|
+ filter_area_wrap = soup.find(class_="sel-content bank")
|
|
|
+ zhongji_name_list = []
|
|
|
+ zhongji_url_list = []
|
|
|
+ for area_ in filter_area_wrap.find_all('a'):
|
|
|
+ area_name = city_name + area_.text
|
|
|
+ area_url = area_.get('href')
|
|
|
+ zhongji_name_list.append(area_name)
|
|
|
+ zhongji_url_list.append(area_url)
|
|
|
+ area_name_list.append(area_name)
|
|
|
+ area_url_list.append(area_url)
|
|
|
+ del area_name_list[0],area_url_list[0]
|
|
|
return area_name_list,area_url_list
|
|
|
|
|
|
+
|
|
|
# 周边
|
|
|
def get_periphery():
|
|
|
+ area_name_list, area_url_list = get_area()
|
|
|
periphery_name_list = []
|
|
|
periphery_url_list = []
|
|
|
- response_periphery = requests.get(url = '', headers=headers).content.decode('utf8')
|
|
|
- soup = BeautifulSoup(response_periphery, 'html.parser')
|
|
|
- filter_area_wrap = soup.find(class_="filter-area-wrap")
|
|
|
|
|
|
- for periphery_ in filter_area_wrap.find_all('a'):
|
|
|
- periphery_name = periphery_.text
|
|
|
- periphery_url = periphery_.get('href')
|
|
|
- periphery_name_list.append(periphery_name)
|
|
|
- periphery_url_list.append('https://www.anjuke.com'+periphery_url)
|
|
|
- del periphery_name_list[0],periphery_url_list[0]
|
|
|
- return periphery_name_list,periphery_url_list
|
|
|
+ for i in range(len(area_url_list)):
|
|
|
+ area_url = area_url_list[i]
|
|
|
+ area_name = area_name_list[i]
|
|
|
+ response_periphery = requests.get(url = area_url, headers = headers).content.decode('utf8')
|
|
|
+ time.sleep(random.uniform(3, 5))
|
|
|
+ soup = BeautifulSoup(response_periphery, 'html.parser')
|
|
|
+ filter_area_wrap = soup.find(class_="sel-content bank")
|
|
|
+ for periphery_ in filter_area_wrap.find_all('a'):
|
|
|
+ periphery_name = area_name + periphery_.text
|
|
|
+ periphery_url = periphery_.get('href')
|
|
|
+ periphery_name_list.append(periphery_name)
|
|
|
+ periphery_url_list.append(periphery_url)
|
|
|
+ del periphery_name_list[0], periphery_url_list[0]
|
|
|
+ time.sleep(random.uniform(3, 5))
|
|
|
+ return periphery_name_list, periphery_url_list
|
|
|
|
|
|
# 获取年份
|
|
|
def get_Year():
|
|
@@ -90,8 +112,8 @@ def get_Year():
|
|
|
# 测试函数
|
|
|
if __name__ == '__main__':
|
|
|
# print('时间跨度:',get_Year())
|
|
|
- print('省级单位:',get_province())
|
|
|
- # print('市级单位:',get_city())
|
|
|
+ # print('省级单位:',get_province())
|
|
|
+ print('市级单位:',get_city())
|
|
|
# print('区级单位',get_area())
|
|
|
# print('周边单位',get_periphery())
|
|
|
|