|
@@ -6,35 +6,62 @@
|
|
from bs4 import BeautifulSoup
|
|
from bs4 import BeautifulSoup
|
|
from anjuke.utils.anjuke_response import *
|
|
from anjuke.utils.anjuke_response import *
|
|
from anjuke.utils.setting import *
|
|
from anjuke.utils.setting import *
|
|
-import time,random
|
|
|
|
|
|
+from anjuke.utils.mysqlClass import *
|
|
|
|
+import mysql.connector
|
|
|
|
+
|
|
# 省级
|
|
# 省级
|
|
def get_province():
|
|
def get_province():
|
|
|
|
+ name = 'province'
|
|
province_name_list = []
|
|
province_name_list = []
|
|
province_url_list = []
|
|
province_url_list = []
|
|
- response_province = requests.get(url = url_start, headers = headers, proxies=proxies, timeout=10).content.decode('utf8')
|
|
|
|
- time.sleep(random.uniform(0.5, 1))
|
|
|
|
|
|
+ response_province = requests.get(url = url_start, headers = headers, proxies=proxies).content.decode('utf8')
|
|
soup = BeautifulSoup(response_province, 'html.parser')
|
|
soup = BeautifulSoup(response_province, 'html.parser')
|
|
- # print(soup)
|
|
|
|
filter_area_wrap = soup.find(class_="filter-area-wrap")
|
|
filter_area_wrap = soup.find(class_="filter-area-wrap")
|
|
- # print(filter_area_wrap)
|
|
|
|
for province_ in filter_area_wrap.find_all('a'):
|
|
for province_ in filter_area_wrap.find_all('a'):
|
|
province_name = province_.text
|
|
province_name = province_.text
|
|
province_url = province_.get('href')
|
|
province_url = province_.get('href')
|
|
province_name_list.append(province_name)
|
|
province_name_list.append(province_name)
|
|
province_url_list.append('https://www.anjuke.com'+province_url)
|
|
province_url_list.append('https://www.anjuke.com'+province_url)
|
|
del province_name_list[0],province_url_list[0]
|
|
del province_name_list[0],province_url_list[0]
|
|
|
|
+
|
|
|
|
+ # 创建游标对象,用于执行SQL查询
|
|
|
|
+
|
|
|
|
+ cursor1 = db.cursor()
|
|
|
|
+ for i in range(len(province_name_list)):
|
|
|
|
+ # 插入一条记录到"anjuke_province"表中
|
|
|
|
+ sql = "INSERT INTO anjuke_province (place_name,place_url) VALUES (%s,%s)"
|
|
|
|
+ values = (province_name_list[i], province_url_list[i])
|
|
|
|
+ cursor1.execute(sql, values)
|
|
|
|
+ # 提交更改到数据库
|
|
|
|
+ db.commit()
|
|
|
|
+ print(f"插入了 {cursor1.rowcount} 条记录")
|
|
|
|
+ cursor1.close()
|
|
|
|
+
|
|
|
|
+ print(f'已获取并添加全部省级单位')
|
|
return province_name_list,province_url_list
|
|
return province_name_list,province_url_list
|
|
|
|
|
|
# 市级
|
|
# 市级
|
|
def get_city():
|
|
def get_city():
|
|
- province_name_list,province_url_list = get_province()
|
|
|
|
|
|
+ name = 'city'
|
|
|
|
+ # 设置空列表
|
|
city_name_list = []
|
|
city_name_list = []
|
|
city_url_list = []
|
|
city_url_list = []
|
|
- for i in range(len(province_url_list)):
|
|
|
|
- province_url = province_url_list[i]
|
|
|
|
- province_name = province_name_list[i]
|
|
|
|
- response_city = requests.get(url = province_url, headers = headers, proxies=proxies, timeout=10).content.decode('utf8')
|
|
|
|
- time.sleep(random.uniform(3, 4))
|
|
|
|
|
|
+ # 设置游标cursor
|
|
|
|
+ cursor2 = db.cursor()
|
|
|
|
+ # 传参
|
|
|
|
+ query = "SELECT * FROM anjuke_province"
|
|
|
|
+ cursor2.execute(query)
|
|
|
|
+ results = cursor2.fetchall()
|
|
|
|
+ province_list = [list(row) for row in results]
|
|
|
|
+ # 记得关掉cursor
|
|
|
|
+ cursor2.close()
|
|
|
|
+
|
|
|
|
+ # 开始循环
|
|
|
|
+ print('开始循环省份-城市')
|
|
|
|
+ for i in range(len(province_list)):
|
|
|
|
+ province_name = province_list[i][0]
|
|
|
|
+ province_url = province_list[i][1]
|
|
|
|
+ response_city = requests.get(url = province_url, headers = headers, proxies=proxies).content.decode('utf8')
|
|
soup = BeautifulSoup(response_city, 'html.parser')
|
|
soup = BeautifulSoup(response_city, 'html.parser')
|
|
filter_area_wrap = soup.find(class_="sel-content bank")
|
|
filter_area_wrap = soup.find(class_="sel-content bank")
|
|
zhongji_name_list = []
|
|
zhongji_name_list = []
|
|
@@ -47,20 +74,35 @@ def get_city():
|
|
del zhongji_name_list[0], zhongji_url_list[0]
|
|
del zhongji_name_list[0], zhongji_url_list[0]
|
|
city_name_list += zhongji_name_list
|
|
city_name_list += zhongji_name_list
|
|
city_url_list += zhongji_url_list
|
|
city_url_list += zhongji_url_list
|
|
- print(f'已循环到第{i}个省级单位:{province_name_list[i]}')
|
|
|
|
|
|
+
|
|
|
|
+ # # 上传代码
|
|
|
|
+ # cursor2 = db.cursor()
|
|
|
|
+ # for i in range(len(city_name_list)):
|
|
|
|
+ # # 插入一条记录到"anjuke_province"表中
|
|
|
|
+ # sql = "INSERT INTO anjuke_city (place_name,place_url) VALUES (%s,%s)"
|
|
|
|
+ # values = (city_name_list[i], city_url_list[i])
|
|
|
|
+ # cursor2.execute(sql, values)
|
|
|
|
+ # # 提交更改到数据库
|
|
|
|
+ # db.commit()
|
|
|
|
+ # print(f"插入了 {cursor2.rowcount} 条记录")
|
|
|
|
+ # print(f"插入了 {i} 条记录")
|
|
|
|
+ # cursor2.close()
|
|
|
|
+
|
|
|
|
+ print(f'已循环到第{i}个省级单位:{province_name}')
|
|
|
|
+ print(city_name_list)
|
|
return city_name_list,city_url_list
|
|
return city_name_list,city_url_list
|
|
|
|
|
|
# 区级
|
|
# 区级
|
|
def get_area():
|
|
def get_area():
|
|
|
|
+ name = 'area'
|
|
city_name_list, city_url_list = get_city()
|
|
city_name_list, city_url_list = get_city()
|
|
area_name_list = []
|
|
area_name_list = []
|
|
area_url_list = []
|
|
area_url_list = []
|
|
-
|
|
|
|
|
|
+ print('开始循环城市-区域')
|
|
for i in range(len(city_url_list)):
|
|
for i in range(len(city_url_list)):
|
|
city_url = city_url_list[i]
|
|
city_url = city_url_list[i]
|
|
city_name = city_name_list[i]
|
|
city_name = city_name_list[i]
|
|
- response_area = requests.get(url = city_url, headers = headers ,proxies=proxies, timeout=10).content.decode('utf8')
|
|
|
|
- time.sleep(random.uniform(2, 3))
|
|
|
|
|
|
+ response_area = requests.get(url = city_url, headers = headers ,proxies=proxies).content.decode('utf8')
|
|
soup = BeautifulSoup(response_area, 'html.parser')
|
|
soup = BeautifulSoup(response_area, 'html.parser')
|
|
filter_area_wrap = soup.find(class_="sel-content bank")
|
|
filter_area_wrap = soup.find(class_="sel-content bank")
|
|
zhongji_name_list = []
|
|
zhongji_name_list = []
|
|
@@ -70,32 +112,37 @@ def get_area():
|
|
area_url = area_.get('href')
|
|
area_url = area_.get('href')
|
|
zhongji_name_list.append(area_name)
|
|
zhongji_name_list.append(area_name)
|
|
zhongji_url_list.append(area_url)
|
|
zhongji_url_list.append(area_url)
|
|
- area_name_list.append(area_name)
|
|
|
|
- area_url_list.append(area_url)
|
|
|
|
- del area_name_list[0],area_url_list[0]
|
|
|
|
|
|
+ del zhongji_name_list[0],zhongji_url_list[0]
|
|
|
|
+ area_name_list += zhongji_name_list
|
|
|
|
+ area_url_list += zhongji_url_list
|
|
|
|
+ print(f'已循环到第{i}个市级单位:{city_name_list[i]}')
|
|
return area_name_list,area_url_list
|
|
return area_name_list,area_url_list
|
|
|
|
|
|
|
|
|
|
# 周边
|
|
# 周边
|
|
def get_periphery():
|
|
def get_periphery():
|
|
|
|
+ name = 'periphery'
|
|
area_name_list, area_url_list = get_area()
|
|
area_name_list, area_url_list = get_area()
|
|
periphery_name_list = []
|
|
periphery_name_list = []
|
|
periphery_url_list = []
|
|
periphery_url_list = []
|
|
-
|
|
|
|
|
|
+ print('开始循环区域-周边')
|
|
for i in range(len(area_url_list)):
|
|
for i in range(len(area_url_list)):
|
|
area_url = area_url_list[i]
|
|
area_url = area_url_list[i]
|
|
area_name = area_name_list[i]
|
|
area_name = area_name_list[i]
|
|
- response_periphery = requests.get(url = area_url, headers = headers ,proxies=proxies, timeout=10).content.decode('utf8')
|
|
|
|
- time.sleep(random.uniform(3, 5))
|
|
|
|
|
|
+ response_periphery = requests.get(url = area_url, headers = headers ,proxies=proxies).content.decode('utf8')
|
|
soup = BeautifulSoup(response_periphery, 'html.parser')
|
|
soup = BeautifulSoup(response_periphery, 'html.parser')
|
|
filter_area_wrap = soup.find(class_="sel-content bank")
|
|
filter_area_wrap = soup.find(class_="sel-content bank")
|
|
|
|
+ zhongji_name_list = []
|
|
|
|
+ zhongji_url_list = []
|
|
for periphery_ in filter_area_wrap.find_all('a'):
|
|
for periphery_ in filter_area_wrap.find_all('a'):
|
|
periphery_name = area_name + periphery_.text
|
|
periphery_name = area_name + periphery_.text
|
|
periphery_url = periphery_.get('href')
|
|
periphery_url = periphery_.get('href')
|
|
- periphery_name_list.append(periphery_name)
|
|
|
|
- periphery_url_list.append(periphery_url)
|
|
|
|
|
|
+ zhongji_name_list.append(periphery_name)
|
|
|
|
+ zhongji_url_list.append(periphery_url)
|
|
del periphery_name_list[0], periphery_url_list[0]
|
|
del periphery_name_list[0], periphery_url_list[0]
|
|
- time.sleep(random.uniform(3, 5))
|
|
|
|
|
|
+ periphery_name_list += zhongji_name_list
|
|
|
|
+ periphery_url_list += zhongji_url_list
|
|
|
|
+ print(f'已循环到第{i}个周边单位:{area_name_list[i]}')
|
|
return periphery_name_list, periphery_url_list
|
|
return periphery_name_list, periphery_url_list
|
|
|
|
|
|
# 获取年份
|
|
# 获取年份
|
|
@@ -108,7 +155,10 @@ def get_Year():
|
|
Years_list.append(year)
|
|
Years_list.append(year)
|
|
return Years_list
|
|
return Years_list
|
|
|
|
|
|
-# 测试函数
|
|
|
|
|
|
+# 创建mysql数据库方法
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+# # 测试函数
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
# print('时间跨度:',get_Year())
|
|
# print('时间跨度:',get_Year())
|
|
# print('省级单位:',get_province())
|
|
# print('省级单位:',get_province())
|