# -*- coding: utf-8 -*- # @Author : ChenZhaoyuchen # @Time : 2024/9/26 16:20 # @File : get_message.py from bs4 import BeautifulSoup from anjuke.utils.anjuke_response import * from anjuke.utils.setting import * from anjuke.utils.mysqlClass import * import mysql.connector # 省级 def get_province(): name = 'province' province_name_list = [] province_url_list = [] response_province = requests.get(url = url_start, headers = headers, proxies=proxies).content.decode('utf8') soup = BeautifulSoup(response_province, 'html.parser') filter_area_wrap = soup.find(class_="filter-area-wrap") for province_ in filter_area_wrap.find_all('a'): province_name = province_.text province_url = province_.get('href') province_name_list.append(province_name) province_url_list.append('https://www.anjuke.com'+province_url) del province_name_list[0],province_url_list[0] # 创建游标对象,用于执行SQL查询 cursor1 = db.cursor() for i in range(len(province_name_list)): # 插入一条记录到"anjuke_province"表中 sql = "INSERT INTO anjuke_province (place_name,place_url) VALUES (%s,%s)" values = (province_name_list[i], province_url_list[i]) cursor1.execute(sql, values) # 提交更改到数据库 db.commit() print(f"插入了 {cursor1.rowcount} 条记录") cursor1.close() print(f'已获取并添加全部省级单位') return province_name_list,province_url_list # 市级 def get_city(): name = 'city' # 设置空列表 city_name_list = [] city_url_list = [] # 设置游标cursor cursor2 = db.cursor() # 传参 query = "SELECT * FROM anjuke_province" cursor2.execute(query) results = cursor2.fetchall() province_list = [list(row) for row in results] # 记得关掉cursor cursor2.close() # 开始循环 print('开始循环省份-城市') for i in range(len(province_list)): province_name = province_list[i][0] province_url = province_list[i][1] response_city = requests.get(url = province_url, headers = headers, proxies=proxies).content.decode('utf8') soup = BeautifulSoup(response_city, 'html.parser') filter_area_wrap = soup.find(class_="sel-content bank") zhongji_name_list = [] zhongji_url_list = [] for city_ in filter_area_wrap.find_all('a'): city_name = province_name + city_.text city_url = city_.get('href') zhongji_name_list.append(city_name) zhongji_url_list.append(city_url) del zhongji_name_list[0], zhongji_url_list[0] city_name_list += zhongji_name_list city_url_list += zhongji_url_list # # 上传代码 # cursor2 = db.cursor() # for i in range(len(city_name_list)): # # 插入一条记录到"anjuke_province"表中 # sql = "INSERT INTO anjuke_city (place_name,place_url) VALUES (%s,%s)" # values = (city_name_list[i], city_url_list[i]) # cursor2.execute(sql, values) # # 提交更改到数据库 # db.commit() # print(f"插入了 {cursor2.rowcount} 条记录") # print(f"插入了 {i} 条记录") # cursor2.close() print(f'已循环到第{i}个省级单位:{province_name}') print(city_name_list) return city_name_list,city_url_list # 区级 def get_area(): name = 'area' city_name_list, city_url_list = get_city() area_name_list = [] area_url_list = [] print('开始循环城市-区域') for i in range(len(city_url_list)): city_url = city_url_list[i] city_name = city_name_list[i] response_area = requests.get(url = city_url, headers = headers ,proxies=proxies).content.decode('utf8') soup = BeautifulSoup(response_area, 'html.parser') filter_area_wrap = soup.find(class_="sel-content bank") zhongji_name_list = [] zhongji_url_list = [] for area_ in filter_area_wrap.find_all('a'): area_name = city_name + area_.text area_url = area_.get('href') zhongji_name_list.append(area_name) zhongji_url_list.append(area_url) del zhongji_name_list[0],zhongji_url_list[0] area_name_list += zhongji_name_list area_url_list += zhongji_url_list print(f'已循环到第{i}个市级单位:{city_name_list[i]}') return area_name_list,area_url_list # 周边 def get_periphery(): name = 'periphery' area_name_list, area_url_list = get_area() periphery_name_list = [] periphery_url_list = [] print('开始循环区域-周边') for i in range(len(area_url_list)): area_url = area_url_list[i] area_name = area_name_list[i] response_periphery = requests.get(url = area_url, headers = headers ,proxies=proxies).content.decode('utf8') soup = BeautifulSoup(response_periphery, 'html.parser') filter_area_wrap = soup.find(class_="sel-content bank") zhongji_name_list = [] zhongji_url_list = [] for periphery_ in filter_area_wrap.find_all('a'): periphery_name = area_name + periphery_.text periphery_url = periphery_.get('href') zhongji_name_list.append(periphery_name) zhongji_url_list.append(periphery_url) del periphery_name_list[0], periphery_url_list[0] periphery_name_list += zhongji_name_list periphery_url_list += zhongji_url_list print(f'已循环到第{i}个周边单位:{area_name_list[i]}') return periphery_name_list, periphery_url_list # 获取年份 def get_Year(): Years_list = [] soup = BeautifulSoup(response_origion, 'html.parser') years = soup.find(class_="year-list") for year_ in years.find_all('a'): year = year_.text Years_list.append(year) return Years_list # 创建mysql数据库方法 # # 测试函数 if __name__ == '__main__': # print('时间跨度:',get_Year()) # print('省级单位:',get_province()) print('市级单位:',get_city()) # print('区级单位',get_area()) # print('周边单位',get_periphery())