123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170 |
- # -*- coding: utf-8 -*-
- # @Author : ChenZhaoyuchen
- # @Time : 2024/9/26 16:20
- # @File : get_message.py
- from bs4 import BeautifulSoup
- from anjuke.utils.anjuke_response import *
- from anjuke.utils.setting import *
- from anjuke.utils.mysqlClass import *
- import mysql.connector
- # 省级
- def get_province():
- name = 'province'
- province_name_list = []
- province_url_list = []
- response_province = requests.get(url = url_start, headers = headers, proxies=proxies).content.decode('utf8')
- soup = BeautifulSoup(response_province, 'html.parser')
- filter_area_wrap = soup.find(class_="filter-area-wrap")
- for province_ in filter_area_wrap.find_all('a'):
- province_name = province_.text
- province_url = province_.get('href')
- province_name_list.append(province_name)
- province_url_list.append('https://www.anjuke.com'+province_url)
- del province_name_list[0],province_url_list[0]
- # 创建游标对象,用于执行SQL查询
- cursor1 = db.cursor()
- for i in range(len(province_name_list)):
- # 插入一条记录到"anjuke_province"表中
- sql = "INSERT INTO anjuke_province (place_name,place_url) VALUES (%s,%s)"
- values = (province_name_list[i], province_url_list[i])
- cursor1.execute(sql, values)
- # 提交更改到数据库
- db.commit()
- print(f"插入了 {cursor1.rowcount} 条记录")
- cursor1.close()
- print(f'已获取并添加全部省级单位')
- return province_name_list,province_url_list
- # 市级
- def get_city():
- name = 'city'
- # 设置空列表
- city_name_list = []
- city_url_list = []
- # 设置游标cursor
- cursor2 = db.cursor()
- # 传参
- query = "SELECT * FROM anjuke_province"
- cursor2.execute(query)
- results = cursor2.fetchall()
- province_list = [list(row) for row in results]
- # 记得关掉cursor
- cursor2.close()
- # 开始循环
- print('开始循环省份-城市')
- for i in range(len(province_list)):
- province_name = province_list[i][0]
- province_url = province_list[i][1]
- response_city = requests.get(url = province_url, headers = headers, proxies=proxies).content.decode('utf8')
- soup = BeautifulSoup(response_city, 'html.parser')
- filter_area_wrap = soup.find(class_="sel-content bank")
- zhongji_name_list = []
- zhongji_url_list = []
- for city_ in filter_area_wrap.find_all('a'):
- city_name = province_name + city_.text
- city_url = city_.get('href')
- zhongji_name_list.append(city_name)
- zhongji_url_list.append(city_url)
- del zhongji_name_list[0], zhongji_url_list[0]
- city_name_list += zhongji_name_list
- city_url_list += zhongji_url_list
- # # 上传代码
- # cursor2 = db.cursor()
- # for i in range(len(city_name_list)):
- # # 插入一条记录到"anjuke_province"表中
- # sql = "INSERT INTO anjuke_city (place_name,place_url) VALUES (%s,%s)"
- # values = (city_name_list[i], city_url_list[i])
- # cursor2.execute(sql, values)
- # # 提交更改到数据库
- # db.commit()
- # print(f"插入了 {cursor2.rowcount} 条记录")
- # print(f"插入了 {i} 条记录")
- # cursor2.close()
- print(f'已循环到第{i}个省级单位:{province_name}')
- print(city_name_list)
- return city_name_list,city_url_list
- # 区级
- def get_area():
- name = 'area'
- city_name_list, city_url_list = get_city()
- area_name_list = []
- area_url_list = []
- print('开始循环城市-区域')
- for i in range(len(city_url_list)):
- city_url = city_url_list[i]
- city_name = city_name_list[i]
- response_area = requests.get(url = city_url, headers = headers ,proxies=proxies).content.decode('utf8')
- soup = BeautifulSoup(response_area, 'html.parser')
- filter_area_wrap = soup.find(class_="sel-content bank")
- zhongji_name_list = []
- zhongji_url_list = []
- for area_ in filter_area_wrap.find_all('a'):
- area_name = city_name + area_.text
- area_url = area_.get('href')
- zhongji_name_list.append(area_name)
- zhongji_url_list.append(area_url)
- del zhongji_name_list[0],zhongji_url_list[0]
- area_name_list += zhongji_name_list
- area_url_list += zhongji_url_list
- print(f'已循环到第{i}个市级单位:{city_name_list[i]}')
- return area_name_list,area_url_list
- # 周边
- def get_periphery():
- name = 'periphery'
- area_name_list, area_url_list = get_area()
- periphery_name_list = []
- periphery_url_list = []
- print('开始循环区域-周边')
- for i in range(len(area_url_list)):
- area_url = area_url_list[i]
- area_name = area_name_list[i]
- response_periphery = requests.get(url = area_url, headers = headers ,proxies=proxies).content.decode('utf8')
- soup = BeautifulSoup(response_periphery, 'html.parser')
- filter_area_wrap = soup.find(class_="sel-content bank")
- zhongji_name_list = []
- zhongji_url_list = []
- for periphery_ in filter_area_wrap.find_all('a'):
- periphery_name = area_name + periphery_.text
- periphery_url = periphery_.get('href')
- zhongji_name_list.append(periphery_name)
- zhongji_url_list.append(periphery_url)
- del periphery_name_list[0], periphery_url_list[0]
- periphery_name_list += zhongji_name_list
- periphery_url_list += zhongji_url_list
- print(f'已循环到第{i}个周边单位:{area_name_list[i]}')
- return periphery_name_list, periphery_url_list
- # 获取年份
- def get_Year():
- Years_list = []
- soup = BeautifulSoup(response_origion, 'html.parser')
- years = soup.find(class_="year-list")
- for year_ in years.find_all('a'):
- year = year_.text
- Years_list.append(year)
- return Years_list
- # 创建mysql数据库方法
- # # 测试函数
- if __name__ == '__main__':
- # print('时间跨度:',get_Year())
- # print('省级单位:',get_province())
- print('市级单位:',get_city())
- # print('区级单位',get_area())
- # print('周边单位',get_periphery())
|