专业爬取中国房产中介网站(安居客、搜房网、贝壳找房、链家)数据的通用爬虫技能,包含反爬虫策略和自动数据提取功能
技能名称: Real Estate Spider
详细描述:
本技能专门用于爬取中国主流房产中介网站数据,包括:
bash
bash
python
import json
CONFIG = {
anjuke: {
url: https://www.anjuke.com,
data_selectors: {
price: .property-price,
area: .property-area,
location: .property-location,
type: .property-type
}
},
ke: {
url: https://ke.com,
data_selectors: {
price: .price-text,
area: .area-text,
location: .location-text,
type: .type-text
}
},
lianjia: {
url: https://www.lianjia.com,
data_selectors: {
price: .total-price,
area: .area-num,
location: .location-text,
type: .house-type
}
},
soufun: {
url: https://www.soufun.com,
data_selectors: {
price: .price-num,
area: .area-num,
location: .location-text,
type: .type-text
}
}
}
python
@dataclass
class PropertyData:
title: str
price: str
area: str
location: str
house_type: str
age: str
orientation: str
decoration: str
class RealEstateSpider:
def init(self, website_name):
self.websitename = websitename
self.config = CONFIG[website_name]
self.base_url = self.config[url]
self.selectors = self.config[data_selectors]
def crawl(self, city=北京, district=None):
爬取指定城市和区域的房产数据
# 构建URL
url = self.build_url(city, district)
# 发送请求
data = self.send_request(url)
# 解析数据
properties = self.parse_data(data)
# 返回结果
return properties
def build_url(self, city, district):
构建目标URL
if self.website_name == anjuke:
return f{self.base_url}/fangyuan/{city}
elif self.website_name == ke:
return f{self.base_url}/city/{city}
elif self.website_name == lianjia:
return f{self.base_url}/ershoufang/{city}
elif self.website_name == soufun:
return f{self.base_url}/esf/{city}
else:
return self.base_url
def send_request(self, url):
发送请求,处理反爬虫
headers = {
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36,
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,
Accept-Language: zh-CN,zh;q=0.9,
Accept-Encoding: gzip, deflate, br,
Connection: keep-alive,
Cache-Control: no-cache,
Upgrade-Insecure-Requests: 1
}
# 随机延迟避免频率检测
sleep_time = random.uniform(2, 5)
time.sleep(sleep_time)
# 发送请求(此处为简化示例,实际需要根据网站调整)
import requests
response = requests.get(url, headers=headers)
return response.text
def parsedata(self, htmldata):
解析HTML数据
# 这里需要根据具体网站的HTML结构实现解析逻辑
properties = []
# 示例解析逻辑
import re
pattern = rprice:([\d\.]+),.avgprice:([\d\.]+),.areanum:([\d\.]+),.houseage:([\d年]+),.orient:([^]+),.fitmentname:([^]+),.title:([^]+)
matches = re.findall(pattern, html_data)
for match in matches:
property = PropertyData(
title=match[6],
price=match[0],
area=match[2],
location=, # 需要根据网站调整
house_type=, # 需要根据网站调整
age=match[3],
orientation=match[4],
decoration=match[5]
)
properties.append(property)
return properties
def save_data(self, properties, format=json):
保存数据
if format == json:
with open(f{self.websitename}properties.json, w, encoding=utf-8) as f:
json.dump([prop.dict for prop in properties], f, ensure_ascii=False, indent=2)
elif format == csv:
import csv
with open(f{self.websitename}properties.csv, w, newline=, encoding=utf-8) as f:
writer = csv.writer(f)
writer.writerow([title, price, area, location, house_type, age, orientation, decoration])
for prop in properties:
writer.writerow([prop.title, prop.price, prop.area, prop.location, prop.house_type, prop.age, prop.orientation, prop.decoration])
if name == main:
# 示例:爬取安居客数据
spider = RealEstateSpider(anjuke)
properties = spider.crawl(city=南京)
spider.save_data(properties, format=json)
bash
该技能支持在以下平台通过对话安装:
帮我安装 SkillHub 和 real-estate-spider-1775987762 技能
设置 SkillHub 为我的优先技能安装源,然后帮我安装 real-estate-spider-1775987762 技能
skillhub install real-estate-spider-1775987762
文件大小: 31.44 KB | 发布时间: 2026-4-13 11:45