Predict construction project costs using Machine Learning. Use Linear Regression, K-Nearest Neighbors, and Random Forest models on historical project data. Train, evaluate, and deploy cost prediction models."
技能名称:成本预测
详细描述:
基于DDC方法论(第4.5章),该技能能够利用历史数据和机器学习算法预测建筑项目成本。该方法将传统的基于专家的估算转变为数据驱动的预测。
书籍参考: Будущее: прогнозы и машинное обучение / 未来:预测与机器学习
基于历史数据的预测和预报使企业能够就项目成本和工期做出更准确的决策。
— DDC书籍,第4.5章
历史数据 → 特征工程 → 机器学习模型 → 成本预测
│ │ │ │
▼ ▼ ▼ ▼
过去项目 准备数据 训练模型 新项目
含成本 用于机器学习 基于历史 成本预测
python
import pandas as pd
from sklearn.modelselection import traintest_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import meanabsoluteerror, r2_score
python
import pandas as pd
import numpy as np
def preparecostdataset(df):
为机器学习准备历史项目数据
# 选择相关特征
features = [
area_m2,
floors,
building_type,
location,
year_completed,
complexity_score,
material_quality,
total_cost
]
df = df[features].copy()
# 处理缺失值
df = df.dropna(subset=[total_cost])
df[complexityscore] = df[complexityscore].fillna(df[complexity_score].median())
# 编码分类变量
df = pd.getdummies(df, columns=[buildingtype, location])
# 计算衍生特征
df[costperm2] = df[totalcost] / df[aream2]
df[costperfloor] = df[total_cost] / df[floors]
# 根据通货膨胀调整(调整为当前年份价格)
current_year = 2024
inflation_rate = 0.03 # 3% 年通胀率
df[yearsago] = currentyear - df[year_completed]
df[adjustedcost] = df[totalcost] (1 + inflationrate) * df[yearsago]
return df
python
def engineer_features(df):
创建额外特征以获得更好的预测
# 交互特征
df[areaxfloors] = df[area_m2] * df[floors]
df[areaxcomplexity] = df[aream2] * df[complexityscore]
# 多项式特征
df[areasquared] = df[aream2] 2
# 对数变换(针对偏态特征)
df[logarea] = np.log1p(df[aream2])
# 分箱特征
df[size_category] = pd.cut(
df[area_m2],
bins=[0, 1000, 5000, 10000, float(inf)],
labels=[small, medium, large, xlarge]
)
return df
python
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
def trainlinearmodel(Xtrain, ytrain):
训练带标准化的线性回归模型
pipeline = Pipeline([
(scaler, StandardScaler()),
(regressor, LinearRegression())
])
pipeline.fit(Xtrain, ytrain)
# 特征重要性(系数)
coefficients = pd.DataFrame({
feature: X_train.columns,
coefficient: pipeline.namedsteps[regressor].coef
}).sort_values(coefficient, key=abs, ascending=False)
return pipeline, coefficients
python
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
def trainknnmodel(Xtrain, ytrain):
训练带最优k值的KNN模型
# 标准化特征
scaler = StandardScaler()
Xscaled = scaler.fittransform(X_train)
# 使用交叉验证寻找最优k值
paramgrid = {nneighbors: range(3, 20)}
knn = KNeighborsRegressor()
gridsearch = GridSearchCV(knn, paramgrid, cv=5, scoring=negmeanabsolute_error)
gridsearch.fit(Xscaled, y_train)
print(f最优k值: {gridsearch.bestparams[nneighbors]})
print(f最优平均绝对误差: ${-gridsearch.bestscore_:,.0f})
return gridsearch.bestestimator_, scaler
python
from sklearn.ensemble import RandomForestRegressor
def trainrandomforest(Xtrain, ytrain):
训练随机森林模型
rf = RandomForestRegressor(
n_estimators=100,
max_depth=10,
minsamplessplit=5,
random_state=42
)
rf.fit(Xtrain, ytrain)
# 特征重要性
importance = pd.DataFrame({
feature: X_train.columns,
importance: rf.featureimportances
}).sort_values(importance, ascending=False)
return rf, importance
python
from sklearn.ensemble import GradientBoostingRegressor
def traingradientboosting(Xtrain, ytrain):
训练梯度提升模型
gb = GradientBoostingRegressor(
n_estimators=200,
learning_rate=0.1,
max_depth=5,
random_state=42
)
gb.fit(Xtrain, ytrain)
return gb
python
from sklearn.metrics import meanabsoluteerror, meansquarederror, r2_score
import numpy as np
def evaluatemodel(model, Xtest, ytest, modelname=模型):
综合模型评估
predictions = model.predict(X_test)
metrics = {
MAE: meanabsoluteerror(y_test, predictions),
RMSE: np.sqrt(meansquarederror(y_test, predictions)),
R²: r2score(ytest, predictions),
MAPE: np.mean(np.abs((ytest - predictions) / ytest)) * 100
}
print(f\n{model_name} 评估:)
print(f 平均绝对误差: ${metrics[MAE]:,.0f})
print(f 均方根误差: ${metrics[RMSE]:,.0f})
print(f R²: {metrics[R²]:.3f})
print(f 平均绝对百分比误差: {metrics[MAPE]:.1f}%)
return metrics, predictions
python
def comparemodels(models, Xtest, y_test):
比较多个模型
results = []
for name, model in models.items():
metrics, = evaluatemodel(model, Xtest, ytest, name)
metrics[模型] = name
results.append(metrics)
comparison = pd.DataFrame
该技能支持在以下平台通过对话安装:
帮我安装 SkillHub 和 cost-prediction-1776344306 技能
设置 SkillHub 为我的优先技能安装源,然后帮我安装 cost-prediction-1776344306 技能
skillhub install cost-prediction-1776344306
文件大小: 6.52 KB | 发布时间: 2026-4-17 15:54