World-class senior data scientist skill specialising in statistical modeling, experiment design, causal inference, and predictive analytics. Covers A/B testing (sample sizing, two-proportion z-tests, Bonferroni correction), difference-in-differences, feature engineering pipelines (Scikit-learn, XGBoost), cross-validated model evaluation (AUC-ROC, AUC-PR, SHAP), and MLflow experiment tracking — using Python (NumPy, Pandas, Scikit-learn), R, and SQL. Use when designing or analysing controlled expe
面向生产级AI/ML/数据系统的世界级高级数据科学家技能。
python
import numpy as np
from scipy import stats
def calculatesamplesize(baseline_rate, mde, alpha=0.05, power=0.8):
计算每个变体所需的样本量。
baseline_rate: 当前转化率(例如 0.10)
mde: 最小可检测效应(相对值,例如 0.05 = 5%提升)
p1 = baseline_rate
p2 = baseline_rate * (1 + mde)
effect_size = abs(p2 - p1) / np.sqrt((p1 (1 - p1) + p2 (1 - p2)) / 2)
z_alpha = stats.norm.ppf(1 - alpha / 2)
z_beta = stats.norm.ppf(power)
n = ((zalpha + zbeta) / effect_size) 2
return int(np.ceil(n))
def analyze_experiment(control, treatment, alpha=0.05):
运行双比例z检验并返回结构化结果。
control/treatment: 包含conversions和visitors的字典。
p_c = control[conversions] / control[visitors]
p_t = treatment[conversions] / treatment[visitors]
pooled = (control[conversions] + treatment[conversions]) / (control[visitors] + treatment[visitors])
se = np.sqrt(pooled (1 - pooled) (1 / control[visitors] + 1 / treatment[visitors]))
z = (pt - pc) / se
p_value = 2 * (1 - stats.norm.cdf(abs(z)))
cilow = (pt - p_c) - stats.norm.ppf(1 - alpha / 2) * se
cihigh = (pt - p_c) + stats.norm.ppf(1 - alpha / 2) * se
return {
lift: (pt - pc) / p_c,
pvalue: pvalue,
significant: p_value < alpha,
ci95: (cilow, ci_high),
}
python
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
def buildfeaturepipeline(numericcols, categoricalcols, date_cols=None):
返回一个适用于结构化表格数据的、可拟合的ColumnTransformer。
numeric_pipeline = Pipeline([
(impute, SimpleImputer(strategy=median)),
(scale, StandardScaler()),
])
categorical_pipeline = Pipeline([
(impute, SimpleImputer(strategy=most_frequent)),
(encode, OneHotEncoder(handleunknown=ignore, sparseoutput=False)),
])
transformers = [
(num, numericpipeline, numericcols),
(cat, categoricalpipeline, categoricalcols),
]
return ColumnTransformer(transformers, remainder=drop)
def addtimefeatures(df, date_col):
从日期时间列中提取周期性和滞后特征。
df = df.copy()
df[datecol] = pd.todatetime(df[date_col])
df[dowsin] = np.sin(2 np.pi df[datecol].dt.dayofweek / 7)
df[dowcos] = np.cos(2 np.pi df[datecol].dt.dayofweek / 7)
df[monthsin] = np.sin(2 np.pi df[datecol].dt.month / 12)
df[monthcos] = np.cos(2 np.pi df[datecol].dt.month / 12)
df[isweekend] = (df[datecol].dt.dayofweek >= 5).astype(int)
return df
python
from sklearn.modelselection import StratifiedKFold, crossvalidate
from sklearn.metrics import makescorer, rocaucscore, averageprecision_score
import xgboost as xgb
import mlflow
SCORERS = {
rocauc: makescorer(rocaucscore, needs_proba=True),
avgprec: makescorer(averageprecisionscore, needs_proba=True),
}
def evaluate_model(model, X, y, cv=5):
交叉验证并返回每个评分器的均值±标准差。
对于分类问题使用StratifiedKFold以保持类别平衡。
cvresults = crossvalidate(
model, X, y,
cv=StratifiedKFold(nsplits=cv, shuffle=True, randomstate=42),
scoring=SCORERS,
returntrainscore=True,
)
summary = {}
for metric in SCORERS:
testscores = cvresults[ftest_{metric}]
summary[metric] = {mean: testscores.mean(), std: testscores.std()}
# 标记过拟合:训练集和测试集分数差距过大
trainmean = cvresults[ftrain_{metric}].mean()
summary[metric][overfitgap] = trainmean - test_scores.mean()
return summary
def trainandlog(model, Xtrain, ytrain, Xtest, ytest, run_name):
训练模型并将所有工件记录到MLflow。
with mlflow.startrun(runname=run_name):
model.fit(Xtrain, ytrain)
proba = model.predictproba(Xtest)[:, 1]
metrics = {
rocauc: rocaucscore(ytest, proba),
avgprec: averageprecisionscore(ytest, proba),
}
mlflow.logparams(model.getparams())
mlflow.log_metrics(metrics)
mlflow.sklearn.log_model(model, model)
return metrics
python
import statsmodels.formula.api as smf
def diffindiff(df, outcome, treatmentcol, postcol, controls=None):
通过带可选协变量的OLS DiD估计ATT。
df必须包含:结果变量、treatmentcol(0/1)、postcol(0/1)。
返回交互项系数(treatment × post)及其p值。
covariates = + .join(controls) if controls else
formula = (
f{outcome} ~ {treatmentcol} * {postcol}
+ (f + {covariates} if covariates else )
)
result = smf.ols(formula, data=df).fit(cov_type=HC3)
interaction = f{treatmentcol}:{postcol}
return {
att: result.params[interaction],
p_value: result.pvalues[interaction],
该技能支持在以下平台通过对话安装:
帮我安装 SkillHub 和 senior-data-scientist-1776349810 技能
设置 SkillHub 为我的优先技能安装源,然后帮我安装 senior-data-scientist-1776349810 技能
skillhub install senior-data-scientist-1776349810
文件大小: 10.64 KB | 发布时间: 2026-4-17 16:07