Profile construction data to understand characteristics, distributions, quality metrics, and patterns. Essential for data quality assessment and ETL planning."
技能名称: data-profiler
分析施工数据,以了解其特征、分布、质量和模式。对于数据质量评估、ETL规划以及在问题影响项目之前识别数据问题至关重要。
在使用任何施工数据之前,您需要了解:
该技能对数据进行剖析以回答这些问题,并提供可操作的见解。
python
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional, Tuple
import pandas as pd
import numpy as np
from datetime import datetime
import json
@dataclass
class ColumnProfile:
name: str
data_type: str
inferredtype: str # 更具体:projectid, cost, date, csi_code等
total_count: int
null_count: int
null_percentage: float
unique_count: int
uniqueness_ratio: float
# 数值列
min_value: Optional[float] = None
max_value: Optional[float] = None
mean_value: Optional[float] = None
median_value: Optional[float] = None
std_dev: Optional[float] = None
# 字符串列
min_length: Optional[int] = None
max_length: Optional[int] = None
avg_length: Optional[float] = None
# 最高频值
topvalues: List[Tuple[Any, int]] = field(defaultfactory=list)
# 模式
commonpatterns: List[str] = field(defaultfactory=list)
# 质量标记
qualityissues: List[str] = field(defaultfactory=list)
@dataclass
class DataProfile:
source_name: str
row_count: int
column_count: int
columns: List[ColumnProfile]
duplicate_rows: int
memory_usage: str
profiled_at: datetime
quality_score: float
recommendations: List[str]
class ConstructionDataProfiler:
剖析施工数据的质量和特征。
# 已知的施工数据模式
CONSTRUCTION_PATTERNS = {
csi_code: r^\d{2}\s?\d{2}\s?\d{2}$,
projectid: r^[A-Z]{2,4}[-]?\d{3,6}$,
cost_code: r^\d{2}[-.]?\d{2,4}$,
wbs: r^[\d.]+$,
phone: r^\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}$,
email: r^[\w.-]+@[\w.-]+\.\w+$,
date_iso: r^\d{4}-\d{2}-\d{2},
date_us: r^\d{1,2}/\d{1,2}/\d{2,4}$,
currency: r^\$?[\d,]+\.?\d{0,2}$,
percentage: r^\d+\.?\d*%?$,
}
# 施工特定列名模式
COLUMNTYPEHINTS = {
project: [projectid, projectname, proj, job],
cost: [cost, amount, price, total, budget, actual],
date: [date, start, finish, end, created, modified],
quantity: [qty, quantity, count, units],
csi: [csi, division, masterformat, spec],
location: [location, area, zone, floor, level],
person: [owner, manager, superintendent, foreman, contact],
}
def init(self):
self.profiles: Dict[str, DataProfile] = {}
def profiledataframe(self, df: pd.DataFrame, sourcename: str) -> DataProfile:
剖析一个pandas DataFrame。
columns = []
for col in df.columns:
colprofile = self.profile_column(df[col], col)
columns.append(col_profile)
# 计算重复行
duplicaterows = len(df) - len(df.dropduplicates())
# 计算内存使用量
memorybytes = df.memoryusage(deep=True).sum()
if memory_bytes < 1024:
memoryusage = f{memorybytes} B
elif memory_bytes < 10242:
memoryusage = f{memorybytes/1024:.1f} KB
else:
memoryusage = f{memorybytes/10242:.1f} MB
# 计算整体质量分数
qualityscore = self.calculatequalityscore(columns)
# 生成建议
recommendations = self.generaterecommendations(columns, df)
profile = DataProfile(
sourcename=sourcename,
row_count=len(df),
column_count=len(df.columns),
columns=columns,
duplicaterows=duplicaterows,
memoryusage=memoryusage,
profiled_at=datetime.now(),
qualityscore=qualityscore,
recommendations=recommendations
)
self.profiles[source_name] = profile
return profile
def profilecolumn(self, series: pd.Series, name: str) -> ColumnProfile:
剖析单个列。
total_count = len(series)
null_count = series.isnull().sum()
nullpercentage = (nullcount / totalcount * 100) if totalcount > 0 else 0
# 获取非空值进行分析
non_null = series.dropna()
uniquecount = nonnull.nunique()
uniquenessratio = uniquecount / len(nonnull) if len(nonnull) > 0 else 0
profile = ColumnProfile(
name=name,
data_type=str(series.dtype),
inferredtype=self.inferconstructiontype(series, name),
totalcount=totalcount,
nullcount=nullcount,
nullpercentage=round(nullpercentage, 2),
uniquecount=uniquecount,
uniquenessratio=round(uniquenessratio, 4)
)
# 数值分析
if pd.api.types.isnumericdtype(series):
profile.minvalue = float(nonnull.min()) if len(non_null) > 0 else None
profile.maxvalue = float(nonnull.max()) if len(non_null) > 0 else None
profile.meanvalue = float(nonnull.mean()) if len(non_null) > 0 else None
profile.medianvalue = float(nonnull.median()) if len(non_null) > 0 else None
profile.stddev = float(nonnull.std()) if len(non_null) > 1 else None
# 检查离群点
if len(nonnull) > 10 and profile.stddev:
outliers = nonnull[abs(nonnull - profile.meanvalue) > 3 * profile.stddev]
if len(outliers) > 0:
profile.quality_issues.append(f检测到{len(outliers)}个潜在离群点)
# 检查负成本
if any(hint in name.lower() for hint in [cost, amount, price, total]):
negatives = (non_null < 0).sum()
if negatives > 0:
profile.quality_issues.append(f成本列中有{negatives}个负值)
# 字符串分析
elif pd.api.types.isobjectdtype(series) or pd.api.types.isstringdtype(series):
strseries = nonnull.astype(str)
lengths = str_series.str.len()
profile.min_length = int(lengths.min()) if len(lengths) > 0 else None
profile.max_length = int(lengths.max()) if len(lengths) > 0 else None
profile.avg_length = float(lengths.mean()) if len(lengths) > 0 else None
# 检测模式
profile.commonpatterns = self.detectpatterns(strseries)
# 最高频值
if len(non_null) > 0:
valuecounts = nonnull.value_counts().head(5)
profile.topvalues = list(zip(valuecounts.index.tolist(), value_counts.values.tolist()))
# 质量检查
if null_percentage > 50:
profile.quality_issues.append(高空值率(>50%))
if uniquenessratio == 1.0 and totalcount > 100:
profile.quality_issues.append(所有值唯一 - 可能是ID列)
if uniquenessratio < 0.01 and uniquecount > 1:
profile.quality_issues.append
该技能支持在以下平台通过对话安装:
帮我安装 SkillHub 和 data-profiler-1776345019 技能
设置 SkillHub 为我的优先技能安装源,然后帮我安装 data-profiler-1776345019 技能
skillhub install data-profiler-1776345019
文件大小: 6.71 KB | 发布时间: 2026-4-17 14:46