返回顶部
d

data-profiler" 数据剖析器

Profile construction data to understand characteristics, distributions, quality metrics, and patterns. Essential for data quality assessment and ETL planning."

作者: admin | 来源: ClawHub
源自
ClawHub
版本
V 2.1.0
安全检测
已通过
1,138
下载量
免费
免费
0
收藏
概述
安装方式
版本历史

data-profiler"

技能名称: data-profiler

施工数据剖析器

概述

分析施工数据,以了解其特征、分布、质量和模式。对于数据质量评估、ETL规划以及在问题影响项目之前识别数据问题至关重要。

业务场景

在使用任何施工数据之前,您需要了解:

  • - 存在哪些数据类型
  • 值的分布情况
  • 缺失数据的模式
  • 异常值和离群点
  • 参照完整性问题

该技能对数据进行剖析以回答这些问题,并提供可操作的见解。

技术实现

python
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional, Tuple
import pandas as pd
import numpy as np
from datetime import datetime
import json

@dataclass
class ColumnProfile:
name: str
data_type: str
inferredtype: str # 更具体:projectid, cost, date, csi_code等
total_count: int
null_count: int
null_percentage: float
unique_count: int
uniqueness_ratio: float
# 数值列
min_value: Optional[float] = None
max_value: Optional[float] = None
mean_value: Optional[float] = None
median_value: Optional[float] = None
std_dev: Optional[float] = None
# 字符串列
min_length: Optional[int] = None
max_length: Optional[int] = None
avg_length: Optional[float] = None
# 最高频值
topvalues: List[Tuple[Any, int]] = field(defaultfactory=list)
# 模式
commonpatterns: List[str] = field(defaultfactory=list)
# 质量标记
qualityissues: List[str] = field(defaultfactory=list)

@dataclass
class DataProfile:
source_name: str
row_count: int
column_count: int
columns: List[ColumnProfile]
duplicate_rows: int
memory_usage: str
profiled_at: datetime
quality_score: float
recommendations: List[str]

class ConstructionDataProfiler:
剖析施工数据的质量和特征。

# 已知的施工数据模式
CONSTRUCTION_PATTERNS = {
csi_code: r^\d{2}\s?\d{2}\s?\d{2}$,
projectid: r^[A-Z]{2,4}[-]?\d{3,6}$,
cost_code: r^\d{2}[-.]?\d{2,4}$,
wbs: r^[\d.]+$,
phone: r^\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}$,
email: r^[\w.-]+@[\w.-]+\.\w+$,
date_iso: r^\d{4}-\d{2}-\d{2},
date_us: r^\d{1,2}/\d{1,2}/\d{2,4}$,
currency: r^\$?[\d,]+\.?\d{0,2}$,
percentage: r^\d+\.?\d*%?$,
}

# 施工特定列名模式
COLUMNTYPEHINTS = {
project: [projectid, projectname, proj, job],
cost: [cost, amount, price, total, budget, actual],
date: [date, start, finish, end, created, modified],
quantity: [qty, quantity, count, units],
csi: [csi, division, masterformat, spec],
location: [location, area, zone, floor, level],
person: [owner, manager, superintendent, foreman, contact],
}

def init(self):
self.profiles: Dict[str, DataProfile] = {}

def profiledataframe(self, df: pd.DataFrame, sourcename: str) -> DataProfile:
剖析一个pandas DataFrame。
columns = []

for col in df.columns:
colprofile = self.profile_column(df[col], col)
columns.append(col_profile)

# 计算重复行
duplicaterows = len(df) - len(df.dropduplicates())

# 计算内存使用量
memorybytes = df.memoryusage(deep=True).sum()
if memory_bytes < 1024:
memoryusage = f{memorybytes} B
elif memory_bytes < 10242:
memoryusage = f{memorybytes/1024:.1f} KB
else:
memoryusage = f{memorybytes/10242:.1f} MB

# 计算整体质量分数
qualityscore = self.calculatequalityscore(columns)

# 生成建议
recommendations = self.generaterecommendations(columns, df)

profile = DataProfile(
sourcename=sourcename,
row_count=len(df),
column_count=len(df.columns),
columns=columns,
duplicaterows=duplicaterows,
memoryusage=memoryusage,
profiled_at=datetime.now(),
qualityscore=qualityscore,
recommendations=recommendations
)

self.profiles[source_name] = profile
return profile

def profilecolumn(self, series: pd.Series, name: str) -> ColumnProfile:
剖析单个列。
total_count = len(series)
null_count = series.isnull().sum()
nullpercentage = (nullcount / totalcount * 100) if totalcount > 0 else 0

# 获取非空值进行分析
non_null = series.dropna()
uniquecount = nonnull.nunique()
uniquenessratio = uniquecount / len(nonnull) if len(nonnull) > 0 else 0

profile = ColumnProfile(
name=name,
data_type=str(series.dtype),
inferredtype=self.inferconstructiontype(series, name),
totalcount=totalcount,
nullcount=nullcount,
nullpercentage=round(nullpercentage, 2),
uniquecount=uniquecount,
uniquenessratio=round(uniquenessratio, 4)
)

# 数值分析
if pd.api.types.isnumericdtype(series):
profile.minvalue = float(nonnull.min()) if len(non_null) > 0 else None
profile.maxvalue = float(nonnull.max()) if len(non_null) > 0 else None
profile.meanvalue = float(nonnull.mean()) if len(non_null) > 0 else None
profile.medianvalue = float(nonnull.median()) if len(non_null) > 0 else None
profile.stddev = float(nonnull.std()) if len(non_null) > 1 else None

# 检查离群点
if len(nonnull) > 10 and profile.stddev:
outliers = nonnull[abs(nonnull - profile.meanvalue) > 3 * profile.stddev]
if len(outliers) > 0:
profile.quality_issues.append(f检测到{len(outliers)}个潜在离群点)

# 检查负成本
if any(hint in name.lower() for hint in [cost, amount, price, total]):
negatives = (non_null < 0).sum()
if negatives > 0:
profile.quality_issues.append(f成本列中有{negatives}个负值)

# 字符串分析
elif pd.api.types.isobjectdtype(series) or pd.api.types.isstringdtype(series):
strseries = nonnull.astype(str)
lengths = str_series.str.len()
profile.min_length = int(lengths.min()) if len(lengths) > 0 else None
profile.max_length = int(lengths.max()) if len(lengths) > 0 else None
profile.avg_length = float(lengths.mean()) if len(lengths) > 0 else None

# 检测模式
profile.commonpatterns = self.detectpatterns(strseries)

# 最高频值
if len(non_null) > 0:
valuecounts = nonnull.value_counts().head(5)
profile.topvalues = list(zip(valuecounts.index.tolist(), value_counts.values.tolist()))

# 质量检查
if null_percentage > 50:
profile.quality_issues.append(高空值率(>50%))
if uniquenessratio == 1.0 and totalcount > 100:
profile.quality_issues.append(所有值唯一 - 可能是ID列)
if uniquenessratio < 0.01 and uniquecount > 1:
profile.quality_issues.append

标签

skill ai

通过对话安装

该技能支持在以下平台通过对话安装:

OpenClaw WorkBuddy QClaw Kimi Claude

方式一:安装 SkillHub 和技能

帮我安装 SkillHub 和 data-profiler-1776345019 技能

方式二:设置 SkillHub 为优先技能安装源

设置 SkillHub 为我的优先技能安装源,然后帮我安装 data-profiler-1776345019 技能

通过命令行安装

skillhub install data-profiler-1776345019

下载

⬇ 下载 data-profiler" v2.1.0(免费)

文件大小: 6.71 KB | 发布时间: 2026-4-17 14:46

v2.1.0 最新 2026-4-17 14:46
**Summary: Enhanced profiling and construction-specific insights**

- Improved profiling to capture characteristics, distributions, nulls, quality metrics, and patterns in construction data.
- Added construction-specific pattern and column type detection (e.g., csi_code, project_id, cost fields).
- Now includes quality issue flags and actionable recommendations.
- Reports memory usage, duplicate rows, and overall quality score.
- Documentation updated with clearer business case, technical overview, and usage examples.

Archiver·手机版·闲社网·闲社论坛·羊毛社区· 多链控股集团有限公司 · 苏ICP备2025199260号-1

Powered by Discuz! X5.0   © 2024-2025 闲社网·线报更新论坛·羊毛分享社区·http://xianshe.com

p2p_official_large
返回顶部