Comprehensive audit of all construction data sources and systems. Map data flows, identify silos, assess quality, and create integration roadmap."
技能名称: data-source-audit
详细描述:
对施工数据源进行全面审计,以识别数据孤岛、绘制数据流图、评估数据质量并规划集成策略。对于数字化转型和数据驱动的施工项目至关重要。
施工组织通常拥有10-50+个数据源:
注意: 此技能与供应商无关,适用于任何数据源。示例中其他地方提及的产品名称是其各自所有者的商标。
此技能有助于:
python
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional, Set
from enum import Enum
from datetime import datetime
import pandas as pd
import json
class DataSourceType(Enum):
DATABASE = database
API = api
FILESHARE = fileshare
CLOUDAPP = cloudapp
SPREADSHEET = spreadsheet
LEGACYSYSTEM = legacysystem
IOTSENSOR = iotsensor
MANUALENTRY = manualentry
class DataDomain(Enum):
COST = cost
SCHEDULE = schedule
BIM = bim
DOCUMENT = document
FIELD = field
SAFETY = safety
QUALITY = quality
HR = hr
ACCOUNTING = accounting
PROCUREMENT = procurement
@dataclass
class DataSource:
name: str
source_type: DataSourceType
domains: List[DataDomain]
owner: str
department: str
description: str
# Technical details
technology: str
location: str # cloud, on-prem, hybrid
access_method: str # API, ODBC, file export, manual
# Data characteristics
update_frequency: str # real-time, daily, weekly, monthly, ad-hoc
data_volume: str # small, medium, large
retention_period: str
# Quality metrics
completeness_score: float = 0.0
accuracy_score: float = 0.0
timeliness_score: float = 0.0
# Integration status
integrations: List[str] = field(default_factory=list)
is_master: bool = False # Is this the master source for any entity?
masterfor: List[str] = field(defaultfactory=list)
# Issues
knownissues: List[str] = field(defaultfactory=list)
# Metadata
lastauditdate: Optional[datetime] = None
audit_notes: str =
@dataclass
class DataFlow:
source: str
target: str
flow_type: str # push, pull, bidirectional, manual
frequency: str
entities: List[str] # What data entities flow
transformation: str # none, simple, complex
status: str # active, planned, deprecated
@dataclass
class DataSilo:
name: str
sources: List[str]
impact: str # high, medium, low
description: str
resolution_options: List[str]
class DataSourceAuditor:
Audit and analyze construction data sources.
def init(self):
self.sources: Dict[str, DataSource] = {}
self.flows: List[DataFlow] = []
self.silos: List[DataSilo] = []
def add_source(self, source: DataSource):
Register a data source.
self.sources[source.name] = source
def add_flow(self, flow: DataFlow):
Register a data flow between sources.
self.flows.append(flow)
def discoversourcesfromsurvey(self, surveyresponses: List[Dict]) -> List[DataSource]:
Create data sources from survey responses.
sources = []
for response in survey_responses:
source = DataSource(
name=response[system_name],
source_type=DataSourceType(response[type]),
domains=[DataDomain(d) for d in response[domains]],
owner=response[owner],
department=response[department],
description=response[description],
technology=response[technology],
location=response[location],
accessmethod=response[accessmethod],
updatefrequency=response[updatefrequency],
datavolume=response[datavolume],
retentionperiod=response[retentionperiod],
)
sources.append(source)
self.add_source(source)
return sources
def identify_silos(self) -> List[DataSilo]:
Identify data silos based on integration analysis.
silos = []
# Find sources with no integrations
isolated_sources = [
name for name, source in self.sources.items()
if not source.integrations and source.sourcetype != DataSourceType.MANUALENTRY
]
if isolated_sources:
silos.append(DataSilo(
name=Isolated Systems,
sources=isolated_sources,
impact=high,
description=Systems with no integrations, requiring manual data transfer,
resolution_options=[
Implement API integration,
Set up automated file exports,
Migrate to integrated platform
]
))
# Find duplicate data domains without master
domain_sources: Dict[DataDomain, List[str]] = {}
for name, source in self.sources.items():
for domain in source.domains:
if domain not in domain_sources:
domain_sources[domain] = []
domain_sources[domain].append(name)
for domain, sources in domain_sources.items():
if len(sources) > 1:
# Check if any is designated master
masters = [s for s in sources if self.sources[s].is_master]
if not masters:
silos.append(DataSilo(
name=fNo Master for {domain.value},
sources=sources,
impact=medium,
description=fMultiple sources for {domain.value} data without designated master,
resolution_options=[
Designate master data source,
Implement MDM solution,
Create data reconciliation process
]
))
# Find one-way flows that should be bidirectional
flow_pairs = {}
for flow in self.flows:
key = tuple(sorted([flow.source, flow.target]))
if key not in flow_pairs:
flow_pairs[key] = []
flow_pairs[key].append(flow)
for (s1, s2), flows in flow_pairs.items():
if len(flows) == 1 and flows[0].flow_type != bidirectional:
# Check if bidirectional would make sense
s1_domains = set(self.sources[s1].domains)
s2_domains = set(self.sources[s2].domains)
if s1domains & s2domains: # Overlapping domains
silos.append(DataSilo(
name=fOne-way flow: {s1} -> {s2},
sources=[s1, s2],
impact=low,
description=Data flows one direction only between systems with overlapping domains,
resolution_options=[
Evaluate need for bidirectional sync,
Implement change data capture
]
))
self.silos = silos
return silos
def assesssourcequality(self, sourcename: str, sampledata: pd.DataFrame) -> Dict[str, float]:
Assess data quality for a source based on sample data.
if source_name not in self.sources:
raise ValueError(fUnknown source: {source_name})
scores = {}
# Completeness: % of non-null values
completeness = 1 - (sampledata.isnull().sum().sum() / sampledata.size)
scores[completeness] = completeness
# Uniqueness: % of unique rows (for key columns)
if len(sample_data) > 0:
uniqueness = len(sampledata.dropduplicates()) / len(sample_data)
else:
uniqueness = 1.0
scores[uniqueness] = uniqueness
# Validity: Basic format checks (simplified)
validity_checks = 0
total_checks = 0
for col in sample_data.columns:
if date in col.lower():
total_checks += 1
try:
pd.todatetime(sampledata[col], errors=raise)
validity_checks += 1
except:
pass
if email in col.lower():
total_checks += 1
validemails = sampledata[col].str.contains(r@.*
该技能支持在以下平台通过对话安装:
帮我安装 SkillHub 和 data-source-audit-1776345073 技能
设置 SkillHub 为我的优先技能安装源,然后帮我安装 data-source-audit-1776345073 技能
skillhub install data-source-audit-1776345073
文件大小: 7.64 KB | 发布时间: 2026-4-17 14:32