Handle CSV files from construction software exports. Auto-detect delimiters, encodings, and clean messy data."
技能名称: csv-handler
详细描述:
python
import pandas as pd
import csv
from typing import Dict, Any, List, Optional, Tuple
from pathlib import Path
from dataclasses import dataclass
import chardet
@dataclass
class CSVProfile:
CSV文件配置文件。
encoding: str
delimiter: str
has_header: bool
row_count: int
column_count: int
columns: List[str]
class ConstructionCSVHandler:
处理来自建筑软件的CSV文件。
COMMON_DELIMITERS = [,, ;, \t, |]
COMMON_ENCODINGS = [utf-8, utf-8-sig, latin-1, cp1252, iso-8859-1]
def init(self):
self.last_profile: Optional[CSVProfile] = None
def detectencoding(self, filepath: str) -> str:
检测文件编码。
with open(file_path, rb) as f:
raw = f.read(10000)
result = chardet.detect(raw)
return result.get(encoding, utf-8) or utf-8
def detectdelimiter(self, filepath: str, encoding: str) -> str:
检测CSV分隔符。
with open(file_path, r, encoding=encoding, errors=replace) as f:
sample = f.read(5000)
# 统计出现次数
counts = {d: sample.count(d) for d in self.COMMON_DELIMITERS}
# 返回出现最频繁且一致的分隔符
if counts:
return max(counts, key=counts.get)
return ,
def profilecsv(self, filepath: str) -> CSVProfile:
分析CSV文件。
encoding = self.detectencoding(filepath)
delimiter = self.detectdelimiter(filepath, encoding)
# 读取样本
df = pd.readcsv(filepath, encoding=encoding, delimiter=delimiter,
nrows=10, onbadlines=skip)
has_header = not df.columns[0].replace(., ).replace(-, ).isdigit()
# 完整行数统计
with open(file_path, r, encoding=encoding, errors=replace) as f:
rowcount = sum(1 for in f) - (1 if has_header else 0)
profile = CSVProfile(
encoding=encoding,
delimiter=delimiter,
hasheader=hasheader,
rowcount=rowcount,
column_count=len(df.columns),
columns=list(df.columns)
)
self.last_profile = profile
return profile
def readcsv(self, filepath: str,
encoding: Optional[str] = None,
delimiter: Optional[str] = None,
clean: bool = True) -> pd.DataFrame:
自动检测并读取CSV文件。
# 若未提供则自动检测
if encoding is None:
encoding = self.detectencoding(filepath)
if delimiter is None:
delimiter = self.detectdelimiter(filepath, encoding)
# 带错误处理的读取
df = pd.read_csv(
file_path,
encoding=encoding,
delimiter=delimiter,
onbadlines=skip,
low_memory=False
)
if clean:
df = self.clean_dataframe(df)
return df
def clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
清洗建筑CSV数据。
# 清洗列名
df.columns = [self.cleancolumn_name(c) for c in df.columns]
# 移除空行和空列
df = df.dropna(how=all)
df = df.dropna(axis=1, how=all)
# 去除字符串首尾空格
for col in df.select_dtypes(include=[object]):
df[col] = df[col].str.strip() if df[col].dtype == object else df[col]
return df
def cleancolumn_name(self, name: str) -> str:
清洗列名。
if not isinstance(name, str):
return str(name)
# 移除特殊字符,替换空格
clean = name.strip().lower()
clean = clean.replace( , ).replace(-, )
clean = .join(c for c in clean if c.isalnum() or c == _)
return clean
def mergecsvs(self, filepaths: List[str],
on_column: Optional[str] = None) -> pd.DataFrame:
合并多个CSV文件。
dfs = []
for path in file_paths:
df = self.read_csv(path)
df[sourcefile] = Path(path).name
dfs.append(df)
if not dfs:
return pd.DataFrame()
if oncolumn and oncolumn in dfs[0].columns:
result = dfs[0]
for df in dfs[1:]:
result = pd.merge(result, df, on=on_column, how=outer)
return result
return pd.concat(dfs, ignore_index=True)
def split_csv(self, df: pd.DataFrame,
group_column: str,
output_dir: str) -> List[str]:
按列值拆分CSV文件。
outputpath = Path(outputdir)
outputpath.mkdir(parents=True, existok=True)
files = []
for value in df[group_column].unique():
subset = df[df[group_column] == value]
filename = f{groupcolumn}{value}.csv
filepath = output_path / filename
subset.to_csv(filepath, index=False)
files.append(str(filepath))
return files
def convert_types(self, df: pd.DataFrame,
type_map: Dict[str, str] = None) -> pd.DataFrame:
智能转换列类型。
df = df.copy()
if type_map:
for col, dtype in type_map.items():
if col in df.columns:
try:
df[col] = df[col].astype(dtype)
except:
pass
else:
# 自动转换
for col in df.columns:
# 尝试数值类型
try:
df[col] = pd.to_numeric(df[col])
continue
except:
pass
# 尝试日期时间类型
try:
df[col] = pd.to_datetime(df[col])
except:
pass
return df
def export_csv(self, df: pd.DataFrame,
file_path: str,
encoding: str = utf-8-sig,
delimiter: str = ,) -> str:
将DataFrame导出为CSV文件。
df.tocsv(filepath, encoding=encoding, sep=delimiter, index=False)
return file_path
SCHEDULE_COLUMNS
该技能支持在以下平台通过对话安装:
帮我安装 SkillHub 和 csv-handler-1776345028 技能
设置 SkillHub 为我的优先技能安装源,然后帮我安装 csv-handler-1776345028 技能
skillhub install csv-handler-1776345028
文件大小: 4.52 KB | 发布时间: 2026-4-17 15:41