返回顶部
r

rag-construction" RAG建筑系统

Build RAG systems for construction knowledge bases. Create searchable AI-powered construction document systems"

作者: admin | 来源: ClawHub
源自
ClawHub
版本
V 2.1.0
安全检测
已通过
1,533
下载量
免费
免费
6
收藏
概述
安装方式
版本历史

rag-construction"

RAG 构建

概述

基于 DDC 方法论(第 2.3 章),本技能构建用于施工知识库的检索增强生成(RAG)系统,实现对施工文档的语义搜索和基于 AI 的问答。

参考书籍: Pandas DataFrame и LLM ChatGPT / Pandas DataFrame and LLM ChatGPT

快速开始

python
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Dict, Optional, Any, Callable
from datetime import datetime
import json
import hashlib
import re

class DocumentType(Enum):
施工文档类型
SPECIFICATION = specification
DRAWING = drawing
CONTRACT = contract
RFI = rfi
SUBMITTAL = submittal
CHANGEORDER = changeorder
MEETINGMINUTES = meetingminutes
DAILYREPORT = dailyreport
SAFETYREPORT = safetyreport
INSPECTION = inspection
MANUAL = manual
STANDARD = standard

class ChunkingStrategy(Enum):
文本分块策略
FIXEDSIZE = fixedsize
PARAGRAPH = paragraph
SECTION = section
SEMANTIC = semantic
SENTENCE = sentence

@dataclass
class DocumentChunk:
文档文本块
id: str
document_id: str
content: str
metadata: Dict[str, Any]
embedding: Optional[List[float]] = None
token_count: int = 0
position: int = 0

@dataclass
class Document:
施工文档
id: str
title: str
doc_type: DocumentType
content: str
source: str
metadata: Dict[str, Any] = field(default_factory=dict)
chunks: List[DocumentChunk] = field(default_factory=list)
createdat: datetime = field(defaultfactory=datetime.now)

@dataclass
class SearchResult:
向量存储的搜索结果
chunk: DocumentChunk
score: float
document_title: str
doc_type: DocumentType

@dataclass
class RAGResponse:
RAG 系统的响应
query: str
answer: str
sources: List[SearchResult]
confidence: float
tokens_used: int

class TextChunker:
将文档拆分为用于嵌入的块

def init(
self,
strategy: ChunkingStrategy = ChunkingStrategy.PARAGRAPH,
chunk_size: int = 500,
chunk_overlap: int = 50
):
self.strategy = strategy
self.chunksize = chunksize
self.chunkoverlap = chunkoverlap

def chunk_document(self, document: Document) -> List[DocumentChunk]:
将文档拆分为块
if self.strategy == ChunkingStrategy.FIXED_SIZE:
return self.chunkfixed_size(document)
elif self.strategy == ChunkingStrategy.PARAGRAPH:
return self.chunkby_paragraph(document)
elif self.strategy == ChunkingStrategy.SECTION:
return self.chunkby_section(document)
elif self.strategy == ChunkingStrategy.SENTENCE:
return self.chunkby_sentence(document)
else:
return self.chunkfixed_size(document)

def chunkfixed_size(self, document: Document) -> List[DocumentChunk]:
按固定字符大小分块,带有重叠
chunks = []
text = document.content
start = 0
position = 0

while start < len(text):
end = start + self.chunk_size

# 查找单词边界
if end < len(text):
while end > start and text[end] not in \n\t:
end -= 1

chunk_text = text[start:end].strip()
if chunk_text:
chunkid = self.generatechunkid(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=chunk_text,
metadata={
doctype: document.doctype.value,
title: document.title,
document.metadata
},
tokencount=len(chunktext.split()),
position=position
))
position += 1

start = end - self.chunk_overlap
if start >= len(text):
break

return chunks

def chunkby_paragraph(self, document: Document) -> List[DocumentChunk]:
按段落分块
chunks = []
paragraphs = document.content.split(\n\n)
current_chunk =
position = 0

for para in paragraphs:
para = para.strip()
if not para:
continue

if len(currentchunk) + len(para) < self.chunksize:
currentchunk += \n\n + para if currentchunk else para
else:
if current_chunk:
chunkid = self.generatechunkid(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=current_chunk,
metadata={
doctype: document.doctype.value,
title: document.title,
document.metadata
},
tokencount=len(currentchunk.split()),
position=position
))
position += 1
current_chunk = para

# 添加剩余内容
if current_chunk:
chunkid = self.generatechunkid(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=current_chunk,
metadata={
doctype: document.doctype.value,
title: document.title,
document.metadata
},
tokencount=len(currentchunk.split()),
position=position
))

return chunks

def chunkby_section(self, document: Document) -> List[DocumentChunk]:
按文档章节(标题)分块
# 按常见章节模式拆分
section_pattern = r\n(?=(?:\d+\.|\d+\s|SECTION|ARTICLE|PART)\s+[A-Z])
sections = re.split(section_pattern, document.content)

chunks = []
for position, section in enumerate(sections):
section = section.strip()
if section:
# 如果章节太大,进一步拆分
if len(section) > self.chunk_size * 2:
subchunker = TextChunker(ChunkingStrategy.PARAGRAPH, self.chunksize)
sub_doc = Document(
id=f{document.id}_sec{position},
title=document.title,
doctype=document.doctype,
content=section,
source=document.source,
metadata=document.metadata
)
subchunks = subchunker.chunkdocument(subdoc)
for i, chunk in enumerate(sub_chunks):
chunk.id = self.generatechunk_id(document.id, position * 100 + i)
chunk.position = position * 100 + i
chunks.extend(sub_chunks)
else:
chunkid = self.generatechunkid(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=section,
metadata={
doctype: document.doctype.value,
title: document.title,
document.metadata
},
token_count=len(section.split()),
position=position
))

return chunks

def chunkby_sentence(self, document: Document) -> List[DocumentChunk]:
按句子分块,分组以满足大小要求
# 简单句子拆分
sentences = re.split(r(?<=[.!?])\s+, document.content)

chunks = []
current_chunk =
position = 0

for sentence in sentences:
if len(currentchunk) + len(sentence) < self.chunksize:
currentchunk += + sentence if currentchunk else sentence
else:
if current_chunk:
chunkid = self.generatechunkid(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=current_chunk.strip(),
metadata={
doctype: document.doctype.value,
title: document.title,
document.metadata
},
tokencount=len(currentchunk.split()),
position=position
))
position += 1
current_chunk = sentence

if current_chunk:
chunkid = self.generatechunkid(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=current_ch

标签

skill ai

通过对话安装

该技能支持在以下平台通过对话安装:

OpenClaw WorkBuddy QClaw Kimi Claude

方式一:安装 SkillHub 和技能

帮我安装 SkillHub 和 rag-construction-1776344772 技能

方式二:设置 SkillHub 为优先技能安装源

设置 SkillHub 为我的优先技能安装源,然后帮我安装 rag-construction-1776344772 技能

通过命令行安装

skillhub install rag-construction-1776344772

下载

⬇ 下载 rag-construction" v2.1.0(免费)

文件大小: 7.51 KB | 发布时间: 2026-4-17 13:58

v2.1.0 最新 2026-4-17 13:58
rag-construction 2.1.0

- Added SKILL.md describing the feature set, quick start guide, and technical reference.
- Provides tools for building Retrieval-Augmented Generation (RAG) systems for construction knowledge bases.
- Includes classes for document management, chunking strategies, and semantic search support.
- Enables creation of AI-powered, searchable document systems for construction projects.
- Metadata now describes supported platforms and requirements.

Archiver·手机版·闲社网·闲社论坛·羊毛社区· 多链控股集团有限公司 · 苏ICP备2025199260号-1

Powered by Discuz! X5.0   © 2024-2025 闲社网·线报更新论坛·羊毛分享社区·http://xianshe.com

p2p_official_large
返回顶部