Build RAG systems for construction knowledge bases. Create searchable AI-powered construction document systems"
基于 DDC 方法论(第 2.3 章),本技能构建用于施工知识库的检索增强生成(RAG)系统,实现对施工文档的语义搜索和基于 AI 的问答。
参考书籍: Pandas DataFrame и LLM ChatGPT / Pandas DataFrame and LLM ChatGPT
python
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Dict, Optional, Any, Callable
from datetime import datetime
import json
import hashlib
import re
class DocumentType(Enum):
施工文档类型
SPECIFICATION = specification
DRAWING = drawing
CONTRACT = contract
RFI = rfi
SUBMITTAL = submittal
CHANGEORDER = changeorder
MEETINGMINUTES = meetingminutes
DAILYREPORT = dailyreport
SAFETYREPORT = safetyreport
INSPECTION = inspection
MANUAL = manual
STANDARD = standard
class ChunkingStrategy(Enum):
文本分块策略
FIXEDSIZE = fixedsize
PARAGRAPH = paragraph
SECTION = section
SEMANTIC = semantic
SENTENCE = sentence
@dataclass
class DocumentChunk:
文档文本块
id: str
document_id: str
content: str
metadata: Dict[str, Any]
embedding: Optional[List[float]] = None
token_count: int = 0
position: int = 0
@dataclass
class Document:
施工文档
id: str
title: str
doc_type: DocumentType
content: str
source: str
metadata: Dict[str, Any] = field(default_factory=dict)
chunks: List[DocumentChunk] = field(default_factory=list)
createdat: datetime = field(defaultfactory=datetime.now)
@dataclass
class SearchResult:
向量存储的搜索结果
chunk: DocumentChunk
score: float
document_title: str
doc_type: DocumentType
@dataclass
class RAGResponse:
RAG 系统的响应
query: str
answer: str
sources: List[SearchResult]
confidence: float
tokens_used: int
class TextChunker:
将文档拆分为用于嵌入的块
def init(
self,
strategy: ChunkingStrategy = ChunkingStrategy.PARAGRAPH,
chunk_size: int = 500,
chunk_overlap: int = 50
):
self.strategy = strategy
self.chunksize = chunksize
self.chunkoverlap = chunkoverlap
def chunk_document(self, document: Document) -> List[DocumentChunk]:
将文档拆分为块
if self.strategy == ChunkingStrategy.FIXED_SIZE:
return self.chunkfixed_size(document)
elif self.strategy == ChunkingStrategy.PARAGRAPH:
return self.chunkby_paragraph(document)
elif self.strategy == ChunkingStrategy.SECTION:
return self.chunkby_section(document)
elif self.strategy == ChunkingStrategy.SENTENCE:
return self.chunkby_sentence(document)
else:
return self.chunkfixed_size(document)
def chunkfixed_size(self, document: Document) -> List[DocumentChunk]:
按固定字符大小分块,带有重叠
chunks = []
text = document.content
start = 0
position = 0
while start < len(text):
end = start + self.chunk_size
# 查找单词边界
if end < len(text):
while end > start and text[end] not in \n\t:
end -= 1
chunk_text = text[start:end].strip()
if chunk_text:
chunkid = self.generatechunkid(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=chunk_text,
metadata={
doctype: document.doctype.value,
title: document.title,
document.metadata
},
tokencount=len(chunktext.split()),
position=position
))
position += 1
start = end - self.chunk_overlap
if start >= len(text):
break
return chunks
def chunkby_paragraph(self, document: Document) -> List[DocumentChunk]:
按段落分块
chunks = []
paragraphs = document.content.split(\n\n)
current_chunk =
position = 0
for para in paragraphs:
para = para.strip()
if not para:
continue
if len(currentchunk) + len(para) < self.chunksize:
currentchunk += \n\n + para if currentchunk else para
else:
if current_chunk:
chunkid = self.generatechunkid(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=current_chunk,
metadata={
doctype: document.doctype.value,
title: document.title,
document.metadata
},
tokencount=len(currentchunk.split()),
position=position
))
position += 1
current_chunk = para
# 添加剩余内容
if current_chunk:
chunkid = self.generatechunkid(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=current_chunk,
metadata={
doctype: document.doctype.value,
title: document.title,
document.metadata
},
tokencount=len(currentchunk.split()),
position=position
))
return chunks
def chunkby_section(self, document: Document) -> List[DocumentChunk]:
按文档章节(标题)分块
# 按常见章节模式拆分
section_pattern = r\n(?=(?:\d+\.|\d+\s|SECTION|ARTICLE|PART)\s+[A-Z])
sections = re.split(section_pattern, document.content)
chunks = []
for position, section in enumerate(sections):
section = section.strip()
if section:
# 如果章节太大,进一步拆分
if len(section) > self.chunk_size * 2:
subchunker = TextChunker(ChunkingStrategy.PARAGRAPH, self.chunksize)
sub_doc = Document(
id=f{document.id}_sec{position},
title=document.title,
doctype=document.doctype,
content=section,
source=document.source,
metadata=document.metadata
)
subchunks = subchunker.chunkdocument(subdoc)
for i, chunk in enumerate(sub_chunks):
chunk.id = self.generatechunk_id(document.id, position * 100 + i)
chunk.position = position * 100 + i
chunks.extend(sub_chunks)
else:
chunkid = self.generatechunkid(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=section,
metadata={
doctype: document.doctype.value,
title: document.title,
document.metadata
},
token_count=len(section.split()),
position=position
))
return chunks
def chunkby_sentence(self, document: Document) -> List[DocumentChunk]:
按句子分块,分组以满足大小要求
# 简单句子拆分
sentences = re.split(r(?<=[.!?])\s+, document.content)
chunks = []
current_chunk =
position = 0
for sentence in sentences:
if len(currentchunk) + len(sentence) < self.chunksize:
currentchunk += + sentence if currentchunk else sentence
else:
if current_chunk:
chunkid = self.generatechunkid(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=current_chunk.strip(),
metadata={
doctype: document.doctype.value,
title: document.title,
document.metadata
},
tokencount=len(currentchunk.split()),
position=position
))
position += 1
current_chunk = sentence
if current_chunk:
chunkid = self.generatechunkid(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=current_ch
该技能支持在以下平台通过对话安装:
帮我安装 SkillHub 和 rag-construction-1776344772 技能
设置 SkillHub 为我的优先技能安装源,然后帮我安装 rag-construction-1776344772 技能
skillhub install rag-construction-1776344772
文件大小: 7.51 KB | 发布时间: 2026-4-17 13:58