RAG Construction

Overview

Based on DDC methodology (Chapter 2.3), this skill builds Retrieval-Augmented Generation (RAG) systems for construction knowledge bases, enabling semantic search and AI-powered question answering over construction documents.

Book Reference: "Pandas DataFrame и LLM ChatGPT" / "Pandas DataFrame and LLM ChatGPT"

Quick Start

CODEBLOCK0

Common Use Cases

Build Construction Knowledge Base

CODEBLOCK1

Search Knowledge Base

CODEBLOCK2

Answer Questions with RAG

CODEBLOCK3

Quick Reference

Component	Purpose
INLINECODE0	Main RAG system
INLINECODE1

Resources

- Book: "Data-Driven Construction" by Artem Boiko, Chapter 2.3
Website: https://datadrivenconstruction.io

Next Steps

- Use llm-data-automation for automation
Use vector-search for advanced search
Use document-classification-nlp for classification

RAG 构建

概述

基于 DDC 方法论（第 2.3 章），本技能构建用于施工知识库的检索增强生成（RAG）系统，实现对施工文档的语义搜索和基于 AI 的问答。

参考书籍： Pandas DataFrame и LLM ChatGPT / Pandas DataFrame and LLM ChatGPT

快速开始

python
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Dict, Optional, Any, Callable
from datetime import datetime
import json
import hashlib
import re

class DocumentType(Enum):
施工文档类型
SPECIFICATION = specification
DRAWING = drawing
CONTRACT = contract
RFI = rfi
SUBMITTAL = submittal
CHANGEORDER = changeorder
MEETINGMINUTES = meetingminutes
DAILYREPORT = dailyreport
SAFETYREPORT = safetyreport
INSPECTION = inspection
MANUAL = manual
STANDARD = standard

class ChunkingStrategy(Enum):
文本分块策略
FIXEDSIZE = fixedsize
PARAGRAPH = paragraph
SECTION = section
SEMANTIC = semantic
SENTENCE = sentence

@dataclass
class DocumentChunk:
文档文本块
id: str
document_id: str
content: str
metadata: Dict[str, Any]
embedding: Optional[List[float]] = None
token_count: int = 0
position: int = 0

@dataclass
class Document:
施工文档
id: str
title: str
doc_type: DocumentType
content: str
source: str
metadata: Dict[str, Any] = field(default_factory=dict)
chunks: List[DocumentChunk] = field(default_factory=list)
createdat: datetime = field(defaultfactory=datetime.now)

@dataclass
class SearchResult:
向量存储的搜索结果
chunk: DocumentChunk
score: float
document_title: str
doc_type: DocumentType

@dataclass
class RAGResponse:
RAG 系统的响应
query: str
answer: str
sources: List[SearchResult]
confidence: float
tokens_used: int

class TextChunker:
将文档拆分为用于嵌入的块

def init(
self,
strategy: ChunkingStrategy = ChunkingStrategy.PARAGRAPH,
chunk_size: int = 500,
chunk_overlap: int = 50
):
self.strategy = strategy
self.chunksize = chunksize
self.chunkoverlap = chunkoverlap

def chunk_document(self, document: Document) -> List[DocumentChunk]:
将文档拆分为块
if self.strategy == ChunkingStrategy.FIXED_SIZE:
return self.chunkfixed_size(document)
elif self.strategy == ChunkingStrategy.PARAGRAPH:
return self.chunkby_paragraph(document)
elif self.strategy == ChunkingStrategy.SECTION:
return self.chunkby_section(document)
elif self.strategy == ChunkingStrategy.SENTENCE:
return self.chunkby_sentence(document)
else:
return self.chunkfixed_size(document)

def chunkfixed_size(self, document: Document) -> List[DocumentChunk]:
按固定字符大小分块，带有重叠
chunks = []
text = document.content
start = 0
position = 0

while start < len(text):
end = start + self.chunk_size

# 查找单词边界
if end < len(text):
while end > start and text[end] not in \n\t:
end -= 1

chunk_text = text[start:end].strip()
if chunk_text:
chunkid = self.generatechunkid(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=chunk_text,
metadata={
doctype: document.doctype.value,
title: document.title,
document.metadata
},
tokencount=len(chunktext.split()),
position=position
))
position += 1

start = end - self.chunk_overlap
if start >= len(text):
break

return chunks

def chunkby_paragraph(self, document: Document) -> List[DocumentChunk]:
按段落分块
chunks = []
paragraphs = document.content.split(\n\n)
current_chunk =
position = 0

for para in paragraphs:
para = para.strip()
if not para:
continue

if len(currentchunk) + len(para) < self.chunksize:
currentchunk += \n\n + para if currentchunk else para
else:
if current_chunk:
chunkid = self.generatechunkid(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=current_chunk,
metadata={
doctype: document.doctype.value,
title: document.title,
document.metadata
},
tokencount=len(currentchunk.split()),
position=position
))
position += 1
current_chunk = para

# 添加剩余内容
if current_chunk:
chunkid = self.generatechunkid(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=current_chunk,
metadata={
doctype: document.doctype.value,
title: document.title,
document.metadata
},
tokencount=len(currentchunk.split()),
position=position
))

return chunks

def chunkby_section(self, document: Document) -> List[DocumentChunk]:
按文档章节（标题）分块
# 按常见章节模式拆分
section_pattern = r\n(?=(?:\d+\.|\d+\s|SECTION|ARTICLE|PART)\s+[A-Z])
sections = re.split(section_pattern, document.content)

chunks = []
for position, section in enumerate(sections):
section = section.strip()
if section:
# 如果章节太大，进一步拆分
if len(section) > self.chunk_size * 2:
subchunker = TextChunker(ChunkingStrategy.PARAGRAPH, self.chunksize)
sub_doc = Document(
id=f{document.id}_sec{position},
title=document.title,
doctype=document.doctype,
content=section,
source=document.source,
metadata=document.metadata
)
subchunks = subchunker.chunkdocument(subdoc)
for i, chunk in enumerate(sub_chunks):
chunk.id = self.generatechunk_id(document.id, position * 100 + i)
chunk.position = position * 100 + i
chunks.extend(sub_chunks)
else:
chunkid = self.generatechunkid(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=section,
metadata={
doctype: document.doctype.value,
title: document.title,
document.metadata
},
token_count=len(section.split()),
position=position
))

return chunks

def chunkby_sentence(self, document: Document) -> List[DocumentChunk]:
按句子分块，分组以满足大小要求
# 简单句子拆分
sentences = re.split(r(?<=[.!?])\s+, document.content)

chunks = []
current_chunk =
position = 0

for sentence in sentences:
if len(currentchunk) + len(sentence) < self.chunksize:
currentchunk += + sentence if currentchunk else sentence
else:
if current_chunk:
chunkid = self.generatechunkid(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=current_chunk.strip(),
metadata={
doctype: document.doctype.value,
title: document.title,
document.metadata
},
tokencount=len(currentchunk.split()),
position=position
))
position += 1
current_chunk = sentence

if current_chunk:
chunkid = self.generatechunkid(document.id, position)
chunks.append(DocumentChunk(
id=chunk_id,
document_id=document.id,
content=current_ch

rag-construction" RAG建筑系统