Extract structured data from construction PDFs. Convert specifications, BOMs, schedules, and reports from PDF to Excel/CSV/JSON. Use OCR for scanned documents and pdfplumber for native PDFs."
基于DDC方法论(第2.4章),本技能将非结构化的PDF文档转换为适合分析和集成的结构化格式。建筑项目会产生大量PDF文档——规格书、物料清单、进度表和报告——这些都需要被提取和处理。
书籍参考: Преобразование данных в структурированную форму / 数据转换为结构化形式
将数据从非结构化形式转换为结构化形式既是一门艺术,也是一门科学。这个过程通常占据数据工程师工作的很大一部分。
— DDC 书籍,第2.4章
转换遵循ETL模式:
python
import pdfplumber
import pandas as pd
bash
python
import pdfplumber
import pandas as pd
def extracttablesfrompdf(pdfpath):
从PDF文件中提取所有表格
all_tables = []
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages):
tables = page.extract_tables()
for table_num, table in enumerate(tables):
if table and len(table) > 1:
# 第一行作为表头
df = pd.DataFrame(table[1:], columns=table[0])
df[page] = pagenum + 1
df[table] = tablenum + 1
all_tables.append(df)
if all_tables:
return pd.concat(alltables, ignoreindex=True)
return pd.DataFrame()
python
import pdfplumber
def extracttextwithlayout(pdfpath):
提取文本并保留布局结构
full_text = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
full_text.append(text)
return \n\n--- 分页符 ---\n\n.join(full_text)
python
import pdfplumber
import pandas as pd
def extracttablefromarea(pdfpath, page_num, bbox):
从页面特定区域提取表格
参数:
pdf_path: PDF文件路径
page_num: 页码(从0开始)
bbox: 边界框 (x0, top, x1, bottom),单位为点
with pdfplumber.open(pdf_path) as pdf:
page = pdf.pages[page_num]
cropped = page.within_bbox(bbox)
table = cropped.extract_table()
if table:
return pd.DataFrame(table[1:], columns=table[0])
return pd.DataFrame()
python
import pytesseract
from pdf2image import convertfrompath
import pandas as pd
def ocrscannedpdf(pdf_path, language=eng):
使用OCR从扫描版PDF中提取文本
参数:
pdf_path: 扫描版PDF路径
language: Tesseract语言代码(eng、deu、rus等)
# 将PDF页面转换为图像
images = convertfrompath(pdf_path, dpi=300)
extracted_text = []
for i, image in enumerate(images):
text = pytesseract.imagetostring(image, lang=language)
extracted_text.append({
page: i + 1,
text: text
})
return pd.DataFrame(extracted_text)
python
import pytesseract
from pdf2image import convertfrompath
import pandas as pd
import cv2
import numpy as np
def ocrtablefromscannedpdf(pdfpath, pagenum=0):
使用OCR从扫描版PDF中提取表格(含表格检测)
# 将指定页面转换为图像
images = convertfrompath(pdfpath, firstpage=page_num+1,
lastpage=pagenum+1, dpi=300)
image = np.array(images[0])
# 转换为灰度图
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
# 应用阈值处理
, binary = cv2.threshold(gray, 150, 255, cv2.THRESHBINARY_INV)
# 提取带表格结构的文本
custom_config = r--oem 3 --psm 6
text = pytesseract.imagetostring(gray, config=custom_config)
# 将文本解析为表格结构
lines = text.strip().split(\n)
data = [line.split() for line in lines if line.strip()]
if data:
# 假设第一行为表头
df = pd.DataFrame(data[1:], columns=data[0] if len(data[0]) > 0 else None)
return df
return pd.DataFrame()
python
import pdfplumber
import pandas as pd
import re
def extractbomfrompdf(pdfpath):
从建筑PDF中提取物料清单
all_items = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
if not table or len(table) < 2:
continue
# 查找表头行(寻找常见的BOM表头)
header_keywords = [item, description, quantity, unit, material]
for i, row in enumerate(table):
if row and any(keyword in str(row).lower() for keyword in header_keywords):
# 找到表头,处理剩余行
headers = [str(h).strip() for h in row]
for data_row in table[i+1:]:
if datarow and any(cell for cell in datarow if cell):
item = dict(zip(headers, data_row))
all_items.append(item)
break
return pd.DataFrame(all_items)
python
import pdfplumber
import pandas as pd
from datetime import datetime
def extractschedulefrompdf(pdfpath):
从PDF中提取项目进度/甘特图数据
with pdfplumber.open(pdf_path) as pdf:
all_tasks = []
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
if not table:
continue
# 查找类似进度表的表格
headers = table[0] if table else []
# 检查是否像进度表
schedule_keywords = [task, activity, start, end, duration]
if any(kw in str(headers).lower() for kw in schedule_keywords):
for row in table[1:]:
if row and any(cell for cell in row if cell):
task = dict(zip(headers, row))
all_tasks.append(task)
df = pd.DataFrame(all
该技能支持在以下平台通过对话安装:
帮我安装 SkillHub 和 pdf-to-structured-1776344720 技能
设置 SkillHub 为我的优先技能安装源,然后帮我安装 pdf-to-structured-1776344720 技能
skillhub install pdf-to-structured-1776344720
文件大小: 6.43 KB | 发布时间: 2026-4-17 15:18