2026-02-03 22:48:22 +08:00
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
"""
|
|
|
|
|
|
文档解析模块 - LLM增强版
|
|
|
|
|
|
支持PDF和Docx格式,针对GJB438B标准SRS文档优化
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
import logging
|
2026-04-12 21:45:55 +08:00
|
|
|
|
import importlib
|
2026-02-03 22:48:22 +08:00
|
|
|
|
from abc import ABC, abstractmethod
|
2026-04-12 21:45:55 +08:00
|
|
|
|
from typing import List, Dict, Tuple, Optional, Any
|
2026-02-03 22:48:22 +08:00
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
from docx import Document
|
|
|
|
|
|
HAS_DOCX = True
|
|
|
|
|
|
except ImportError:
|
|
|
|
|
|
HAS_DOCX = False
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
import PyPDF2
|
|
|
|
|
|
HAS_PDF = True
|
|
|
|
|
|
except ImportError:
|
|
|
|
|
|
HAS_PDF = False
|
|
|
|
|
|
|
2026-04-12 21:45:55 +08:00
|
|
|
|
HAS_PDF_TABLE = importlib.util.find_spec("pdfplumber") is not None
|
|
|
|
|
|
|
2026-02-03 22:48:22 +08:00
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Section:
|
|
|
|
|
|
"""表示文档中的一个章节"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, level: int, title: str, number: str = None, content: str = "", uid: str = ""):
|
|
|
|
|
|
self.level = level
|
|
|
|
|
|
self.title = title
|
|
|
|
|
|
self.number = number
|
|
|
|
|
|
self.content = content
|
|
|
|
|
|
self.uid = uid
|
|
|
|
|
|
self.parent = None
|
|
|
|
|
|
self.children = []
|
|
|
|
|
|
self.tables = []
|
2026-04-12 21:45:55 +08:00
|
|
|
|
self.blocks = []
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
|
|
|
|
|
def add_child(self, child: 'Section') -> None:
|
|
|
|
|
|
self.children.append(child)
|
|
|
|
|
|
child.parent = self
|
|
|
|
|
|
|
|
|
|
|
|
def add_content(self, text: str) -> None:
|
2026-04-12 21:45:55 +08:00
|
|
|
|
text = (text or "").strip()
|
|
|
|
|
|
if not text:
|
|
|
|
|
|
return
|
2026-02-03 22:48:22 +08:00
|
|
|
|
if self.content:
|
|
|
|
|
|
self.content += "\n" + text
|
|
|
|
|
|
else:
|
|
|
|
|
|
self.content = text
|
2026-04-12 21:45:55 +08:00
|
|
|
|
self.blocks.append({"type": "text", "text": text})
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
|
|
|
|
|
def add_table(self, table_data: List[List[str]]) -> None:
|
2026-04-12 21:45:55 +08:00
|
|
|
|
if not table_data:
|
|
|
|
|
|
return
|
2026-02-03 22:48:22 +08:00
|
|
|
|
self.tables.append(table_data)
|
2026-04-12 21:45:55 +08:00
|
|
|
|
table_index = len(self.tables) - 1
|
|
|
|
|
|
self.blocks.append({"type": "table", "table_index": table_index, "table": table_data})
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
|
|
|
|
|
def generate_auto_number(self, parent_number: str = "", sibling_index: int = 1) -> None:
|
|
|
|
|
|
"""
|
|
|
|
|
|
自动生成章节编号(当章节没有编号时)
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
parent_number: 父章节编号
|
|
|
|
|
|
sibling_index: 在同级章节中的序号(从1开始)
|
|
|
|
|
|
"""
|
|
|
|
|
|
if not self.number:
|
|
|
|
|
|
if parent_number:
|
|
|
|
|
|
self.number = f"{parent_number}.{sibling_index}"
|
|
|
|
|
|
else:
|
|
|
|
|
|
self.number = str(sibling_index)
|
|
|
|
|
|
|
|
|
|
|
|
def __repr__(self) -> str:
|
|
|
|
|
|
return f"Section(level={self.level}, number='{self.number}', title='{self.title}')"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DocumentParser(ABC):
|
|
|
|
|
|
"""文档解析器基类"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, file_path: str):
|
|
|
|
|
|
self.file_path = file_path
|
|
|
|
|
|
self.sections: List[Section] = []
|
|
|
|
|
|
self.document_title = ""
|
|
|
|
|
|
self.raw_text = ""
|
|
|
|
|
|
self.llm = None
|
|
|
|
|
|
self._uid_counter = 0
|
|
|
|
|
|
|
|
|
|
|
|
def set_llm(self, llm) -> None:
|
|
|
|
|
|
"""设置LLM实例"""
|
|
|
|
|
|
self.llm = llm
|
|
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
|
def parse(self) -> List[Section]:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
def get_document_title(self) -> str:
|
|
|
|
|
|
return self.document_title
|
|
|
|
|
|
|
|
|
|
|
|
def _next_uid(self) -> str:
|
|
|
|
|
|
self._uid_counter += 1
|
|
|
|
|
|
return f"sec-{self._uid_counter}"
|
|
|
|
|
|
|
|
|
|
|
|
def _auto_number_sections(self, sections: List[Section], parent_number: str = "") -> None:
|
|
|
|
|
|
"""
|
|
|
|
|
|
为没有编号的章节自动生成编号
|
|
|
|
|
|
|
|
|
|
|
|
规则:使用Word样式确定级别,跳过前置章节(目录、概述等),
|
|
|
|
|
|
从第一个正文章节(如"外部接口")开始编号为1
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
sections: 章节列表
|
|
|
|
|
|
parent_number: 父章节编号
|
|
|
|
|
|
"""
|
2026-04-18 20:33:58 +08:00
|
|
|
|
if not sections:
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
# 仅为缺失编号的章节补号;已存在的文档原始编号必须保留。
|
|
|
|
|
|
sibling_index = 0
|
|
|
|
|
|
for section in sections:
|
|
|
|
|
|
has_number = bool((section.number or "").strip()) and not self._is_chinese_number(section.number)
|
|
|
|
|
|
if not has_number:
|
|
|
|
|
|
sibling_index += 1
|
|
|
|
|
|
section.generate_auto_number(parent_number, sibling_index)
|
|
|
|
|
|
|
|
|
|
|
|
if section.children:
|
|
|
|
|
|
self._auto_number_sections(section.children, section.number)
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
|
|
|
|
|
def _is_chinese_number(self, text: str) -> bool:
|
|
|
|
|
|
"""检查是否是中文数字编号"""
|
|
|
|
|
|
chinese_numbers = '一二三四五六七八九十百千万'
|
|
|
|
|
|
return text and all(c in chinese_numbers for c in text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DocxParser(DocumentParser):
|
|
|
|
|
|
"""DOCX格式文档解析器"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, file_path: str):
|
|
|
|
|
|
if not HAS_DOCX:
|
|
|
|
|
|
raise ImportError("python-docx库未安装,请运行: pip install python-docx")
|
|
|
|
|
|
super().__init__(file_path)
|
|
|
|
|
|
self.document = None
|
|
|
|
|
|
|
|
|
|
|
|
def parse(self) -> List[Section]:
|
|
|
|
|
|
try:
|
|
|
|
|
|
self.document = Document(self.file_path)
|
|
|
|
|
|
self.document_title = self.document.core_properties.title or "SRS Document"
|
|
|
|
|
|
|
|
|
|
|
|
section_stack = {}
|
|
|
|
|
|
|
|
|
|
|
|
for block in self._iter_block_items(self.document):
|
|
|
|
|
|
from docx.text.paragraph import Paragraph
|
|
|
|
|
|
from docx.table import Table
|
|
|
|
|
|
if isinstance(block, Paragraph):
|
|
|
|
|
|
text = block.text.strip()
|
|
|
|
|
|
if not text:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
heading_info = self._parse_heading(block, text)
|
|
|
|
|
|
if heading_info:
|
|
|
|
|
|
number, title, level = heading_info
|
|
|
|
|
|
section = Section(level=level, title=title, number=number, uid=self._next_uid())
|
|
|
|
|
|
|
|
|
|
|
|
if level == 1 or not section_stack:
|
|
|
|
|
|
self.sections.append(section)
|
|
|
|
|
|
section_stack = {1: section}
|
|
|
|
|
|
else:
|
|
|
|
|
|
parent_level = level - 1
|
|
|
|
|
|
while parent_level >= 1 and parent_level not in section_stack:
|
|
|
|
|
|
parent_level -= 1
|
|
|
|
|
|
|
|
|
|
|
|
if parent_level >= 1 and parent_level in section_stack:
|
|
|
|
|
|
section_stack[parent_level].add_child(section)
|
|
|
|
|
|
elif self.sections:
|
|
|
|
|
|
self.sections[-1].add_child(section)
|
|
|
|
|
|
|
|
|
|
|
|
section_stack[level] = section
|
|
|
|
|
|
for l in list(section_stack.keys()):
|
|
|
|
|
|
if l > level:
|
|
|
|
|
|
del section_stack[l]
|
|
|
|
|
|
else:
|
|
|
|
|
|
# 添加内容到当前章节
|
|
|
|
|
|
if section_stack:
|
|
|
|
|
|
max_level = max(section_stack.keys())
|
|
|
|
|
|
section_stack[max_level].add_content(text)
|
|
|
|
|
|
else:
|
|
|
|
|
|
# 没有标题时,创建默认章节
|
|
|
|
|
|
default_section = Section(level=1, title="未命名章节", number="", uid=self._next_uid())
|
|
|
|
|
|
default_section.add_content(text)
|
|
|
|
|
|
self.sections.append(default_section)
|
|
|
|
|
|
section_stack = {1: default_section}
|
|
|
|
|
|
elif isinstance(block, Table):
|
|
|
|
|
|
# 表格处理
|
|
|
|
|
|
table_data = self._extract_table_data(block)
|
|
|
|
|
|
if table_data:
|
|
|
|
|
|
if section_stack:
|
|
|
|
|
|
max_level = max(section_stack.keys())
|
|
|
|
|
|
section_stack[max_level].add_table(table_data)
|
|
|
|
|
|
else:
|
|
|
|
|
|
default_section = Section(level=1, title="未命名章节", number="", uid=self._next_uid())
|
|
|
|
|
|
default_section.add_table(table_data)
|
|
|
|
|
|
self.sections.append(default_section)
|
|
|
|
|
|
section_stack = {1: default_section}
|
|
|
|
|
|
|
|
|
|
|
|
# 为没有编号的章节自动生成编号
|
|
|
|
|
|
self._auto_number_sections(self.sections)
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"完成Docx解析,提取{len(self.sections)}个顶级章节")
|
|
|
|
|
|
return self.sections
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"解析Docx文档失败: {e}")
|
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
def _is_valid_heading(self, text: str) -> bool:
|
|
|
|
|
|
"""检查是否是有效的标题"""
|
|
|
|
|
|
if len(text) > 120 or '...' in text:
|
|
|
|
|
|
return False
|
|
|
|
|
|
# 标题应包含中文或字母
|
|
|
|
|
|
if not re.search(r'[\u4e00-\u9fa5A-Za-z]', text):
|
|
|
|
|
|
return False
|
|
|
|
|
|
# 过滤目录项(标题后跟页码,如"概述 2"或"概述 . . . . 2")
|
|
|
|
|
|
if re.search(r'\s{2,}\d+$', text): # 多个空格后跟数字结尾
|
|
|
|
|
|
return False
|
|
|
|
|
|
if re.search(r'[\.。\s]+\d+$', text): # 点号或空格后跟数字结尾
|
|
|
|
|
|
return False
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_heading(self, paragraph, text: str) -> Optional[Tuple[str, str, int]]:
|
|
|
|
|
|
"""解析标题,返回(编号, 标题, 级别)"""
|
|
|
|
|
|
style_name = paragraph.style.name if paragraph.style else ""
|
|
|
|
|
|
is_heading_style = style_name.lower().startswith('heading') if style_name else False
|
|
|
|
|
|
|
|
|
|
|
|
# 数字编号标题
|
|
|
|
|
|
match = re.match(r'^(\d+(?:\.\d+)*)\s*[\.、]?\s*(.+)$', text)
|
|
|
|
|
|
if match and self._is_valid_heading(match.group(2)):
|
|
|
|
|
|
number = match.group(1)
|
|
|
|
|
|
title = match.group(2).strip()
|
|
|
|
|
|
level = len(number.split('.'))
|
|
|
|
|
|
return number, title, level
|
|
|
|
|
|
|
|
|
|
|
|
# 中文编号标题
|
|
|
|
|
|
match = re.match(r'^([一二三四五六七八九十]+)[、\.]+\s*(.+)$', text)
|
|
|
|
|
|
if match and self._is_valid_heading(match.group(2)):
|
|
|
|
|
|
number = match.group(1)
|
|
|
|
|
|
title = match.group(2).strip()
|
|
|
|
|
|
level = 1
|
|
|
|
|
|
return number, title, level
|
|
|
|
|
|
|
|
|
|
|
|
# 样式标题
|
|
|
|
|
|
if is_heading_style and self._is_valid_heading(text):
|
|
|
|
|
|
level = 1
|
|
|
|
|
|
level_match = re.search(r'(\d+)', style_name)
|
|
|
|
|
|
if level_match:
|
|
|
|
|
|
level = int(level_match.group(1))
|
|
|
|
|
|
return "", text, level
|
|
|
|
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
def _iter_block_items(self, parent):
|
|
|
|
|
|
"""按文档顺序迭代段落和表格"""
|
|
|
|
|
|
from docx.text.paragraph import Paragraph
|
|
|
|
|
|
from docx.table import Table
|
|
|
|
|
|
from docx.oxml.text.paragraph import CT_P
|
|
|
|
|
|
from docx.oxml.table import CT_Tbl
|
|
|
|
|
|
|
|
|
|
|
|
for child in parent.element.body.iterchildren():
|
|
|
|
|
|
if isinstance(child, CT_P):
|
|
|
|
|
|
yield Paragraph(child, parent)
|
|
|
|
|
|
elif isinstance(child, CT_Tbl):
|
|
|
|
|
|
yield Table(child, parent)
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_table_data(self, table) -> List[List[str]]:
|
|
|
|
|
|
"""提取表格数据"""
|
|
|
|
|
|
table_data = []
|
|
|
|
|
|
for row in table.rows:
|
|
|
|
|
|
row_data = []
|
|
|
|
|
|
for cell in row.cells:
|
|
|
|
|
|
text = cell.text.replace('\n', ' ').strip()
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text)
|
|
|
|
|
|
row_data.append(text)
|
|
|
|
|
|
if any(cell for cell in row_data):
|
|
|
|
|
|
table_data.append(row_data)
|
|
|
|
|
|
return table_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PDFParser(DocumentParser):
|
|
|
|
|
|
"""PDF格式文档解析器 - LLM增强版"""
|
|
|
|
|
|
|
|
|
|
|
|
# GJB438B标准SRS文档的有效章节标题关键词
|
|
|
|
|
|
VALID_TITLE_KEYWORDS = [
|
|
|
|
|
|
'范围', '标识', '概述', '引用', '文档',
|
|
|
|
|
|
'需求', '功能', '接口', '性能', '安全', '保密',
|
|
|
|
|
|
'环境', '资源', '质量', '设计', '约束',
|
|
|
|
|
|
'人员', '培训', '保障', '验收', '交付', '包装',
|
|
|
|
|
|
'优先', '关键', '合格', '追踪', '注释',
|
|
|
|
|
|
'CSCI', '计算机', '软件', '硬件', '通信', '通讯',
|
|
|
|
|
|
'数据', '适应', '可靠', '内部', '外部',
|
2026-04-18 20:33:58 +08:00
|
|
|
|
'描述', '要求', '规定', '说明', '定义'
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
TOP_LEVEL_TITLE_KEYWORDS = [
|
|
|
|
|
|
'范围', '标识', '概述', '引用', '文档', '需求', '接口', '性能',
|
|
|
|
|
|
'安全', '保密', '环境', '资源', '质量', '设计', '约束', '验收',
|
|
|
|
|
|
'交付', '包装', '注释'
|
2026-02-03 22:48:22 +08:00
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
# 明显无效的章节标题模式(噪声)
|
|
|
|
|
|
INVALID_TITLE_PATTERNS = [
|
|
|
|
|
|
'本文档可作为', '参比电位', '补偿电流', '以太网',
|
|
|
|
|
|
'电源', '软件接', '功能\\', '性能 \\', '输入/输出 \\',
|
|
|
|
|
|
'数据处理要求 \\', '固件 \\', '质量控制要求',
|
|
|
|
|
|
'信安科技', '浙江', '公司'
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, file_path: str):
|
|
|
|
|
|
if not HAS_PDF:
|
|
|
|
|
|
raise ImportError("PyPDF2库未安装,请运行: pip install PyPDF2")
|
|
|
|
|
|
super().__init__(file_path)
|
|
|
|
|
|
self.document_title = "SRS Document"
|
2026-04-12 21:45:55 +08:00
|
|
|
|
self._page_texts: List[str] = []
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
|
|
|
|
|
def parse(self) -> List[Section]:
|
|
|
|
|
|
"""解析PDF文档"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
# 1. 提取所有文本
|
|
|
|
|
|
self.raw_text = self._extract_all_text()
|
|
|
|
|
|
|
|
|
|
|
|
# 2. 清洗文本
|
|
|
|
|
|
cleaned_text = self._clean_text(self.raw_text)
|
|
|
|
|
|
|
|
|
|
|
|
# 3. 识别章节结构
|
|
|
|
|
|
self.sections = self._parse_sections(cleaned_text)
|
|
|
|
|
|
|
|
|
|
|
|
# 4. 使用LLM验证和清理章节(如果可用)
|
|
|
|
|
|
if self.llm:
|
|
|
|
|
|
self.sections = self._llm_validate_sections(self.sections)
|
2026-04-12 21:45:55 +08:00
|
|
|
|
|
|
|
|
|
|
# 章节识别失败时,创建兜底章节避免后续表格数据丢失。
|
|
|
|
|
|
if not self.sections:
|
|
|
|
|
|
fallback = Section(level=1, title="未命名章节", number="1", uid=self._next_uid())
|
|
|
|
|
|
if cleaned_text:
|
|
|
|
|
|
fallback.add_content(cleaned_text)
|
|
|
|
|
|
self.sections = [fallback]
|
|
|
|
|
|
|
|
|
|
|
|
# 5. 提取并挂接PDF表格到章节(若依赖可用)
|
|
|
|
|
|
pdf_tables = self._extract_pdf_tables()
|
|
|
|
|
|
if pdf_tables:
|
|
|
|
|
|
self._attach_pdf_tables_to_sections(pdf_tables)
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
2026-04-12 21:45:55 +08:00
|
|
|
|
# 6. 为没有编号的章节自动生成编号
|
2026-02-03 22:48:22 +08:00
|
|
|
|
self._auto_number_sections(self.sections)
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"完成PDF解析,提取{len(self.sections)}个顶级章节")
|
|
|
|
|
|
return self.sections
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"解析PDF文档失败: {e}")
|
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_all_text(self) -> str:
|
|
|
|
|
|
"""从PDF提取所有文本"""
|
|
|
|
|
|
all_text = []
|
|
|
|
|
|
with open(self.file_path, 'rb') as f:
|
|
|
|
|
|
pdf_reader = PyPDF2.PdfReader(f)
|
|
|
|
|
|
for page in pdf_reader.pages:
|
|
|
|
|
|
text = page.extract_text()
|
|
|
|
|
|
if text:
|
|
|
|
|
|
all_text.append(text)
|
2026-04-12 21:45:55 +08:00
|
|
|
|
self._page_texts = all_text
|
2026-02-03 22:48:22 +08:00
|
|
|
|
return '\n'.join(all_text)
|
2026-04-12 21:45:55 +08:00
|
|
|
|
|
|
|
|
|
|
def _extract_pdf_tables(self) -> List[Dict[str, Any]]:
|
|
|
|
|
|
"""提取PDF中的表格数据。"""
|
|
|
|
|
|
if not HAS_PDF_TABLE:
|
|
|
|
|
|
logger.warning("未安装pdfplumber,跳过PDF表格提取。可执行: pip install pdfplumber")
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
tables: List[Dict[str, Any]] = []
|
|
|
|
|
|
try:
|
|
|
|
|
|
pdfplumber = importlib.import_module("pdfplumber")
|
|
|
|
|
|
with pdfplumber.open(self.file_path) as pdf:
|
|
|
|
|
|
for page_idx, page in enumerate(pdf.pages):
|
|
|
|
|
|
page_text = ""
|
|
|
|
|
|
if page_idx < len(self._page_texts):
|
|
|
|
|
|
page_text = self._page_texts[page_idx]
|
|
|
|
|
|
|
2026-04-18 20:33:58 +08:00
|
|
|
|
table_objs = page.find_tables() or []
|
|
|
|
|
|
if table_objs:
|
|
|
|
|
|
extracted_tables = [(idx, t.extract(), t.bbox) for idx, t in enumerate(table_objs)]
|
|
|
|
|
|
else:
|
|
|
|
|
|
raw_tables = page.extract_tables() or []
|
|
|
|
|
|
extracted_tables = [(idx, t, None) for idx, t in enumerate(raw_tables)]
|
|
|
|
|
|
|
|
|
|
|
|
for table_idx, table, bbox in extracted_tables:
|
2026-04-12 21:45:55 +08:00
|
|
|
|
cleaned_table: List[List[str]] = []
|
|
|
|
|
|
for row in table or []:
|
|
|
|
|
|
cells = [re.sub(r'\s+', ' ', str(cell or '')).strip() for cell in row]
|
2026-04-18 20:33:58 +08:00
|
|
|
|
# 只要存在非空单元格就保留,避免有效行被误丢弃。
|
2026-04-12 21:45:55 +08:00
|
|
|
|
if any(cells):
|
|
|
|
|
|
cleaned_table.append(cells)
|
|
|
|
|
|
|
|
|
|
|
|
if cleaned_table:
|
2026-04-18 20:33:58 +08:00
|
|
|
|
section_hint = ""
|
|
|
|
|
|
if bbox:
|
|
|
|
|
|
try:
|
|
|
|
|
|
top = float(bbox[1])
|
|
|
|
|
|
text_above = page.crop((0, 0, page.width, top)).extract_text() or ""
|
|
|
|
|
|
section_hint = self._find_last_section_number(text_above)
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
section_hint = ""
|
|
|
|
|
|
|
|
|
|
|
|
table_ref = self._extract_table_reference(cleaned_table)
|
|
|
|
|
|
|
2026-04-12 21:45:55 +08:00
|
|
|
|
tables.append(
|
|
|
|
|
|
{
|
|
|
|
|
|
"page_idx": page_idx,
|
|
|
|
|
|
"table_idx": table_idx,
|
|
|
|
|
|
"page_text": page_text,
|
|
|
|
|
|
"data": cleaned_table,
|
2026-04-18 20:33:58 +08:00
|
|
|
|
"section_hint": section_hint,
|
|
|
|
|
|
"table_ref": table_ref,
|
2026-04-12 21:45:55 +08:00
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.warning(f"PDF表格提取失败,继续纯文本流程: {e}")
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
logger.info(f"PDF表格提取完成,共{len(tables)}个表格")
|
|
|
|
|
|
return tables
|
|
|
|
|
|
|
2026-04-18 20:33:58 +08:00
|
|
|
|
def _extract_table_reference(self, table: List[List[str]]) -> str:
|
|
|
|
|
|
"""从表格前几行中提取表号引用,如“表3-5”。"""
|
|
|
|
|
|
if not table:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
head_rows = table[:2]
|
|
|
|
|
|
merged = " ".join(" ".join(str(c or "") for c in row) for row in head_rows)
|
|
|
|
|
|
merged = re.sub(r"\s+", "", merged)
|
|
|
|
|
|
m = re.search(r"表\s*(\d+(?:[--]\d+){1,3})", merged)
|
|
|
|
|
|
if not m:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
return m.group(1).replace("-", "-")
|
|
|
|
|
|
|
|
|
|
|
|
def _build_table_reference_index(self, sections: List[Section]) -> Dict[str, List[Section]]:
|
|
|
|
|
|
"""构建“表号 -> 章节”索引,用于优先精确挂接表格。"""
|
|
|
|
|
|
index: Dict[str, List[Section]] = {}
|
|
|
|
|
|
for section in sections:
|
|
|
|
|
|
content = re.sub(r"\s+", "", section.content or "")
|
|
|
|
|
|
for m in re.finditer(r"表\s*(\d+(?:[--]\d+){1,3})", content):
|
|
|
|
|
|
ref = m.group(1).replace("-", "-")
|
|
|
|
|
|
index.setdefault(ref, []).append(section)
|
|
|
|
|
|
return index
|
|
|
|
|
|
|
|
|
|
|
|
def _find_last_section_number(self, text: str) -> str:
|
|
|
|
|
|
"""从文本中提取最后出现的章节号。"""
|
|
|
|
|
|
if not text:
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
found = ""
|
|
|
|
|
|
for line in text.split("\n"):
|
|
|
|
|
|
line = line.strip()
|
|
|
|
|
|
if not line:
|
|
|
|
|
|
continue
|
|
|
|
|
|
section_info = self._match_section_header(line, set())
|
|
|
|
|
|
if section_info:
|
|
|
|
|
|
found = section_info[0]
|
|
|
|
|
|
return found
|
|
|
|
|
|
|
2026-04-12 21:45:55 +08:00
|
|
|
|
def _attach_pdf_tables_to_sections(self, tables: List[Dict[str, Any]]) -> None:
|
|
|
|
|
|
"""将提取出的PDF表格挂接到最匹配的章节。"""
|
|
|
|
|
|
flat_sections = self._flatten_sections(self.sections)
|
|
|
|
|
|
if not flat_sections:
|
|
|
|
|
|
return
|
|
|
|
|
|
|
2026-04-18 20:33:58 +08:00
|
|
|
|
section_by_number = {
|
|
|
|
|
|
(s.number or "").strip(): s
|
|
|
|
|
|
for s in flat_sections
|
|
|
|
|
|
if (s.number or "").strip()
|
|
|
|
|
|
}
|
|
|
|
|
|
table_ref_index = self._build_table_reference_index(flat_sections)
|
|
|
|
|
|
|
2026-04-12 21:45:55 +08:00
|
|
|
|
last_section: Optional[Section] = None
|
|
|
|
|
|
for table in tables:
|
2026-04-18 20:33:58 +08:00
|
|
|
|
target = None
|
|
|
|
|
|
|
|
|
|
|
|
table_ref = (table.get("table_ref") or "").strip()
|
|
|
|
|
|
if table_ref and table_ref in table_ref_index:
|
|
|
|
|
|
candidates = table_ref_index[table_ref]
|
|
|
|
|
|
# 同表号命中多个章节时,优先更深层章节,避免父级“汇总章节”抢占。
|
|
|
|
|
|
target = max(candidates, key=lambda s: (s.level, len(s.content or "")))
|
|
|
|
|
|
|
|
|
|
|
|
section_hint = (table.get("section_hint") or "").strip()
|
|
|
|
|
|
if not target and section_hint and section_hint in section_by_number:
|
|
|
|
|
|
target = section_by_number[section_hint]
|
|
|
|
|
|
|
|
|
|
|
|
if not target:
|
|
|
|
|
|
target = self._match_table_section(table.get("page_text", ""), flat_sections)
|
|
|
|
|
|
|
|
|
|
|
|
# 兜底优先使用上一个命中章节,避免错误挂到首章节造成跨章污染。
|
|
|
|
|
|
if not target:
|
|
|
|
|
|
target = last_section
|
|
|
|
|
|
|
|
|
|
|
|
if not target:
|
|
|
|
|
|
logger.warning(
|
|
|
|
|
|
"未定位到表格归属章节,跳过: page=%s table=%s",
|
|
|
|
|
|
table.get("page_idx", -1),
|
|
|
|
|
|
table.get("table_idx", -1),
|
|
|
|
|
|
)
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
2026-04-12 21:45:55 +08:00
|
|
|
|
target.add_table(table["data"])
|
|
|
|
|
|
last_section = target
|
|
|
|
|
|
|
|
|
|
|
|
def _flatten_sections(self, sections: List[Section]) -> List[Section]:
|
|
|
|
|
|
"""按文档顺序拉平章节树。"""
|
|
|
|
|
|
result: List[Section] = []
|
|
|
|
|
|
for section in sections:
|
|
|
|
|
|
result.append(section)
|
|
|
|
|
|
if section.children:
|
|
|
|
|
|
result.extend(self._flatten_sections(section.children))
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def _match_table_section(self, page_text: str, sections: List[Section]) -> Optional[Section]:
|
|
|
|
|
|
"""基于页文本匹配表格归属章节。"""
|
|
|
|
|
|
normalized_page = re.sub(r"\s+", "", (page_text or "")).lower()
|
|
|
|
|
|
if not normalized_page:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
matched: Optional[Section] = None
|
2026-04-18 20:33:58 +08:00
|
|
|
|
matched_score = (-1, -1)
|
2026-04-12 21:45:55 +08:00
|
|
|
|
for section in sections:
|
|
|
|
|
|
title = (section.title or "").strip()
|
|
|
|
|
|
if not title:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
number = (section.number or "").strip()
|
|
|
|
|
|
candidates = [title]
|
|
|
|
|
|
if number:
|
|
|
|
|
|
candidates.append(f"{number}{title}")
|
|
|
|
|
|
candidates.append(f"{number} {title}")
|
|
|
|
|
|
|
|
|
|
|
|
for candidate in candidates:
|
|
|
|
|
|
normalized_candidate = re.sub(r"\s+", "", candidate).lower()
|
|
|
|
|
|
if normalized_candidate and normalized_candidate in normalized_page:
|
2026-04-18 20:33:58 +08:00
|
|
|
|
score = (len(normalized_candidate), section.level)
|
2026-04-12 21:45:55 +08:00
|
|
|
|
if score > matched_score:
|
|
|
|
|
|
matched = section
|
|
|
|
|
|
matched_score = score
|
|
|
|
|
|
|
|
|
|
|
|
return matched
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
|
|
|
|
|
def _clean_text(self, text: str) -> str:
|
|
|
|
|
|
"""清洗PDF提取的文本"""
|
|
|
|
|
|
lines = text.split('\n')
|
|
|
|
|
|
cleaned_lines = []
|
|
|
|
|
|
|
|
|
|
|
|
for line in lines:
|
|
|
|
|
|
line = line.strip()
|
|
|
|
|
|
if not line:
|
|
|
|
|
|
continue
|
|
|
|
|
|
# 跳过页码(通常是1-3位数字单独一行)
|
|
|
|
|
|
if re.match(r'^\d{1,3}$', line):
|
|
|
|
|
|
continue
|
|
|
|
|
|
# 跳过目录行
|
|
|
|
|
|
if line.count('.') > 10 and '...' in line:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
cleaned_lines.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
return '\n'.join(cleaned_lines)
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_sections(self, text: str) -> List[Section]:
|
|
|
|
|
|
"""解析章节结构"""
|
|
|
|
|
|
sections = []
|
|
|
|
|
|
section_stack = {}
|
|
|
|
|
|
lines = text.split('\n')
|
|
|
|
|
|
current_section = None
|
|
|
|
|
|
content_buffer = []
|
|
|
|
|
|
found_sections = set()
|
2026-04-18 20:33:58 +08:00
|
|
|
|
last_top_level_number = 0
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
|
|
|
|
|
for line in lines:
|
|
|
|
|
|
line = line.strip()
|
|
|
|
|
|
if not line:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# 尝试匹配章节标题
|
|
|
|
|
|
section_info = self._match_section_header(line, found_sections)
|
|
|
|
|
|
|
|
|
|
|
|
if section_info:
|
|
|
|
|
|
number, title = section_info
|
|
|
|
|
|
level = len(number.split('.'))
|
2026-04-18 20:33:58 +08:00
|
|
|
|
top_level_number = int(number.split('.')[0])
|
|
|
|
|
|
|
|
|
|
|
|
# 顶级章节序号大幅跳跃通常是误识别(如正文中的“8 表...”)。
|
|
|
|
|
|
if level == 1 and last_top_level_number and top_level_number > last_top_level_number + 1:
|
|
|
|
|
|
if line and not self._is_noise(line):
|
|
|
|
|
|
content_buffer.append(line)
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# 顶级章节编号倒退通常是正文枚举项被误识别(如“1 综合监控...”)。
|
|
|
|
|
|
if level == 1 and last_top_level_number and top_level_number < last_top_level_number:
|
|
|
|
|
|
if line and not self._is_noise(line):
|
|
|
|
|
|
content_buffer.append(line)
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if level > 6:
|
|
|
|
|
|
continue
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
|
|
|
|
|
# 保存之前章节的内容
|
|
|
|
|
|
if current_section and content_buffer:
|
|
|
|
|
|
current_section.add_content('\n'.join(content_buffer))
|
|
|
|
|
|
content_buffer = []
|
|
|
|
|
|
|
|
|
|
|
|
# 创建新章节
|
|
|
|
|
|
section = Section(level=level, title=title, number=number, uid=self._next_uid())
|
|
|
|
|
|
found_sections.add(number)
|
|
|
|
|
|
|
|
|
|
|
|
# 建立层次结构
|
|
|
|
|
|
if level == 1:
|
|
|
|
|
|
sections.append(section)
|
|
|
|
|
|
section_stack = {1: section}
|
2026-04-18 20:33:58 +08:00
|
|
|
|
last_top_level_number = top_level_number
|
2026-02-03 22:48:22 +08:00
|
|
|
|
else:
|
|
|
|
|
|
parent_level = level - 1
|
|
|
|
|
|
while parent_level >= 1 and parent_level not in section_stack:
|
|
|
|
|
|
parent_level -= 1
|
|
|
|
|
|
|
|
|
|
|
|
if parent_level >= 1 and parent_level in section_stack:
|
|
|
|
|
|
section_stack[parent_level].add_child(section)
|
|
|
|
|
|
elif sections:
|
|
|
|
|
|
sections[-1].add_child(section)
|
|
|
|
|
|
else:
|
|
|
|
|
|
sections.append(section)
|
|
|
|
|
|
section_stack = {1: section}
|
|
|
|
|
|
|
|
|
|
|
|
section_stack[level] = section
|
|
|
|
|
|
for l in list(section_stack.keys()):
|
|
|
|
|
|
if l > level:
|
|
|
|
|
|
del section_stack[l]
|
2026-04-18 20:33:58 +08:00
|
|
|
|
|
|
|
|
|
|
# 若出现层级跳跃(如1->3),自动回退到父级+1。
|
|
|
|
|
|
if level > 1 and (level - 1) not in section_stack:
|
|
|
|
|
|
section.level = max(section_stack.keys()) if section_stack else 1
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
|
|
|
|
|
current_section = section
|
|
|
|
|
|
else:
|
|
|
|
|
|
# 收集内容
|
|
|
|
|
|
if line and not self._is_noise(line):
|
|
|
|
|
|
content_buffer.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
# 保存最后一个章节的内容
|
|
|
|
|
|
if current_section and content_buffer:
|
|
|
|
|
|
current_section.add_content('\n'.join(content_buffer))
|
|
|
|
|
|
|
|
|
|
|
|
return sections
|
|
|
|
|
|
|
|
|
|
|
|
def _match_section_header(self, line: str, found_sections: set) -> Optional[Tuple[str, str]]:
|
|
|
|
|
|
"""
|
|
|
|
|
|
匹配章节标题
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
(章节编号, 章节标题) 或 None
|
|
|
|
|
|
"""
|
2026-04-18 20:33:58 +08:00
|
|
|
|
# 模式: "3.1 功能需求" / "3.1.2 电场..."
|
|
|
|
|
|
match = re.match(r'^(\d+(?:\.\d+)*)[\s、.))]*(.+)$', line)
|
2026-02-03 22:48:22 +08:00
|
|
|
|
if not match:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
number = match.group(1)
|
|
|
|
|
|
title = match.group(2).strip()
|
2026-04-18 20:33:58 +08:00
|
|
|
|
level = len(number.split('.'))
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
|
|
|
|
|
# 排除目录行
|
|
|
|
|
|
if '...' in title or title.count('.') > 5:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# 验证章节编号
|
|
|
|
|
|
parts = number.split('.')
|
|
|
|
|
|
first_part = int(parts[0])
|
|
|
|
|
|
|
|
|
|
|
|
# 放宽一级章节编号范围(非严格GJB结构)
|
|
|
|
|
|
if first_part < 1 or first_part > 30:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# 检查子部分是否合理
|
|
|
|
|
|
for part in parts[1:]:
|
|
|
|
|
|
if int(part) > 20:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# 避免重复
|
|
|
|
|
|
if number in found_sections:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# 标题长度检查
|
|
|
|
|
|
if len(title) > 60 or len(title) < 2:
|
|
|
|
|
|
return None
|
2026-04-18 20:33:58 +08:00
|
|
|
|
|
|
|
|
|
|
# 过滤更像正文描述的句式。
|
|
|
|
|
|
if self._looks_like_statement(title):
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# 过滤疑似正文句子(含句号/分号且过长)。
|
|
|
|
|
|
if len(title) > 24 and re.search(r'[。;;]', title):
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# 过滤指令拼接噪声标题(逗号过多通常是正文残片)。
|
|
|
|
|
|
if title.count(',') >= 2 and len(title) > 20:
|
|
|
|
|
|
return None
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
2026-04-12 21:45:55 +08:00
|
|
|
|
# 放宽标题字符要求(兼容部分PDF字体导致中文抽取异常的情况)
|
2026-02-03 22:48:22 +08:00
|
|
|
|
if not re.search(r'[\u4e00-\u9fa5A-Za-z]', title):
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# 检查是否包含无效模式
|
|
|
|
|
|
for invalid_pattern in self.INVALID_TITLE_PATTERNS:
|
|
|
|
|
|
if invalid_pattern in title:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# 标题不能以数字开头
|
|
|
|
|
|
if title[0].isdigit():
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# 数字比例检查
|
|
|
|
|
|
digit_ratio = sum(c.isdigit() for c in title) / max(len(title), 1)
|
|
|
|
|
|
if digit_ratio > 0.3:
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# 检查标题是否包含反斜杠(通常是表格噪声)
|
|
|
|
|
|
if '\\' in title and '需求' not in title:
|
|
|
|
|
|
return None
|
2026-04-18 20:33:58 +08:00
|
|
|
|
|
|
|
|
|
|
# 常见有效标题关键词兜底,降低正文被识别为标题的概率。
|
|
|
|
|
|
if not any(k in title for k in self.VALID_TITLE_KEYWORDS):
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# 顶级章节标题需符合SRS结构性关键词,避免“综合监控”“电场”等正文短语被识别。
|
|
|
|
|
|
if level == 1 and not any(k in title for k in self.TOP_LEVEL_TITLE_KEYWORDS):
|
|
|
|
|
|
return None
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
|
|
|
|
|
return (number, title)
|
2026-04-18 20:33:58 +08:00
|
|
|
|
|
|
|
|
|
|
def _looks_like_statement(self, title: str) -> bool:
|
|
|
|
|
|
"""判断标题是否更像正文语句而非章节名。"""
|
|
|
|
|
|
if not title:
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
statement_hints = ["应", "能够", "可以", "进行", "通过", "并", "同时", "当", "如果", "则"]
|
|
|
|
|
|
if any(h in title for h in statement_hints):
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
if len(title) > 24 and re.search(r'[,。;;::]', title):
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
return False
|
2026-02-03 22:48:22 +08:00
|
|
|
|
|
|
|
|
|
|
def _is_noise(self, line: str) -> bool:
|
|
|
|
|
|
"""检查是否是噪声行"""
|
|
|
|
|
|
# 纯数字行
|
|
|
|
|
|
if re.match(r'^[\d\s,.]+$', line):
|
|
|
|
|
|
return True
|
|
|
|
|
|
# 非常短的行
|
|
|
|
|
|
if len(line) < 3:
|
|
|
|
|
|
return True
|
|
|
|
|
|
# 罗马数字
|
|
|
|
|
|
if re.match(r'^[ivxIVX]+$', line):
|
|
|
|
|
|
return True
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def _llm_validate_sections(self, sections: List[Section]) -> List[Section]:
|
|
|
|
|
|
"""使用LLM验证章节是否有效"""
|
|
|
|
|
|
if not self.llm:
|
|
|
|
|
|
return sections
|
|
|
|
|
|
|
|
|
|
|
|
validated_sections = []
|
|
|
|
|
|
|
|
|
|
|
|
for section in sections:
|
|
|
|
|
|
# 验证顶级章节
|
|
|
|
|
|
if self._is_valid_section_with_llm(section):
|
|
|
|
|
|
# 递归验证子章节
|
|
|
|
|
|
section.children = self._validate_children(section.children)
|
|
|
|
|
|
validated_sections.append(section)
|
|
|
|
|
|
|
|
|
|
|
|
return validated_sections
|
|
|
|
|
|
|
|
|
|
|
|
def _validate_children(self, children: List[Section]) -> List[Section]:
|
|
|
|
|
|
"""递归验证子章节"""
|
|
|
|
|
|
validated = []
|
|
|
|
|
|
for child in children:
|
|
|
|
|
|
if self._is_valid_section_with_llm(child):
|
|
|
|
|
|
child.children = self._validate_children(child.children)
|
|
|
|
|
|
validated.append(child)
|
|
|
|
|
|
return validated
|
|
|
|
|
|
|
|
|
|
|
|
def _is_valid_section_with_llm(self, section: Section) -> bool:
|
|
|
|
|
|
"""使用LLM判断章节是否有效"""
|
|
|
|
|
|
# 先用规则快速过滤明显无效的章节
|
|
|
|
|
|
invalid_titles = [
|
|
|
|
|
|
'本文档可作为', '故障', '实时', '输入/输出',
|
|
|
|
|
|
'固件', '功能\\', '\\4.', '\\3.'
|
|
|
|
|
|
]
|
|
|
|
|
|
for invalid in invalid_titles:
|
|
|
|
|
|
if invalid in section.title:
|
|
|
|
|
|
logger.debug(f"过滤无效章节: {section.number} {section.title}")
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
# 对于需求相关章节(第3章),额外验证
|
|
|
|
|
|
if section.number and section.number.startswith('3'):
|
|
|
|
|
|
# 检查标题是否看起来像是有效的需求章节标题
|
|
|
|
|
|
# 有效的标题应该是完整的中文短语
|
|
|
|
|
|
if '\\' in section.title or '/' in section.title:
|
|
|
|
|
|
if not any(kw in section.title for kw in ['输入', '输出', '接口']):
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_parser(file_path: str) -> DocumentParser:
|
|
|
|
|
|
"""
|
|
|
|
|
|
工厂函数:根据文件扩展名创建相应的解析器
|
|
|
|
|
|
"""
|
|
|
|
|
|
ext = Path(file_path).suffix.lower()
|
|
|
|
|
|
|
|
|
|
|
|
if ext == '.docx':
|
|
|
|
|
|
return DocxParser(file_path)
|
|
|
|
|
|
elif ext == '.pdf':
|
|
|
|
|
|
return PDFParser(file_path)
|
|
|
|
|
|
else:
|
|
|
|
|
|
raise ValueError(f"不支持的文件格式: {ext}")
|