只保留LLM提取模式,修改提取逻辑
This commit is contained in:
@@ -4,7 +4,6 @@
|
||||
支持PDF和Docx格式,针对GJB438B标准SRS文档优化
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
import importlib
|
||||
@@ -119,43 +118,19 @@ class DocumentParser(ABC):
|
||||
sections: 章节列表
|
||||
parent_number: 父章节编号
|
||||
"""
|
||||
# 仅在顶级章节重编号
|
||||
if not parent_number:
|
||||
# 前置章节关键词(需要跳过的)
|
||||
skip_keywords = ['目录', '封面', '扉页', '未命名', '年', '月']
|
||||
# 正文章节关键词(遇到这些说明正文开始)
|
||||
content_keywords = ['外部接口', '接口', '软件需求', '需求', '功能', '性能', '设计', '概述', '标识', '引言']
|
||||
|
||||
start_index = 0
|
||||
for idx, section in enumerate(sections):
|
||||
# 优先检查是否是正文章节
|
||||
is_content = any(kw in section.title for kw in content_keywords)
|
||||
if is_content and section.level == 1:
|
||||
start_index = idx
|
||||
break
|
||||
|
||||
# 重新编号所有章节
|
||||
counter = 1
|
||||
for i, section in enumerate(sections):
|
||||
if i < start_index:
|
||||
# 前置章节不编号
|
||||
section.number = ""
|
||||
else:
|
||||
# 正文章节:顶级章节从1开始编号
|
||||
if section.level == 1:
|
||||
section.number = str(counter)
|
||||
counter += 1
|
||||
|
||||
# 递归处理子章节
|
||||
if section.children:
|
||||
self._auto_number_sections(section.children, section.number)
|
||||
else:
|
||||
# 子章节编号
|
||||
for i, section in enumerate(sections, 1):
|
||||
if not section.number or self._is_chinese_number(section.number):
|
||||
section.generate_auto_number(parent_number, i)
|
||||
if section.children:
|
||||
self._auto_number_sections(section.children, section.number)
|
||||
if not sections:
|
||||
return
|
||||
|
||||
# 仅为缺失编号的章节补号;已存在的文档原始编号必须保留。
|
||||
sibling_index = 0
|
||||
for section in sections:
|
||||
has_number = bool((section.number or "").strip()) and not self._is_chinese_number(section.number)
|
||||
if not has_number:
|
||||
sibling_index += 1
|
||||
section.generate_auto_number(parent_number, sibling_index)
|
||||
|
||||
if section.children:
|
||||
self._auto_number_sections(section.children, section.number)
|
||||
|
||||
def _is_chinese_number(self, text: str) -> bool:
|
||||
"""检查是否是中文数字编号"""
|
||||
@@ -327,8 +302,13 @@ class PDFParser(DocumentParser):
|
||||
'优先', '关键', '合格', '追踪', '注释',
|
||||
'CSCI', '计算机', '软件', '硬件', '通信', '通讯',
|
||||
'数据', '适应', '可靠', '内部', '外部',
|
||||
'描述', '要求', '规定', '说明', '定义',
|
||||
'电场', '防护', '装置', '控制', '监控', '显控'
|
||||
'描述', '要求', '规定', '说明', '定义'
|
||||
]
|
||||
|
||||
TOP_LEVEL_TITLE_KEYWORDS = [
|
||||
'范围', '标识', '概述', '引用', '文档', '需求', '接口', '性能',
|
||||
'安全', '保密', '环境', '资源', '质量', '设计', '约束', '验收',
|
||||
'交付', '包装', '注释'
|
||||
]
|
||||
|
||||
# 明显无效的章节标题模式(噪声)
|
||||
@@ -411,21 +391,41 @@ class PDFParser(DocumentParser):
|
||||
if page_idx < len(self._page_texts):
|
||||
page_text = self._page_texts[page_idx]
|
||||
|
||||
extracted_tables = page.extract_tables() or []
|
||||
for table_idx, table in enumerate(extracted_tables):
|
||||
table_objs = page.find_tables() or []
|
||||
if table_objs:
|
||||
extracted_tables = [(idx, t.extract(), t.bbox) for idx, t in enumerate(table_objs)]
|
||||
else:
|
||||
raw_tables = page.extract_tables() or []
|
||||
extracted_tables = [(idx, t, None) for idx, t in enumerate(raw_tables)]
|
||||
|
||||
for table_idx, table, bbox in extracted_tables:
|
||||
cleaned_table: List[List[str]] = []
|
||||
for row in table or []:
|
||||
cells = [re.sub(r'\s+', ' ', str(cell or '')).strip() for cell in row]
|
||||
# 只要存在非空单元格就保留,避免有效行被误丢弃。
|
||||
if any(cells):
|
||||
cleaned_table.append(cells)
|
||||
|
||||
if cleaned_table:
|
||||
section_hint = ""
|
||||
if bbox:
|
||||
try:
|
||||
top = float(bbox[1])
|
||||
text_above = page.crop((0, 0, page.width, top)).extract_text() or ""
|
||||
section_hint = self._find_last_section_number(text_above)
|
||||
except Exception:
|
||||
section_hint = ""
|
||||
|
||||
table_ref = self._extract_table_reference(cleaned_table)
|
||||
|
||||
tables.append(
|
||||
{
|
||||
"page_idx": page_idx,
|
||||
"table_idx": table_idx,
|
||||
"page_text": page_text,
|
||||
"data": cleaned_table,
|
||||
"section_hint": section_hint,
|
||||
"table_ref": table_ref,
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -435,16 +435,86 @@ class PDFParser(DocumentParser):
|
||||
logger.info(f"PDF表格提取完成,共{len(tables)}个表格")
|
||||
return tables
|
||||
|
||||
def _extract_table_reference(self, table: List[List[str]]) -> str:
|
||||
"""从表格前几行中提取表号引用,如“表3-5”。"""
|
||||
if not table:
|
||||
return ""
|
||||
|
||||
head_rows = table[:2]
|
||||
merged = " ".join(" ".join(str(c or "") for c in row) for row in head_rows)
|
||||
merged = re.sub(r"\s+", "", merged)
|
||||
m = re.search(r"表\s*(\d+(?:[--]\d+){1,3})", merged)
|
||||
if not m:
|
||||
return ""
|
||||
return m.group(1).replace("-", "-")
|
||||
|
||||
def _build_table_reference_index(self, sections: List[Section]) -> Dict[str, List[Section]]:
|
||||
"""构建“表号 -> 章节”索引,用于优先精确挂接表格。"""
|
||||
index: Dict[str, List[Section]] = {}
|
||||
for section in sections:
|
||||
content = re.sub(r"\s+", "", section.content or "")
|
||||
for m in re.finditer(r"表\s*(\d+(?:[--]\d+){1,3})", content):
|
||||
ref = m.group(1).replace("-", "-")
|
||||
index.setdefault(ref, []).append(section)
|
||||
return index
|
||||
|
||||
def _find_last_section_number(self, text: str) -> str:
|
||||
"""从文本中提取最后出现的章节号。"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
found = ""
|
||||
for line in text.split("\n"):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
section_info = self._match_section_header(line, set())
|
||||
if section_info:
|
||||
found = section_info[0]
|
||||
return found
|
||||
|
||||
def _attach_pdf_tables_to_sections(self, tables: List[Dict[str, Any]]) -> None:
|
||||
"""将提取出的PDF表格挂接到最匹配的章节。"""
|
||||
flat_sections = self._flatten_sections(self.sections)
|
||||
if not flat_sections:
|
||||
return
|
||||
|
||||
section_by_number = {
|
||||
(s.number or "").strip(): s
|
||||
for s in flat_sections
|
||||
if (s.number or "").strip()
|
||||
}
|
||||
table_ref_index = self._build_table_reference_index(flat_sections)
|
||||
|
||||
last_section: Optional[Section] = None
|
||||
for table in tables:
|
||||
matched = self._match_table_section(table.get("page_text", ""), flat_sections)
|
||||
target = matched or last_section or flat_sections[0]
|
||||
target = None
|
||||
|
||||
table_ref = (table.get("table_ref") or "").strip()
|
||||
if table_ref and table_ref in table_ref_index:
|
||||
candidates = table_ref_index[table_ref]
|
||||
# 同表号命中多个章节时,优先更深层章节,避免父级“汇总章节”抢占。
|
||||
target = max(candidates, key=lambda s: (s.level, len(s.content or "")))
|
||||
|
||||
section_hint = (table.get("section_hint") or "").strip()
|
||||
if not target and section_hint and section_hint in section_by_number:
|
||||
target = section_by_number[section_hint]
|
||||
|
||||
if not target:
|
||||
target = self._match_table_section(table.get("page_text", ""), flat_sections)
|
||||
|
||||
# 兜底优先使用上一个命中章节,避免错误挂到首章节造成跨章污染。
|
||||
if not target:
|
||||
target = last_section
|
||||
|
||||
if not target:
|
||||
logger.warning(
|
||||
"未定位到表格归属章节,跳过: page=%s table=%s",
|
||||
table.get("page_idx", -1),
|
||||
table.get("table_idx", -1),
|
||||
)
|
||||
continue
|
||||
|
||||
target.add_table(table["data"])
|
||||
last_section = target
|
||||
|
||||
@@ -464,7 +534,7 @@ class PDFParser(DocumentParser):
|
||||
return None
|
||||
|
||||
matched: Optional[Section] = None
|
||||
matched_score = -1
|
||||
matched_score = (-1, -1)
|
||||
for section in sections:
|
||||
title = (section.title or "").strip()
|
||||
if not title:
|
||||
@@ -479,7 +549,7 @@ class PDFParser(DocumentParser):
|
||||
for candidate in candidates:
|
||||
normalized_candidate = re.sub(r"\s+", "", candidate).lower()
|
||||
if normalized_candidate and normalized_candidate in normalized_page:
|
||||
score = len(normalized_candidate)
|
||||
score = (len(normalized_candidate), section.level)
|
||||
if score > matched_score:
|
||||
matched = section
|
||||
matched_score = score
|
||||
@@ -514,6 +584,7 @@ class PDFParser(DocumentParser):
|
||||
current_section = None
|
||||
content_buffer = []
|
||||
found_sections = set()
|
||||
last_top_level_number = 0
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
@@ -526,6 +597,22 @@ class PDFParser(DocumentParser):
|
||||
if section_info:
|
||||
number, title = section_info
|
||||
level = len(number.split('.'))
|
||||
top_level_number = int(number.split('.')[0])
|
||||
|
||||
# 顶级章节序号大幅跳跃通常是误识别(如正文中的“8 表...”)。
|
||||
if level == 1 and last_top_level_number and top_level_number > last_top_level_number + 1:
|
||||
if line and not self._is_noise(line):
|
||||
content_buffer.append(line)
|
||||
continue
|
||||
|
||||
# 顶级章节编号倒退通常是正文枚举项被误识别(如“1 综合监控...”)。
|
||||
if level == 1 and last_top_level_number and top_level_number < last_top_level_number:
|
||||
if line and not self._is_noise(line):
|
||||
content_buffer.append(line)
|
||||
continue
|
||||
|
||||
if level > 6:
|
||||
continue
|
||||
|
||||
# 保存之前章节的内容
|
||||
if current_section and content_buffer:
|
||||
@@ -540,6 +627,7 @@ class PDFParser(DocumentParser):
|
||||
if level == 1:
|
||||
sections.append(section)
|
||||
section_stack = {1: section}
|
||||
last_top_level_number = top_level_number
|
||||
else:
|
||||
parent_level = level - 1
|
||||
while parent_level >= 1 and parent_level not in section_stack:
|
||||
@@ -557,6 +645,10 @@ class PDFParser(DocumentParser):
|
||||
for l in list(section_stack.keys()):
|
||||
if l > level:
|
||||
del section_stack[l]
|
||||
|
||||
# 若出现层级跳跃(如1->3),自动回退到父级+1。
|
||||
if level > 1 and (level - 1) not in section_stack:
|
||||
section.level = max(section_stack.keys()) if section_stack else 1
|
||||
|
||||
current_section = section
|
||||
else:
|
||||
@@ -577,13 +669,14 @@ class PDFParser(DocumentParser):
|
||||
Returns:
|
||||
(章节编号, 章节标题) 或 None
|
||||
"""
|
||||
# 模式: "3.1功能需求" 或 "3.1 功能需求"
|
||||
match = re.match(r'^(\d+(?:\.\d+)*)\s*(.+)$', line)
|
||||
# 模式: "3.1 功能需求" / "3.1.2 电场..."
|
||||
match = re.match(r'^(\d+(?:\.\d+)*)[\s、.))]*(.+)$', line)
|
||||
if not match:
|
||||
return None
|
||||
|
||||
number = match.group(1)
|
||||
title = match.group(2).strip()
|
||||
level = len(number.split('.'))
|
||||
|
||||
# 排除目录行
|
||||
if '...' in title or title.count('.') > 5:
|
||||
@@ -609,6 +702,18 @@ class PDFParser(DocumentParser):
|
||||
# 标题长度检查
|
||||
if len(title) > 60 or len(title) < 2:
|
||||
return None
|
||||
|
||||
# 过滤更像正文描述的句式。
|
||||
if self._looks_like_statement(title):
|
||||
return None
|
||||
|
||||
# 过滤疑似正文句子(含句号/分号且过长)。
|
||||
if len(title) > 24 and re.search(r'[。;;]', title):
|
||||
return None
|
||||
|
||||
# 过滤指令拼接噪声标题(逗号过多通常是正文残片)。
|
||||
if title.count(',') >= 2 and len(title) > 20:
|
||||
return None
|
||||
|
||||
# 放宽标题字符要求(兼容部分PDF字体导致中文抽取异常的情况)
|
||||
if not re.search(r'[\u4e00-\u9fa5A-Za-z]', title):
|
||||
@@ -631,8 +736,30 @@ class PDFParser(DocumentParser):
|
||||
# 检查标题是否包含反斜杠(通常是表格噪声)
|
||||
if '\\' in title and '需求' not in title:
|
||||
return None
|
||||
|
||||
# 常见有效标题关键词兜底,降低正文被识别为标题的概率。
|
||||
if not any(k in title for k in self.VALID_TITLE_KEYWORDS):
|
||||
return None
|
||||
|
||||
# 顶级章节标题需符合SRS结构性关键词,避免“综合监控”“电场”等正文短语被识别。
|
||||
if level == 1 and not any(k in title for k in self.TOP_LEVEL_TITLE_KEYWORDS):
|
||||
return None
|
||||
|
||||
return (number, title)
|
||||
|
||||
def _looks_like_statement(self, title: str) -> bool:
|
||||
"""判断标题是否更像正文语句而非章节名。"""
|
||||
if not title:
|
||||
return False
|
||||
|
||||
statement_hints = ["应", "能够", "可以", "进行", "通过", "并", "同时", "当", "如果", "则"]
|
||||
if any(h in title for h in statement_hints):
|
||||
return True
|
||||
|
||||
if len(title) > 24 and re.search(r'[,。;;::]', title):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _is_noise(self, line: str) -> bool:
|
||||
"""检查是否是噪声行"""
|
||||
|
||||
Reference in New Issue
Block a user