完善了识别逻辑,允许轻微改动需求描述

This commit is contained in:
2026-04-12 21:45:55 +08:00
parent aa8fd4d84a
commit f01ddf045d
10 changed files with 1099 additions and 183 deletions

View File

@@ -7,8 +7,9 @@
import os
import re
import logging
import importlib
from abc import ABC, abstractmethod
from typing import List, Dict, Tuple, Optional
from typing import List, Dict, Tuple, Optional, Any
from pathlib import Path
try:
@@ -23,6 +24,8 @@ try:
except ImportError:
HAS_PDF = False
HAS_PDF_TABLE = importlib.util.find_spec("pdfplumber") is not None
logger = logging.getLogger(__name__)
@@ -38,19 +41,28 @@ class Section:
self.parent = None
self.children = []
self.tables = []
self.blocks = []
def add_child(self, child: 'Section') -> None:
self.children.append(child)
child.parent = self
def add_content(self, text: str) -> None:
text = (text or "").strip()
if not text:
return
if self.content:
self.content += "\n" + text
else:
self.content = text
self.blocks.append({"type": "text", "text": text})
def add_table(self, table_data: List[List[str]]) -> None:
if not table_data:
return
self.tables.append(table_data)
table_index = len(self.tables) - 1
self.blocks.append({"type": "table", "table_index": table_index, "table": table_data})
def generate_auto_number(self, parent_number: str = "", sibling_index: int = 1) -> None:
"""
@@ -332,6 +344,7 @@ class PDFParser(DocumentParser):
raise ImportError("PyPDF2库未安装请运行: pip install PyPDF2")
super().__init__(file_path)
self.document_title = "SRS Document"
self._page_texts: List[str] = []
def parse(self) -> List[Section]:
"""解析PDF文档"""
@@ -348,8 +361,20 @@ class PDFParser(DocumentParser):
# 4. 使用LLM验证和清理章节如果可用
if self.llm:
self.sections = self._llm_validate_sections(self.sections)
# 章节识别失败时,创建兜底章节避免后续表格数据丢失。
if not self.sections:
fallback = Section(level=1, title="未命名章节", number="1", uid=self._next_uid())
if cleaned_text:
fallback.add_content(cleaned_text)
self.sections = [fallback]
# 5. 提取并挂接PDF表格到章节若依赖可用
pdf_tables = self._extract_pdf_tables()
if pdf_tables:
self._attach_pdf_tables_to_sections(pdf_tables)
# 5. 为没有编号的章节自动生成编号
# 6. 为没有编号的章节自动生成编号
self._auto_number_sections(self.sections)
logger.info(f"完成PDF解析提取{len(self.sections)}个顶级章节")
@@ -368,7 +393,98 @@ class PDFParser(DocumentParser):
text = page.extract_text()
if text:
all_text.append(text)
self._page_texts = all_text
return '\n'.join(all_text)
def _extract_pdf_tables(self) -> List[Dict[str, Any]]:
"""提取PDF中的表格数据。"""
if not HAS_PDF_TABLE:
logger.warning("未安装pdfplumber跳过PDF表格提取。可执行: pip install pdfplumber")
return []
tables: List[Dict[str, Any]] = []
try:
pdfplumber = importlib.import_module("pdfplumber")
with pdfplumber.open(self.file_path) as pdf:
for page_idx, page in enumerate(pdf.pages):
page_text = ""
if page_idx < len(self._page_texts):
page_text = self._page_texts[page_idx]
extracted_tables = page.extract_tables() or []
for table_idx, table in enumerate(extracted_tables):
cleaned_table: List[List[str]] = []
for row in table or []:
cells = [re.sub(r'\s+', ' ', str(cell or '')).strip() for cell in row]
if any(cells):
cleaned_table.append(cells)
if cleaned_table:
tables.append(
{
"page_idx": page_idx,
"table_idx": table_idx,
"page_text": page_text,
"data": cleaned_table,
}
)
except Exception as e:
logger.warning(f"PDF表格提取失败继续纯文本流程: {e}")
return []
logger.info(f"PDF表格提取完成{len(tables)}个表格")
return tables
def _attach_pdf_tables_to_sections(self, tables: List[Dict[str, Any]]) -> None:
"""将提取出的PDF表格挂接到最匹配的章节。"""
flat_sections = self._flatten_sections(self.sections)
if not flat_sections:
return
last_section: Optional[Section] = None
for table in tables:
matched = self._match_table_section(table.get("page_text", ""), flat_sections)
target = matched or last_section or flat_sections[0]
target.add_table(table["data"])
last_section = target
def _flatten_sections(self, sections: List[Section]) -> List[Section]:
"""按文档顺序拉平章节树。"""
result: List[Section] = []
for section in sections:
result.append(section)
if section.children:
result.extend(self._flatten_sections(section.children))
return result
def _match_table_section(self, page_text: str, sections: List[Section]) -> Optional[Section]:
"""基于页文本匹配表格归属章节。"""
normalized_page = re.sub(r"\s+", "", (page_text or "")).lower()
if not normalized_page:
return None
matched: Optional[Section] = None
matched_score = -1
for section in sections:
title = (section.title or "").strip()
if not title:
continue
number = (section.number or "").strip()
candidates = [title]
if number:
candidates.append(f"{number}{title}")
candidates.append(f"{number} {title}")
for candidate in candidates:
normalized_candidate = re.sub(r"\s+", "", candidate).lower()
if normalized_candidate and normalized_candidate in normalized_page:
score = len(normalized_candidate)
if score > matched_score:
matched = section
matched_score = score
return matched
def _clean_text(self, text: str) -> str:
"""清洗PDF提取的文本"""
@@ -494,11 +610,7 @@ class PDFParser(DocumentParser):
if len(title) > 60 or len(title) < 2:
return None
# 标题必须包含中文
if not re.search(r'[\u4e00-\u9fa5]', title):
return None
# 放宽标题关键词要求非严格GJB结构
# 放宽标题字符要求兼容部分PDF字体导致中文抽取异常的情况
if not re.search(r'[\u4e00-\u9fa5A-Za-z]', title):
return None