完善了识别逻辑,允许轻微改动需求描述
This commit is contained in:
@@ -7,8 +7,9 @@
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
import importlib
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Dict, Tuple, Optional
|
||||
from typing import List, Dict, Tuple, Optional, Any
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
@@ -23,6 +24,8 @@ try:
|
||||
except ImportError:
|
||||
HAS_PDF = False
|
||||
|
||||
HAS_PDF_TABLE = importlib.util.find_spec("pdfplumber") is not None
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -38,19 +41,28 @@ class Section:
|
||||
self.parent = None
|
||||
self.children = []
|
||||
self.tables = []
|
||||
self.blocks = []
|
||||
|
||||
def add_child(self, child: 'Section') -> None:
|
||||
self.children.append(child)
|
||||
child.parent = self
|
||||
|
||||
def add_content(self, text: str) -> None:
|
||||
text = (text or "").strip()
|
||||
if not text:
|
||||
return
|
||||
if self.content:
|
||||
self.content += "\n" + text
|
||||
else:
|
||||
self.content = text
|
||||
self.blocks.append({"type": "text", "text": text})
|
||||
|
||||
def add_table(self, table_data: List[List[str]]) -> None:
|
||||
if not table_data:
|
||||
return
|
||||
self.tables.append(table_data)
|
||||
table_index = len(self.tables) - 1
|
||||
self.blocks.append({"type": "table", "table_index": table_index, "table": table_data})
|
||||
|
||||
def generate_auto_number(self, parent_number: str = "", sibling_index: int = 1) -> None:
|
||||
"""
|
||||
@@ -332,6 +344,7 @@ class PDFParser(DocumentParser):
|
||||
raise ImportError("PyPDF2库未安装,请运行: pip install PyPDF2")
|
||||
super().__init__(file_path)
|
||||
self.document_title = "SRS Document"
|
||||
self._page_texts: List[str] = []
|
||||
|
||||
def parse(self) -> List[Section]:
|
||||
"""解析PDF文档"""
|
||||
@@ -348,8 +361,20 @@ class PDFParser(DocumentParser):
|
||||
# 4. 使用LLM验证和清理章节(如果可用)
|
||||
if self.llm:
|
||||
self.sections = self._llm_validate_sections(self.sections)
|
||||
|
||||
# 章节识别失败时,创建兜底章节避免后续表格数据丢失。
|
||||
if not self.sections:
|
||||
fallback = Section(level=1, title="未命名章节", number="1", uid=self._next_uid())
|
||||
if cleaned_text:
|
||||
fallback.add_content(cleaned_text)
|
||||
self.sections = [fallback]
|
||||
|
||||
# 5. 提取并挂接PDF表格到章节(若依赖可用)
|
||||
pdf_tables = self._extract_pdf_tables()
|
||||
if pdf_tables:
|
||||
self._attach_pdf_tables_to_sections(pdf_tables)
|
||||
|
||||
# 5. 为没有编号的章节自动生成编号
|
||||
# 6. 为没有编号的章节自动生成编号
|
||||
self._auto_number_sections(self.sections)
|
||||
|
||||
logger.info(f"完成PDF解析,提取{len(self.sections)}个顶级章节")
|
||||
@@ -368,7 +393,98 @@ class PDFParser(DocumentParser):
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
all_text.append(text)
|
||||
self._page_texts = all_text
|
||||
return '\n'.join(all_text)
|
||||
|
||||
def _extract_pdf_tables(self) -> List[Dict[str, Any]]:
|
||||
"""提取PDF中的表格数据。"""
|
||||
if not HAS_PDF_TABLE:
|
||||
logger.warning("未安装pdfplumber,跳过PDF表格提取。可执行: pip install pdfplumber")
|
||||
return []
|
||||
|
||||
tables: List[Dict[str, Any]] = []
|
||||
try:
|
||||
pdfplumber = importlib.import_module("pdfplumber")
|
||||
with pdfplumber.open(self.file_path) as pdf:
|
||||
for page_idx, page in enumerate(pdf.pages):
|
||||
page_text = ""
|
||||
if page_idx < len(self._page_texts):
|
||||
page_text = self._page_texts[page_idx]
|
||||
|
||||
extracted_tables = page.extract_tables() or []
|
||||
for table_idx, table in enumerate(extracted_tables):
|
||||
cleaned_table: List[List[str]] = []
|
||||
for row in table or []:
|
||||
cells = [re.sub(r'\s+', ' ', str(cell or '')).strip() for cell in row]
|
||||
if any(cells):
|
||||
cleaned_table.append(cells)
|
||||
|
||||
if cleaned_table:
|
||||
tables.append(
|
||||
{
|
||||
"page_idx": page_idx,
|
||||
"table_idx": table_idx,
|
||||
"page_text": page_text,
|
||||
"data": cleaned_table,
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"PDF表格提取失败,继续纯文本流程: {e}")
|
||||
return []
|
||||
|
||||
logger.info(f"PDF表格提取完成,共{len(tables)}个表格")
|
||||
return tables
|
||||
|
||||
def _attach_pdf_tables_to_sections(self, tables: List[Dict[str, Any]]) -> None:
|
||||
"""将提取出的PDF表格挂接到最匹配的章节。"""
|
||||
flat_sections = self._flatten_sections(self.sections)
|
||||
if not flat_sections:
|
||||
return
|
||||
|
||||
last_section: Optional[Section] = None
|
||||
for table in tables:
|
||||
matched = self._match_table_section(table.get("page_text", ""), flat_sections)
|
||||
target = matched or last_section or flat_sections[0]
|
||||
target.add_table(table["data"])
|
||||
last_section = target
|
||||
|
||||
def _flatten_sections(self, sections: List[Section]) -> List[Section]:
|
||||
"""按文档顺序拉平章节树。"""
|
||||
result: List[Section] = []
|
||||
for section in sections:
|
||||
result.append(section)
|
||||
if section.children:
|
||||
result.extend(self._flatten_sections(section.children))
|
||||
return result
|
||||
|
||||
def _match_table_section(self, page_text: str, sections: List[Section]) -> Optional[Section]:
|
||||
"""基于页文本匹配表格归属章节。"""
|
||||
normalized_page = re.sub(r"\s+", "", (page_text or "")).lower()
|
||||
if not normalized_page:
|
||||
return None
|
||||
|
||||
matched: Optional[Section] = None
|
||||
matched_score = -1
|
||||
for section in sections:
|
||||
title = (section.title or "").strip()
|
||||
if not title:
|
||||
continue
|
||||
|
||||
number = (section.number or "").strip()
|
||||
candidates = [title]
|
||||
if number:
|
||||
candidates.append(f"{number}{title}")
|
||||
candidates.append(f"{number} {title}")
|
||||
|
||||
for candidate in candidates:
|
||||
normalized_candidate = re.sub(r"\s+", "", candidate).lower()
|
||||
if normalized_candidate and normalized_candidate in normalized_page:
|
||||
score = len(normalized_candidate)
|
||||
if score > matched_score:
|
||||
matched = section
|
||||
matched_score = score
|
||||
|
||||
return matched
|
||||
|
||||
def _clean_text(self, text: str) -> str:
|
||||
"""清洗PDF提取的文本"""
|
||||
@@ -494,11 +610,7 @@ class PDFParser(DocumentParser):
|
||||
if len(title) > 60 or len(title) < 2:
|
||||
return None
|
||||
|
||||
# 标题必须包含中文
|
||||
if not re.search(r'[\u4e00-\u9fa5]', title):
|
||||
return None
|
||||
|
||||
# 放宽标题关键词要求(非严格GJB结构)
|
||||
# 放宽标题字符要求(兼容部分PDF字体导致中文抽取异常的情况)
|
||||
if not re.search(r'[\u4e00-\u9fa5A-Za-z]', title):
|
||||
return None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user