完善了识别逻辑，允许轻微改动需求描述

2026-04-12 21:45:55 +08:00
parent aa8fd4d84a
commit f01ddf045d
10 changed files with 1099 additions and 183 deletions
--- a/src/document_parser.py
+++ b/src/document_parser.py
@@ -7,8 +7,9 @@
 import os
 import re
 import logging
+import importlib
 from abc import ABC, abstractmethod
-from typing import List, Dict, Tuple, Optional
+from typing import List, Dict, Tuple, Optional, Any
 from pathlib import Path

 try:
@@ -23,6 +24,8 @@ try:
 except ImportError:
    HAS_PDF = False

+HAS_PDF_TABLE = importlib.util.find_spec("pdfplumber") is not None
+
 logger = logging.getLogger(__name__)


@@ -38,19 +41,28 @@ class Section:
        self.parent = None
        self.children = []
        self.tables = []
+        self.blocks = []
    
    def add_child(self, child: 'Section') -> None:
        self.children.append(child)
        child.parent = self
    
    def add_content(self, text: str) -> None:
+        text = (text or "").strip()
+        if not text:
+            return
        if self.content:
            self.content += "\n" + text
        else:
            self.content = text
+        self.blocks.append({"type": "text", "text": text})
    
    def add_table(self, table_data: List[List[str]]) -> None:
+        if not table_data:
+            return
        self.tables.append(table_data)
+        table_index = len(self.tables) - 1
+        self.blocks.append({"type": "table", "table_index": table_index, "table": table_data})
    
    def generate_auto_number(self, parent_number: str = "", sibling_index: int = 1) -> None:
        """
@@ -332,6 +344,7 @@ class PDFParser(DocumentParser):
            raise ImportError("PyPDF2库未安装，请运行: pip install PyPDF2")
        super().__init__(file_path)
        self.document_title = "SRS Document"
+        self._page_texts: List[str] = []
    
    def parse(self) -> List[Section]:
        """解析PDF文档"""
@@ -348,8 +361,20 @@ class PDFParser(DocumentParser):
            # 4. 使用LLM验证和清理章节（如果可用）
            if self.llm:
                self.sections = self._llm_validate_sections(self.sections)
+
+            # 章节识别失败时，创建兜底章节避免后续表格数据丢失。
+            if not self.sections:
+                fallback = Section(level=1, title="未命名章节", number="1", uid=self._next_uid())
+                if cleaned_text:
+                    fallback.add_content(cleaned_text)
+                self.sections = [fallback]
+
+            # 5. 提取并挂接PDF表格到章节（若依赖可用）
+            pdf_tables = self._extract_pdf_tables()
+            if pdf_tables:
+                self._attach_pdf_tables_to_sections(pdf_tables)
            
-            # 5. 为没有编号的章节自动生成编号
+            # 6. 为没有编号的章节自动生成编号
            self._auto_number_sections(self.sections)
            
            logger.info(f"完成PDF解析，提取{len(self.sections)}个顶级章节")
@@ -368,7 +393,98 @@ class PDFParser(DocumentParser):
                text = page.extract_text()
                if text:
                    all_text.append(text)
+        self._page_texts = all_text
        return '\n'.join(all_text)
+
+    def _extract_pdf_tables(self) -> List[Dict[str, Any]]:
+        """提取PDF中的表格数据。"""
+        if not HAS_PDF_TABLE:
+            logger.warning("未安装pdfplumber，跳过PDF表格提取。可执行: pip install pdfplumber")
+            return []
+
+        tables: List[Dict[str, Any]] = []
+        try:
+            pdfplumber = importlib.import_module("pdfplumber")
+            with pdfplumber.open(self.file_path) as pdf:
+                for page_idx, page in enumerate(pdf.pages):
+                    page_text = ""
+                    if page_idx < len(self._page_texts):
+                        page_text = self._page_texts[page_idx]
+
+                    extracted_tables = page.extract_tables() or []
+                    for table_idx, table in enumerate(extracted_tables):
+                        cleaned_table: List[List[str]] = []
+                        for row in table or []:
+                            cells = [re.sub(r'\s+', ' ', str(cell or '')).strip() for cell in row]
+                            if any(cells):
+                                cleaned_table.append(cells)
+
+                        if cleaned_table:
+                            tables.append(
+                                {
+                                    "page_idx": page_idx,
+                                    "table_idx": table_idx,
+                                    "page_text": page_text,
+                                    "data": cleaned_table,
+                                }
+                            )
+        except Exception as e:
+            logger.warning(f"PDF表格提取失败，继续纯文本流程: {e}")
+            return []
+
+        logger.info(f"PDF表格提取完成，共{len(tables)}个表格")
+        return tables
+
+    def _attach_pdf_tables_to_sections(self, tables: List[Dict[str, Any]]) -> None:
+        """将提取出的PDF表格挂接到最匹配的章节。"""
+        flat_sections = self._flatten_sections(self.sections)
+        if not flat_sections:
+            return
+
+        last_section: Optional[Section] = None
+        for table in tables:
+            matched = self._match_table_section(table.get("page_text", ""), flat_sections)
+            target = matched or last_section or flat_sections[0]
+            target.add_table(table["data"])
+            last_section = target
+
+    def _flatten_sections(self, sections: List[Section]) -> List[Section]:
+        """按文档顺序拉平章节树。"""
+        result: List[Section] = []
+        for section in sections:
+            result.append(section)
+            if section.children:
+                result.extend(self._flatten_sections(section.children))
+        return result
+
+    def _match_table_section(self, page_text: str, sections: List[Section]) -> Optional[Section]:
+        """基于页文本匹配表格归属章节。"""
+        normalized_page = re.sub(r"\s+", "", (page_text or "")).lower()
+        if not normalized_page:
+            return None
+
+        matched: Optional[Section] = None
+        matched_score = -1
+        for section in sections:
+            title = (section.title or "").strip()
+            if not title:
+                continue
+
+            number = (section.number or "").strip()
+            candidates = [title]
+            if number:
+                candidates.append(f"{number}{title}")
+                candidates.append(f"{number} {title}")
+
+            for candidate in candidates:
+                normalized_candidate = re.sub(r"\s+", "", candidate).lower()
+                if normalized_candidate and normalized_candidate in normalized_page:
+                    score = len(normalized_candidate)
+                    if score > matched_score:
+                        matched = section
+                        matched_score = score
+
+        return matched
    
    def _clean_text(self, text: str) -> str:
        """清洗PDF提取的文本"""
@@ -494,11 +610,7 @@ class PDFParser(DocumentParser):
        if len(title) > 60 or len(title) < 2:
            return None
        
-        # 标题必须包含中文
-        if not re.search(r'[\u4e00-\u9fa5]', title):
-            return None
-        
-        # 放宽标题关键词要求（非严格GJB结构）
+        # 放宽标题字符要求（兼容部分PDF字体导致中文抽取异常的情况）
        if not re.search(r'[\u4e00-\u9fa5A-Za-z]', title):
            return None