只保留LLM提取模式，修改提取逻辑

2026-04-18 20:33:58 +08:00
parent f01ddf045d
commit e274e7faa2
9 changed files with 1427 additions and 403 deletions
--- a/src/document_parser.py
+++ b/src/document_parser.py
@@ -4,7 +4,6 @@
 支持PDF和Docx格式，针对GJB438B标准SRS文档优化
 """

-import os
 import re
 import logging
 import importlib
@@ -119,43 +118,19 @@ class DocumentParser(ABC):
            sections: 章节列表
            parent_number: 父章节编号
        """
-        # 仅在顶级章节重编号
-        if not parent_number:
-            # 前置章节关键词（需要跳过的）
-            skip_keywords = ['目录', '封面', '扉页', '未命名', '年', '月']
-            # 正文章节关键词（遇到这些说明正文开始）
-            content_keywords = ['外部接口', '接口', '软件需求', '需求', '功能', '性能', '设计', '概述', '标识', '引言']
-            
-            start_index = 0
-            for idx, section in enumerate(sections):
-                # 优先检查是否是正文章节
-                is_content = any(kw in section.title for kw in content_keywords)
-                if is_content and section.level == 1:
-                    start_index = idx
-                    break
-            
-            # 重新编号所有章节
-            counter = 1
-            for i, section in enumerate(sections):
-                if i < start_index:
-                    # 前置章节不编号
-                    section.number = ""
-                else:
-                    # 正文章节：顶级章节从1开始编号
-                    if section.level == 1:
-                        section.number = str(counter)
-                        counter += 1
-                
-                # 递归处理子章节
-                if section.children:
-                    self._auto_number_sections(section.children, section.number)
-        else:
-            # 子章节编号
-            for i, section in enumerate(sections, 1):
-                if not section.number or self._is_chinese_number(section.number):
-                    section.generate_auto_number(parent_number, i)
-                if section.children:
-                    self._auto_number_sections(section.children, section.number)
+        if not sections:
+            return
+
+        # 仅为缺失编号的章节补号；已存在的文档原始编号必须保留。
+        sibling_index = 0
+        for section in sections:
+            has_number = bool((section.number or "").strip()) and not self._is_chinese_number(section.number)
+            if not has_number:
+                sibling_index += 1
+                section.generate_auto_number(parent_number, sibling_index)
+
+            if section.children:
+                self._auto_number_sections(section.children, section.number)
    
    def _is_chinese_number(self, text: str) -> bool:
        """检查是否是中文数字编号"""
@@ -327,8 +302,13 @@ class PDFParser(DocumentParser):
        '优先', '关键', '合格', '追踪', '注释',
        'CSCI', '计算机', '软件', '硬件', '通信', '通讯',
        '数据', '适应', '可靠', '内部', '外部',
-        '描述', '要求', '规定', '说明', '定义',
-        '电场', '防护', '装置', '控制', '监控', '显控'
+        '描述', '要求', '规定', '说明', '定义'
+    ]
+
+    TOP_LEVEL_TITLE_KEYWORDS = [
+        '范围', '标识', '概述', '引用', '文档', '需求', '接口', '性能',
+        '安全', '保密', '环境', '资源', '质量', '设计', '约束', '验收',
+        '交付', '包装', '注释'
    ]
    
    # 明显无效的章节标题模式（噪声）
@@ -411,21 +391,41 @@ class PDFParser(DocumentParser):
                    if page_idx < len(self._page_texts):
                        page_text = self._page_texts[page_idx]

-                    extracted_tables = page.extract_tables() or []
-                    for table_idx, table in enumerate(extracted_tables):
+                    table_objs = page.find_tables() or []
+                    if table_objs:
+                        extracted_tables = [(idx, t.extract(), t.bbox) for idx, t in enumerate(table_objs)]
+                    else:
+                        raw_tables = page.extract_tables() or []
+                        extracted_tables = [(idx, t, None) for idx, t in enumerate(raw_tables)]
+
+                    for table_idx, table, bbox in extracted_tables:
                        cleaned_table: List[List[str]] = []
                        for row in table or []:
                            cells = [re.sub(r'\s+', ' ', str(cell or '')).strip() for cell in row]
+                            # 只要存在非空单元格就保留，避免有效行被误丢弃。
                            if any(cells):
                                cleaned_table.append(cells)

                        if cleaned_table:
+                            section_hint = ""
+                            if bbox:
+                                try:
+                                    top = float(bbox[1])
+                                    text_above = page.crop((0, 0, page.width, top)).extract_text() or ""
+                                    section_hint = self._find_last_section_number(text_above)
+                                except Exception:
+                                    section_hint = ""
+
+                            table_ref = self._extract_table_reference(cleaned_table)
+
                            tables.append(
                                {
                                    "page_idx": page_idx,
                                    "table_idx": table_idx,
                                    "page_text": page_text,
                                    "data": cleaned_table,
+                                    "section_hint": section_hint,
+                                    "table_ref": table_ref,
                                }
                            )
        except Exception as e:
@@ -435,16 +435,86 @@ class PDFParser(DocumentParser):
        logger.info(f"PDF表格提取完成，共{len(tables)}个表格")
        return tables

+    def _extract_table_reference(self, table: List[List[str]]) -> str:
+        """从表格前几行中提取表号引用，如“表3-5”。"""
+        if not table:
+            return ""
+
+        head_rows = table[:2]
+        merged = " ".join(" ".join(str(c or "") for c in row) for row in head_rows)
+        merged = re.sub(r"\s+", "", merged)
+        m = re.search(r"表\s*(\d+(?:[-－]\d+){1,3})", merged)
+        if not m:
+            return ""
+        return m.group(1).replace("－", "-")
+
+    def _build_table_reference_index(self, sections: List[Section]) -> Dict[str, List[Section]]:
+        """构建“表号 -> 章节”索引，用于优先精确挂接表格。"""
+        index: Dict[str, List[Section]] = {}
+        for section in sections:
+            content = re.sub(r"\s+", "", section.content or "")
+            for m in re.finditer(r"表\s*(\d+(?:[-－]\d+){1,3})", content):
+                ref = m.group(1).replace("－", "-")
+                index.setdefault(ref, []).append(section)
+        return index
+
+    def _find_last_section_number(self, text: str) -> str:
+        """从文本中提取最后出现的章节号。"""
+        if not text:
+            return ""
+
+        found = ""
+        for line in text.split("\n"):
+            line = line.strip()
+            if not line:
+                continue
+            section_info = self._match_section_header(line, set())
+            if section_info:
+                found = section_info[0]
+        return found
+
    def _attach_pdf_tables_to_sections(self, tables: List[Dict[str, Any]]) -> None:
        """将提取出的PDF表格挂接到最匹配的章节。"""
        flat_sections = self._flatten_sections(self.sections)
        if not flat_sections:
            return

+        section_by_number = {
+            (s.number or "").strip(): s
+            for s in flat_sections
+            if (s.number or "").strip()
+        }
+        table_ref_index = self._build_table_reference_index(flat_sections)
+
        last_section: Optional[Section] = None
        for table in tables:
-            matched = self._match_table_section(table.get("page_text", ""), flat_sections)
-            target = matched or last_section or flat_sections[0]
+            target = None
+
+            table_ref = (table.get("table_ref") or "").strip()
+            if table_ref and table_ref in table_ref_index:
+                candidates = table_ref_index[table_ref]
+                # 同表号命中多个章节时，优先更深层章节，避免父级“汇总章节”抢占。
+                target = max(candidates, key=lambda s: (s.level, len(s.content or "")))
+
+            section_hint = (table.get("section_hint") or "").strip()
+            if not target and section_hint and section_hint in section_by_number:
+                target = section_by_number[section_hint]
+
+            if not target:
+                target = self._match_table_section(table.get("page_text", ""), flat_sections)
+
+            # 兜底优先使用上一个命中章节，避免错误挂到首章节造成跨章污染。
+            if not target:
+                target = last_section
+
+            if not target:
+                logger.warning(
+                    "未定位到表格归属章节，跳过: page=%s table=%s",
+                    table.get("page_idx", -1),
+                    table.get("table_idx", -1),
+                )
+                continue
+
            target.add_table(table["data"])
            last_section = target

@@ -464,7 +534,7 @@ class PDFParser(DocumentParser):
            return None

        matched: Optional[Section] = None
-        matched_score = -1
+        matched_score = (-1, -1)
        for section in sections:
            title = (section.title or "").strip()
            if not title:
@@ -479,7 +549,7 @@ class PDFParser(DocumentParser):
            for candidate in candidates:
                normalized_candidate = re.sub(r"\s+", "", candidate).lower()
                if normalized_candidate and normalized_candidate in normalized_page:
-                    score = len(normalized_candidate)
+                    score = (len(normalized_candidate), section.level)
                    if score > matched_score:
                        matched = section
                        matched_score = score
@@ -514,6 +584,7 @@ class PDFParser(DocumentParser):
        current_section = None
        content_buffer = []
        found_sections = set()
+        last_top_level_number = 0
        
        for line in lines:
            line = line.strip()
@@ -526,6 +597,22 @@ class PDFParser(DocumentParser):
            if section_info:
                number, title = section_info
                level = len(number.split('.'))
+                top_level_number = int(number.split('.')[0])
+
+                # 顶级章节序号大幅跳跃通常是误识别（如正文中的“8 表...”）。
+                if level == 1 and last_top_level_number and top_level_number > last_top_level_number + 1:
+                    if line and not self._is_noise(line):
+                        content_buffer.append(line)
+                    continue
+
+                # 顶级章节编号倒退通常是正文枚举项被误识别（如“1 综合监控...”）。
+                if level == 1 and last_top_level_number and top_level_number < last_top_level_number:
+                    if line and not self._is_noise(line):
+                        content_buffer.append(line)
+                    continue
+
+                if level > 6:
+                    continue
                
                # 保存之前章节的内容
                if current_section and content_buffer:
@@ -540,6 +627,7 @@ class PDFParser(DocumentParser):
                if level == 1:
                    sections.append(section)
                    section_stack = {1: section}
+                    last_top_level_number = top_level_number
                else:
                    parent_level = level - 1
                    while parent_level >= 1 and parent_level not in section_stack:
@@ -557,6 +645,10 @@ class PDFParser(DocumentParser):
                for l in list(section_stack.keys()):
                    if l > level:
                        del section_stack[l]
+
+                # 若出现层级跳跃（如1->3），自动回退到父级+1。
+                if level > 1 and (level - 1) not in section_stack:
+                    section.level = max(section_stack.keys()) if section_stack else 1
                
                current_section = section
            else:
@@ -577,13 +669,14 @@ class PDFParser(DocumentParser):
        Returns:
            (章节编号, 章节标题) 或 None
        """
-        # 模式: "3.1功能需求" 或 "3.1 功能需求"
-        match = re.match(r'^(\d+(?:\.\d+)*)\s*(.+)$', line)
+        # 模式: "3.1 功能需求" / "3.1.2 电场..."
+        match = re.match(r'^(\d+(?:\.\d+)*)[\s、.)）]*(.+)$', line)
        if not match:
            return None
        
        number = match.group(1)
        title = match.group(2).strip()
+        level = len(number.split('.'))
        
        # 排除目录行
        if '...' in title or title.count('.') > 5:
@@ -609,6 +702,18 @@ class PDFParser(DocumentParser):
        # 标题长度检查
        if len(title) > 60 or len(title) < 2:
            return None
+
+        # 过滤更像正文描述的句式。
+        if self._looks_like_statement(title):
+            return None
+
+        # 过滤疑似正文句子（含句号/分号且过长）。
+        if len(title) > 24 and re.search(r'[。；;]', title):
+            return None
+
+        # 过滤指令拼接噪声标题（逗号过多通常是正文残片）。
+        if title.count('，') >= 2 and len(title) > 20:
+            return None
        
        # 放宽标题字符要求（兼容部分PDF字体导致中文抽取异常的情况）
        if not re.search(r'[\u4e00-\u9fa5A-Za-z]', title):
@@ -631,8 +736,30 @@ class PDFParser(DocumentParser):
        # 检查标题是否包含反斜杠（通常是表格噪声）
        if '\\' in title and '需求' not in title:
            return None
+
+        # 常见有效标题关键词兜底，降低正文被识别为标题的概率。
+        if not any(k in title for k in self.VALID_TITLE_KEYWORDS):
+            return None
+
+        # 顶级章节标题需符合SRS结构性关键词，避免“综合监控”“电场”等正文短语被识别。
+        if level == 1 and not any(k in title for k in self.TOP_LEVEL_TITLE_KEYWORDS):
+            return None
        
        return (number, title)
+
+    def _looks_like_statement(self, title: str) -> bool:
+        """判断标题是否更像正文语句而非章节名。"""
+        if not title:
+            return False
+
+        statement_hints = ["应", "能够", "可以", "进行", "通过", "并", "同时", "当", "如果", "则"]
+        if any(h in title for h in statement_hints):
+            return True
+
+        if len(title) > 24 and re.search(r'[，。；;:：]', title):
+            return True
+
+        return False
    
    def _is_noise(self, line: str) -> bool:
        """检查是否是噪声行"""