From e274e7faa2db28eb59b96d7cdf77bec4452a33d7 Mon Sep 17 00:00:00 2001
From: junlan <15167915727@163.com>
Date: Sat, 18 Apr 2026 20:33:58 +0800
Subject: [PATCH] =?UTF-8?q?=E5=8F=AA=E4=BF=9D=E7=95=99LLM=E6=8F=90?=
 =?UTF-8?q?=E5=8F=96=E6=A8=A1=E5=BC=8F=EF=BC=8C=E4=BF=AE=E6=94=B9=E6=8F=90?=
 =?UTF-8?q?=E5=8F=96=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                    |   45 +-
 config.yaml                  |   92 ++-
 json_to_excel.py             |   10 +-
 main.py                      |  123 +++-
 src/document_parser.py       |  223 +++++--
 src/json_generator.py        |    4 +-
 src/requirement_extractor.py | 1178 +++++++++++++++++++++++++---------
 src/requirement_splitter.py  |   21 +-
 src/settings.py              |  134 ++++
 9 files changed, 1427 insertions(+), 403 deletions(-)

diff --git a/README.md b/README.md
index 6b6b6e7..f37c0ad 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # SRS需求文档解析工具
 
-一个智能的SRS（软件需求规格说明书）文档解析工具，支持PDF和Docx格式，能够自动提取需求并生成结构化JSON输出。
+一个基于大模型的SRS（软件需求规格说明书）文档解析工具，支持PDF和Docx格式，能够自动提取需求并生成结构化JSON输出。
 
 ## 特性
 
@@ -12,6 +12,8 @@
 - **表格需求识别**：支持从表格中提取功能/接口/其他需求
 - **PDF表格提取**：支持从PDF中提取表格并自动挂接到章节
 - **长句原子拆分**：自动将包含多个需求点的长句拆分为多个可验证需求项
+- **章节筛选提取**：支持按章节号提取（如输入`3`提取第3章及其全部子章节）
+- **LLM-only**：当前版本仅支持LLM提取链路，不再提供规则提取模式
 
 ## 快速开始
 
@@ -27,7 +29,7 @@ pip install dashscope
 pip install pdfplumber
 ```
 
-### 配置API密钥（LLM模式）
+### 配置API密钥（必需）
 
 ```bash
 # 方式1：环境变量（推荐）
@@ -45,11 +47,11 @@ llm:
 ### 运行
 
 ```bash
-# LLM增强模式
+# LLM增强模式（唯一模式）
 python main.py -i ".\input\DC-SRS.pdf" -o ".\output\output.json"
 
-# 纯规则模式（不使用LLM）
-python main.py -i DC-SRS.pdf -o output.json --no-llm
+# 按章节提取（输入3表示提取第3章及3.x子章节）
+python main.py -i ".\input\DC-SRS.pdf" -o ".\output\output_ch3.json" --chapters 3
 ```
 
 <!-- ```bash
@@ -73,16 +75,33 @@ python -c "from src.document_parser import DocxParser; parser = DocxParser('test
 
 | 字段 | 说明 |
 |------|------|
-| **接口名称** | 接口的名称
-| **接口类型** | 接口的类型
-| **来源** | 数据或信号的来源/发送方 |
-| **目的地** | 数据或信号的目的地/接收方 |
+| **接口名称** | 接口的名称 |
+| **接口类型** | 接口的类型 |
+| **数据来源** | 数据或信号的来源/发送方 |
+| **数据目的地** | 数据或信号的目的地/接收方 |
 
-### 需求描述规则
+### 需求描述策略（LLM驱动）
 
-- **功能需求**：保持原文描述，不改写润色
-- **接口需求**：允许改写润色，确保描述清晰完整
-- **其他需求**：保持原文描述，不改写润色
+- **功能需求**：以原文为主，必要时轻微补全语义
+- **接口需求**：允许适度改写润色，并补齐接口字段
+- **其他需求**：以原文为主，避免无意义改写
+
+### 表格处理策略
+
+- **系统功能要求表、性能要求表**：默认忽略，不提取需求
+- **接口要求表**：可提取接口需求，且接口字段优先从表格列提取
+- **硬件/软件/运行环境表**：按“一表一条”生成需求，避免拆成多条
+
+### 润色约束
+
+- 除接口需求外，需求描述尽量保持原文
+- 非接口需求的润色改动上限为20个字（超限则回退原描述）
+
+## 运行约束
+
+- 必须配置可用的 `DASHSCOPE_API_KEY`（或在 `config.yaml` 中配置 `llm.api_key`）
+- 当LLM初始化失败或调用失败时，程序会直接报错退出，不会降级为规则提取
+- `--chapters` 为空时提取全量；设置为 `3` 时仅提取第3章及其子章节
 
 ## 目录结构
 
diff --git a/config.yaml b/config.yaml
index 8fa4a5c..4e2ae26 100644
--- a/config.yaml
+++ b/config.yaml
@@ -3,12 +3,12 @@
 
 # LLM配置 - 阿里云千问
 llm:
-  # 是否启用LLM（设为false则使用纯规则提取）
+  # 是否启用LLM（当前版本必须为true）
   enabled: true
   # LLM提供商：qwen（阿里云千问）
   provider: "qwen"
   # 模型名称
-  model: "qwen3-max-2026-01-23"
+  model: "glm-5"
   # API密钥（建议使用环境变量 DASHSCOPE_API_KEY）
   api_key: "sk-7097f7842f724f0c9e70c4bf3b16dacb"
   # 可选参数
@@ -48,7 +48,7 @@ extraction:
       priority: 1
     接口需求:
       prefix: "IR"
-      keywords: ["接口", "interface", "api", "外部接口", "内部接口", "CAN", "以太网", "通信"]
+      keywords: ["接口", "interface", "api", "外部接口", "内部接口", "输入输出"]
       priority: 2
     性能需求:
       prefix: "PR"
@@ -68,23 +68,105 @@ extraction:
       priority: 6
   splitter:
     enabled: true
-    max_sentence_len: 120
-    min_clause_len: 12
+    max_sentence_len: 160
+    min_clause_len: 20
+  semantic_type_policy:
+    interface_section_hints:
+      - "接口描述"
+      - "接口需求"
+      - "接口要求"
+      - "外部接口"
+      - "内部接口"
+      - "I/O"
+    interface_title_excludes:
+      - "计算机通信需求"
+      - "通信需求"
+      - "通信要求"
+    functional_section_hints:
+      - "功能需求"
+      - "功能要求"
+    other_section_hints:
+      - "安全性需求"
+      - "保密性需求"
+      - "适应性需求"
+      - "环境需求"
+      - "资源需求"
+      - "质量"
+      - "设计约束"
+      - "培训需求"
+      - "软件保障"
+      - "验收"
+      - "交付"
+      - "包装"
+      - "通信需求"
+      - "计算机通信需求"
+      - "硬件环境"
+      - "软件环境"
+      - "运行环境"
   semantic_guard:
     enabled: true
     preserve_condition_action_chain: true
     preserve_alarm_chain: true
+  system_description_hints:
+    - "系统描述"
+    - "功能描述"
+    - "概述"
+    - "示意图"
+    - "组成"
+    - "架构"
+    - "原理"
   table_strategy:
     llm_semantic_enabled: true
     sequence_table_merge: "single_requirement"
     merge_time_series_rows_min: 3
+    skip_keywords:
+      - "系统功能要求"
+      - "性能要求"
+      - "系统性能要求"
+      - "系统接口要求"
+      - "功能矩阵"
+      - "能力对照"
+      - "性能指标对照"
+    interface_keywords:
+      - "接口"
+      - "interface"
+      - "输入输出"
+      - "I/O"
+      - "数据来源"
+      - "数据目的地"
+      - "来源"
+      - "目的地"
+    single_requirement_keywords:
+      - "硬件要求"
+      - "软件要求"
+      - "运行环境"
+      - "硬件环境"
+      - "软件环境"
+      - "运行硬件环境"
+      - "运行软件环境"
+      - "环境需求"
+      - "资源需求"
+      - "计算机资源"
   rewrite_policy:
     llm_light_rewrite_enabled: true
     preserve_ratio_min: 0.65
     max_length_growth_ratio: 1.25
+    non_interface_max_edit_distance: 20
   renumber_policy:
     enabled: true
     mode: "section_continuous"
+  dedup_policy:
+    similarity_threshold: 0.88
+    enable_cross_section_dedup: true
+    prefer_text_over_table: true
+  interface_policy:
+    unknown_fallback: "未知"
+  normalization_policy:
+    ocr_spacing_normalize: true
+  fidelity_policy:
+    preserve_source_text_for_text_blocks: true
+  punctuation_policy:
+    ensure_terminal_period: true
 
 # 输出配置
 output:
diff --git a/json_to_excel.py b/json_to_excel.py
index d662b1f..1a4ba63 100644
--- a/json_to_excel.py
+++ b/json_to_excel.py
@@ -45,8 +45,8 @@ def parse_requirements_from_json(json_data, parent_section=""):
                 "需求描述": req.get("需求描述", ""),
                 "接口名称": req.get("接口名称", ""),
                 "接口类型": req.get("接口类型", ""),
-                "来源": req.get("来源", ""),
-                "目的地": req.get("目的地", "")
+                "数据来源": req.get("数据来源", ""),
+                "数据目的地": req.get("数据目的地", "")
             }
             requirements.append(req_data)
         
@@ -108,7 +108,7 @@ def create_excel(json_file, output_file):
     # 定义表头（按用户要求的顺序）
     headers = [
         "章节编号", "章节标题", "需求类型", "需求编号", "需求描述",
-        "接口名称", "接口类型", "来源", "目的地"
+        "接口名称", "接口类型", "数据来源", "数据目的地"
     ]
     
     # 写入表头
@@ -154,8 +154,8 @@ def create_excel(json_file, output_file):
         'E': 80,  # 需求描述
         'F': 25,  # 接口名称
         'G': 25,  # 接口类型
-        'H': 25,  # 来源
-        'I': 25   # 目的地
+        'H': 25,  # 数据来源
+        'I': 25   # 数据目的地
     }
     
     for col, width in column_widths.items():
diff --git a/main.py b/main.py
index 7b496c7..3649ab2 100644
--- a/main.py
+++ b/main.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 """
 SRS 解析工具 - 主程序入口
-LLM 增强版 - 默认阿里云千问大模型
 """
 
 import argparse
@@ -16,6 +15,7 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 
 from src.utils import load_config, setup_logging, validate_file_path, ensure_directory_exists, get_env_or_config
 from src.document_parser import create_parser
+from src.document_parser import Section
 from src.requirement_extractor import RequirementExtractor
 from src.json_generator import JSONGenerator
 
@@ -34,10 +34,9 @@ def create_llm(config: dict):
     """
     llm_config = config.get('llm', {})
     
-    # 检查是否启用LLM
+    # 当前版本仅支持LLM模式
     if not llm_config.get('enabled', True):
-        logger.info("LLM已禁用，使用纯规则提取模式")
-        return None
+        raise ValueError("当前版本仅支持LLM模式，请将配置 llm.enabled 设为 true")
     
     provider = llm_config.get('provider', 'qwen')
     
@@ -45,9 +44,7 @@ def create_llm(config: dict):
     api_key = get_env_or_config('DASHSCOPE_API_KEY', llm_config.get('api_key'))
     
     if not api_key:
-        logger.warning("未配置API密钥，请使用纯规则提取模式")
-        logger.warning("请设置环境变量 DASHSCOPE_API_KEY 或在 config.yaml 中配置 llm.api_key")
-        return None
+        raise ValueError("未配置API密钥：请设置环境变量 DASHSCOPE_API_KEY 或在 config.yaml 中配置 llm.api_key")
     
     try:
         from src.llm_interface import QwenLLM
@@ -67,12 +64,80 @@ def create_llm(config: dict):
         return llm
         
     except ImportError as e:
-        logger.warning(f"无法导入LLM模块: {e}")
-        logger.warning("请运行: pip install dashscope")
-        return None
+        raise RuntimeError(f"无法导入LLM模块: {e}。请安装依赖：pip install dashscope") from e
     except Exception as e:
-        logger.warning(f"创建LLM实例失败: {e}")
-        return None
+        raise RuntimeError(f"创建LLM实例失败: {e}") from e
+
+
+def parse_chapter_selector(selector: str) -> list:
+    """解析章节筛选参数。"""
+    if not selector:
+        return []
+    chapters = [x.strip() for x in selector.split(',') if x.strip()]
+    valid = []
+    for chapter in chapters:
+        if not chapter or not all(p.isdigit() for p in chapter.split('.')):
+            raise ValueError(f"无效章节编号: {chapter}，仅支持如 3 或 3.1 的格式")
+        valid.append(chapter)
+    return valid
+
+
+def _clone_section_with_children(section: Section) -> Section:
+    copied = Section(
+        level=section.level,
+        title=section.title,
+        number=section.number,
+        content=section.content,
+        uid=section.uid,
+    )
+    copied.tables = list(section.tables)
+    copied.blocks = list(section.blocks)
+    for child in section.children:
+        copied.add_child(_clone_section_with_children(child))
+    return copied
+
+
+def filter_sections_by_chapters(sections: list, chapters: list) -> list:
+    """按章节前缀过滤章节树（如3匹配3及3.x）。"""
+    if not chapters:
+        return sections
+
+    def matched(number: str) -> bool:
+        number = (number or "").strip()
+        if not number:
+            return False
+        for chapter in chapters:
+            if number == chapter or number.startswith(f"{chapter}."):
+                return True
+        return False
+
+    def recurse(section: Section) -> Section:
+        if matched(section.number):
+            return _clone_section_with_children(section)
+
+        copied = Section(
+            level=section.level,
+            title=section.title,
+            number=section.number,
+            content=section.content,
+            uid=section.uid,
+        )
+        copied.tables = list(section.tables)
+        copied.blocks = list(section.blocks)
+
+        for child in section.children:
+            filtered_child = recurse(child)
+            if filtered_child:
+                copied.add_child(filtered_child)
+
+        return copied if copied.children else None
+
+    filtered = []
+    for s in sections:
+        fs = recurse(s)
+        if fs:
+            filtered.append(fs)
+    return filtered
 
 
 def main():
@@ -86,7 +151,7 @@ def main():
 示例用法：
   python main.py --input sample.pdf --output output.json
   python main.py -i requirements.docx -o output.json --verbose
-  python main.py -i DC-SRS.pdf -o output.json --no-llm  # 禁用LLM
+    python main.py -i DC-SRS.pdf -o output.json
         """
     )
     
@@ -116,11 +181,12 @@ def main():
         action='store_true',
         help='输出详细日志'
     )
-    
+
     parser.add_argument(
-        '--no-llm',
-        action='store_true',
-        help='禁用LLM，使用纯规则提取'
+        '--chapters',
+        type=str,
+        default=None,
+        help='按章节提取（如: 3 或 3,4.1）；输入3表示提取第3章及其子章节'
     )
     
     # 解析命令行参数
@@ -129,10 +195,6 @@ def main():
     # 加载配置
     config = load_config(args.config)
     
-    # 命令行参数覆盖配置
-    if args.no_llm:
-        config.setdefault('llm', {})['enabled'] = False
-    
     # 设置日志
     if args.verbose:
         config.setdefault('logging', {})['level'] = 'DEBUG'
@@ -158,12 +220,9 @@ def main():
         
         logger.info(f"输出文件: {args.output}")
         
-        # 创建LLM实例
+        # 创建LLM实例（必需）
         llm = create_llm(config)
-        if llm:
-            logger.info("LLM增强模式已启用")
-        else:
-            logger.info("使用纯规则提取模式")
+        logger.info("LLM增强模式已启用")
         
         # 步骤1：解析文档
         logger.info("\n" + "=" * 60)
@@ -176,6 +235,13 @@ def main():
         
         sections = doc_parser.parse()
         document_title = doc_parser.get_document_title()
+
+        selected_chapters = parse_chapter_selector(args.chapters) if args.chapters else []
+        if selected_chapters:
+            sections = filter_sections_by_chapters(sections, selected_chapters)
+            if not sections:
+                raise ValueError(f"未匹配到指定章节: {', '.join(selected_chapters)}")
+            logger.info(f"章节筛选已启用: {', '.join(selected_chapters)}")
         
         logger.info(f"成功解析文档，提取{len(sections)}个顶级章节")
         
@@ -192,10 +258,7 @@ def main():
         
         # 步骤2：提取需求
         logger.info("\n" + "=" * 60)
-        if llm:
-            logger.info("步骤2：提取需求（LLM增强模式）")
-        else:
-            logger.info("步骤2：提取需求（规则匹配模式）")
+        logger.info("步骤2：提取需求（LLM增强模式）")
         logger.info("=" * 60)
         
         extractor = RequirementExtractor(config, llm=llm)
diff --git a/src/document_parser.py b/src/document_parser.py
index 859029a..dd4505d 100644
--- a/src/document_parser.py
+++ b/src/document_parser.py
@@ -4,7 +4,6 @@
 支持PDF和Docx格式，针对GJB438B标准SRS文档优化
 """
 
-import os
 import re
 import logging
 import importlib
@@ -119,43 +118,19 @@ class DocumentParser(ABC):
             sections: 章节列表
             parent_number: 父章节编号
         """
-        # 仅在顶级章节重编号
-        if not parent_number:
-            # 前置章节关键词（需要跳过的）
-            skip_keywords = ['目录', '封面', '扉页', '未命名', '年', '月']
-            # 正文章节关键词（遇到这些说明正文开始）
-            content_keywords = ['外部接口', '接口', '软件需求', '需求', '功能', '性能', '设计', '概述', '标识', '引言']
-            
-            start_index = 0
-            for idx, section in enumerate(sections):
-                # 优先检查是否是正文章节
-                is_content = any(kw in section.title for kw in content_keywords)
-                if is_content and section.level == 1:
-                    start_index = idx
-                    break
-            
-            # 重新编号所有章节
-            counter = 1
-            for i, section in enumerate(sections):
-                if i < start_index:
-                    # 前置章节不编号
-                    section.number = ""
-                else:
-                    # 正文章节：顶级章节从1开始编号
-                    if section.level == 1:
-                        section.number = str(counter)
-                        counter += 1
-                
-                # 递归处理子章节
-                if section.children:
-                    self._auto_number_sections(section.children, section.number)
-        else:
-            # 子章节编号
-            for i, section in enumerate(sections, 1):
-                if not section.number or self._is_chinese_number(section.number):
-                    section.generate_auto_number(parent_number, i)
-                if section.children:
-                    self._auto_number_sections(section.children, section.number)
+        if not sections:
+            return
+
+        # 仅为缺失编号的章节补号；已存在的文档原始编号必须保留。
+        sibling_index = 0
+        for section in sections:
+            has_number = bool((section.number or "").strip()) and not self._is_chinese_number(section.number)
+            if not has_number:
+                sibling_index += 1
+                section.generate_auto_number(parent_number, sibling_index)
+
+            if section.children:
+                self._auto_number_sections(section.children, section.number)
     
     def _is_chinese_number(self, text: str) -> bool:
         """检查是否是中文数字编号"""
@@ -327,8 +302,13 @@ class PDFParser(DocumentParser):
         '优先', '关键', '合格', '追踪', '注释',
         'CSCI', '计算机', '软件', '硬件', '通信', '通讯',
         '数据', '适应', '可靠', '内部', '外部',
-        '描述', '要求', '规定', '说明', '定义',
-        '电场', '防护', '装置', '控制', '监控', '显控'
+        '描述', '要求', '规定', '说明', '定义'
+    ]
+
+    TOP_LEVEL_TITLE_KEYWORDS = [
+        '范围', '标识', '概述', '引用', '文档', '需求', '接口', '性能',
+        '安全', '保密', '环境', '资源', '质量', '设计', '约束', '验收',
+        '交付', '包装', '注释'
     ]
     
     # 明显无效的章节标题模式（噪声）
@@ -411,21 +391,41 @@ class PDFParser(DocumentParser):
                     if page_idx < len(self._page_texts):
                         page_text = self._page_texts[page_idx]
 
-                    extracted_tables = page.extract_tables() or []
-                    for table_idx, table in enumerate(extracted_tables):
+                    table_objs = page.find_tables() or []
+                    if table_objs:
+                        extracted_tables = [(idx, t.extract(), t.bbox) for idx, t in enumerate(table_objs)]
+                    else:
+                        raw_tables = page.extract_tables() or []
+                        extracted_tables = [(idx, t, None) for idx, t in enumerate(raw_tables)]
+
+                    for table_idx, table, bbox in extracted_tables:
                         cleaned_table: List[List[str]] = []
                         for row in table or []:
                             cells = [re.sub(r'\s+', ' ', str(cell or '')).strip() for cell in row]
+                            # 只要存在非空单元格就保留，避免有效行被误丢弃。
                             if any(cells):
                                 cleaned_table.append(cells)
 
                         if cleaned_table:
+                            section_hint = ""
+                            if bbox:
+                                try:
+                                    top = float(bbox[1])
+                                    text_above = page.crop((0, 0, page.width, top)).extract_text() or ""
+                                    section_hint = self._find_last_section_number(text_above)
+                                except Exception:
+                                    section_hint = ""
+
+                            table_ref = self._extract_table_reference(cleaned_table)
+
                             tables.append(
                                 {
                                     "page_idx": page_idx,
                                     "table_idx": table_idx,
                                     "page_text": page_text,
                                     "data": cleaned_table,
+                                    "section_hint": section_hint,
+                                    "table_ref": table_ref,
                                 }
                             )
         except Exception as e:
@@ -435,16 +435,86 @@ class PDFParser(DocumentParser):
         logger.info(f"PDF表格提取完成，共{len(tables)}个表格")
         return tables
 
+    def _extract_table_reference(self, table: List[List[str]]) -> str:
+        """从表格前几行中提取表号引用，如“表3-5”。"""
+        if not table:
+            return ""
+
+        head_rows = table[:2]
+        merged = " ".join(" ".join(str(c or "") for c in row) for row in head_rows)
+        merged = re.sub(r"\s+", "", merged)
+        m = re.search(r"表\s*(\d+(?:[-－]\d+){1,3})", merged)
+        if not m:
+            return ""
+        return m.group(1).replace("－", "-")
+
+    def _build_table_reference_index(self, sections: List[Section]) -> Dict[str, List[Section]]:
+        """构建“表号 -> 章节”索引，用于优先精确挂接表格。"""
+        index: Dict[str, List[Section]] = {}
+        for section in sections:
+            content = re.sub(r"\s+", "", section.content or "")
+            for m in re.finditer(r"表\s*(\d+(?:[-－]\d+){1,3})", content):
+                ref = m.group(1).replace("－", "-")
+                index.setdefault(ref, []).append(section)
+        return index
+
+    def _find_last_section_number(self, text: str) -> str:
+        """从文本中提取最后出现的章节号。"""
+        if not text:
+            return ""
+
+        found = ""
+        for line in text.split("\n"):
+            line = line.strip()
+            if not line:
+                continue
+            section_info = self._match_section_header(line, set())
+            if section_info:
+                found = section_info[0]
+        return found
+
     def _attach_pdf_tables_to_sections(self, tables: List[Dict[str, Any]]) -> None:
         """将提取出的PDF表格挂接到最匹配的章节。"""
         flat_sections = self._flatten_sections(self.sections)
         if not flat_sections:
             return
 
+        section_by_number = {
+            (s.number or "").strip(): s
+            for s in flat_sections
+            if (s.number or "").strip()
+        }
+        table_ref_index = self._build_table_reference_index(flat_sections)
+
         last_section: Optional[Section] = None
         for table in tables:
-            matched = self._match_table_section(table.get("page_text", ""), flat_sections)
-            target = matched or last_section or flat_sections[0]
+            target = None
+
+            table_ref = (table.get("table_ref") or "").strip()
+            if table_ref and table_ref in table_ref_index:
+                candidates = table_ref_index[table_ref]
+                # 同表号命中多个章节时，优先更深层章节，避免父级“汇总章节”抢占。
+                target = max(candidates, key=lambda s: (s.level, len(s.content or "")))
+
+            section_hint = (table.get("section_hint") or "").strip()
+            if not target and section_hint and section_hint in section_by_number:
+                target = section_by_number[section_hint]
+
+            if not target:
+                target = self._match_table_section(table.get("page_text", ""), flat_sections)
+
+            # 兜底优先使用上一个命中章节，避免错误挂到首章节造成跨章污染。
+            if not target:
+                target = last_section
+
+            if not target:
+                logger.warning(
+                    "未定位到表格归属章节，跳过: page=%s table=%s",
+                    table.get("page_idx", -1),
+                    table.get("table_idx", -1),
+                )
+                continue
+
             target.add_table(table["data"])
             last_section = target
 
@@ -464,7 +534,7 @@ class PDFParser(DocumentParser):
             return None
 
         matched: Optional[Section] = None
-        matched_score = -1
+        matched_score = (-1, -1)
         for section in sections:
             title = (section.title or "").strip()
             if not title:
@@ -479,7 +549,7 @@ class PDFParser(DocumentParser):
             for candidate in candidates:
                 normalized_candidate = re.sub(r"\s+", "", candidate).lower()
                 if normalized_candidate and normalized_candidate in normalized_page:
-                    score = len(normalized_candidate)
+                    score = (len(normalized_candidate), section.level)
                     if score > matched_score:
                         matched = section
                         matched_score = score
@@ -514,6 +584,7 @@ class PDFParser(DocumentParser):
         current_section = None
         content_buffer = []
         found_sections = set()
+        last_top_level_number = 0
         
         for line in lines:
             line = line.strip()
@@ -526,6 +597,22 @@ class PDFParser(DocumentParser):
             if section_info:
                 number, title = section_info
                 level = len(number.split('.'))
+                top_level_number = int(number.split('.')[0])
+
+                # 顶级章节序号大幅跳跃通常是误识别（如正文中的“8 表...”）。
+                if level == 1 and last_top_level_number and top_level_number > last_top_level_number + 1:
+                    if line and not self._is_noise(line):
+                        content_buffer.append(line)
+                    continue
+
+                # 顶级章节编号倒退通常是正文枚举项被误识别（如“1 综合监控...”）。
+                if level == 1 and last_top_level_number and top_level_number < last_top_level_number:
+                    if line and not self._is_noise(line):
+                        content_buffer.append(line)
+                    continue
+
+                if level > 6:
+                    continue
                 
                 # 保存之前章节的内容
                 if current_section and content_buffer:
@@ -540,6 +627,7 @@ class PDFParser(DocumentParser):
                 if level == 1:
                     sections.append(section)
                     section_stack = {1: section}
+                    last_top_level_number = top_level_number
                 else:
                     parent_level = level - 1
                     while parent_level >= 1 and parent_level not in section_stack:
@@ -557,6 +645,10 @@ class PDFParser(DocumentParser):
                 for l in list(section_stack.keys()):
                     if l > level:
                         del section_stack[l]
+
+                # 若出现层级跳跃（如1->3），自动回退到父级+1。
+                if level > 1 and (level - 1) not in section_stack:
+                    section.level = max(section_stack.keys()) if section_stack else 1
                 
                 current_section = section
             else:
@@ -577,13 +669,14 @@ class PDFParser(DocumentParser):
         Returns:
             (章节编号, 章节标题) 或 None
         """
-        # 模式: "3.1功能需求" 或 "3.1 功能需求"
-        match = re.match(r'^(\d+(?:\.\d+)*)\s*(.+)$', line)
+        # 模式: "3.1 功能需求" / "3.1.2 电场..."
+        match = re.match(r'^(\d+(?:\.\d+)*)[\s、.)）]*(.+)$', line)
         if not match:
             return None
         
         number = match.group(1)
         title = match.group(2).strip()
+        level = len(number.split('.'))
         
         # 排除目录行
         if '...' in title or title.count('.') > 5:
@@ -609,6 +702,18 @@ class PDFParser(DocumentParser):
         # 标题长度检查
         if len(title) > 60 or len(title) < 2:
             return None
+
+        # 过滤更像正文描述的句式。
+        if self._looks_like_statement(title):
+            return None
+
+        # 过滤疑似正文句子（含句号/分号且过长）。
+        if len(title) > 24 and re.search(r'[。；;]', title):
+            return None
+
+        # 过滤指令拼接噪声标题（逗号过多通常是正文残片）。
+        if title.count('，') >= 2 and len(title) > 20:
+            return None
         
         # 放宽标题字符要求（兼容部分PDF字体导致中文抽取异常的情况）
         if not re.search(r'[\u4e00-\u9fa5A-Za-z]', title):
@@ -631,8 +736,30 @@ class PDFParser(DocumentParser):
         # 检查标题是否包含反斜杠（通常是表格噪声）
         if '\\' in title and '需求' not in title:
             return None
+
+        # 常见有效标题关键词兜底，降低正文被识别为标题的概率。
+        if not any(k in title for k in self.VALID_TITLE_KEYWORDS):
+            return None
+
+        # 顶级章节标题需符合SRS结构性关键词，避免“综合监控”“电场”等正文短语被识别。
+        if level == 1 and not any(k in title for k in self.TOP_LEVEL_TITLE_KEYWORDS):
+            return None
         
         return (number, title)
+
+    def _looks_like_statement(self, title: str) -> bool:
+        """判断标题是否更像正文语句而非章节名。"""
+        if not title:
+            return False
+
+        statement_hints = ["应", "能够", "可以", "进行", "通过", "并", "同时", "当", "如果", "则"]
+        if any(h in title for h in statement_hints):
+            return True
+
+        if len(title) > 24 and re.search(r'[，。；;:：]', title):
+            return True
+
+        return False
     
     def _is_noise(self, line: str) -> bool:
         """检查是否是噪声行"""
diff --git a/src/json_generator.py b/src/json_generator.py
index 1bc46a3..b6408b8 100644
--- a/src/json_generator.py
+++ b/src/json_generator.py
@@ -146,8 +146,8 @@ class JSONGenerator:
                 if req.type == 'interface':
                     req_dict["接口名称"] = req.interface_name
                     req_dict["接口类型"] = req.interface_type
-                    req_dict["来源"] = req.source
-                    req_dict["目的地"] = req.destination
+                    req_dict["数据来源"] = req.source
+                    req_dict["数据目的地"] = req.destination
                 result["需求列表"].append(req_dict)
         
         # 如果有子章节，添加子章节
diff --git a/src/requirement_extractor.py b/src/requirement_extractor.py
index dbfef14..f589f80 100644
--- a/src/requirement_extractor.py
+++ b/src/requirement_extractor.py
@@ -7,6 +7,7 @@
 import re
 import json
 import logging
+from difflib import SequenceMatcher
 from typing import List, Dict, Optional, Tuple, Any
 from .document_parser import Section
 from .settings import AppSettings
@@ -51,8 +52,8 @@ class Requirement:
         if self.type == 'interface':
             result["接口名称"] = self.interface_name
             result["接口类型"] = self.interface_type
-            result["来源"] = self.source
-            result["目的地"] = self.destination
+            result["数据来源"] = self.source
+            result["数据目的地"] = self.destination
         return result
     
     def __repr__(self) -> str:
@@ -94,6 +95,9 @@ class RequirementExtractor:
         for section in sections:
             self._process_section(section)
 
+        if self.settings.enable_cross_section_dedup:
+            self.requirements = self._global_deduplicate_requirements(self.requirements)
+
         # 去重后统一连续重编号，避免出现跳号。
         if self.settings.renumber_enabled:
             self.requirements = self._renumber_requirements_continuous(self.requirements)
@@ -122,29 +126,53 @@ class RequirementExtractor:
         """判断是否应该跳过此章节"""
         if self.settings.is_non_requirement_section(section.title):
             return True
+
+        if self._is_diagram_or_overview_section(section.title):
+            return True
         
         # 检查是否是系统描述章节（如3.1.1通常是系统描述）
         if self._is_system_description(section):
             return True
         
         return False
+
+    def _is_diagram_or_overview_section(self, title: str) -> bool:
+        t = (title or "").strip()
+        if not t:
+            return False
+        # 示意图/概述章节通常不承载可验证需求。
+        if any(k in t for k in ["示意图", "概述"]):
+            if "需求" not in t and "要求" not in t:
+                return True
+            # 即使包含“需求/要求”，若明确是示意图也仍跳过。
+            if "示意图" in t:
+                return True
+        return False
     
     def _is_system_description(self, section: Section) -> bool:
         """判断是否是系统描述章节（应该跳过）"""
+        title = section.title or ""
+
+        # 明确的需求语义章节优先提取，避免误判导致漏提。
+        if self._is_requirement_semantic_section(title):
+            return False
+
         # 检查标题
-        desc_keywords = ['系统描述', '功能描述', '概述', '示意图', '组成']
+        desc_keywords = self.settings.system_description_hints
         for kw in desc_keywords:
-            if kw in section.title:
+            if kw in title:
                 return True
         
-        # 使用LLM判断
-        if self.llm and section.content:
+        if not self.llm:
+            raise RuntimeError("LLM实例未初始化，当前版本仅支持LLM提取")
+
+        if section.content:
             try:
                 result = self._llm_check_system_description(section)
                 return result
             except Exception as e:
-                logger.warning(f"LLM判断失败，使用规则判断: {e}")
-        
+                raise RuntimeError(f"LLM系统描述判断失败: section={section.number} {section.title}, error={e}") from e
+
         return False
     
     def _llm_check_system_description(self, section: Section) -> bool:
@@ -163,17 +191,62 @@ class RequirementExtractor:
 回答（只需要回答"是"或"否"）："""
         
         response = self.llm.call(prompt).strip()
-        return '是' in response
+        return response.startswith("是")
+
+    def _is_requirement_semantic_section(self, title: str) -> bool:
+        """章节标题是否具有明确需求语义。"""
+        return (
+            self.settings.is_interface_semantic_title(title)
+            or self.settings.is_functional_semantic_title(title)
+            or self.settings.is_other_semantic_title(title)
+            or ("需求" in (title or ""))
+            or ("要求" in (title or ""))
+        )
+
+    def _should_extract_text_block(self, section: Section, text: str, req_type: str) -> bool:
+        """判断文本块是否值得提取需求。"""
+        cleaned = (text or "").strip()
+        if len(cleaned) < 8:
+            return False
+
+        if self._is_requirement_semantic_section(section.title):
+            return True
+
+        hard_requirement_hints = ["应", "必须", "需", "shall", "should", "不得", "支持"]
+        if any(h in cleaned for h in hard_requirement_hints):
+            return True
+
+        prompt = f"""请判断以下文本是否包含具体、可验证的软件需求。
+
+章节标题：{section.title}
+需求类型：{req_type}
+文本内容：
+{cleaned[:700]}
+
+判定规则：
+- 回答“是”：包含可执行、可测试、可验收的具体要求。
+- 回答“否”：仅为概括、背景、系统组成说明、引言或泛化描述。
+
+回答仅输出“是”或“否”。"""
+
+        response = self.llm.call(prompt).strip()
+        return response.startswith("是")
     
     def _extract_requirements_from_section(self, section: Section) -> List[Requirement]:
         """从单个章节按文档顺序提取需求。"""
         requirements: List[Requirement] = []
+        if not self.llm:
+            raise RuntimeError("LLM实例未初始化，当前版本仅支持LLM提取")
+
         req_type = self._identify_requirement_type(section.title, section.content)
+        is_environment_section = self._is_environment_section(section.title)
+
+        if self._should_suppress_section_requirements(section, req_type):
+            return []
 
         blocks = self._iter_section_blocks(section)
         for block in blocks:
             block_type = block.get("type", "text")
-            block_order = int(block.get("order", 0))
 
             temp_section = Section(
                 level=section.level,
@@ -184,20 +257,39 @@ class RequirementExtractor:
             )
 
             if block_type == "text":
-                temp_section.content = block.get("text", "")
-                if self.llm:
-                    block_reqs = self._llm_extract_requirements(temp_section, req_type)
-                else:
-                    block_reqs = self._rule_extract_requirements(temp_section, req_type)
+                # 环境类章节以表格为准，一表一条，正文不单独拆多条。
+                if is_environment_section:
+                    continue
+                temp_section.content = self._sanitize_block_text(block.get("text", ""), section.number)
+                if len(temp_section.content.strip()) < 8:
+                    continue
+                if not self._should_extract_text_block(temp_section, temp_section.content, req_type):
+                    continue
+                block_reqs = self._llm_extract_requirements(temp_section, req_type, source_type="text")
                 table_index = -1
             else:
                 table_data = block.get("table", [])
                 temp_section.tables = [table_data] if table_data else []
                 table_index = int(block.get("table_index", -1))
-                if self.llm and self.settings.table_llm_semantic_enabled:
-                    block_reqs = self._llm_extract_table_requirements(temp_section, req_type)
+
+                # 安全/保密/适应性章节若误挂接口风格表格，直接忽略。
+                if self._should_skip_table_for_section(section.title, table_data, req_type):
+                    continue
+
+                if is_environment_section:
+                    table_mode = "single"
                 else:
-                    block_reqs = self._rule_extract_requirements(temp_section, req_type)
+                    table_mode = self._classify_table_mode(section.title, table_data, req_type)
+                if table_mode == "skip":
+                    continue
+                if table_mode == "single":
+                    block_reqs = self._llm_extract_table_as_single_requirement(temp_section, req_type)
+                else:
+                    interface_rows = self._extract_interface_rows_from_table(table_data) if table_mode == "interface" else []
+                    if self.llm and self.settings.table_llm_semantic_enabled:
+                        block_reqs = self._llm_extract_table_requirements(temp_section, req_type, interface_rows=interface_rows)
+                    else:
+                        block_reqs = self._llm_extract_requirements(temp_section, req_type, source_type="table")
 
             for req in block_reqs:
                 self._global_order += 1
@@ -205,11 +297,188 @@ class RequirementExtractor:
                 req.source_order = self._global_order
                 req.source_table_index = table_index
                 req.source_row_span = block.get("row_span", "")
-                req.description = self._maybe_light_rewrite(req.description, block_type)
+                req.description = self._maybe_light_rewrite(req.description, block_type, req.type)
+                req.description = self._clean_description(req.description)
+                if self._is_low_quality_requirement(req.description, req.type):
+                    continue
+                if req.type != 'interface' and block_type == 'text':
+                    req.description = self._snap_to_source_sentence(req.description, temp_section.content)
+                    if self._is_low_quality_requirement(req.description, req.type):
+                        continue
+                if req.type == 'interface':
+                    if self._is_generic_reference_requirement(req.description):
+                        continue
+                    req.interface_name = self._normalize_interface_field(req.interface_name)
+                    req.interface_type = self._normalize_interface_field(req.interface_type)
+                    req.source = self._normalize_interface_field(req.source)
+                    req.destination = self._normalize_interface_field(req.destination)
                 requirements.append(req)
 
         requirements = self._semantic_integrity_postprocess(requirements)
-        return self._deduplicate_requirements(requirements)
+        requirements = self._merge_fragment_requirements(requirements)
+        requirements = self._deduplicate_requirements(requirements)
+        requirements = self._drop_inferior_interface_duplicates(requirements)
+        return requirements
+
+    def _is_environment_section(self, title: str) -> bool:
+        t = (title or "").strip()
+        return any(k in t for k in ["硬件环境", "软件环境", "运行环境", "计算机硬件", "计算机软件"])
+
+    def _should_skip_table_for_section(self, section_title: str, table: List[List[str]], req_type: str) -> bool:
+        if not table:
+            return False
+        title = (section_title or "")
+        header_text = " ".join(str(c or "") for c in table[0]).lower() if table else ""
+
+        # 安全/保密/适应性章节中出现接口风格表头，通常是误挂表。
+        if any(k in title for k in ["安全性需求", "保密性需求", "适应性需求"]):
+            interface_style_hints = ["来源", "目的地", "标识", "数据消息", "接口"]
+            if any(h in header_text for h in interface_style_hints):
+                return True
+
+        return False
+
+    def _is_low_quality_requirement(self, description: str, req_type: str) -> bool:
+        text = (description or "").strip()
+        if len(text) < 6:
+            return True
+
+        normalized = re.sub(r"\s+", "", text)
+        if "无。" in normalized and "否。" in normalized:
+            return True
+        bad_markers = ["无。", "否。", "是。", "无，", "否，"]
+        if any(m in normalized for m in bad_markers):
+            action_hints = ["应", "必须", "需", "支持", "通过", "提供", "实现", "具备", "监测", "控制", "接口"]
+            if not any(h in normalized for h in action_hints):
+                return True
+
+        # 类似“4。xxx。MOD_XXX。否。”的噪声拼接句。
+        if re.match(r"^\d+[。．、]", normalized):
+            if normalized.count("。") >= 3 and ("否。" in normalized or "无。" in normalized):
+                return True
+
+        # 标点占比过高且缺少动作谓词，通常是表格噪声拼接。
+        punct_count = len(re.findall(r"[，,。；;：:（）()\[\]{}]", text))
+        if punct_count / max(len(text), 1) > 0.22:
+            action_hints = ["应", "必须", "需", "支持", "通过", "提供", "实现", "具备", "监测", "控制", "接口"]
+            if not any(h in text for h in action_hints):
+                return True
+
+        # OCR断裂文本（大量单字空格分隔）通常不是高质量需求。
+        if re.search(r"(?:[\u4e00-\u9fa5A-Za-z0-9]\s+){5,}", text):
+            return True
+
+        # 短“标题:残句”且无明确需求谓词，通常是截断噪声。
+        modal_hints = ["应", "必须", "需", "shall", "should"]
+        if "：" in text and len(text) < 35 and not any(h in text for h in modal_hints):
+            return True
+
+        # 典型截断尾词。
+        if text.endswith(("与。", "上。", "设。")) and len(text) < 50:
+            return True
+
+        return False
+
+    def _should_suppress_section_requirements(self, section: Section, req_type: str) -> bool:
+        """抑制父级汇总章节提取，优先保留子章节明细。"""
+        if not section.children:
+            return False
+
+        title = section.title or ""
+        child_titles = " ".join((c.title or "") for c in section.children)
+
+        # 接口父章节若存在“接口描述”子章节，则仅保留子章节。
+        if self.settings.is_interface_semantic_title(title) and ("接口描述" in child_titles):
+            return True
+
+        # 功能/性能父章节常见“汇总表”，若有子章节则抑制父级抽取。
+        summary_hints = ["系统功能要求", "功能要求", "性能要求", "汇总", "总表"]
+        if req_type == "functional" and section.children and "功能要求" in title:
+            return True
+
+        if req_type != "interface" and any(h in title for h in ["功能要求", "性能要求", "需求"]):
+            if any(any(h in " ".join(str(c or "") for c in row) for h in summary_hints) for table in section.tables for row in table[:1]):
+                return True
+
+        return False
+
+    def _is_generic_reference_requirement(self, text: str) -> bool:
+        """过滤仅引用其他章节的泛化接口描述。"""
+        t = (text or "").strip()
+        if not t:
+            return True
+        generic_patterns = [
+            r"应满足.*章节",
+            r"应满足本文件",
+            r"应满足.*规定",
+            r"适用于测试类",
+            r"合格性方法",
+        ]
+        return any(re.search(p, t) for p in generic_patterns)
+
+    def _merge_fragment_requirements(self, requirements: List[Requirement]) -> List[Requirement]:
+        """将短碎片需求并入前一条主需求，避免“指令左舷启停”类条目独立存在。"""
+        if not requirements:
+            return requirements
+
+        merged: List[Requirement] = [requirements[0]]
+        for req in requirements[1:]:
+            prev = merged[-1]
+            if prev.section_uid == req.section_uid and prev.type == req.type and self._is_fragment_requirement(req.description):
+                prev.description = self._clean_description(
+                    f"{prev.description.rstrip('；;。')}；{req.description.lstrip('；;。')}"
+                )
+            else:
+                merged.append(req)
+        return merged
+
+    def _is_fragment_requirement(self, description: str) -> bool:
+        text = (description or "").strip()
+        if not text:
+            return True
+        if len(text) <= 18:
+            return True
+
+        strong_modal = ["应", "必须", "需", "shall", "should", "不得"]
+        if not any(m in text for m in strong_modal):
+            weak_tail = ["启停", "报警", "电流", "电压", "参数", "状态"]
+            if len(text) <= 30 and any(text.rstrip("。").endswith(w) for w in weak_tail):
+                return True
+        return False
+
+    def _sanitize_block_text(self, text: str, current_section_number: str) -> str:
+        """清理正文块并裁剪误混入的其他章节内容。"""
+        cleaned = self._normalize_ocr_spacing(text or "")
+        cleaned = self._trim_text_to_current_section(cleaned, current_section_number)
+        return cleaned.strip()
+
+    def _trim_text_to_current_section(self, text: str, current_section_number: str) -> str:
+        """遇到后续章节标题时截断，避免跨章节污染。"""
+        if not text:
+            return ""
+
+        current = (current_section_number or "").strip()
+        if not current:
+            return text
+
+        current_depth = current.count(".") + 1
+        kept: List[str] = []
+        for raw_line in text.splitlines():
+            line = raw_line.strip()
+            if not line:
+                kept.append(raw_line)
+                continue
+
+            m = re.match(r"^(\d+(?:\.\d+){1,5})\s*[、.)）]?\s*(.+)$", line)
+            if m:
+                found_no = m.group(1)
+                found_depth = found_no.count(".") + 1
+                # 命中其他同级/上级或下级章节号时，认为当前章节正文到此结束。
+                if found_no != current and (found_depth <= current_depth or found_no.startswith(f"{current}.")):
+                    break
+            kept.append(raw_line)
+
+        return "\n".join(kept)
 
     def _iter_section_blocks(self, section: Section) -> List[Dict[str, Any]]:
         """返回章节中的顺序块（文本/表格）。"""
@@ -256,9 +525,125 @@ class RequirementExtractor:
             )
             fallback_order += 1
         return blocks
+
+    def _classify_table_mode(self, section_title: str, table: List[List[str]], req_type: str) -> str:
+        """
+        表格模式分类。
+        返回：skip | single | interface | generic
+        """
+        if not table:
+            return "generic"
+
+        header_text = " ".join(str(c or "") for c in table[0])
+        combined = f"{section_title} {header_text}".lower()
+
+        has_interface_hint = any(k.lower() in combined for k in self.settings.table_interface_keywords)
+        has_skip_hint = any(k.lower() in combined for k in self.settings.table_skip_keywords)
+
+        if has_skip_hint and not has_interface_hint:
+            return "skip"
+
+        if any(k.lower() in combined for k in self.settings.table_single_requirement_keywords):
+            return "single"
+
+        if req_type == "interface":
+            return "interface"
+
+        # 接口表必须满足章节语义优先，避免“计算机通信需求”等章节被关键词误判为接口需求。
+        if has_interface_hint and self.settings.is_interface_semantic_title(section_title):
+            return "interface"
+
+        return "generic"
+
+    def _extract_interface_rows_from_table(self, table: List[List[str]]) -> List[Dict[str, str]]:
+        """从接口表中提取接口名称/类型/来源/目的地字段。"""
+        if not table or len(table) < 2:
+            return []
+
+        header = [self._clean_description(str(c or "")) for c in table[0]]
+
+        def find_col(candidates: List[str]) -> int:
+            for idx, h in enumerate(header):
+                if any(k in h for k in candidates):
+                    return idx
+            return -1
+
+        name_idx = find_col(["接口名称", "接口名", "名称"])
+        type_idx = find_col(["接口类型", "类型", "通信类型"])
+        source_idx = find_col(["数据来源", "来源", "发送方", "源"])
+        dst_idx = find_col(["数据目的地", "目的地", "接收方", "去向"])
+
+        rows: List[Dict[str, str]] = []
+        for row in table[1:]:
+            if not row or not any(str(c or "").strip() for c in row):
+                continue
+
+            def pick(col_idx: int) -> str:
+                if col_idx < 0 or col_idx >= len(row):
+                    return self.settings.interface_unknown_fallback
+                return self._normalize_interface_field(row[col_idx])
+
+            rows.append(
+                {
+                    "interface_name": pick(name_idx),
+                    "interface_type": pick(type_idx),
+                    "source": pick(source_idx),
+                    "destination": pick(dst_idx),
+                }
+            )
+
+        return rows
+
+    def _llm_extract_table_as_single_requirement(self, section: Section, req_type: str) -> List[Requirement]:
+        """硬件/软件/运行环境类表格按“一表一条”提取。"""
+        if not section.tables:
+            return []
+
+        table_text = self._format_tables_for_prompt(section.tables)
+        prompt = f"""请将下列表格合并为一条完整需求。
+
+章节标题：{section.title}
+表格内容：
+{table_text}
+
+要求：
+1. 仅输出1条需求。
+2. 保留关键配置项、数值、阈值、版本信息。
+3. 使用原文措辞，尽量不改写。
+
+输出JSON：
+{{
+  "requirements": [
+    {{"req_id": "可为空", "description": "一条完整需求"}}
+  ]
+}}"""
+
+        response = self.llm.call(prompt)
+        data = self._parse_llm_json_response(response)
+        if not data or not isinstance(data.get("requirements"), list) or not data["requirements"]:
+            return []
+
+        req_data = data["requirements"][0]
+        desc = self._clean_description(req_data.get("description", ""))
+        if not desc:
+            return []
+
+        doc_req_id = self._normalize_req_id(req_data.get("req_id", ""))
+        req_id = self._generate_requirement_id(req_type, section.number, 1, doc_req_id, "")
+        return [
+            Requirement(
+                req_id=req_id,
+                description=desc,
+                req_type=req_type,
+                section_number=section.number,
+                section_title=section.title,
+                section_uid=section.uid,
+                source_type="table",
+            )
+        ]
     
-    def _llm_extract_requirements(self, section: Section, req_type: str) -> List[Requirement]:
-        """使用LLM提取需求"""
+    def _llm_extract_requirements(self, section: Section, req_type: str, source_type: str = "text") -> List[Requirement]:
+        """使用LLM提取需求。source_type: text|table"""
         requirements = []
         
         content_text = section.content or ""
@@ -266,10 +651,19 @@ class RequirementExtractor:
         if len(content_text.strip()) < 8 and not table_text:
             return requirements
         
+        is_text_source = source_type == "text"
+
         # 根据需求类型构建不同的提示词
         if req_type == 'interface':
-            # 接口需求：允许改写润色，并提取接口详细信息
-            prompt = f"""请从以下SRS文档章节中提取具体的接口需求，并对需求描述进行改写润色。同时智能识别每个接口的详细信息。
+            if is_text_source:
+                if self.settings.preserve_source_text_for_text_blocks:
+                    rewrite_rule = "需求描述必须尽量保持原文句式，不得润色改写，只允许去除换行、编号前缀和明显OCR噪声。"
+                else:
+                    rewrite_rule = "可在不改变语义和数值的前提下做轻微整理，使句子完整清晰。"
+            else:
+                rewrite_rule = "可在不改变语义和数值的前提下做轻微整理，使句子完整清晰。"
+
+            prompt = f"""请从以下SRS文档章节中提取具体的接口需求，并智能识别每个接口的详细信息。
 
 章节编号：{section.number}
 章节标题：{section.title}
@@ -283,14 +677,16 @@ class RequirementExtractor:
 1. 只提取具体的、可验证的接口需求
 2. 不要提取系统描述、背景说明等非需求内容
 3. 去除原文中的换行符、表格格式噪声
-4. 对提取的需求描述进行改写润色，使其更加清晰完整
+4. {rewrite_rule}
 5. 每条需求应该是完整的句子，描述清楚接口规范
 6. 如果有多条需求，请分别列出
-7. 对于每条接口需求，请智能识别以下信息：
+6.1 不要输出“a）/b）/1）”等前缀编号
+6.2 不要输出短语碎片（如“左舷艏侧推启停”），应并入主需求句
+7. 对于每条接口需求，请智能识别以下信息（若表格中存在对应列，优先按表格填写）：
    - interface_name: 接口名称
    - interface_type: 接口类型 （如：CAN接口、以太网接口、串口等）
-   - source: 来源/发送方（数据或信号从哪里来）
-   - destination: 目的地/接收方（数据或信号发送到哪里）
+   - source: 数据来源/发送方（数据或信号从哪里来）
+   - destination: 数据目的地/接收方（数据或信号发送到哪里）
 8. 如果某个字段无法从文本中识别，请填写"未知"
 9. 若原文给出需求编号，请优先使用原文编号（req_id）
 
@@ -302,8 +698,8 @@ class RequirementExtractor:
             "description": "接口需求描述",
             "interface_name": "接口名称",
             "interface_type": "接口类型",
-            "source": "来源",
-            "destination": "目的地"
+            "source": "数据来源",
+            "destination": "数据目的地"
         }}
     ]
 }}
@@ -313,8 +709,17 @@ class RequirementExtractor:
 
 JSON输出："""
         else:
-            # 功能需求、其他需求：以原文为主，允许轻微扩写补全
-            prompt = f"""请从以下SRS文档章节中提取具体的软件需求。以原文为主，允许轻微扩写补全语义。
+            if is_text_source:
+                if self.settings.preserve_source_text_for_text_blocks:
+                    source_rule = "需求描述必须保持原文，不得润色改写；仅允许移除换行、表格线噪声和列表编号。"
+                else:
+                    source_rule = "需求描述以原文为主，可做轻微重组以形成完整句子。"
+                split_rule = "仅在语义完全独立且原文可明确拆分时拆分，禁止将同一主句拆成碎片。"
+            else:
+                source_rule = "需求描述以原文为主，可做轻微重组以形成完整句子。"
+                split_rule = "仅在语义独立可验证时拆分。"
+
+            prompt = f"""请从以下SRS文档章节中提取具体的软件需求。
 
 章节编号：{section.number}
 章节标题：{section.title}
@@ -327,16 +732,18 @@ JSON输出："""
 提取要求：
 1. 同时提取正文与表格中的具体、可验证的软件需求
 2. 不要提取系统描述、背景说明等非需求内容
-3. 需求描述应保留原文大部分词语（建议保留率>=70%），仅做轻微补充以增强语义完整性
+3. {source_rule}
 4. 严禁改变任何数值、阈值、状态名、信号名和逻辑条件
 5. 去除原文中的多余换行符和表格格式符号，但保留语句内容
 5. 每条需求应该是完整的句子
 6. 如果有多条需求，请分别列出
-7. 如果一段需求描述内有多条需求点，必须拆分成多个独立需求项
-8. 拆分判定：出现“并/并且/同时/然后/且/以及”，或一条句子中出现多个动作（如判断+监测+发送）时必须拆分
-9. 每条需求尽量满足“单一动作、可单独验证”
-8. 过滤重复或过于相似的需求，只保留独特的需求
-9. 若原文给出需求编号，请优先使用原文编号（req_id）
+6.1 去除“a）/b）/1）”等编号前缀
+7. 如果一段需求描述内有多条需求点，按规则判断是否拆分
+8. 拆分判定：{split_rule} 条件-动作链、并列限定语、因果承接不应强拆
+8.1 对“具体包括/其中包括”后的短项，不要单独成条，必须并入主句
+9. 优先保持需求语义完整，避免过度拆分导致碎片化
+10. 过滤重复或过于相似的需求，只保留独特的需求
+11. 若原文给出需求编号，请优先使用原文编号（req_id）
 
 请以JSON格式输出，格式如下：
 {{
@@ -401,10 +808,18 @@ JSON输出："""
                             source = ""
                             destination = ""
                             if req_type == 'interface':
-                                interface_name = req_data.get('interface_name', '未知').strip()
-                                interface_type = req_data.get('interface_type', '未知').strip()
-                                source = req_data.get('source', '未知').strip()
-                                destination = req_data.get('destination', '未知').strip()
+                                interface_name = self._normalize_interface_field(
+                                    req_data.get('interface_name', self.settings.interface_unknown_fallback)
+                                )
+                                interface_type = self._normalize_interface_field(
+                                    req_data.get('interface_type', self.settings.interface_unknown_fallback)
+                                )
+                                source = self._normalize_interface_field(
+                                    req_data.get('source', self.settings.interface_unknown_fallback)
+                                )
+                                destination = self._normalize_interface_field(
+                                    req_data.get('destination', self.settings.interface_unknown_fallback)
+                                )
 
                             req = Requirement(
                                 req_id=req_id,
@@ -420,59 +835,23 @@ JSON输出："""
                             )
                             requirements.append(req)
         except Exception as e:
-            logger.warning(f"LLM提取需求失败: {e}，使用规则提取")
-            return self._rule_extract_requirements(section, req_type)
+            raise RuntimeError(
+                f"LLM提取需求失败: section={section.number} {section.title}, error={e}"
+            ) from e
         
         return requirements
 
-    def _build_table_requirements_rule(self, section: Section, req_type: str, start_index: int) -> List[Requirement]:
-        """仅从表格构建规则需求，用于LLM模式补充召回。"""
-        requirements: List[Requirement] = []
-        table_requirements = self._extract_requirements_from_tables_rule(section.tables)
-        if not table_requirements:
-            return requirements
-
-        parent_req_id = ""
-        complete_id_pattern = r'^[A-Za-z0-9]{2,10}[-_].+$'
-        for temp_id, _ in table_requirements:
-            if temp_id and re.match(complete_id_pattern, temp_id):
-                parent_req_id = temp_id.replace('_', '-')
-                break
-
-        index = start_index
-        for doc_req_id, desc in table_requirements:
-            split_descs = self._split_requirement_description(desc)
-            if not split_descs:
-                split_descs = [desc]
-
-            for split_idx, split_desc in enumerate(split_descs, 1):
-                req_id = self._generate_requirement_id(
-                    req_type=req_type,
-                    section_number=section.number,
-                    index=index,
-                    doc_req_id=doc_req_id,
-                    parent_req_id=parent_req_id,
-                    split_index=split_idx,
-                    split_total=len(split_descs),
-                )
-                requirements.append(
-                    Requirement(
-                        req_id=req_id,
-                        description=split_desc,
-                        req_type=req_type,
-                        section_number=section.number,
-                        section_title=section.title,
-                        section_uid=section.uid,
-                    )
-                )
-            index += 1
-
-        return requirements
-
-    def _llm_extract_table_requirements(self, section: Section, req_type: str) -> List[Requirement]:
+    def _llm_extract_table_requirements(
+        self,
+        section: Section,
+        req_type: str,
+        interface_rows: Optional[List[Dict[str, str]]] = None,
+    ) -> List[Requirement]:
         """使用LLM语义化提取表格需求。"""
-        if not self.llm or not section.tables:
-            return self._rule_extract_requirements(section, req_type)
+        if not self.llm:
+            raise RuntimeError("LLM实例未初始化，当前版本仅支持LLM提取")
+        if not section.tables:
+            return []
 
         table = section.tables[0]
         is_sequence_table = self._is_time_series_table(table)
@@ -514,6 +893,11 @@ JSON输出："""
                         continue
                     doc_req_id = self._normalize_req_id(req_data.get("req_id", ""))
                     req_id = self._generate_requirement_id(req_type, section.number, i, doc_req_id, "")
+
+                    row_info = {}
+                    if req_type == "interface" and interface_rows and i - 1 < len(interface_rows):
+                        row_info = interface_rows[i - 1]
+
                     requirements.append(
                         Requirement(
                             req_id=req_id,
@@ -522,26 +906,75 @@ JSON输出："""
                             section_number=section.number,
                             section_title=section.title,
                             section_uid=section.uid,
+                            interface_name=row_info.get("interface_name", self.settings.interface_unknown_fallback) if req_type == 'interface' else "",
+                            interface_type=row_info.get("interface_type", self.settings.interface_unknown_fallback) if req_type == 'interface' else "",
+                            source=row_info.get("source", self.settings.interface_unknown_fallback) if req_type == 'interface' else "",
+                            destination=row_info.get("destination", self.settings.interface_unknown_fallback) if req_type == 'interface' else "",
                             source_type="table",
                         )
                     )
 
+            # 小型表格兜底：若LLM漏行，则将剩余有效行补齐为需求，避免“6行只提4行”。
+            expected_rows = [
+                [self._clean_description(str(c or "")) for c in row]
+                for row in (table[1:] if len(table) > 1 else table)
+                if row and any(str(c or "").strip() for c in row)
+            ]
+            if req_type in {"functional", "other"} and expected_rows and len(expected_rows) <= 12 and len(requirements) < len(expected_rows):
+                next_index = len(requirements) + 1
+                for row in expected_rows[len(requirements):]:
+                    row_desc = self._clean_description("，".join([c for c in row if c]))
+                    if not row_desc or len(row_desc) < 6:
+                        continue
+
+                    # 避免补齐时引入明显重复。
+                    duplicated = any(
+                        SequenceMatcher(None, self._normalize_text_for_dedup(row_desc), self._normalize_text_for_dedup(r.description)).ratio() >= 0.92
+                        for r in requirements
+                    )
+                    if duplicated:
+                        continue
+
+                    req_id = self._generate_requirement_id(req_type, section.number, next_index, "", "")
+                    requirements.append(
+                        Requirement(
+                            req_id=req_id,
+                            description=row_desc,
+                            req_type=req_type,
+                            section_number=section.number,
+                            section_title=section.title,
+                            section_uid=section.uid,
+                            source_type="table",
+                        )
+                    )
+                    next_index += 1
+
             if not requirements:
-                return self._rule_extract_requirements(section, req_type)
+                logger.warning(
+                    "LLM表格提取未产出需求: section=%s %s",
+                    section.number,
+                    section.title,
+                )
+                return []
             return requirements
         except Exception as e:
-            logger.warning(f"LLM表格语义化提取失败，回退规则模式: {e}")
-            return self._rule_extract_requirements(section, req_type)
+            raise RuntimeError(
+                f"LLM表格语义化提取失败: section={section.number} {section.title}, error={e}"
+            ) from e
 
-    def _maybe_light_rewrite(self, description: str, source_type: str) -> str:
+    def _maybe_light_rewrite(self, description: str, source_type: str, req_type: str) -> str:
         """仅在LLM模式做轻微扩写，且通过保真校验。"""
         description = self._clean_description(description)
         if not description:
             return description
+        if req_type != "interface":
+            return description
+        if source_type != "table":
+            return description
         if not self.llm or not self.settings.llm_light_rewrite_enabled:
             return description
 
-        need_rewrite = source_type == "table" or len(description) < 28
+        need_rewrite = len(description) < 28
         if not need_rewrite:
             return description
 
@@ -572,6 +1005,33 @@ JSON输出："""
         except Exception:
             return description
 
+    def _edit_distance(self, a: str, b: str) -> int:
+        """计算字符串编辑距离（Levenshtein）。"""
+        if a == b:
+            return 0
+        if not a:
+            return len(b)
+        if not b:
+            return len(a)
+
+        n, m = len(a), len(b)
+        dp = [[0] * (m + 1) for _ in range(n + 1)]
+        for i in range(n + 1):
+            dp[i][0] = i
+        for j in range(m + 1):
+            dp[0][j] = j
+
+        for i in range(1, n + 1):
+            for j in range(1, m + 1):
+                cost = 0 if a[i - 1] == b[j - 1] else 1
+                dp[i][j] = min(
+                    dp[i - 1][j] + 1,
+                    dp[i][j - 1] + 1,
+                    dp[i - 1][j - 1] + cost,
+                )
+
+        return dp[n][m]
+
     def _calculate_preserve_ratio(self, original: str, rewritten: str) -> float:
         original_tokens = [c for c in re.sub(r"\s+", "", original) if c]
         rewritten_tokens = set(c for c in re.sub(r"\s+", "", rewritten) if c)
@@ -642,121 +1102,6 @@ JSON输出："""
 
         return ordered
     
-    def _rule_extract_requirements(self, section: Section, req_type: str) -> List[Requirement]:
-        """使用规则提取需求（备用方法）"""
-        requirements = []
-        content = section.content
-        
-        # 正文需求
-        descriptions = []
-        if content and len(content.strip()) >= 8:
-            descriptions = self._extract_list_items(content)
-            
-            if not descriptions:
-                # 如果没有列表项，将整个内容作为一个需求
-                desc = self._clean_description(content)
-                if len(desc) > 5 and not section.tables:
-                    descriptions = [f"{section.title}：{desc}"]
-        
-        # 表格需求
-        table_requirements = self._extract_requirements_from_tables_rule(section.tables)
-        
-        # 查找父需求编号（第一个合法完整编号）
-        parent_req_id = ""
-        complete_id_pattern = r'^[A-Za-z0-9]{2,10}[-_].+$'
-        for desc in descriptions:
-            temp_id, _ = self._extract_requirement_id_from_text(desc)
-            # 验证是否为合法的完整编号格式
-            if temp_id and re.match(complete_id_pattern, temp_id):
-                parent_req_id = temp_id.replace('_', '-')
-                break
-        if not parent_req_id:
-            for temp_id, _ in table_requirements:
-                # 验证是否为合法的完整编号格式
-                if temp_id and re.match(complete_id_pattern, temp_id):
-                    parent_req_id = temp_id.replace('_', '-')
-                    break
-        
-        index = 1
-        for desc in descriptions:
-            desc = self._clean_description(desc)
-            if len(desc) > 5:
-                doc_req_id, cleaned_desc = self._extract_requirement_id_from_text(desc)
-                split_descs = self._split_requirement_description(cleaned_desc)
-                if not split_descs:
-                    split_descs = [cleaned_desc]
-
-                for split_idx, split_desc in enumerate(split_descs, 1):
-                    req_id = self._generate_requirement_id(
-                        req_type,
-                        section.number,
-                        index,
-                        doc_req_id,
-                        parent_req_id,
-                        split_idx,
-                        len(split_descs),
-                    )
-                    req = Requirement(
-                        req_id=req_id,
-                        description=split_desc,
-                        req_type=req_type,
-                        section_number=section.number,
-                        section_title=section.title,
-                        section_uid=section.uid
-                    )
-                    requirements.append(req)
-                index += 1
-        
-        for doc_req_id, desc in table_requirements:
-            split_descs = self._split_requirement_description(desc)
-            if not split_descs:
-                split_descs = [desc]
-
-            for split_idx, split_desc in enumerate(split_descs, 1):
-                req_id = self._generate_requirement_id(
-                    req_type,
-                    section.number,
-                    index,
-                    doc_req_id,
-                    parent_req_id,
-                    split_idx,
-                    len(split_descs),
-                )
-                req = Requirement(
-                    req_id=req_id,
-                    description=split_desc,
-                    req_type=req_type,
-                    section_number=section.number,
-                    section_title=section.title,
-                    section_uid=section.uid
-                )
-                requirements.append(req)
-            index += 1
-        
-        return requirements
-    
-    def _extract_list_items(self, content: str) -> List[str]:
-        """提取列表项"""
-        items = []
-        
-        # 模式1: a) b) c) 或 1) 2) 3)
-        patterns = [
-            r'([a-z][\)）])\s*(.+?)(?=[a-z][\)）]|$)',
-            r'(\d+[\)）])\s*(.+?)(?=\d+[\)）]|$)',
-            r'([①②③④⑤⑥⑦⑧⑨⑩])\s*(.+?)(?=[①②③④⑤⑥⑦⑧⑨⑩]|$)'
-        ]
-        
-        for pattern in patterns:
-            matches = re.findall(pattern, content, re.DOTALL)
-            if matches:
-                for marker, text in matches:
-                    text = text.strip()
-                    if text and len(text) > 5:
-                        items.append(text)
-                break
-        
-        return items
-    
     def _identify_requirement_type(self, title: str, content: str) -> str:
         """
         通过标题和内容识别需求类型
@@ -803,22 +1148,93 @@ JSON输出："""
     def _normalize_req_id(self, req_id: str) -> str:
         """规范化需求编号"""
         return self.id_generator.normalize(req_id)
+
+    def _normalize_ocr_spacing(self, text: str) -> str:
+        """归一化OCR导致的词内断裂空格。"""
+        normalized = str(text or "")
+        if not getattr(self.settings, "ocr_spacing_normalize", True):
+            return normalized
+
+        normalized = normalized.replace("\u3000", " ").replace("\xa0", " ")
+        normalized = re.sub(r"(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])", "", normalized)
+        normalized = re.sub(r"(?<=[\u4e00-\u9fff])\s+(?=[A-Za-z0-9])", "", normalized)
+        normalized = re.sub(r"(?<=[A-Za-z0-9])\s+(?=[\u4e00-\u9fff])", "", normalized)
+        return normalized
     
     def _clean_description(self, text: str) -> str:
         """清理需求描述"""
+        text = self._normalize_ocr_spacing(text)
         # 替换换行符为空格
         text = re.sub(r'\n+', ' ', text)
         # 替换多个空格为单个空格
         text = re.sub(r'\s+', ' ', text)
         # 去除表格噪声
         text = re.sub(r'[\|│┃]+', ' ', text)
+        # 去除前缀枚举编号（如a）/b）/1）/①））但保留正文语义。
+        text = self._strip_leading_enumeration(text)
+        text = self._normalize_ocr_spacing(text)
         # 去除首尾空白
         text = text.strip()
+        text = text.rstrip("；;，,")
         # 限制长度
         if len(text) > 1000:
             text = text[:1000] + '...'
+        if self.settings.ensure_terminal_period:
+            text = self._ensure_terminal_period(text)
         return text
 
+    def _strip_leading_enumeration(self, text: str) -> str:
+        cleaned = (text or "").strip()
+        patterns = [
+            r"^\d+\s*(?=[A-Za-z][\)）])",
+            r"^(?:[A-Za-z]|\d+|[①②③④⑤⑥⑦⑧⑨⑩]|[一二三四五六七八九十]+)\s*[\)）、:]\s*",
+            r"^(?:[A-Za-z]|\d+|[①②③④⑤⑥⑦⑧⑨⑩]|[一二三四五六七八九十]+)\s*\.\s*(?!\d)",
+        ]
+        for p in patterns:
+            cleaned = re.sub(p, "", cleaned)
+        return cleaned
+
+    def _ensure_terminal_period(self, text: str) -> str:
+        if not text:
+            return text
+        if text.endswith(("。", "！", "？")):
+            return text
+        return text + "。"
+
+    def _normalize_interface_field(self, value: Any) -> str:
+        text = self._normalize_ocr_spacing(str(value or "")).strip()
+        if not text:
+            return self.settings.interface_unknown_fallback
+        if text in {"N/A", "NA", "None", "null", "NULL", "-", "--", "未知"}:
+            return self.settings.interface_unknown_fallback
+        text = text.rstrip("；;，,。")
+        return text if text else self.settings.interface_unknown_fallback
+
+    def _snap_to_source_sentence(self, description: str, source_text: str) -> str:
+        """将非接口需求尽量贴合回原文句子。"""
+        source_text = self._normalize_ocr_spacing(source_text or "").strip()
+        if not description or not source_text:
+            return description
+
+        if description in source_text:
+            return description
+
+        normalized_desc = self._normalize_ocr_spacing(description)
+
+        candidates = [
+            self._clean_description(x)
+            for x in re.split(r"[\n。；;]", source_text)
+            if self._clean_description(x)
+        ]
+        if not candidates:
+            return description
+
+        best = min(candidates, key=lambda c: self._edit_distance(normalized_desc, self._normalize_ocr_spacing(c)))
+        dist = self._edit_distance(normalized_desc, self._normalize_ocr_spacing(best))
+        if dist <= self.settings.non_interface_max_edit_distance:
+            return best
+        return description
+
     def _format_tables_for_prompt(self, tables: List[List[List[str]]]) -> str:
         """格式化表格内容用于LLM提示词"""
         if not tables:
@@ -845,6 +1261,8 @@ JSON输出："""
     def _split_requirement_description(self, text: str) -> List[str]:
         if not text:
             return []
+        if any(h in text for h in ["具体包括", "其中包括", "包括但不限于", "主要包括"]):
+            return [text]
         if "时间序列" in text and "执行指令" in text:
             return [text]
         if not self.splitter:
@@ -852,72 +1270,266 @@ JSON输出："""
         return self.splitter.split(text)
 
     def _deduplicate_requirements(self, requirements: List[Requirement]) -> List[Requirement]:
-        seen = set()
         deduped: List[Requirement] = []
+        threshold = self.settings.dedup_similarity_threshold
         for req in requirements:
-            normalized_desc = re.sub(r'\s+', ' ', req.description).strip().lower()
-            key = (req.type, normalized_desc)
-            if key in seen:
-                continue
-            seen.add(key)
-            deduped.append(req)
+            normalized_desc = self._normalize_text_for_dedup(req.description)
+            drop = False
+            for kept in deduped:
+                if kept.type != req.type:
+                    continue
+
+                # 接口需求优先按结构化键去重（同章节同接口同源同目的）。
+                if req.type == "interface" and kept.section_uid == req.section_uid:
+                    if self._interface_dedup_key(kept) == self._interface_dedup_key(req):
+                        # 优先保留字段更完整的条目。
+                        if self._interface_field_completeness(req) > self._interface_field_completeness(kept):
+                            kept.interface_name = req.interface_name
+                            kept.interface_type = req.interface_type
+                            kept.source = req.source
+                            kept.destination = req.destination
+                            kept.description = req.description
+                            kept.source_type = req.source_type
+                            kept.source_order = req.source_order
+                        drop = True
+                        break
+
+                    # 同一来源与目的地的接口需求若语义高度重合，保留更完整一条。
+                    kept_src = self._normalize_text_for_dedup(kept.source)
+                    req_src = self._normalize_text_for_dedup(req.source)
+                    kept_dst = self._normalize_text_for_dedup(kept.destination)
+                    req_dst = self._normalize_text_for_dedup(req.destination)
+                    unknown = self._normalize_text_for_dedup(self.settings.interface_unknown_fallback)
+                    if kept_src and req_src and kept_dst and req_dst and kept_src == req_src and kept_dst == req_dst:
+                        if kept_src != unknown and kept_dst != unknown:
+                            name_sim = SequenceMatcher(
+                                None,
+                                self._normalize_text_for_dedup(kept.interface_name),
+                                self._normalize_text_for_dedup(req.interface_name),
+                            ).ratio()
+                            sim_sd = SequenceMatcher(
+                                None,
+                                normalized_desc,
+                                self._normalize_text_for_dedup(kept.description),
+                            ).ratio()
+                            if sim_sd >= 0.72 or name_sim >= 0.72:
+                                if self._interface_field_completeness(req) >= self._interface_field_completeness(kept):
+                                    kept.interface_name = req.interface_name
+                                    kept.interface_type = req.interface_type
+                                    kept.source = req.source
+                                    kept.destination = req.destination
+                                    if len(req.description) >= len(kept.description):
+                                        kept.description = req.description
+                                    kept.source_type = req.source_type
+                                    kept.source_order = min(kept.source_order, req.source_order)
+                                drop = True
+                                break
+
+                sim = SequenceMatcher(
+                    None,
+                    normalized_desc,
+                    self._normalize_text_for_dedup(kept.description),
+                ).ratio()
+                if sim >= threshold:
+                    # 同章节优先保留正文块。
+                    if (
+                        self.settings.prefer_text_over_table
+                        and kept.section_uid == req.section_uid
+                        and kept.source_type == "table"
+                        and req.source_type == "text"
+                    ):
+                        kept.description = req.description
+                        kept.source_type = req.source_type
+                        kept.source_order = req.source_order
+                    drop = True
+                    break
+            if not drop:
+                deduped.append(req)
         return deduped
 
-    def _extract_requirements_from_tables_rule(self, tables: List[List[List[str]]]) -> List[Tuple[Optional[str], str]]:
-        """从表格中提取需求（规则方式）"""
-        results = []
-        if not tables:
-            return results
-        
-        id_keywords = ['需求编号', '编号', '序号', 'id', 'ID']
-        desc_keywords = ['需求', '描述', '内容', '说明', '要求']
-        
-        for table in tables:
-            if not table:
+    def _interface_dedup_key(self, req: Requirement) -> Tuple[str, str, str, str]:
+        return (
+            self._normalize_text_for_dedup(req.interface_name),
+            self._normalize_text_for_dedup(req.interface_type),
+            self._normalize_text_for_dedup(req.source),
+            self._normalize_text_for_dedup(req.destination),
+        )
+
+    def _interface_field_completeness(self, req: Requirement) -> int:
+        fields = [req.interface_name, req.interface_type, req.source, req.destination]
+        score = 0
+        for f in fields:
+            value = (f or "").strip()
+            if value and value != self.settings.interface_unknown_fallback:
+                score += 1
+        return score
+
+    def _drop_inferior_interface_duplicates(self, requirements: List[Requirement]) -> List[Requirement]:
+        """同章节同接口名/来源/目的地重复时，保留字段更完整条目。"""
+        if not requirements:
+            return requirements
+
+        grouped: Dict[Tuple[str, str, str, str], List[Requirement]] = {}
+        others: List[Requirement] = []
+        for req in requirements:
+            if req.type != "interface":
+                others.append(req)
                 continue
 
-            if self._is_time_series_table(table) and self.settings.sequence_table_merge == "single_requirement":
-                merged_desc = self._build_sequence_table_requirement(table)
-                if merged_desc:
-                    results.append((None, merged_desc))
+            key = (
+                req.section_uid or req.section_number or "",
+                self._normalize_text_for_dedup(req.interface_name),
+                self._normalize_text_for_dedup(req.source),
+                self._normalize_text_for_dedup(req.destination),
+            )
+            grouped.setdefault(key, []).append(req)
+
+        kept_interfaces: List[Requirement] = []
+        for group in grouped.values():
+            if len(group) == 1:
+                kept_interfaces.append(group[0])
                 continue
 
-            header = table[0] if table else []
-            header_lower = [h.lower() for h in header]
-            id_idx = None
-            desc_idx = None
-            for i, h in enumerate(header_lower):
-                if any(k.lower() in h for k in id_keywords):
-                    id_idx = i
-                if any(k.lower() in h for k in desc_keywords):
-                    desc_idx = i
-            
-            start_row = 1 if (id_idx is not None or desc_idx is not None) else 0
-            for row in table[start_row:]:
-                if not row:
+            best = group[0]
+            for cand in group[1:]:
+                best_score = self._interface_field_completeness(best)
+                cand_score = self._interface_field_completeness(cand)
+                if cand_score > best_score:
+                    best = cand
                     continue
-                row = [self._clean_description(cell) for cell in row]
-                if not any(row):
+                if cand_score == best_score and len(cand.description) > len(best.description):
+                    best = cand
                     continue
-                
-                req_id = None
-                desc = ""
-                if id_idx is not None and id_idx < len(row):
-                    req_id = self._normalize_req_id(row[id_idx])
-                if desc_idx is not None and desc_idx < len(row):
-                    desc = row[desc_idx]
-                if not desc:
-                    # 如果无明确描述列，拼接整行作为描述
-                    desc = " | ".join([cell for cell in row if cell])
-                
-                # 若描述里包含编号，尝试再次提取
-                if not req_id:
-                    req_id, desc = self._extract_requirement_id_from_text(desc)
-                
-                if desc and len(desc) > 5:
-                    results.append((req_id, desc))
-        
-        return results
+                if cand_score == best_score and len(cand.description) == len(best.description):
+                    if cand.source_order < best.source_order:
+                        best = cand
+            kept_interfaces.append(best)
+
+        # 第二阶段：同章节同来源/目的地下，删除与“已知类型”高度相似的“未知类型”重复项。
+        by_section_src_dst: Dict[Tuple[str, str, str], List[Requirement]] = {}
+        for req in kept_interfaces:
+            key = (
+                req.section_uid or req.section_number or "",
+                self._normalize_text_for_dedup(req.source),
+                self._normalize_text_for_dedup(req.destination),
+            )
+            by_section_src_dst.setdefault(key, []).append(req)
+
+        pruned_interfaces: List[Requirement] = []
+        unknown_norm = self._normalize_text_for_dedup(self.settings.interface_unknown_fallback)
+        for key, group in by_section_src_dst.items():
+            known = [
+                r for r in group
+                if self._normalize_text_for_dedup(r.interface_type) != unknown_norm
+            ]
+            if not known:
+                pruned_interfaces.extend(group)
+                continue
+
+            for cand in group:
+                cand_type = self._normalize_text_for_dedup(cand.interface_type)
+                if cand_type != unknown_norm:
+                    pruned_interfaces.append(cand)
+                    continue
+
+                should_drop = False
+                for k in known:
+                    name_sim = SequenceMatcher(
+                        None,
+                        self._normalize_text_for_dedup(cand.interface_name),
+                        self._normalize_text_for_dedup(k.interface_name),
+                    ).ratio()
+                    desc_sim = SequenceMatcher(
+                        None,
+                        self._normalize_text_for_dedup(cand.description),
+                        self._normalize_text_for_dedup(k.description),
+                    ).ratio()
+                    if name_sim >= 0.60 or desc_sim >= 0.60:
+                        should_drop = True
+                        break
+
+                if not should_drop:
+                    pruned_interfaces.append(cand)
+
+        merged = others + pruned_interfaces
+        return sorted(merged, key=lambda r: (r.source_order, r.section_number or ""))
+
+    def _normalize_text_for_dedup(self, text: str) -> str:
+        normalized = self._normalize_ocr_spacing(text or "").strip().lower()
+        normalized = normalized.translate(str.maketrans({
+            "，": ",",
+            "；": ";",
+            "：": ":",
+            "（": "(",
+            "）": ")",
+            "。": ".",
+        }))
+        normalized = re.sub(r"\s+", " ", normalized)
+        return normalized
+
+    def _global_deduplicate_requirements(self, requirements: List[Requirement]) -> List[Requirement]:
+        deduped: List[Requirement] = []
+        threshold = self.settings.dedup_similarity_threshold
+        for req in requirements:
+            current_text = self._normalize_text_for_dedup(req.description)
+            duplicate_idx = -1
+            for i, kept in enumerate(deduped):
+                if kept.type != req.type:
+                    continue
+                sim = SequenceMatcher(
+                    None,
+                    current_text,
+                    self._normalize_text_for_dedup(kept.description),
+                ).ratio()
+                if sim >= threshold:
+                    duplicate_idx = i
+                    break
+
+            if duplicate_idx < 0:
+                deduped.append(req)
+                continue
+
+            kept = deduped[duplicate_idx]
+            if self._prefer_requirement_for_dedup(kept, req):
+                deduped[duplicate_idx] = req
+
+        return sorted(deduped, key=lambda r: (r.source_order, r.section_number or ""))
+
+    def _section_depth(self, section_number: str) -> int:
+        sn = (section_number or "").strip()
+        if not sn:
+            return 0
+        return sn.count(".") + 1
+
+    def _prefer_requirement_for_dedup(self, kept: Requirement, candidate: Requirement) -> bool:
+        # 同章节优先保留正文。
+        if (
+            self.settings.prefer_text_over_table
+            and kept.section_uid == candidate.section_uid
+            and kept.source_type == "table"
+            and candidate.source_type == "text"
+        ):
+            return True
+
+        # 优先保留更深层级章节（子章节）需求，抑制父级汇总重复。
+        kept_depth = self._section_depth(kept.section_number)
+        cand_depth = self._section_depth(candidate.section_number)
+        if cand_depth > kept_depth:
+            return True
+        if cand_depth < kept_depth:
+            return False
+
+        # 接口需求中，优先保留非“泛引用”条目。
+        if kept.type == "interface":
+            kept_generic = self._is_generic_reference_requirement(kept.description)
+            cand_generic = self._is_generic_reference_requirement(candidate.description)
+            if kept_generic and not cand_generic:
+                return True
+            if not kept_generic and cand_generic:
+                return False
+
+        # 同层级时保留更早出现的原文顺序。
+        return candidate.source_order < kept.source_order
 
     def _is_time_series_table(self, table: List[List[str]]) -> bool:
         if not table:
@@ -939,38 +1551,6 @@ JSON输出："""
 
         return (header_has_time and header_has_action) or (time_like_rows >= self.settings.merge_time_series_rows_min)
 
-    def _build_sequence_table_requirement(self, table: List[List[str]]) -> str:
-        if not table or len(table) < 2:
-            return ""
-
-        header = table[0]
-        time_idx = 0
-        action_idx = 1 if len(header) > 1 else 0
-        for i, col in enumerate(header):
-            col_text = (col or "")
-            if any(k in col_text for k in ["时间", "时刻", "time", "TIME"]):
-                time_idx = i
-            if any(k in col_text for k in ["指令", "动作", "行为", "操作", "名称"]):
-                action_idx = i
-
-        sequence_parts = []
-        for row in table[1:]:
-            if not row:
-                continue
-            row = [self._clean_description(c) for c in row]
-            if not any(row):
-                continue
-            t = row[time_idx] if time_idx < len(row) else ""
-            a = row[action_idx] if action_idx < len(row) else ""
-            if t and a:
-                sequence_parts.append(f"{t}执行{a}")
-            elif a:
-                sequence_parts.append(a)
-
-        if not sequence_parts:
-            return ""
-        return "系统应按以下时间序列依次执行指令：" + "；".join(sequence_parts)
-    
     def _parse_llm_json_response(self, response: str) -> Optional[Dict]:
         """解析LLM的JSON响应"""
         try:
diff --git a/src/requirement_splitter.py b/src/requirement_splitter.py
index b062082..1b80292 100644
--- a/src/requirement_splitter.py
+++ b/src/requirement_splitter.py
@@ -33,8 +33,10 @@ class RequirementSplitter:
     CONNECTOR_HINTS = ["并", "并且", "同时", "然后", "且", "以及", "及"]
     CONDITIONAL_HINTS = ["如果", "当", "若", "在", "其中", "此时", "满足"]
     CONTEXT_PRONOUN_HINTS = ["该", "其", "上述", "此", "这些", "那些"]
+    CHAIN_HINTS = ["从而", "以便", "用于", "以实现", "并据此", "进而", "从而实现"]
+    ENUMERATION_HINTS = ["具体包括", "包括但不限于", "主要包括", "其中包括", "如下"]
 
-    def __init__(self, max_sentence_len: int = 120, min_clause_len: int = 12):
+    def __init__(self, max_sentence_len: int = 160, min_clause_len: int = 20):
         self.max_sentence_len = max_sentence_len
         self.min_clause_len = min_clause_len
 
@@ -107,6 +109,14 @@ class RequirementSplitter:
         if len(current) < self.min_clause_len:
             return False
 
+        # “具体包括/其中包括”后的列举项通常是上一句延伸，不应拆分为独立需求。
+        if any(h in current for h in self.ENUMERATION_HINTS):
+            return False
+
+        # 承接链条短语一般不是独立需求动作，避免切断语义链。
+        if any(fragment.startswith(h) for h in self.CHAIN_HINTS):
+            return False
+
         # 指代承接片段通常是语义延续，不应切断。
         if any(fragment.startswith(h) for h in self.CONTEXT_PRONOUN_HINTS):
             return False
@@ -123,6 +133,12 @@ class RequirementSplitter:
         has_action = any(h in fragment for h in self.ACTION_HINTS)
         current_has_action = any(h in current for h in self.ACTION_HINTS)
 
+        # 并列连接词后接“控制/处理/显示”等限定短语时，优先视为同一需求。
+        if has_connector and len(fragment) < self.max_sentence_len // 3 and not any(
+            kw in fragment for kw in ["并输出", "并上传", "并记录", "并触发"]
+        ):
+            return False
+
         # 连接词 + 动作词，且当前片段已经包含动作，优先拆分。
         if has_connector and has_action and current_has_action:
             return True
@@ -147,6 +163,9 @@ class RequirementSplitter:
         return merged
 
     def _should_merge(self, prev: str, current: str) -> bool:
+        if any(h in prev for h in self.ENUMERATION_HINTS):
+            return True
+
         # 指代开头：如“该报警信号...”。
         if any(current.startswith(h) for h in self.CONTEXT_PRONOUN_HINTS):
             return True
diff --git a/src/settings.py b/src/settings.py
index 55e7fc0..abe3adc 100644
--- a/src/settings.py
+++ b/src/settings.py
@@ -60,6 +60,46 @@ class AppSettings:
         "other": "OR",
     }
 
+    DEFAULT_INTERFACE_SECTION_HINTS = [
+        "接口描述",
+        "接口需求",
+        "接口要求",
+        "外部接口",
+        "内部接口",
+        "i/o",
+    ]
+
+    DEFAULT_INTERFACE_TITLE_EXCLUDES = [
+        "计算机通信需求",
+        "通信需求",
+        "通信要求",
+    ]
+
+    DEFAULT_FUNCTIONAL_SECTION_HINTS = [
+        "功能需求",
+        "功能要求",
+    ]
+
+    DEFAULT_OTHER_SECTION_HINTS = [
+        "安全性需求",
+        "保密性需求",
+        "适应性需求",
+        "环境需求",
+        "资源需求",
+        "质量",
+        "设计约束",
+        "培训需求",
+        "软件保障",
+        "验收",
+        "交付",
+        "包装",
+        "通信需求",
+        "计算机通信需求",
+        "硬件环境",
+        "软件环境",
+        "运行环境",
+    ]
+
     def __init__(self, config: Dict[str, Any] = None):
         self.config = config or {}
 
@@ -75,6 +115,20 @@ class AppSettings:
         self.type_prefix = self._build_type_prefix(req_types_cfg)
         self.type_chinese = self._build_type_chinese(req_types_cfg)
 
+        semantic_type_cfg = extraction_cfg.get("semantic_type_policy", {})
+        self.interface_section_hints = [
+            str(x).lower() for x in semantic_type_cfg.get("interface_section_hints", self.DEFAULT_INTERFACE_SECTION_HINTS)
+        ]
+        self.interface_title_excludes = [
+            str(x).lower() for x in semantic_type_cfg.get("interface_title_excludes", self.DEFAULT_INTERFACE_TITLE_EXCLUDES)
+        ]
+        self.functional_section_hints = [
+            str(x).lower() for x in semantic_type_cfg.get("functional_section_hints", self.DEFAULT_FUNCTIONAL_SECTION_HINTS)
+        ]
+        self.other_section_hints = [
+            str(x).lower() for x in semantic_type_cfg.get("other_section_hints", self.DEFAULT_OTHER_SECTION_HINTS)
+        ]
+
         splitter_cfg = extraction_cfg.get("splitter", {})
         self.splitter_max_sentence_len = int(splitter_cfg.get("max_sentence_len", 120))
         self.splitter_min_clause_len = int(splitter_cfg.get("min_clause_len", 12))
@@ -91,16 +145,61 @@ class AppSettings:
         self.table_llm_semantic_enabled = bool(table_cfg.get("llm_semantic_enabled", True))
         self.sequence_table_merge = table_cfg.get("sequence_table_merge", "single_requirement")
         self.merge_time_series_rows_min = int(table_cfg.get("merge_time_series_rows_min", 3))
+        self.table_skip_keywords = list(
+            table_cfg.get(
+                "skip_keywords",
+                ["系统功能要求", "性能要求", "功能矩阵", "能力对照", "性能指标对照"],
+            )
+        )
+        self.table_interface_keywords = list(
+            table_cfg.get(
+                "interface_keywords",
+                ["接口", "interface", "输入输出", "I/O", "数据来源", "数据目的地", "来源", "目的地"],
+            )
+        )
+        self.table_single_requirement_keywords = list(
+            table_cfg.get(
+                "single_requirement_keywords",
+                ["硬件要求", "软件要求", "运行环境", "环境需求", "资源需求", "计算机资源"],
+            )
+        )
 
         rewrite_cfg = extraction_cfg.get("rewrite_policy", {})
         self.llm_light_rewrite_enabled = bool(rewrite_cfg.get("llm_light_rewrite_enabled", True))
         self.preserve_ratio_min = float(rewrite_cfg.get("preserve_ratio_min", 0.65))
         self.max_length_growth_ratio = float(rewrite_cfg.get("max_length_growth_ratio", 1.25))
+        self.non_interface_max_edit_distance = int(rewrite_cfg.get("non_interface_max_edit_distance", 20))
+
+        self.system_description_hints = list(
+            extraction_cfg.get(
+                "system_description_hints",
+                ["系统描述", "功能描述", "概述", "示意图", "组成", "架构", "原理"],
+            )
+        )
 
         renumber_cfg = extraction_cfg.get("renumber_policy", {})
         self.renumber_enabled = bool(renumber_cfg.get("enabled", True))
         self.renumber_mode = renumber_cfg.get("mode", "section_continuous")
 
+        dedup_cfg = extraction_cfg.get("dedup_policy", {})
+        self.dedup_similarity_threshold = float(dedup_cfg.get("similarity_threshold", 0.88))
+        self.enable_cross_section_dedup = bool(dedup_cfg.get("enable_cross_section_dedup", True))
+        self.prefer_text_over_table = bool(dedup_cfg.get("prefer_text_over_table", True))
+
+        interface_cfg = extraction_cfg.get("interface_policy", {})
+        self.interface_unknown_fallback = str(interface_cfg.get("unknown_fallback", "未知"))
+
+        normalization_cfg = extraction_cfg.get("normalization_policy", {})
+        self.ocr_spacing_normalize = bool(normalization_cfg.get("ocr_spacing_normalize", True))
+
+        fidelity_cfg = extraction_cfg.get("fidelity_policy", {})
+        self.preserve_source_text_for_text_blocks = bool(
+            fidelity_cfg.get("preserve_source_text_for_text_blocks", True)
+        )
+
+        punctuation_cfg = extraction_cfg.get("punctuation_policy", {})
+        self.ensure_terminal_period = bool(punctuation_cfg.get("ensure_terminal_period", True))
+
     def _build_rules(self, req_types_cfg: Dict[str, Dict[str, Any]]) -> List[RequirementTypeRule]:
         rules: List[RequirementTypeRule] = []
         if not req_types_cfg:
@@ -153,10 +252,45 @@ class AppSettings:
     def is_non_requirement_section(self, title: str) -> bool:
         return any(keyword in title for keyword in self.non_requirement_sections)
 
+    def is_interface_semantic_title(self, title: str) -> bool:
+        t = (title or "").strip().lower()
+        if not t:
+            return False
+
+        excluded = any(x in t for x in self.interface_title_excludes)
+        if excluded and "接口" not in t:
+            return False
+
+        return any(h in t for h in self.interface_section_hints)
+
+    def is_functional_semantic_title(self, title: str) -> bool:
+        t = (title or "").strip().lower()
+        if not t:
+            return False
+        return any(h in t for h in self.functional_section_hints)
+
+    def is_other_semantic_title(self, title: str) -> bool:
+        t = (title or "").strip().lower()
+        if not t:
+            return False
+        return any(h in t for h in self.other_section_hints)
+
     def detect_requirement_type(self, title: str, content: str) -> str:
+        # 章节语义优先：接口仅由接口类章节触发；安全/保密/适应性等统一归其他需求。
+        if self.is_interface_semantic_title(title):
+            return "interface"
+        if self.is_functional_semantic_title(title):
+            return "functional"
+        if self.is_other_semantic_title(title):
+            return "other"
+
         combined_text = f"{title} {(content or '')[:500]}".lower()
         for rule in self.requirement_rules:
+            if rule.key == "interface" and not self.is_interface_semantic_title(title):
+                continue
             for keyword in rule.keywords:
                 if keyword.lower() in combined_text:
+                    if rule.key in {"performance", "security", "reliability", "other"}:
+                        return "other"
                     return rule.key
         return "functional"