project init.

2026-02-03 22:48:22 +08:00
commit bf730377eb
14 changed files with 3141 additions and 0 deletions
--- a/src/document_parser.py
+++ b/src/document_parser.py
@@ -0,0 +1,597 @@
+# -*- coding: utf-8 -*-
+"""
+文档解析模块 - LLM增强版
+支持PDF和Docx格式，针对GJB438B标准SRS文档优化
+"""
+
+import os
+import re
+import logging
+from abc import ABC, abstractmethod
+from typing import List, Dict, Tuple, Optional
+from pathlib import Path
+
+try:
+    from docx import Document
+    HAS_DOCX = True
+except ImportError:
+    HAS_DOCX = False
+
+try:
+    import PyPDF2
+    HAS_PDF = True
+except ImportError:
+    HAS_PDF = False
+
+logger = logging.getLogger(__name__)
+
+
+class Section:
+    """表示文档中的一个章节"""
+    
+    def __init__(self, level: int, title: str, number: str = None, content: str = "", uid: str = ""):
+        self.level = level
+        self.title = title
+        self.number = number
+        self.content = content
+        self.uid = uid
+        self.parent = None
+        self.children = []
+        self.tables = []
+    
+    def add_child(self, child: 'Section') -> None:
+        self.children.append(child)
+        child.parent = self
+    
+    def add_content(self, text: str) -> None:
+        if self.content:
+            self.content += "\n" + text
+        else:
+            self.content = text
+    
+    def add_table(self, table_data: List[List[str]]) -> None:
+        self.tables.append(table_data)
+    
+    def generate_auto_number(self, parent_number: str = "", sibling_index: int = 1) -> None:
+        """
+        自动生成章节编号（当章节没有编号时）
+        
+        Args:
+            parent_number: 父章节编号
+            sibling_index: 在同级章节中的序号（从1开始）
+        """
+        if not self.number:
+            if parent_number:
+                self.number = f"{parent_number}.{sibling_index}"
+            else:
+                self.number = str(sibling_index)
+    
+    def __repr__(self) -> str:
+        return f"Section(level={self.level}, number='{self.number}', title='{self.title}')"
+
+
+class DocumentParser(ABC):
+    """文档解析器基类"""
+    
+    def __init__(self, file_path: str):
+        self.file_path = file_path
+        self.sections: List[Section] = []
+        self.document_title = ""
+        self.raw_text = ""
+        self.llm = None
+        self._uid_counter = 0
+    
+    def set_llm(self, llm) -> None:
+        """设置LLM实例"""
+        self.llm = llm
+    
+    @abstractmethod
+    def parse(self) -> List[Section]:
+        pass
+    
+    def get_document_title(self) -> str:
+        return self.document_title
+
+    def _next_uid(self) -> str:
+        self._uid_counter += 1
+        return f"sec-{self._uid_counter}"
+    
+    def _auto_number_sections(self, sections: List[Section], parent_number: str = "") -> None:
+        """
+        为没有编号的章节自动生成编号
+        
+        规则：使用Word样式确定级别，跳过前置章节（目录、概述等），
+        从第一个正文章节（如"外部接口"）开始编号为1
+        
+        Args:
+            sections: 章节列表
+            parent_number: 父章节编号
+        """
+        # 仅在顶级章节重编号
+        if not parent_number:
+            # 前置章节关键词（需要跳过的）
+            skip_keywords = ['目录', '封面', '扉页', '未命名', '年', '月']
+            # 正文章节关键词（遇到这些说明正文开始）
+            content_keywords = ['外部接口', '接口', '软件需求', '需求', '功能', '性能', '设计', '概述', '标识', '引言']
+            
+            start_index = 0
+            for idx, section in enumerate(sections):
+                # 优先检查是否是正文章节
+                is_content = any(kw in section.title for kw in content_keywords)
+                if is_content and section.level == 1:
+                    start_index = idx
+                    break
+            
+            # 重新编号所有章节
+            counter = 1
+            for i, section in enumerate(sections):
+                if i < start_index:
+                    # 前置章节不编号
+                    section.number = ""
+                else:
+                    # 正文章节：顶级章节从1开始编号
+                    if section.level == 1:
+                        section.number = str(counter)
+                        counter += 1
+                
+                # 递归处理子章节
+                if section.children:
+                    self._auto_number_sections(section.children, section.number)
+        else:
+            # 子章节编号
+            for i, section in enumerate(sections, 1):
+                if not section.number or self._is_chinese_number(section.number):
+                    section.generate_auto_number(parent_number, i)
+                if section.children:
+                    self._auto_number_sections(section.children, section.number)
+    
+    def _is_chinese_number(self, text: str) -> bool:
+        """检查是否是中文数字编号"""
+        chinese_numbers = '一二三四五六七八九十百千万'
+        return text and all(c in chinese_numbers for c in text)
+
+
+class DocxParser(DocumentParser):
+    """DOCX格式文档解析器"""
+    
+    def __init__(self, file_path: str):
+        if not HAS_DOCX:
+            raise ImportError("python-docx库未安装，请运行: pip install python-docx")
+        super().__init__(file_path)
+        self.document = None
+    
+    def parse(self) -> List[Section]:
+        try:
+            self.document = Document(self.file_path)
+            self.document_title = self.document.core_properties.title or "SRS Document"
+            
+            section_stack = {}
+            
+            for block in self._iter_block_items(self.document):
+                from docx.text.paragraph import Paragraph
+                from docx.table import Table
+                if isinstance(block, Paragraph):
+                    text = block.text.strip()
+                    if not text:
+                        continue
+                    
+                    heading_info = self._parse_heading(block, text)
+                    if heading_info:
+                        number, title, level = heading_info
+                        section = Section(level=level, title=title, number=number, uid=self._next_uid())
+                        
+                        if level == 1 or not section_stack:
+                            self.sections.append(section)
+                            section_stack = {1: section}
+                        else:
+                            parent_level = level - 1
+                            while parent_level >= 1 and parent_level not in section_stack:
+                                parent_level -= 1
+                            
+                            if parent_level >= 1 and parent_level in section_stack:
+                                section_stack[parent_level].add_child(section)
+                            elif self.sections:
+                                self.sections[-1].add_child(section)
+                        
+                        section_stack[level] = section
+                        for l in list(section_stack.keys()):
+                            if l > level:
+                                del section_stack[l]
+                    else:
+                        # 添加内容到当前章节
+                        if section_stack:
+                            max_level = max(section_stack.keys())
+                            section_stack[max_level].add_content(text)
+                        else:
+                            # 没有标题时，创建默认章节
+                            default_section = Section(level=1, title="未命名章节", number="", uid=self._next_uid())
+                            default_section.add_content(text)
+                            self.sections.append(default_section)
+                            section_stack = {1: default_section}
+                elif isinstance(block, Table):
+                    # 表格处理
+                    table_data = self._extract_table_data(block)
+                    if table_data:
+                        if section_stack:
+                            max_level = max(section_stack.keys())
+                            section_stack[max_level].add_table(table_data)
+                        else:
+                            default_section = Section(level=1, title="未命名章节", number="", uid=self._next_uid())
+                            default_section.add_table(table_data)
+                            self.sections.append(default_section)
+                            section_stack = {1: default_section}
+            
+            # 为没有编号的章节自动生成编号
+            self._auto_number_sections(self.sections)
+            
+            logger.info(f"完成Docx解析，提取{len(self.sections)}个顶级章节")
+            return self.sections
+            
+        except Exception as e:
+            logger.error(f"解析Docx文档失败: {e}")
+            raise
+    
+    def _is_valid_heading(self, text: str) -> bool:
+        """检查是否是有效的标题"""
+        if len(text) > 120 or '...' in text:
+            return False
+        # 标题应包含中文或字母
+        if not re.search(r'[\u4e00-\u9fa5A-Za-z]', text):
+            return False
+        # 过滤目录项（标题后跟页码，如"概述     2"或"概述 . . . . 2"）
+        if re.search(r'\s{2,}\d+$', text):  # 多个空格后跟数字结尾
+            return False
+        if re.search(r'[\.。\s]+\d+$', text):  # 点号或空格后跟数字结尾
+            return False
+        return True
+
+    def _parse_heading(self, paragraph, text: str) -> Optional[Tuple[str, str, int]]:
+        """解析标题，返回(编号, 标题, 级别)"""
+        style_name = paragraph.style.name if paragraph.style else ""
+        is_heading_style = style_name.lower().startswith('heading') if style_name else False
+        
+        # 数字编号标题
+        match = re.match(r'^(\d+(?:\.\d+)*)\s*[\.、]?\s*(.+)$', text)
+        if match and self._is_valid_heading(match.group(2)):
+            number = match.group(1)
+            title = match.group(2).strip()
+            level = len(number.split('.'))
+            return number, title, level
+        
+        # 中文编号标题
+        match = re.match(r'^([一二三四五六七八九十]+)[、\.]+\s*(.+)$', text)
+        if match and self._is_valid_heading(match.group(2)):
+            number = match.group(1)
+            title = match.group(2).strip()
+            level = 1
+            return number, title, level
+        
+        # 样式标题
+        if is_heading_style and self._is_valid_heading(text):
+            level = 1
+            level_match = re.search(r'(\d+)', style_name)
+            if level_match:
+                level = int(level_match.group(1))
+            return "", text, level
+        
+        return None
+
+    def _iter_block_items(self, parent):
+        """按文档顺序迭代段落和表格"""
+        from docx.text.paragraph import Paragraph
+        from docx.table import Table
+        from docx.oxml.text.paragraph import CT_P
+        from docx.oxml.table import CT_Tbl
+        
+        for child in parent.element.body.iterchildren():
+            if isinstance(child, CT_P):
+                yield Paragraph(child, parent)
+            elif isinstance(child, CT_Tbl):
+                yield Table(child, parent)
+
+    def _extract_table_data(self, table) -> List[List[str]]:
+        """提取表格数据"""
+        table_data = []
+        for row in table.rows:
+            row_data = []
+            for cell in row.cells:
+                text = cell.text.replace('\n', ' ').strip()
+                text = re.sub(r'\s+', ' ', text)
+                row_data.append(text)
+            if any(cell for cell in row_data):
+                table_data.append(row_data)
+        return table_data
+
+
+class PDFParser(DocumentParser):
+    """PDF格式文档解析器 - LLM增强版"""
+    
+    # GJB438B标准SRS文档的有效章节标题关键词
+    VALID_TITLE_KEYWORDS = [
+        '范围', '标识', '概述', '引用', '文档',
+        '需求', '功能', '接口', '性能', '安全', '保密',
+        '环境', '资源', '质量', '设计', '约束',
+        '人员', '培训', '保障', '验收', '交付', '包装',
+        '优先', '关键', '合格', '追踪', '注释',
+        'CSCI', '计算机', '软件', '硬件', '通信', '通讯',
+        '数据', '适应', '可靠', '内部', '外部',
+        '描述', '要求', '规定', '说明', '定义',
+        '电场', '防护', '装置', '控制', '监控', '显控'
+    ]
+    
+    # 明显无效的章节标题模式（噪声）
+    INVALID_TITLE_PATTERNS = [
+        '本文档可作为', '参比电位', '补偿电流', '以太网',
+        '电源', '软件接', '功能\\', '性能 \\', '输入/输出 \\',
+        '数据处理要求 \\', '固件 \\', '质量控制要求',
+        '信安科技', '浙江', '公司'
+    ]
+    
+    def __init__(self, file_path: str):
+        if not HAS_PDF:
+            raise ImportError("PyPDF2库未安装，请运行: pip install PyPDF2")
+        super().__init__(file_path)
+        self.document_title = "SRS Document"
+    
+    def parse(self) -> List[Section]:
+        """解析PDF文档"""
+        try:
+            # 1. 提取所有文本
+            self.raw_text = self._extract_all_text()
+            
+            # 2. 清洗文本
+            cleaned_text = self._clean_text(self.raw_text)
+            
+            # 3. 识别章节结构
+            self.sections = self._parse_sections(cleaned_text)
+            
+            # 4. 使用LLM验证和清理章节（如果可用）
+            if self.llm:
+                self.sections = self._llm_validate_sections(self.sections)
+            
+            # 5. 为没有编号的章节自动生成编号
+            self._auto_number_sections(self.sections)
+            
+            logger.info(f"完成PDF解析，提取{len(self.sections)}个顶级章节")
+            return self.sections
+            
+        except Exception as e:
+            logger.error(f"解析PDF文档失败: {e}")
+            raise
+    
+    def _extract_all_text(self) -> str:
+        """从PDF提取所有文本"""
+        all_text = []
+        with open(self.file_path, 'rb') as f:
+            pdf_reader = PyPDF2.PdfReader(f)
+            for page in pdf_reader.pages:
+                text = page.extract_text()
+                if text:
+                    all_text.append(text)
+        return '\n'.join(all_text)
+    
+    def _clean_text(self, text: str) -> str:
+        """清洗PDF提取的文本"""
+        lines = text.split('\n')
+        cleaned_lines = []
+        
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue
+            # 跳过页码（通常是1-3位数字单独一行）
+            if re.match(r'^\d{1,3}$', line):
+                continue
+            # 跳过目录行
+            if line.count('.') > 10 and '...' in line:
+                continue
+            
+            cleaned_lines.append(line)
+        
+        return '\n'.join(cleaned_lines)
+    
+    def _parse_sections(self, text: str) -> List[Section]:
+        """解析章节结构"""
+        sections = []
+        section_stack = {}
+        lines = text.split('\n')
+        current_section = None
+        content_buffer = []
+        found_sections = set()
+        
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue
+            
+            # 尝试匹配章节标题
+            section_info = self._match_section_header(line, found_sections)
+            
+            if section_info:
+                number, title = section_info
+                level = len(number.split('.'))
+                
+                # 保存之前章节的内容
+                if current_section and content_buffer:
+                    current_section.add_content('\n'.join(content_buffer))
+                    content_buffer = []
+                
+                # 创建新章节
+                section = Section(level=level, title=title, number=number, uid=self._next_uid())
+                found_sections.add(number)
+                
+                # 建立层次结构
+                if level == 1:
+                    sections.append(section)
+                    section_stack = {1: section}
+                else:
+                    parent_level = level - 1
+                    while parent_level >= 1 and parent_level not in section_stack:
+                        parent_level -= 1
+                    
+                    if parent_level >= 1 and parent_level in section_stack:
+                        section_stack[parent_level].add_child(section)
+                    elif sections:
+                        sections[-1].add_child(section)
+                    else:
+                        sections.append(section)
+                        section_stack = {1: section}
+                
+                section_stack[level] = section
+                for l in list(section_stack.keys()):
+                    if l > level:
+                        del section_stack[l]
+                
+                current_section = section
+            else:
+                # 收集内容
+                if line and not self._is_noise(line):
+                    content_buffer.append(line)
+        
+        # 保存最后一个章节的内容
+        if current_section and content_buffer:
+            current_section.add_content('\n'.join(content_buffer))
+        
+        return sections
+    
+    def _match_section_header(self, line: str, found_sections: set) -> Optional[Tuple[str, str]]:
+        """
+        匹配章节标题
+        
+        Returns:
+            (章节编号, 章节标题) 或 None
+        """
+        # 模式: "3.1功能需求" 或 "3.1 功能需求"
+        match = re.match(r'^(\d+(?:\.\d+)*)\s*(.+)$', line)
+        if not match:
+            return None
+        
+        number = match.group(1)
+        title = match.group(2).strip()
+        
+        # 排除目录行
+        if '...' in title or title.count('.') > 5:
+            return None
+        
+        # 验证章节编号
+        parts = number.split('.')
+        first_part = int(parts[0])
+        
+        # 放宽一级章节编号范围（非严格GJB结构）
+        if first_part < 1 or first_part > 30:
+            return None
+        
+        # 检查子部分是否合理
+        for part in parts[1:]:
+            if int(part) > 20:
+                return None
+        
+        # 避免重复
+        if number in found_sections:
+            return None
+        
+        # 标题长度检查
+        if len(title) > 60 or len(title) < 2:
+            return None
+        
+        # 标题必须包含中文
+        if not re.search(r'[\u4e00-\u9fa5]', title):
+            return None
+        
+        # 放宽标题关键词要求（非严格GJB结构）
+        if not re.search(r'[\u4e00-\u9fa5A-Za-z]', title):
+            return None
+        
+        # 检查是否包含无效模式
+        for invalid_pattern in self.INVALID_TITLE_PATTERNS:
+            if invalid_pattern in title:
+                return None
+        
+        # 标题不能以数字开头
+        if title[0].isdigit():
+            return None
+        
+        # 数字比例检查
+        digit_ratio = sum(c.isdigit() for c in title) / max(len(title), 1)
+        if digit_ratio > 0.3:
+            return None
+        
+        # 检查标题是否包含反斜杠（通常是表格噪声）
+        if '\\' in title and '需求' not in title:
+            return None
+        
+        return (number, title)
+    
+    def _is_noise(self, line: str) -> bool:
+        """检查是否是噪声行"""
+        # 纯数字行
+        if re.match(r'^[\d\s,.]+$', line):
+            return True
+        # 非常短的行
+        if len(line) < 3:
+            return True
+        # 罗马数字
+        if re.match(r'^[ivxIVX]+$', line):
+            return True
+        return False
+    
+    def _llm_validate_sections(self, sections: List[Section]) -> List[Section]:
+        """使用LLM验证章节是否有效"""
+        if not self.llm:
+            return sections
+        
+        validated_sections = []
+        
+        for section in sections:
+            # 验证顶级章节
+            if self._is_valid_section_with_llm(section):
+                # 递归验证子章节
+                section.children = self._validate_children(section.children)
+                validated_sections.append(section)
+        
+        return validated_sections
+    
+    def _validate_children(self, children: List[Section]) -> List[Section]:
+        """递归验证子章节"""
+        validated = []
+        for child in children:
+            if self._is_valid_section_with_llm(child):
+                child.children = self._validate_children(child.children)
+                validated.append(child)
+        return validated
+    
+    def _is_valid_section_with_llm(self, section: Section) -> bool:
+        """使用LLM判断章节是否有效"""
+        # 先用规则快速过滤明显无效的章节
+        invalid_titles = [
+            '本文档可作为', '故障', '实时', '输入/输出',
+            '固件', '功能\\', '\\4.', '\\3.'
+        ]
+        for invalid in invalid_titles:
+            if invalid in section.title:
+                logger.debug(f"过滤无效章节: {section.number} {section.title}")
+                return False
+        
+        # 对于需求相关章节（第3章），额外验证
+        if section.number and section.number.startswith('3'):
+            # 检查标题是否看起来像是有效的需求章节标题
+            # 有效的标题应该是完整的中文短语
+            if '\\' in section.title or '/' in section.title:
+                if not any(kw in section.title for kw in ['输入', '输出', '接口']):
+                    return False
+        
+        return True
+
+
+def create_parser(file_path: str) -> DocumentParser:
+    """
+    工厂函数：根据文件扩展名创建相应的解析器
+    """
+    ext = Path(file_path).suffix.lower()
+    
+    if ext == '.docx':
+        return DocxParser(file_path)
+    elif ext == '.pdf':
+        return PDFParser(file_path)
+    else:
+        raise ValueError(f"不支持的文件格式: {ext}")