src/document_parser.py

# -*- coding: utf-8 -*-
"""
文档解析模块 - LLM增强版
支持PDF和Docx格式，针对GJB438B标准SRS文档优化
"""

import os
import re
import logging
import importlib
from abc import ABC, abstractmethod
from typing import List, Dict, Tuple, Optional, Any
from pathlib import Path

try:
    from docx import Document
    HAS_DOCX = True
except ImportError:
    HAS_DOCX = False

try:
    import PyPDF2
    HAS_PDF = True
except ImportError:
    HAS_PDF = False

HAS_PDF_TABLE = importlib.util.find_spec("pdfplumber") is not None

logger = logging.getLogger(__name__)


class Section:
    """表示文档中的一个章节"""
    
    def __init__(self, level: int, title: str, number: str = None, content: str = "", uid: str = ""):
        self.level = level
        self.title = title
        self.number = number
        self.content = content
        self.uid = uid
        self.parent = None
        self.children = []
        self.tables = []
        self.blocks = []
    
    def add_child(self, child: 'Section') -> None:
        self.children.append(child)
        child.parent = self
    
    def add_content(self, text: str) -> None:
        text = (text or "").strip()
        if not text:
            return
        if self.content:
            self.content += "\n" + text
        else:
            self.content = text
        self.blocks.append({"type": "text", "text": text})
    
    def add_table(self, table_data: List[List[str]]) -> None:
        if not table_data:
            return
        self.tables.append(table_data)
        table_index = len(self.tables) - 1
        self.blocks.append({"type": "table", "table_index": table_index, "table": table_data})
    
    def generate_auto_number(self, parent_number: str = "", sibling_index: int = 1) -> None:
        """
        自动生成章节编号（当章节没有编号时）
        
        Args:
            parent_number: 父章节编号
            sibling_index: 在同级章节中的序号（从1开始）
        """
        if not self.number:
            if parent_number:
                self.number = f"{parent_number}.{sibling_index}"
            else:
                self.number = str(sibling_index)
    
    def __repr__(self) -> str:
        return f"Section(level={self.level}, number='{self.number}', title='{self.title}')"


class DocumentParser(ABC):
    """文档解析器基类"""
    
    def __init__(self, file_path: str):
        self.file_path = file_path
        self.sections: List[Section] = []
        self.document_title = ""
        self.raw_text = ""
        self.llm = None
        self._uid_counter = 0
    
    def set_llm(self, llm) -> None:
        """设置LLM实例"""
        self.llm = llm
    
    @abstractmethod
    def parse(self) -> List[Section]:
        pass
    
    def get_document_title(self) -> str:
        return self.document_title

    def _next_uid(self) -> str:
        self._uid_counter += 1
        return f"sec-{self._uid_counter}"
    
    def _auto_number_sections(self, sections: List[Section], parent_number: str = "") -> None:
        """
        为没有编号的章节自动生成编号
        
        规则：使用Word样式确定级别，跳过前置章节（目录、概述等），
        从第一个正文章节（如"外部接口"）开始编号为1
        
        Args:
            sections: 章节列表
            parent_number: 父章节编号
        """
        # 仅在顶级章节重编号
        if not parent_number:
            # 前置章节关键词（需要跳过的）
            skip_keywords = ['目录', '封面', '扉页', '未命名', '年', '月']
            # 正文章节关键词（遇到这些说明正文开始）
            content_keywords = ['外部接口', '接口', '软件需求', '需求', '功能', '性能', '设计', '概述', '标识', '引言']
            
            start_index = 0
            for idx, section in enumerate(sections):
                # 优先检查是否是正文章节
                is_content = any(kw in section.title for kw in content_keywords)
                if is_content and section.level == 1:
                    start_index = idx
                    break
            
            # 重新编号所有章节
            counter = 1
            for i, section in enumerate(sections):
                if i < start_index:
                    # 前置章节不编号
                    section.number = ""
                else:
                    # 正文章节：顶级章节从1开始编号
                    if section.level == 1:
                        section.number = str(counter)
                        counter += 1
                
                # 递归处理子章节
                if section.children:
                    self._auto_number_sections(section.children, section.number)
        else:
            # 子章节编号
            for i, section in enumerate(sections, 1):
                if not section.number or self._is_chinese_number(section.number):
                    section.generate_auto_number(parent_number, i)
                if section.children:
                    self._auto_number_sections(section.children, section.number)
    
    def _is_chinese_number(self, text: str) -> bool:
        """检查是否是中文数字编号"""
        chinese_numbers = '一二三四五六七八九十百千万'
        return text and all(c in chinese_numbers for c in text)


class DocxParser(DocumentParser):
    """DOCX格式文档解析器"""
    
    def __init__(self, file_path: str):
        if not HAS_DOCX:
            raise ImportError("python-docx库未安装，请运行: pip install python-docx")
        super().__init__(file_path)
        self.document = None
    
    def parse(self) -> List[Section]:
        try:
            self.document = Document(self.file_path)
            self.document_title = self.document.core_properties.title or "SRS Document"
            
            section_stack = {}
            
            for block in self._iter_block_items(self.document):
                from docx.text.paragraph import Paragraph
                from docx.table import Table
                if isinstance(block, Paragraph):
                    text = block.text.strip()
                    if not text:
                        continue
                    
                    heading_info = self._parse_heading(block, text)
                    if heading_info:
                        number, title, level = heading_info
                        section = Section(level=level, title=title, number=number, uid=self._next_uid())
                        
                        if level == 1 or not section_stack:
                            self.sections.append(section)
                            section_stack = {1: section}
                        else:
                            parent_level = level - 1
                            while parent_level >= 1 and parent_level not in section_stack:
                                parent_level -= 1
                            
                            if parent_level >= 1 and parent_level in section_stack:
                                section_stack[parent_level].add_child(section)
                            elif self.sections:
                                self.sections[-1].add_child(section)
                        
                        section_stack[level] = section
                        for l in list(section_stack.keys()):
                            if l > level:
                                del section_stack[l]
                    else:
                        # 添加内容到当前章节
                        if section_stack:
                            max_level = max(section_stack.keys())
                            section_stack[max_level].add_content(text)
                        else:
                            # 没有标题时，创建默认章节
                            default_section = Section(level=1, title="未命名章节", number="", uid=self._next_uid())
                            default_section.add_content(text)
                            self.sections.append(default_section)
                            section_stack = {1: default_section}
                elif isinstance(block, Table):
                    # 表格处理
                    table_data = self._extract_table_data(block)
                    if table_data:
                        if section_stack:
                            max_level = max(section_stack.keys())
                            section_stack[max_level].add_table(table_data)
                        else:
                            default_section = Section(level=1, title="未命名章节", number="", uid=self._next_uid())
                            default_section.add_table(table_data)
                            self.sections.append(default_section)
                            section_stack = {1: default_section}
            
            # 为没有编号的章节自动生成编号
            self._auto_number_sections(self.sections)
            
            logger.info(f"完成Docx解析，提取{len(self.sections)}个顶级章节")
            return self.sections
            
        except Exception as e:
            logger.error(f"解析Docx文档失败: {e}")
            raise
    
    def _is_valid_heading(self, text: str) -> bool:
        """检查是否是有效的标题"""
        if len(text) > 120 or '...' in text:
            return False
        # 标题应包含中文或字母
        if not re.search(r'[\u4e00-\u9fa5A-Za-z]', text):
            return False
        # 过滤目录项（标题后跟页码，如"概述     2"或"概述 . . . . 2"）
        if re.search(r'\s{2,}\d+$', text):  # 多个空格后跟数字结尾
            return False
        if re.search(r'[\.。\s]+\d+$', text):  # 点号或空格后跟数字结尾
            return False
        return True

    def _parse_heading(self, paragraph, text: str) -> Optional[Tuple[str, str, int]]:
        """解析标题，返回(编号, 标题, 级别)"""
        style_name = paragraph.style.name if paragraph.style else ""
        is_heading_style = style_name.lower().startswith('heading') if style_name else False
        
        # 数字编号标题
        match = re.match(r'^(\d+(?:\.\d+)*)\s*[\.、]?\s*(.+)$', text)
        if match and self._is_valid_heading(match.group(2)):
            number = match.group(1)
            title = match.group(2).strip()
            level = len(number.split('.'))
            return number, title, level
        
        # 中文编号标题
        match = re.match(r'^([一二三四五六七八九十]+)[、\.]+\s*(.+)$', text)
        if match and self._is_valid_heading(match.group(2)):
            number = match.group(1)
            title = match.group(2).strip()
            level = 1
            return number, title, level
        
        # 样式标题
        if is_heading_style and self._is_valid_heading(text):
            level = 1
            level_match = re.search(r'(\d+)', style_name)
            if level_match:
                level = int(level_match.group(1))
            return "", text, level
        
        return None

    def _iter_block_items(self, parent):
        """按文档顺序迭代段落和表格"""
        from docx.text.paragraph import Paragraph
        from docx.table import Table
        from docx.oxml.text.paragraph import CT_P
        from docx.oxml.table import CT_Tbl
        
        for child in parent.element.body.iterchildren():
            if isinstance(child, CT_P):
                yield Paragraph(child, parent)
            elif isinstance(child, CT_Tbl):
                yield Table(child, parent)

    def _extract_table_data(self, table) -> List[List[str]]:
        """提取表格数据"""
        table_data = []
        for row in table.rows:
            row_data = []
            for cell in row.cells:
                text = cell.text.replace('\n', ' ').strip()
                text = re.sub(r'\s+', ' ', text)
                row_data.append(text)
            if any(cell for cell in row_data):
                table_data.append(row_data)
        return table_data


class PDFParser(DocumentParser):
    """PDF格式文档解析器 - LLM增强版"""
    
    # GJB438B标准SRS文档的有效章节标题关键词
    VALID_TITLE_KEYWORDS = [
        '范围', '标识', '概述', '引用', '文档',
        '需求', '功能', '接口', '性能', '安全', '保密',
        '环境', '资源', '质量', '设计', '约束',
        '人员', '培训', '保障', '验收', '交付', '包装',
        '优先', '关键', '合格', '追踪', '注释',
        'CSCI', '计算机', '软件', '硬件', '通信', '通讯',
        '数据', '适应', '可靠', '内部', '外部',
        '描述', '要求', '规定', '说明', '定义',
        '电场', '防护', '装置', '控制', '监控', '显控'
    ]
    
    # 明显无效的章节标题模式（噪声）
    INVALID_TITLE_PATTERNS = [
        '本文档可作为', '参比电位', '补偿电流', '以太网',
        '电源', '软件接', '功能\\', '性能 \\', '输入/输出 \\',
        '数据处理要求 \\', '固件 \\', '质量控制要求',
        '信安科技', '浙江', '公司'
    ]
    
    def __init__(self, file_path: str):
        if not HAS_PDF:
            raise ImportError("PyPDF2库未安装，请运行: pip install PyPDF2")
        super().__init__(file_path)
        self.document_title = "SRS Document"
        self._page_texts: List[str] = []
    
    def parse(self) -> List[Section]:
        """解析PDF文档"""
        try:
            # 1. 提取所有文本
            self.raw_text = self._extract_all_text()
            
            # 2. 清洗文本
            cleaned_text = self._clean_text(self.raw_text)
            
            # 3. 识别章节结构
            self.sections = self._parse_sections(cleaned_text)
            
            # 4. 使用LLM验证和清理章节（如果可用）
            if self.llm:
                self.sections = self._llm_validate_sections(self.sections)

            # 章节识别失败时，创建兜底章节避免后续表格数据丢失。
            if not self.sections:
                fallback = Section(level=1, title="未命名章节", number="1", uid=self._next_uid())
                if cleaned_text:
                    fallback.add_content(cleaned_text)
                self.sections = [fallback]

            # 5. 提取并挂接PDF表格到章节（若依赖可用）
            pdf_tables = self._extract_pdf_tables()
            if pdf_tables:
                self._attach_pdf_tables_to_sections(pdf_tables)
            
            # 6. 为没有编号的章节自动生成编号
            self._auto_number_sections(self.sections)
            
            logger.info(f"完成PDF解析，提取{len(self.sections)}个顶级章节")
            return self.sections
            
        except Exception as e:
            logger.error(f"解析PDF文档失败: {e}")
            raise
    
    def _extract_all_text(self) -> str:
        """从PDF提取所有文本"""
        all_text = []
        with open(self.file_path, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            for page in pdf_reader.pages:
                text = page.extract_text()
                if text:
                    all_text.append(text)
        self._page_texts = all_text
        return '\n'.join(all_text)

    def _extract_pdf_tables(self) -> List[Dict[str, Any]]:
        """提取PDF中的表格数据。"""
        if not HAS_PDF_TABLE:
            logger.warning("未安装pdfplumber，跳过PDF表格提取。可执行: pip install pdfplumber")
            return []

        tables: List[Dict[str, Any]] = []
        try:
            pdfplumber = importlib.import_module("pdfplumber")
            with pdfplumber.open(self.file_path) as pdf:
                for page_idx, page in enumerate(pdf.pages):
                    page_text = ""
                    if page_idx < len(self._page_texts):
                        page_text = self._page_texts[page_idx]

                    extracted_tables = page.extract_tables() or []
                    for table_idx, table in enumerate(extracted_tables):
                        cleaned_table: List[List[str]] = []
                        for row in table or []:
                            cells = [re.sub(r'\s+', ' ', str(cell or '')).strip() for cell in row]
                            if any(cells):
                                cleaned_table.append(cells)

                        if cleaned_table:
                            tables.append(
                                {
                                    "page_idx": page_idx,
                                    "table_idx": table_idx,
                                    "page_text": page_text,
                                    "data": cleaned_table,
                                }
                            )
        except Exception as e:
            logger.warning(f"PDF表格提取失败，继续纯文本流程: {e}")
            return []

        logger.info(f"PDF表格提取完成，共{len(tables)}个表格")
        return tables

    def _attach_pdf_tables_to_sections(self, tables: List[Dict[str, Any]]) -> None:
        """将提取出的PDF表格挂接到最匹配的章节。"""
        flat_sections = self._flatten_sections(self.sections)
        if not flat_sections:
            return

        last_section: Optional[Section] = None
        for table in tables:
            matched = self._match_table_section(table.get("page_text", ""), flat_sections)
            target = matched or last_section or flat_sections[0]
            target.add_table(table["data"])
            last_section = target

    def _flatten_sections(self, sections: List[Section]) -> List[Section]:
        """按文档顺序拉平章节树。"""
        result: List[Section] = []
        for section in sections:
            result.append(section)
            if section.children:
                result.extend(self._flatten_sections(section.children))
        return result

    def _match_table_section(self, page_text: str, sections: List[Section]) -> Optional[Section]:
        """基于页文本匹配表格归属章节。"""
        normalized_page = re.sub(r"\s+", "", (page_text or "")).lower()
        if not normalized_page:
            return None

        matched: Optional[Section] = None
        matched_score = -1
        for section in sections:
            title = (section.title or "").strip()
            if not title:
                continue

            number = (section.number or "").strip()
            candidates = [title]
            if number:
                candidates.append(f"{number}{title}")
                candidates.append(f"{number} {title}")

            for candidate in candidates:
                normalized_candidate = re.sub(r"\s+", "", candidate).lower()
                if normalized_candidate and normalized_candidate in normalized_page:
                    score = len(normalized_candidate)
                    if score > matched_score:
                        matched = section
                        matched_score = score

        return matched
    
    def _clean_text(self, text: str) -> str:
        """清洗PDF提取的文本"""
        lines = text.split('\n')
        cleaned_lines = []
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
            # 跳过页码（通常是1-3位数字单独一行）
            if re.match(r'^\d{1,3}$', line):
                continue
            # 跳过目录行
            if line.count('.') > 10 and '...' in line:
                continue
            
            cleaned_lines.append(line)
        
        return '\n'.join(cleaned_lines)
    
    def _parse_sections(self, text: str) -> List[Section]:
        """解析章节结构"""
        sections = []
        section_stack = {}
        lines = text.split('\n')
        current_section = None
        content_buffer = []
        found_sections = set()
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
            
            # 尝试匹配章节标题
            section_info = self._match_section_header(line, found_sections)
            
            if section_info:
                number, title = section_info
                level = len(number.split('.'))
                
                # 保存之前章节的内容
                if current_section and content_buffer:
                    current_section.add_content('\n'.join(content_buffer))
                    content_buffer = []
                
                # 创建新章节
                section = Section(level=level, title=title, number=number, uid=self._next_uid())
                found_sections.add(number)
                
                # 建立层次结构
                if level == 1:
                    sections.append(section)
                    section_stack = {1: section}
                else:
                    parent_level = level - 1
                    while parent_level >= 1 and parent_level not in section_stack:
                        parent_level -= 1
                    
                    if parent_level >= 1 and parent_level in section_stack:
                        section_stack[parent_level].add_child(section)
                    elif sections:
                        sections[-1].add_child(section)
                    else:
                        sections.append(section)
                        section_stack = {1: section}
                
                section_stack[level] = section
                for l in list(section_stack.keys()):
                    if l > level:
                        del section_stack[l]
                
                current_section = section
            else:
                # 收集内容
                if line and not self._is_noise(line):
                    content_buffer.append(line)
        
        # 保存最后一个章节的内容
        if current_section and content_buffer:
            current_section.add_content('\n'.join(content_buffer))
        
        return sections
    
    def _match_section_header(self, line: str, found_sections: set) -> Optional[Tuple[str, str]]:
        """
        匹配章节标题
        
        Returns:
            (章节编号, 章节标题) 或 None
        """
        # 模式: "3.1功能需求" 或 "3.1 功能需求"
        match = re.match(r'^(\d+(?:\.\d+)*)\s*(.+)$', line)
        if not match:
            return None
        
        number = match.group(1)
        title = match.group(2).strip()
        
        # 排除目录行
        if '...' in title or title.count('.') > 5:
            return None
        
        # 验证章节编号
        parts = number.split('.')
        first_part = int(parts[0])
        
        # 放宽一级章节编号范围（非严格GJB结构）
        if first_part < 1 or first_part > 30:
            return None
        
        # 检查子部分是否合理
        for part in parts[1:]:
            if int(part) > 20:
                return None
        
        # 避免重复
        if number in found_sections:
            return None
        
        # 标题长度检查
        if len(title) > 60 or len(title) < 2:
            return None
        
        # 放宽标题字符要求（兼容部分PDF字体导致中文抽取异常的情况）
        if not re.search(r'[\u4e00-\u9fa5A-Za-z]', title):
            return None
        
        # 检查是否包含无效模式
        for invalid_pattern in self.INVALID_TITLE_PATTERNS:
            if invalid_pattern in title:
                return None
        
        # 标题不能以数字开头
        if title[0].isdigit():
            return None
        
        # 数字比例检查
        digit_ratio = sum(c.isdigit() for c in title) / max(len(title), 1)
        if digit_ratio > 0.3:
            return None
        
        # 检查标题是否包含反斜杠（通常是表格噪声）
        if '\\' in title and '需求' not in title:
            return None
        
        return (number, title)
    
    def _is_noise(self, line: str) -> bool:
        """检查是否是噪声行"""
        # 纯数字行
        if re.match(r'^[\d\s,.]+$', line):
            return True
        # 非常短的行
        if len(line) < 3:
            return True
        # 罗马数字
        if re.match(r'^[ivxIVX]+$', line):
            return True
        return False
    
    def _llm_validate_sections(self, sections: List[Section]) -> List[Section]:
        """使用LLM验证章节是否有效"""
        if not self.llm:
            return sections
        
        validated_sections = []
        
        for section in sections:
            # 验证顶级章节
            if self._is_valid_section_with_llm(section):
                # 递归验证子章节
                section.children = self._validate_children(section.children)
                validated_sections.append(section)
        
        return validated_sections
    
    def _validate_children(self, children: List[Section]) -> List[Section]:
        """递归验证子章节"""
        validated = []
        for child in children:
            if self._is_valid_section_with_llm(child):
                child.children = self._validate_children(child.children)
                validated.append(child)
        return validated
    
    def _is_valid_section_with_llm(self, section: Section) -> bool:
        """使用LLM判断章节是否有效"""
        # 先用规则快速过滤明显无效的章节
        invalid_titles = [
            '本文档可作为', '故障', '实时', '输入/输出',
            '固件', '功能\\', '\\4.', '\\3.'
        ]
        for invalid in invalid_titles:
            if invalid in section.title:
                logger.debug(f"过滤无效章节: {section.number} {section.title}")
                return False
        
        # 对于需求相关章节（第3章），额外验证
        if section.number and section.number.startswith('3'):
            # 检查标题是否看起来像是有效的需求章节标题
            # 有效的标题应该是完整的中文短语
            if '\\' in section.title or '/' in section.title:
                if not any(kw in section.title for kw in ['输入', '输出', '接口']):
                    return False
        
        return True


def create_parser(file_path: str) -> DocumentParser:
    """
    工厂函数：根据文件扩展名创建相应的解析器
    """
    ext = Path(file_path).suffix.lower()
    
    if ext == '.docx':
        return DocxParser(file_path)
    elif ext == '.pdf':
        return PDFParser(file_path)
    else:
        raise ValueError(f"不支持的文件格式: {ext}")