test_item_gen/process_doc_file.py

import json
import re
from docx import Document
# from docx.oml import CT_OMath
from docx.table import Table
from lxml import etree


def extract_text_from_paragraph(paragraph):
    """从段落中提取文本，包括处理特殊元素"""
    text_parts = []
    for child in paragraph._element:
        if child.tag.endswith('}r'):  # 文本运行
            for sub in child:
                if sub.tag.endswith('}t'):
                    if sub.text:
                        text_parts.append(sub.text)
        elif child.tag.endswith('}hyperlink'):  # 超链接
            for r in child:
                for sub in r:
                    if sub.tag.endswith('}t'):
                        if sub.text:
                            text_parts.append(sub.text)
    return ''.join(text_parts).strip()


def is_image_paragraph(paragraph):
    """检查段落是否包含图片"""
    # 定义命名空间URI
    NS_A = 'http://schemas.openxmlformats.org/drawingml/2006/main'
    NS_PIC = 'http://schemas.openxmlformats.org/drawingml/2006/picture'
    NS_W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'

    # 检查所有元素，查找图片相关标签
    def has_image_element(element):
        """递归检查元素及其子元素是否包含图片"""
        # 检查当前元素的标签
        tag = element.tag
        if tag == f'{{{NS_A}}}blip' or tag == f'{{{NS_PIC}}}pic':
            return True
        if tag == f'{{{NS_W}}}drawing':
            return True
        # 递归检查子元素
        for child in element:
            if has_image_element(child):
                return True
        return False

    for run in paragraph.runs:
        if has_image_element(run._element):
            return True
    # 检查段落元素本身
    if has_image_element(paragraph._element):
        return True
    return False


def get_heading_level(paragraph):
    """
    获取段落的标题级别
    返回: 0表示非标题，1-9表示对应级别的标题
    """
    style_name = paragraph.style.name if paragraph.style else ""

    # 检查是否是Heading样式
    if style_name.startswith('Heading'):
        try:
            level = int(style_name.replace('Heading', '').strip())
            return level
        except ValueError:
            pass

    # 检查是否是标题样式（中文）
    if '标题' in style_name:
        match = re.search(r'(\d+)', style_name)
        if match:
            return int(match.group(1))

    # 通过文本内容判断（处理列表编号格式的标题）
    text = extract_text_from_paragraph(paragraph)
    if not text:
        return 0

    # 匹配 "1. 船舶图面展示" 这种一级标题
    if re.match(r'^[1-9]\d*\.\s+\S', text):
        return 1

    # 匹配 "1.1 地图图面" 这种二级标题
    if re.match(r'^[1-9]\d*\.[1-9]\d*\s+\S', text):
        return 2

    # 匹配 "1.1.1 xxx" 这种三级标题
    if re.match(r'^[1-9]\d*\.[1-9]\d*\.[1-9]\d*\s+\S', text):
        return 3

    return 0


def parse_heading_text(text):
    """解析标题文本，提取编号和标题内容"""
    # 匹配 "1. 标题" 或 "1.1 标题" 或 "1.1.1 标题"
    match = re.match(r'^([1-9]\d*(?:\.[1-9]\d*)*\.?)\s*(.+)$', text)
    if match:
        return {
            'number': match.group(1).rstrip('.'),
            'title': match.group(2).strip()
        }
    return {
        'number': '',
        'title': text.strip()
    }


def extract_table_data(table):
    """提取表格数据"""
    table_data = []
    for row in table.rows:
        row_data = []
        for cell in row.cells:
            cell_text = cell.text.strip()
            row_data.append(cell_text)
        table_data.append(row_data)
    return table_data


def parse_document(doc_path):
    """解析Word文档并转换为结构化数据"""
    try:
        doc = Document(doc_path)
    except Exception as e:
        raise Exception(f"无法打开文档: {e}")

    result = {
        'document_title': '',
        'version': '',
        'date': '',
        'table_of_contents': [],
        'sections': []
    }

    # 用于追踪当前位置
    current_section = None  # 一级章节
    current_subsection = None  # 二级章节
    content_started = False
    toc_section = False

    # 提取元数据（标题、版本、日期）
    for i, para in enumerate(doc.paragraphs[:10]):
        text = extract_text_from_paragraph(para)
        if not text:
            continue

        if 'VDES' in text and '使用说明书' in text and not result['document_title']:
            result['document_title'] = text
        elif text.startswith('版本'):
            result['version'] = text.replace('版本：', '').replace('版本:', '').strip()
        elif '日期' in text or re.match(r'.*\d{4}\s*年', text):
            result['date'] = text.replace('日期：', '').replace('日期:', '').strip()
        elif text == '目录':
            toc_section = True
            continue

    # 主要解析逻辑
    skip_until_content = True

    for i, para in enumerate(doc.paragraphs):
        text = extract_text_from_paragraph(para)

        # 跳过空段落和图片占位符
        if not text:
            if is_image_paragraph(para):
                # 记录图片位置
                if current_subsection is not None:
                    current_subsection['content'].append({
                        'type': 'image',
                        'description': '[图片]'
                    })
                elif current_section is not None:
                    current_section['content'].append({
                        'type': 'image',
                        'description': '[图片]'
                    })
            continue

        # 跳过目录部分
        if text == '目录':
            toc_section = True
            continue

        if toc_section:
            # 检测目录结束（遇到正式章节标题）
            if re.match(r'^[1-9]\.\s+', text) or get_heading_level(para) == 1:
                # 检查这是否是正式内容的开始
                if '船舶' in text or '卫星' in text or '气象' in text or '辅助' in text or '运维' in text:
                    toc_section = False
                    skip_until_content = False

        # 跳过文档开头的元数据
        if skip_until_content:
            if result['document_title'] and text == result['document_title']:
                continue
            if '版本' in text or '日期' in text or text == '目录':
                continue
            # 检测正式内容开始
            heading_level = get_heading_level(para)
            if heading_level == 1 or re.match(r'^1\.\s+船舶', text):
                skip_until_content = False
                toc_section = False
            else:
                continue

        # 判断标题级别
        heading_level = get_heading_level(para)

        if heading_level == 1:
            # 一级标题：新章节
            parsed = parse_heading_text(text)

            # 保存之前的subsection到section
            if current_subsection is not None and current_section is not None:
                current_section['subsections'].append(current_subsection)
                current_subsection = None

            # 保存之前的section
            if current_section is not None:
                result['sections'].append(current_section)

            current_section = {
                'number': parsed['number'],
                'title': parsed['title'],
                'content': [],
                'subsections': []
            }

        elif heading_level == 2:
            # 二级标题：新子章节
            parsed = parse_heading_text(text)

            # 保存之前的subsection
            if current_subsection is not None and current_section is not None:
                current_section['subsections'].append(current_subsection)

            current_subsection = {
                'number': parsed['number'],
                'title': parsed['title'],
                'content': []
            }

        else:
            # 普通段落内容
            content_item = {
                'type': 'text',
                'content': text
            }

            # 检查是否包含图片引用（UUID格式）
            if re.match(r'^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$', text, re.IGNORECASE):
                content_item = {
                    'type': 'image_reference',
                    'reference_id': text
                }

            # 添加到适当的位置
            if current_subsection is not None:
                current_subsection['content'].append(content_item)
            elif current_section is not None:
                current_section['content'].append(content_item)

    # 保存最后的subsection和section
    if current_subsection is not None and current_section is not None:
        current_section['subsections'].append(current_subsection)
    if current_section is not None:
        result['sections'].append(current_section)

    return result


def clean_json_output(data):
    """清理JSON输出，移除空内容"""
    if isinstance(data, dict):
        cleaned = {}
        for key, value in data.items():
            cleaned_value = clean_json_output(value)
            # 保留空列表以保持结构完整性
            if cleaned_value is not None:
                cleaned[key] = cleaned_value
        return cleaned
    elif isinstance(data, list):
        cleaned = [clean_json_output(item) for item in data if item]
        return cleaned
    elif isinstance(data, str):
        return data.strip() if data.strip() else None
    else:
        return data


def convert_docx_to_json(input_path, output_path=None, indent=2, ensure_ascii=False):
    """
    主函数：将Word文档转换为JSON

    参数:
        input_path: Word文档路径
        output_path: JSON输出路径（可选）
        indent: JSON缩进空格数
        ensure_ascii: 是否将非ASCII字符转义

    返回:
        解析后的字典数据
    """
    # 解析文档
    parsed_data = parse_document(input_path)

    # 清理数据
    cleaned_data = clean_json_output(parsed_data)

    # 输出到文件
    if output_path:
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(cleaned_data, f, ensure_ascii=ensure_ascii, indent=indent)
        print(f"JSON已保存至: {output_path}")

    return cleaned_data


def main():
    """主入口函数"""
    import sys
    import os

    # 默认输入输出路径
    input_file = "VDES软件使用说明书.docx"
    output_file = "VDES软件使用说明书.json"

    # 支持命令行参数
    if len(sys.argv) >= 2:
        input_file = sys.argv[1]
    if len(sys.argv) >= 3:
        output_file = sys.argv[2]

    # 检查文件存在
    if not os.path.exists(input_file):
        print(f"错误: 找不到文件 '{input_file}'")
        sys.exit(1)

    try:
        # 转换文档
        result = convert_docx_to_json(input_file, output_file)

        # 打印预览
        print("\n=== 转换结果预览 ===")
        print(f"文档标题: {result.get('document_title', 'N/A')}")
        print(f"版本: {result.get('version', 'N/A')}")
        print(f"日期: {result.get('date', 'N/A')}")
        print(f"章节数量: {len(result.get('sections', []))}")

        for section in result.get('sections', []):
            print(f"\n  {section.get('number', '')} {section.get('title', '')}")
            for subsection in section.get('subsections', []):
                print(f"    {subsection.get('number', '')} {subsection.get('title', '')}")

        print("\n转换完成!")

    except Exception as e:
        print(f"转换失败: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)


if __name__ == "__main__":
    main()