init rep

2026-02-04 14:38:52 +08:00
commit a5147b1429
29 changed files with 4489 additions and 0 deletions
--- a/process_doc_file.py
+++ b/process_doc_file.py
@@ -0,0 +1,371 @@
+import json
+import re
+from docx import Document
+# from docx.oml import CT_OMath
+from docx.table import Table
+from lxml import etree
+
+
+def extract_text_from_paragraph(paragraph):
+    """从段落中提取文本，包括处理特殊元素"""
+    text_parts = []
+    for child in paragraph._element:
+        if child.tag.endswith('}r'):  # 文本运行
+            for sub in child:
+                if sub.tag.endswith('}t'):
+                    if sub.text:
+                        text_parts.append(sub.text)
+        elif child.tag.endswith('}hyperlink'):  # 超链接
+            for r in child:
+                for sub in r:
+                    if sub.tag.endswith('}t'):
+                        if sub.text:
+                            text_parts.append(sub.text)
+    return ''.join(text_parts).strip()
+
+
+def is_image_paragraph(paragraph):
+    """检查段落是否包含图片"""
+    # 定义命名空间URI
+    NS_A = 'http://schemas.openxmlformats.org/drawingml/2006/main'
+    NS_PIC = 'http://schemas.openxmlformats.org/drawingml/2006/picture'
+    NS_W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
+    
+    # 检查所有元素，查找图片相关标签
+    def has_image_element(element):
+        """递归检查元素及其子元素是否包含图片"""
+        # 检查当前元素的标签
+        tag = element.tag
+        if tag == f'{{{NS_A}}}blip' or tag == f'{{{NS_PIC}}}pic':
+            return True
+        if tag == f'{{{NS_W}}}drawing':
+            return True
+        # 递归检查子元素
+        for child in element:
+            if has_image_element(child):
+                return True
+        return False
+    
+    for run in paragraph.runs:
+        if has_image_element(run._element):
+            return True
+    # 检查段落元素本身
+    if has_image_element(paragraph._element):
+        return True
+    return False
+
+
+def get_heading_level(paragraph):
+    """
+    获取段落的标题级别
+    返回: 0表示非标题，1-9表示对应级别的标题
+    """
+    style_name = paragraph.style.name if paragraph.style else ""
+    
+    # 检查是否是Heading样式
+    if style_name.startswith('Heading'):
+        try:
+            level = int(style_name.replace('Heading', '').strip())
+            return level
+        except ValueError:
+            pass
+    
+    # 检查是否是标题样式（中文）
+    if '标题' in style_name:
+        match = re.search(r'(\d+)', style_name)
+        if match:
+            return int(match.group(1))
+    
+    # 通过文本内容判断（处理列表编号格式的标题）
+    text = extract_text_from_paragraph(paragraph)
+    if not text:
+        return 0
+    
+    # 匹配 "1. 船舶图面展示" 这种一级标题
+    if re.match(r'^[1-9]\d*\.\s+\S', text):
+        return 1
+    
+    # 匹配 "1.1 地图图面" 这种二级标题
+    if re.match(r'^[1-9]\d*\.[1-9]\d*\s+\S', text):
+        return 2
+    
+    # 匹配 "1.1.1 xxx" 这种三级标题
+    if re.match(r'^[1-9]\d*\.[1-9]\d*\.[1-9]\d*\s+\S', text):
+        return 3
+    
+    return 0
+
+
+def parse_heading_text(text):
+    """解析标题文本，提取编号和标题内容"""
+    # 匹配 "1. 标题" 或 "1.1 标题" 或 "1.1.1 标题"
+    match = re.match(r'^([1-9]\d*(?:\.[1-9]\d*)*\.?)\s*(.+)$', text)
+    if match:
+        return {
+            'number': match.group(1).rstrip('.'),
+            'title': match.group(2).strip()
+        }
+    return {
+        'number': '',
+        'title': text.strip()
+    }
+
+
+def extract_table_data(table):
+    """提取表格数据"""
+    table_data = []
+    for row in table.rows:
+        row_data = []
+        for cell in row.cells:
+            cell_text = cell.text.strip()
+            row_data.append(cell_text)
+        table_data.append(row_data)
+    return table_data
+
+
+def parse_document(doc_path):
+    """解析Word文档并转换为结构化数据"""
+    try:
+        doc = Document(doc_path)
+    except Exception as e:
+        raise Exception(f"无法打开文档: {e}")
+    
+    result = {
+        'document_title': '',
+        'version': '',
+        'date': '',
+        'table_of_contents': [],
+        'sections': []
+    }
+    
+    # 用于追踪当前位置
+    current_section = None  # 一级章节
+    current_subsection = None  # 二级章节
+    content_started = False
+    toc_section = False
+    
+    # 提取元数据（标题、版本、日期）
+    for i, para in enumerate(doc.paragraphs[:10]):
+        text = extract_text_from_paragraph(para)
+        if not text:
+            continue
+        
+        if 'VDES' in text and '使用说明书' in text and not result['document_title']:
+            result['document_title'] = text
+        elif text.startswith('版本'):
+            result['version'] = text.replace('版本：', '').replace('版本:', '').strip()
+        elif '日期' in text or re.match(r'.*\d{4}\s*年', text):
+            result['date'] = text.replace('日期：', '').replace('日期:', '').strip()
+        elif text == '目录':
+            toc_section = True
+            continue
+    
+    # 主要解析逻辑
+    skip_until_content = True
+    
+    for i, para in enumerate(doc.paragraphs):
+        text = extract_text_from_paragraph(para)
+        
+        # 跳过空段落和图片占位符
+        if not text:
+            if is_image_paragraph(para):
+                # 记录图片位置
+                if current_subsection is not None:
+                    current_subsection['content'].append({
+                        'type': 'image',
+                        'description': '[图片]'
+                    })
+                elif current_section is not None:
+                    current_section['content'].append({
+                        'type': 'image',
+                        'description': '[图片]'
+                    })
+            continue
+        
+        # 跳过目录部分
+        if text == '目录':
+            toc_section = True
+            continue
+        
+        if toc_section:
+            # 检测目录结束（遇到正式章节标题）
+            if re.match(r'^[1-9]\.\s+', text) or get_heading_level(para) == 1:
+                # 检查这是否是正式内容的开始
+                if '船舶' in text or '卫星' in text or '气象' in text or '辅助' in text or '运维' in text:
+                    toc_section = False
+                    skip_until_content = False
+        
+        # 跳过文档开头的元数据
+        if skip_until_content:
+            if result['document_title'] and text == result['document_title']:
+                continue
+            if '版本' in text or '日期' in text or text == '目录':
+                continue
+            # 检测正式内容开始
+            heading_level = get_heading_level(para)
+            if heading_level == 1 or re.match(r'^1\.\s+船舶', text):
+                skip_until_content = False
+                toc_section = False
+            else:
+                continue
+        
+        # 判断标题级别
+        heading_level = get_heading_level(para)
+        
+        if heading_level == 1:
+            # 一级标题：新章节
+            parsed = parse_heading_text(text)
+            
+            # 保存之前的subsection到section
+            if current_subsection is not None and current_section is not None:
+                current_section['subsections'].append(current_subsection)
+                current_subsection = None
+            
+            # 保存之前的section
+            if current_section is not None:
+                result['sections'].append(current_section)
+            
+            current_section = {
+                'number': parsed['number'],
+                'title': parsed['title'],
+                'content': [],
+                'subsections': []
+            }
+            
+        elif heading_level == 2:
+            # 二级标题：新子章节
+            parsed = parse_heading_text(text)
+            
+            # 保存之前的subsection
+            if current_subsection is not None and current_section is not None:
+                current_section['subsections'].append(current_subsection)
+            
+            current_subsection = {
+                'number': parsed['number'],
+                'title': parsed['title'],
+                'content': []
+            }
+            
+        else:
+            # 普通段落内容
+            content_item = {
+                'type': 'text',
+                'content': text
+            }
+            
+            # 检查是否包含图片引用（UUID格式）
+            if re.match(r'^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$', text, re.IGNORECASE):
+                content_item = {
+                    'type': 'image_reference',
+                    'reference_id': text
+                }
+            
+            # 添加到适当的位置
+            if current_subsection is not None:
+                current_subsection['content'].append(content_item)
+            elif current_section is not None:
+                current_section['content'].append(content_item)
+    
+    # 保存最后的subsection和section
+    if current_subsection is not None and current_section is not None:
+        current_section['subsections'].append(current_subsection)
+    if current_section is not None:
+        result['sections'].append(current_section)
+    
+    return result
+
+
+def clean_json_output(data):
+    """清理JSON输出，移除空内容"""
+    if isinstance(data, dict):
+        cleaned = {}
+        for key, value in data.items():
+            cleaned_value = clean_json_output(value)
+            # 保留空列表以保持结构完整性
+            if cleaned_value is not None:
+                cleaned[key] = cleaned_value
+        return cleaned
+    elif isinstance(data, list):
+        cleaned = [clean_json_output(item) for item in data if item]
+        return cleaned
+    elif isinstance(data, str):
+        return data.strip() if data.strip() else None
+    else:
+        return data
+
+
+def convert_docx_to_json(input_path, output_path=None, indent=2, ensure_ascii=False):
+    """
+    主函数：将Word文档转换为JSON
+    
+    参数:
+        input_path: Word文档路径
+        output_path: JSON输出路径（可选）
+        indent: JSON缩进空格数
+        ensure_ascii: 是否将非ASCII字符转义
+    
+    返回:
+        解析后的字典数据
+    """
+    # 解析文档
+    parsed_data = parse_document(input_path)
+    
+    # 清理数据
+    cleaned_data = clean_json_output(parsed_data)
+    
+    # 输出到文件
+    if output_path:
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(cleaned_data, f, ensure_ascii=ensure_ascii, indent=indent)
+        print(f"JSON已保存至: {output_path}")
+    
+    return cleaned_data
+
+
+def main():
+    """主入口函数"""
+    import sys
+    import os
+    
+    # 默认输入输出路径
+    input_file = "VDES软件使用说明书.docx"
+    output_file = "VDES软件使用说明书.json"
+    
+    # 支持命令行参数
+    if len(sys.argv) >= 2:
+        input_file = sys.argv[1]
+    if len(sys.argv) >= 3:
+        output_file = sys.argv[2]
+    
+    # 检查文件存在
+    if not os.path.exists(input_file):
+        print(f"错误: 找不到文件 '{input_file}'")
+        sys.exit(1)
+    
+    try:
+        # 转换文档
+        result = convert_docx_to_json(input_file, output_file)
+        
+        # 打印预览
+        print("\n=== 转换结果预览 ===")
+        print(f"文档标题: {result.get('document_title', 'N/A')}")
+        print(f"版本: {result.get('version', 'N/A')}")
+        print(f"日期: {result.get('date', 'N/A')}")
+        print(f"章节数量: {len(result.get('sections', []))}")
+        
+        for section in result.get('sections', []):
+            print(f"\n  {section.get('number', '')} {section.get('title', '')}")
+            for subsection in section.get('subsections', []):
+                print(f"    {subsection.get('number', '')} {subsection.get('title', '')}")
+        
+        print("\n转换完成!")
+        
+    except Exception as e:
+        print(f"转换失败: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()