import json import re from docx import Document # from docx.oml import CT_OMath from docx.table import Table from lxml import etree def extract_text_from_paragraph(paragraph): """从段落中提取文本,包括处理特殊元素""" text_parts = [] for child in paragraph._element: if child.tag.endswith('}r'): # 文本运行 for sub in child: if sub.tag.endswith('}t'): if sub.text: text_parts.append(sub.text) elif child.tag.endswith('}hyperlink'): # 超链接 for r in child: for sub in r: if sub.tag.endswith('}t'): if sub.text: text_parts.append(sub.text) return ''.join(text_parts).strip() def is_image_paragraph(paragraph): """检查段落是否包含图片""" # 定义命名空间URI NS_A = 'http://schemas.openxmlformats.org/drawingml/2006/main' NS_PIC = 'http://schemas.openxmlformats.org/drawingml/2006/picture' NS_W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' # 检查所有元素,查找图片相关标签 def has_image_element(element): """递归检查元素及其子元素是否包含图片""" # 检查当前元素的标签 tag = element.tag if tag == f'{{{NS_A}}}blip' or tag == f'{{{NS_PIC}}}pic': return True if tag == f'{{{NS_W}}}drawing': return True # 递归检查子元素 for child in element: if has_image_element(child): return True return False for run in paragraph.runs: if has_image_element(run._element): return True # 检查段落元素本身 if has_image_element(paragraph._element): return True return False def get_heading_level(paragraph): """ 获取段落的标题级别 返回: 0表示非标题,1-9表示对应级别的标题 """ style_name = paragraph.style.name if paragraph.style else "" # 检查是否是Heading样式 if style_name.startswith('Heading'): try: level = int(style_name.replace('Heading', '').strip()) return level except ValueError: pass # 检查是否是标题样式(中文) if '标题' in style_name: match = re.search(r'(\d+)', style_name) if match: return int(match.group(1)) # 通过文本内容判断(处理列表编号格式的标题) text = extract_text_from_paragraph(paragraph) if not text: return 0 # 匹配 "1. 船舶图面展示" 这种一级标题 if re.match(r'^[1-9]\d*\.\s+\S', text): return 1 # 匹配 "1.1 地图图面" 这种二级标题 if re.match(r'^[1-9]\d*\.[1-9]\d*\s+\S', text): return 2 # 匹配 "1.1.1 xxx" 这种三级标题 if re.match(r'^[1-9]\d*\.[1-9]\d*\.[1-9]\d*\s+\S', text): return 3 return 0 def parse_heading_text(text): """解析标题文本,提取编号和标题内容""" # 匹配 "1. 标题" 或 "1.1 标题" 或 "1.1.1 标题" match = re.match(r'^([1-9]\d*(?:\.[1-9]\d*)*\.?)\s*(.+)$', text) if match: return { 'number': match.group(1).rstrip('.'), 'title': match.group(2).strip() } return { 'number': '', 'title': text.strip() } def extract_table_data(table): """提取表格数据""" table_data = [] for row in table.rows: row_data = [] for cell in row.cells: cell_text = cell.text.strip() row_data.append(cell_text) table_data.append(row_data) return table_data def parse_document(doc_path): """解析Word文档并转换为结构化数据""" try: doc = Document(doc_path) except Exception as e: raise Exception(f"无法打开文档: {e}") result = { 'document_title': '', 'version': '', 'date': '', 'table_of_contents': [], 'sections': [] } # 用于追踪当前位置 current_section = None # 一级章节 current_subsection = None # 二级章节 content_started = False toc_section = False # 提取元数据(标题、版本、日期) for i, para in enumerate(doc.paragraphs[:10]): text = extract_text_from_paragraph(para) if not text: continue if 'VDES' in text and '使用说明书' in text and not result['document_title']: result['document_title'] = text elif text.startswith('版本'): result['version'] = text.replace('版本:', '').replace('版本:', '').strip() elif '日期' in text or re.match(r'.*\d{4}\s*年', text): result['date'] = text.replace('日期:', '').replace('日期:', '').strip() elif text == '目录': toc_section = True continue # 主要解析逻辑 skip_until_content = True for i, para in enumerate(doc.paragraphs): text = extract_text_from_paragraph(para) # 跳过空段落和图片占位符 if not text: if is_image_paragraph(para): # 记录图片位置 if current_subsection is not None: current_subsection['content'].append({ 'type': 'image', 'description': '[图片]' }) elif current_section is not None: current_section['content'].append({ 'type': 'image', 'description': '[图片]' }) continue # 跳过目录部分 if text == '目录': toc_section = True continue if toc_section: # 检测目录结束(遇到正式章节标题) if re.match(r'^[1-9]\.\s+', text) or get_heading_level(para) == 1: # 检查这是否是正式内容的开始 if '船舶' in text or '卫星' in text or '气象' in text or '辅助' in text or '运维' in text: toc_section = False skip_until_content = False # 跳过文档开头的元数据 if skip_until_content: if result['document_title'] and text == result['document_title']: continue if '版本' in text or '日期' in text or text == '目录': continue # 检测正式内容开始 heading_level = get_heading_level(para) if heading_level == 1 or re.match(r'^1\.\s+船舶', text): skip_until_content = False toc_section = False else: continue # 判断标题级别 heading_level = get_heading_level(para) if heading_level == 1: # 一级标题:新章节 parsed = parse_heading_text(text) # 保存之前的subsection到section if current_subsection is not None and current_section is not None: current_section['subsections'].append(current_subsection) current_subsection = None # 保存之前的section if current_section is not None: result['sections'].append(current_section) current_section = { 'number': parsed['number'], 'title': parsed['title'], 'content': [], 'subsections': [] } elif heading_level == 2: # 二级标题:新子章节 parsed = parse_heading_text(text) # 保存之前的subsection if current_subsection is not None and current_section is not None: current_section['subsections'].append(current_subsection) current_subsection = { 'number': parsed['number'], 'title': parsed['title'], 'content': [] } else: # 普通段落内容 content_item = { 'type': 'text', 'content': text } # 检查是否包含图片引用(UUID格式) if re.match(r'^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$', text, re.IGNORECASE): content_item = { 'type': 'image_reference', 'reference_id': text } # 添加到适当的位置 if current_subsection is not None: current_subsection['content'].append(content_item) elif current_section is not None: current_section['content'].append(content_item) # 保存最后的subsection和section if current_subsection is not None and current_section is not None: current_section['subsections'].append(current_subsection) if current_section is not None: result['sections'].append(current_section) return result def clean_json_output(data): """清理JSON输出,移除空内容""" if isinstance(data, dict): cleaned = {} for key, value in data.items(): cleaned_value = clean_json_output(value) # 保留空列表以保持结构完整性 if cleaned_value is not None: cleaned[key] = cleaned_value return cleaned elif isinstance(data, list): cleaned = [clean_json_output(item) for item in data if item] return cleaned elif isinstance(data, str): return data.strip() if data.strip() else None else: return data def convert_docx_to_json(input_path, output_path=None, indent=2, ensure_ascii=False): """ 主函数:将Word文档转换为JSON 参数: input_path: Word文档路径 output_path: JSON输出路径(可选) indent: JSON缩进空格数 ensure_ascii: 是否将非ASCII字符转义 返回: 解析后的字典数据 """ # 解析文档 parsed_data = parse_document(input_path) # 清理数据 cleaned_data = clean_json_output(parsed_data) # 输出到文件 if output_path: with open(output_path, 'w', encoding='utf-8') as f: json.dump(cleaned_data, f, ensure_ascii=ensure_ascii, indent=indent) print(f"JSON已保存至: {output_path}") return cleaned_data def main(): """主入口函数""" import sys import os # 默认输入输出路径 input_file = "VDES软件使用说明书.docx" output_file = "VDES软件使用说明书.json" # 支持命令行参数 if len(sys.argv) >= 2: input_file = sys.argv[1] if len(sys.argv) >= 3: output_file = sys.argv[2] # 检查文件存在 if not os.path.exists(input_file): print(f"错误: 找不到文件 '{input_file}'") sys.exit(1) try: # 转换文档 result = convert_docx_to_json(input_file, output_file) # 打印预览 print("\n=== 转换结果预览 ===") print(f"文档标题: {result.get('document_title', 'N/A')}") print(f"版本: {result.get('version', 'N/A')}") print(f"日期: {result.get('date', 'N/A')}") print(f"章节数量: {len(result.get('sections', []))}") for section in result.get('sections', []): print(f"\n {section.get('number', '')} {section.get('title', '')}") for subsection in section.get('subsections', []): print(f" {subsection.get('number', '')} {subsection.get('title', '')}") print("\n转换完成!") except Exception as e: print(f"转换失败: {e}") import traceback traceback.print_exc() sys.exit(1) if __name__ == "__main__": main()