371 lines
12 KiB
Python
371 lines
12 KiB
Python
import json
|
||
import re
|
||
from docx import Document
|
||
# from docx.oml import CT_OMath
|
||
from docx.table import Table
|
||
from lxml import etree
|
||
|
||
|
||
def extract_text_from_paragraph(paragraph):
|
||
"""从段落中提取文本,包括处理特殊元素"""
|
||
text_parts = []
|
||
for child in paragraph._element:
|
||
if child.tag.endswith('}r'): # 文本运行
|
||
for sub in child:
|
||
if sub.tag.endswith('}t'):
|
||
if sub.text:
|
||
text_parts.append(sub.text)
|
||
elif child.tag.endswith('}hyperlink'): # 超链接
|
||
for r in child:
|
||
for sub in r:
|
||
if sub.tag.endswith('}t'):
|
||
if sub.text:
|
||
text_parts.append(sub.text)
|
||
return ''.join(text_parts).strip()
|
||
|
||
|
||
def is_image_paragraph(paragraph):
|
||
"""检查段落是否包含图片"""
|
||
# 定义命名空间URI
|
||
NS_A = 'http://schemas.openxmlformats.org/drawingml/2006/main'
|
||
NS_PIC = 'http://schemas.openxmlformats.org/drawingml/2006/picture'
|
||
NS_W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
||
|
||
# 检查所有元素,查找图片相关标签
|
||
def has_image_element(element):
|
||
"""递归检查元素及其子元素是否包含图片"""
|
||
# 检查当前元素的标签
|
||
tag = element.tag
|
||
if tag == f'{{{NS_A}}}blip' or tag == f'{{{NS_PIC}}}pic':
|
||
return True
|
||
if tag == f'{{{NS_W}}}drawing':
|
||
return True
|
||
# 递归检查子元素
|
||
for child in element:
|
||
if has_image_element(child):
|
||
return True
|
||
return False
|
||
|
||
for run in paragraph.runs:
|
||
if has_image_element(run._element):
|
||
return True
|
||
# 检查段落元素本身
|
||
if has_image_element(paragraph._element):
|
||
return True
|
||
return False
|
||
|
||
|
||
def get_heading_level(paragraph):
|
||
"""
|
||
获取段落的标题级别
|
||
返回: 0表示非标题,1-9表示对应级别的标题
|
||
"""
|
||
style_name = paragraph.style.name if paragraph.style else ""
|
||
|
||
# 检查是否是Heading样式
|
||
if style_name.startswith('Heading'):
|
||
try:
|
||
level = int(style_name.replace('Heading', '').strip())
|
||
return level
|
||
except ValueError:
|
||
pass
|
||
|
||
# 检查是否是标题样式(中文)
|
||
if '标题' in style_name:
|
||
match = re.search(r'(\d+)', style_name)
|
||
if match:
|
||
return int(match.group(1))
|
||
|
||
# 通过文本内容判断(处理列表编号格式的标题)
|
||
text = extract_text_from_paragraph(paragraph)
|
||
if not text:
|
||
return 0
|
||
|
||
# 匹配 "1. 船舶图面展示" 这种一级标题
|
||
if re.match(r'^[1-9]\d*\.\s+\S', text):
|
||
return 1
|
||
|
||
# 匹配 "1.1 地图图面" 这种二级标题
|
||
if re.match(r'^[1-9]\d*\.[1-9]\d*\s+\S', text):
|
||
return 2
|
||
|
||
# 匹配 "1.1.1 xxx" 这种三级标题
|
||
if re.match(r'^[1-9]\d*\.[1-9]\d*\.[1-9]\d*\s+\S', text):
|
||
return 3
|
||
|
||
return 0
|
||
|
||
|
||
def parse_heading_text(text):
|
||
"""解析标题文本,提取编号和标题内容"""
|
||
# 匹配 "1. 标题" 或 "1.1 标题" 或 "1.1.1 标题"
|
||
match = re.match(r'^([1-9]\d*(?:\.[1-9]\d*)*\.?)\s*(.+)$', text)
|
||
if match:
|
||
return {
|
||
'number': match.group(1).rstrip('.'),
|
||
'title': match.group(2).strip()
|
||
}
|
||
return {
|
||
'number': '',
|
||
'title': text.strip()
|
||
}
|
||
|
||
|
||
def extract_table_data(table):
|
||
"""提取表格数据"""
|
||
table_data = []
|
||
for row in table.rows:
|
||
row_data = []
|
||
for cell in row.cells:
|
||
cell_text = cell.text.strip()
|
||
row_data.append(cell_text)
|
||
table_data.append(row_data)
|
||
return table_data
|
||
|
||
|
||
def parse_document(doc_path):
|
||
"""解析Word文档并转换为结构化数据"""
|
||
try:
|
||
doc = Document(doc_path)
|
||
except Exception as e:
|
||
raise Exception(f"无法打开文档: {e}")
|
||
|
||
result = {
|
||
'document_title': '',
|
||
'version': '',
|
||
'date': '',
|
||
'table_of_contents': [],
|
||
'sections': []
|
||
}
|
||
|
||
# 用于追踪当前位置
|
||
current_section = None # 一级章节
|
||
current_subsection = None # 二级章节
|
||
content_started = False
|
||
toc_section = False
|
||
|
||
# 提取元数据(标题、版本、日期)
|
||
for i, para in enumerate(doc.paragraphs[:10]):
|
||
text = extract_text_from_paragraph(para)
|
||
if not text:
|
||
continue
|
||
|
||
if 'VDES' in text and '使用说明书' in text and not result['document_title']:
|
||
result['document_title'] = text
|
||
elif text.startswith('版本'):
|
||
result['version'] = text.replace('版本:', '').replace('版本:', '').strip()
|
||
elif '日期' in text or re.match(r'.*\d{4}\s*年', text):
|
||
result['date'] = text.replace('日期:', '').replace('日期:', '').strip()
|
||
elif text == '目录':
|
||
toc_section = True
|
||
continue
|
||
|
||
# 主要解析逻辑
|
||
skip_until_content = True
|
||
|
||
for i, para in enumerate(doc.paragraphs):
|
||
text = extract_text_from_paragraph(para)
|
||
|
||
# 跳过空段落和图片占位符
|
||
if not text:
|
||
if is_image_paragraph(para):
|
||
# 记录图片位置
|
||
if current_subsection is not None:
|
||
current_subsection['content'].append({
|
||
'type': 'image',
|
||
'description': '[图片]'
|
||
})
|
||
elif current_section is not None:
|
||
current_section['content'].append({
|
||
'type': 'image',
|
||
'description': '[图片]'
|
||
})
|
||
continue
|
||
|
||
# 跳过目录部分
|
||
if text == '目录':
|
||
toc_section = True
|
||
continue
|
||
|
||
if toc_section:
|
||
# 检测目录结束(遇到正式章节标题)
|
||
if re.match(r'^[1-9]\.\s+', text) or get_heading_level(para) == 1:
|
||
# 检查这是否是正式内容的开始
|
||
if '船舶' in text or '卫星' in text or '气象' in text or '辅助' in text or '运维' in text:
|
||
toc_section = False
|
||
skip_until_content = False
|
||
|
||
# 跳过文档开头的元数据
|
||
if skip_until_content:
|
||
if result['document_title'] and text == result['document_title']:
|
||
continue
|
||
if '版本' in text or '日期' in text or text == '目录':
|
||
continue
|
||
# 检测正式内容开始
|
||
heading_level = get_heading_level(para)
|
||
if heading_level == 1 or re.match(r'^1\.\s+船舶', text):
|
||
skip_until_content = False
|
||
toc_section = False
|
||
else:
|
||
continue
|
||
|
||
# 判断标题级别
|
||
heading_level = get_heading_level(para)
|
||
|
||
if heading_level == 1:
|
||
# 一级标题:新章节
|
||
parsed = parse_heading_text(text)
|
||
|
||
# 保存之前的subsection到section
|
||
if current_subsection is not None and current_section is not None:
|
||
current_section['subsections'].append(current_subsection)
|
||
current_subsection = None
|
||
|
||
# 保存之前的section
|
||
if current_section is not None:
|
||
result['sections'].append(current_section)
|
||
|
||
current_section = {
|
||
'number': parsed['number'],
|
||
'title': parsed['title'],
|
||
'content': [],
|
||
'subsections': []
|
||
}
|
||
|
||
elif heading_level == 2:
|
||
# 二级标题:新子章节
|
||
parsed = parse_heading_text(text)
|
||
|
||
# 保存之前的subsection
|
||
if current_subsection is not None and current_section is not None:
|
||
current_section['subsections'].append(current_subsection)
|
||
|
||
current_subsection = {
|
||
'number': parsed['number'],
|
||
'title': parsed['title'],
|
||
'content': []
|
||
}
|
||
|
||
else:
|
||
# 普通段落内容
|
||
content_item = {
|
||
'type': 'text',
|
||
'content': text
|
||
}
|
||
|
||
# 检查是否包含图片引用(UUID格式)
|
||
if re.match(r'^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$', text, re.IGNORECASE):
|
||
content_item = {
|
||
'type': 'image_reference',
|
||
'reference_id': text
|
||
}
|
||
|
||
# 添加到适当的位置
|
||
if current_subsection is not None:
|
||
current_subsection['content'].append(content_item)
|
||
elif current_section is not None:
|
||
current_section['content'].append(content_item)
|
||
|
||
# 保存最后的subsection和section
|
||
if current_subsection is not None and current_section is not None:
|
||
current_section['subsections'].append(current_subsection)
|
||
if current_section is not None:
|
||
result['sections'].append(current_section)
|
||
|
||
return result
|
||
|
||
|
||
def clean_json_output(data):
|
||
"""清理JSON输出,移除空内容"""
|
||
if isinstance(data, dict):
|
||
cleaned = {}
|
||
for key, value in data.items():
|
||
cleaned_value = clean_json_output(value)
|
||
# 保留空列表以保持结构完整性
|
||
if cleaned_value is not None:
|
||
cleaned[key] = cleaned_value
|
||
return cleaned
|
||
elif isinstance(data, list):
|
||
cleaned = [clean_json_output(item) for item in data if item]
|
||
return cleaned
|
||
elif isinstance(data, str):
|
||
return data.strip() if data.strip() else None
|
||
else:
|
||
return data
|
||
|
||
|
||
def convert_docx_to_json(input_path, output_path=None, indent=2, ensure_ascii=False):
|
||
"""
|
||
主函数:将Word文档转换为JSON
|
||
|
||
参数:
|
||
input_path: Word文档路径
|
||
output_path: JSON输出路径(可选)
|
||
indent: JSON缩进空格数
|
||
ensure_ascii: 是否将非ASCII字符转义
|
||
|
||
返回:
|
||
解析后的字典数据
|
||
"""
|
||
# 解析文档
|
||
parsed_data = parse_document(input_path)
|
||
|
||
# 清理数据
|
||
cleaned_data = clean_json_output(parsed_data)
|
||
|
||
# 输出到文件
|
||
if output_path:
|
||
with open(output_path, 'w', encoding='utf-8') as f:
|
||
json.dump(cleaned_data, f, ensure_ascii=ensure_ascii, indent=indent)
|
||
print(f"JSON已保存至: {output_path}")
|
||
|
||
return cleaned_data
|
||
|
||
|
||
def main():
|
||
"""主入口函数"""
|
||
import sys
|
||
import os
|
||
|
||
# 默认输入输出路径
|
||
input_file = "VDES软件使用说明书.docx"
|
||
output_file = "VDES软件使用说明书.json"
|
||
|
||
# 支持命令行参数
|
||
if len(sys.argv) >= 2:
|
||
input_file = sys.argv[1]
|
||
if len(sys.argv) >= 3:
|
||
output_file = sys.argv[2]
|
||
|
||
# 检查文件存在
|
||
if not os.path.exists(input_file):
|
||
print(f"错误: 找不到文件 '{input_file}'")
|
||
sys.exit(1)
|
||
|
||
try:
|
||
# 转换文档
|
||
result = convert_docx_to_json(input_file, output_file)
|
||
|
||
# 打印预览
|
||
print("\n=== 转换结果预览 ===")
|
||
print(f"文档标题: {result.get('document_title', 'N/A')}")
|
||
print(f"版本: {result.get('version', 'N/A')}")
|
||
print(f"日期: {result.get('date', 'N/A')}")
|
||
print(f"章节数量: {len(result.get('sections', []))}")
|
||
|
||
for section in result.get('sections', []):
|
||
print(f"\n {section.get('number', '')} {section.get('title', '')}")
|
||
for subsection in section.get('subsections', []):
|
||
print(f" {subsection.get('number', '')} {subsection.get('title', '')}")
|
||
|
||
print("\n转换完成!")
|
||
|
||
except Exception as e:
|
||
print(f"转换失败: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
sys.exit(1)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main() |