Files
test_item_gen/process_doc_file.py
2026-02-04 14:38:52 +08:00

371 lines
12 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import re
from docx import Document
# from docx.oml import CT_OMath
from docx.table import Table
from lxml import etree
def extract_text_from_paragraph(paragraph):
"""从段落中提取文本,包括处理特殊元素"""
text_parts = []
for child in paragraph._element:
if child.tag.endswith('}r'): # 文本运行
for sub in child:
if sub.tag.endswith('}t'):
if sub.text:
text_parts.append(sub.text)
elif child.tag.endswith('}hyperlink'): # 超链接
for r in child:
for sub in r:
if sub.tag.endswith('}t'):
if sub.text:
text_parts.append(sub.text)
return ''.join(text_parts).strip()
def is_image_paragraph(paragraph):
"""检查段落是否包含图片"""
# 定义命名空间URI
NS_A = 'http://schemas.openxmlformats.org/drawingml/2006/main'
NS_PIC = 'http://schemas.openxmlformats.org/drawingml/2006/picture'
NS_W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
# 检查所有元素,查找图片相关标签
def has_image_element(element):
"""递归检查元素及其子元素是否包含图片"""
# 检查当前元素的标签
tag = element.tag
if tag == f'{{{NS_A}}}blip' or tag == f'{{{NS_PIC}}}pic':
return True
if tag == f'{{{NS_W}}}drawing':
return True
# 递归检查子元素
for child in element:
if has_image_element(child):
return True
return False
for run in paragraph.runs:
if has_image_element(run._element):
return True
# 检查段落元素本身
if has_image_element(paragraph._element):
return True
return False
def get_heading_level(paragraph):
"""
获取段落的标题级别
返回: 0表示非标题1-9表示对应级别的标题
"""
style_name = paragraph.style.name if paragraph.style else ""
# 检查是否是Heading样式
if style_name.startswith('Heading'):
try:
level = int(style_name.replace('Heading', '').strip())
return level
except ValueError:
pass
# 检查是否是标题样式(中文)
if '标题' in style_name:
match = re.search(r'(\d+)', style_name)
if match:
return int(match.group(1))
# 通过文本内容判断(处理列表编号格式的标题)
text = extract_text_from_paragraph(paragraph)
if not text:
return 0
# 匹配 "1. 船舶图面展示" 这种一级标题
if re.match(r'^[1-9]\d*\.\s+\S', text):
return 1
# 匹配 "1.1 地图图面" 这种二级标题
if re.match(r'^[1-9]\d*\.[1-9]\d*\s+\S', text):
return 2
# 匹配 "1.1.1 xxx" 这种三级标题
if re.match(r'^[1-9]\d*\.[1-9]\d*\.[1-9]\d*\s+\S', text):
return 3
return 0
def parse_heading_text(text):
"""解析标题文本,提取编号和标题内容"""
# 匹配 "1. 标题" 或 "1.1 标题" 或 "1.1.1 标题"
match = re.match(r'^([1-9]\d*(?:\.[1-9]\d*)*\.?)\s*(.+)$', text)
if match:
return {
'number': match.group(1).rstrip('.'),
'title': match.group(2).strip()
}
return {
'number': '',
'title': text.strip()
}
def extract_table_data(table):
"""提取表格数据"""
table_data = []
for row in table.rows:
row_data = []
for cell in row.cells:
cell_text = cell.text.strip()
row_data.append(cell_text)
table_data.append(row_data)
return table_data
def parse_document(doc_path):
"""解析Word文档并转换为结构化数据"""
try:
doc = Document(doc_path)
except Exception as e:
raise Exception(f"无法打开文档: {e}")
result = {
'document_title': '',
'version': '',
'date': '',
'table_of_contents': [],
'sections': []
}
# 用于追踪当前位置
current_section = None # 一级章节
current_subsection = None # 二级章节
content_started = False
toc_section = False
# 提取元数据(标题、版本、日期)
for i, para in enumerate(doc.paragraphs[:10]):
text = extract_text_from_paragraph(para)
if not text:
continue
if 'VDES' in text and '使用说明书' in text and not result['document_title']:
result['document_title'] = text
elif text.startswith('版本'):
result['version'] = text.replace('版本:', '').replace('版本:', '').strip()
elif '日期' in text or re.match(r'.*\d{4}\s*年', text):
result['date'] = text.replace('日期:', '').replace('日期:', '').strip()
elif text == '目录':
toc_section = True
continue
# 主要解析逻辑
skip_until_content = True
for i, para in enumerate(doc.paragraphs):
text = extract_text_from_paragraph(para)
# 跳过空段落和图片占位符
if not text:
if is_image_paragraph(para):
# 记录图片位置
if current_subsection is not None:
current_subsection['content'].append({
'type': 'image',
'description': '[图片]'
})
elif current_section is not None:
current_section['content'].append({
'type': 'image',
'description': '[图片]'
})
continue
# 跳过目录部分
if text == '目录':
toc_section = True
continue
if toc_section:
# 检测目录结束(遇到正式章节标题)
if re.match(r'^[1-9]\.\s+', text) or get_heading_level(para) == 1:
# 检查这是否是正式内容的开始
if '船舶' in text or '卫星' in text or '气象' in text or '辅助' in text or '运维' in text:
toc_section = False
skip_until_content = False
# 跳过文档开头的元数据
if skip_until_content:
if result['document_title'] and text == result['document_title']:
continue
if '版本' in text or '日期' in text or text == '目录':
continue
# 检测正式内容开始
heading_level = get_heading_level(para)
if heading_level == 1 or re.match(r'^1\.\s+船舶', text):
skip_until_content = False
toc_section = False
else:
continue
# 判断标题级别
heading_level = get_heading_level(para)
if heading_level == 1:
# 一级标题:新章节
parsed = parse_heading_text(text)
# 保存之前的subsection到section
if current_subsection is not None and current_section is not None:
current_section['subsections'].append(current_subsection)
current_subsection = None
# 保存之前的section
if current_section is not None:
result['sections'].append(current_section)
current_section = {
'number': parsed['number'],
'title': parsed['title'],
'content': [],
'subsections': []
}
elif heading_level == 2:
# 二级标题:新子章节
parsed = parse_heading_text(text)
# 保存之前的subsection
if current_subsection is not None and current_section is not None:
current_section['subsections'].append(current_subsection)
current_subsection = {
'number': parsed['number'],
'title': parsed['title'],
'content': []
}
else:
# 普通段落内容
content_item = {
'type': 'text',
'content': text
}
# 检查是否包含图片引用UUID格式
if re.match(r'^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$', text, re.IGNORECASE):
content_item = {
'type': 'image_reference',
'reference_id': text
}
# 添加到适当的位置
if current_subsection is not None:
current_subsection['content'].append(content_item)
elif current_section is not None:
current_section['content'].append(content_item)
# 保存最后的subsection和section
if current_subsection is not None and current_section is not None:
current_section['subsections'].append(current_subsection)
if current_section is not None:
result['sections'].append(current_section)
return result
def clean_json_output(data):
"""清理JSON输出移除空内容"""
if isinstance(data, dict):
cleaned = {}
for key, value in data.items():
cleaned_value = clean_json_output(value)
# 保留空列表以保持结构完整性
if cleaned_value is not None:
cleaned[key] = cleaned_value
return cleaned
elif isinstance(data, list):
cleaned = [clean_json_output(item) for item in data if item]
return cleaned
elif isinstance(data, str):
return data.strip() if data.strip() else None
else:
return data
def convert_docx_to_json(input_path, output_path=None, indent=2, ensure_ascii=False):
"""
主函数将Word文档转换为JSON
参数:
input_path: Word文档路径
output_path: JSON输出路径可选
indent: JSON缩进空格数
ensure_ascii: 是否将非ASCII字符转义
返回:
解析后的字典数据
"""
# 解析文档
parsed_data = parse_document(input_path)
# 清理数据
cleaned_data = clean_json_output(parsed_data)
# 输出到文件
if output_path:
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(cleaned_data, f, ensure_ascii=ensure_ascii, indent=indent)
print(f"JSON已保存至: {output_path}")
return cleaned_data
def main():
"""主入口函数"""
import sys
import os
# 默认输入输出路径
input_file = "VDES软件使用说明书.docx"
output_file = "VDES软件使用说明书.json"
# 支持命令行参数
if len(sys.argv) >= 2:
input_file = sys.argv[1]
if len(sys.argv) >= 3:
output_file = sys.argv[2]
# 检查文件存在
if not os.path.exists(input_file):
print(f"错误: 找不到文件 '{input_file}'")
sys.exit(1)
try:
# 转换文档
result = convert_docx_to_json(input_file, output_file)
# 打印预览
print("\n=== 转换结果预览 ===")
print(f"文档标题: {result.get('document_title', 'N/A')}")
print(f"版本: {result.get('version', 'N/A')}")
print(f"日期: {result.get('date', 'N/A')}")
print(f"章节数量: {len(result.get('sections', []))}")
for section in result.get('sections', []):
print(f"\n {section.get('number', '')} {section.get('title', '')}")
for subsection in section.get('subsections', []):
print(f" {subsection.get('number', '')} {subsection.get('title', '')}")
print("\n转换完成!")
except Exception as e:
print(f"转换失败: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()