Files
test_item_gen/process_doc_file.py

371 lines
12 KiB
Python
Raw Permalink Normal View History

2026-02-04 14:38:52 +08:00
import json
import re
from docx import Document
# from docx.oml import CT_OMath
from docx.table import Table
from lxml import etree
def extract_text_from_paragraph(paragraph):
"""从段落中提取文本,包括处理特殊元素"""
text_parts = []
for child in paragraph._element:
if child.tag.endswith('}r'): # 文本运行
for sub in child:
if sub.tag.endswith('}t'):
if sub.text:
text_parts.append(sub.text)
elif child.tag.endswith('}hyperlink'): # 超链接
for r in child:
for sub in r:
if sub.tag.endswith('}t'):
if sub.text:
text_parts.append(sub.text)
return ''.join(text_parts).strip()
def is_image_paragraph(paragraph):
"""检查段落是否包含图片"""
# 定义命名空间URI
NS_A = 'http://schemas.openxmlformats.org/drawingml/2006/main'
NS_PIC = 'http://schemas.openxmlformats.org/drawingml/2006/picture'
NS_W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
# 检查所有元素,查找图片相关标签
def has_image_element(element):
"""递归检查元素及其子元素是否包含图片"""
# 检查当前元素的标签
tag = element.tag
if tag == f'{{{NS_A}}}blip' or tag == f'{{{NS_PIC}}}pic':
return True
if tag == f'{{{NS_W}}}drawing':
return True
# 递归检查子元素
for child in element:
if has_image_element(child):
return True
return False
for run in paragraph.runs:
if has_image_element(run._element):
return True
# 检查段落元素本身
if has_image_element(paragraph._element):
return True
return False
def get_heading_level(paragraph):
"""
获取段落的标题级别
返回: 0表示非标题1-9表示对应级别的标题
"""
style_name = paragraph.style.name if paragraph.style else ""
# 检查是否是Heading样式
if style_name.startswith('Heading'):
try:
level = int(style_name.replace('Heading', '').strip())
return level
except ValueError:
pass
# 检查是否是标题样式(中文)
if '标题' in style_name:
match = re.search(r'(\d+)', style_name)
if match:
return int(match.group(1))
# 通过文本内容判断(处理列表编号格式的标题)
text = extract_text_from_paragraph(paragraph)
if not text:
return 0
# 匹配 "1. 船舶图面展示" 这种一级标题
if re.match(r'^[1-9]\d*\.\s+\S', text):
return 1
# 匹配 "1.1 地图图面" 这种二级标题
if re.match(r'^[1-9]\d*\.[1-9]\d*\s+\S', text):
return 2
# 匹配 "1.1.1 xxx" 这种三级标题
if re.match(r'^[1-9]\d*\.[1-9]\d*\.[1-9]\d*\s+\S', text):
return 3
return 0
def parse_heading_text(text):
"""解析标题文本,提取编号和标题内容"""
# 匹配 "1. 标题" 或 "1.1 标题" 或 "1.1.1 标题"
match = re.match(r'^([1-9]\d*(?:\.[1-9]\d*)*\.?)\s*(.+)$', text)
if match:
return {
'number': match.group(1).rstrip('.'),
'title': match.group(2).strip()
}
return {
'number': '',
'title': text.strip()
}
def extract_table_data(table):
"""提取表格数据"""
table_data = []
for row in table.rows:
row_data = []
for cell in row.cells:
cell_text = cell.text.strip()
row_data.append(cell_text)
table_data.append(row_data)
return table_data
def parse_document(doc_path):
"""解析Word文档并转换为结构化数据"""
try:
doc = Document(doc_path)
except Exception as e:
raise Exception(f"无法打开文档: {e}")
result = {
'document_title': '',
'version': '',
'date': '',
'table_of_contents': [],
'sections': []
}
# 用于追踪当前位置
current_section = None # 一级章节
current_subsection = None # 二级章节
content_started = False
toc_section = False
# 提取元数据(标题、版本、日期)
for i, para in enumerate(doc.paragraphs[:10]):
text = extract_text_from_paragraph(para)
if not text:
continue
if 'VDES' in text and '使用说明书' in text and not result['document_title']:
result['document_title'] = text
elif text.startswith('版本'):
result['version'] = text.replace('版本:', '').replace('版本:', '').strip()
elif '日期' in text or re.match(r'.*\d{4}\s*年', text):
result['date'] = text.replace('日期:', '').replace('日期:', '').strip()
elif text == '目录':
toc_section = True
continue
# 主要解析逻辑
skip_until_content = True
for i, para in enumerate(doc.paragraphs):
text = extract_text_from_paragraph(para)
# 跳过空段落和图片占位符
if not text:
if is_image_paragraph(para):
# 记录图片位置
if current_subsection is not None:
current_subsection['content'].append({
'type': 'image',
'description': '[图片]'
})
elif current_section is not None:
current_section['content'].append({
'type': 'image',
'description': '[图片]'
})
continue
# 跳过目录部分
if text == '目录':
toc_section = True
continue
if toc_section:
# 检测目录结束(遇到正式章节标题)
if re.match(r'^[1-9]\.\s+', text) or get_heading_level(para) == 1:
# 检查这是否是正式内容的开始
if '船舶' in text or '卫星' in text or '气象' in text or '辅助' in text or '运维' in text:
toc_section = False
skip_until_content = False
# 跳过文档开头的元数据
if skip_until_content:
if result['document_title'] and text == result['document_title']:
continue
if '版本' in text or '日期' in text or text == '目录':
continue
# 检测正式内容开始
heading_level = get_heading_level(para)
if heading_level == 1 or re.match(r'^1\.\s+船舶', text):
skip_until_content = False
toc_section = False
else:
continue
# 判断标题级别
heading_level = get_heading_level(para)
if heading_level == 1:
# 一级标题:新章节
parsed = parse_heading_text(text)
# 保存之前的subsection到section
if current_subsection is not None and current_section is not None:
current_section['subsections'].append(current_subsection)
current_subsection = None
# 保存之前的section
if current_section is not None:
result['sections'].append(current_section)
current_section = {
'number': parsed['number'],
'title': parsed['title'],
'content': [],
'subsections': []
}
elif heading_level == 2:
# 二级标题:新子章节
parsed = parse_heading_text(text)
# 保存之前的subsection
if current_subsection is not None and current_section is not None:
current_section['subsections'].append(current_subsection)
current_subsection = {
'number': parsed['number'],
'title': parsed['title'],
'content': []
}
else:
# 普通段落内容
content_item = {
'type': 'text',
'content': text
}
# 检查是否包含图片引用UUID格式
if re.match(r'^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$', text, re.IGNORECASE):
content_item = {
'type': 'image_reference',
'reference_id': text
}
# 添加到适当的位置
if current_subsection is not None:
current_subsection['content'].append(content_item)
elif current_section is not None:
current_section['content'].append(content_item)
# 保存最后的subsection和section
if current_subsection is not None and current_section is not None:
current_section['subsections'].append(current_subsection)
if current_section is not None:
result['sections'].append(current_section)
return result
def clean_json_output(data):
"""清理JSON输出移除空内容"""
if isinstance(data, dict):
cleaned = {}
for key, value in data.items():
cleaned_value = clean_json_output(value)
# 保留空列表以保持结构完整性
if cleaned_value is not None:
cleaned[key] = cleaned_value
return cleaned
elif isinstance(data, list):
cleaned = [clean_json_output(item) for item in data if item]
return cleaned
elif isinstance(data, str):
return data.strip() if data.strip() else None
else:
return data
def convert_docx_to_json(input_path, output_path=None, indent=2, ensure_ascii=False):
"""
主函数将Word文档转换为JSON
参数:
input_path: Word文档路径
output_path: JSON输出路径可选
indent: JSON缩进空格数
ensure_ascii: 是否将非ASCII字符转义
返回:
解析后的字典数据
"""
# 解析文档
parsed_data = parse_document(input_path)
# 清理数据
cleaned_data = clean_json_output(parsed_data)
# 输出到文件
if output_path:
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(cleaned_data, f, ensure_ascii=ensure_ascii, indent=indent)
print(f"JSON已保存至: {output_path}")
return cleaned_data
def main():
"""主入口函数"""
import sys
import os
# 默认输入输出路径
input_file = "VDES软件使用说明书.docx"
output_file = "VDES软件使用说明书.json"
# 支持命令行参数
if len(sys.argv) >= 2:
input_file = sys.argv[1]
if len(sys.argv) >= 3:
output_file = sys.argv[2]
# 检查文件存在
if not os.path.exists(input_file):
print(f"错误: 找不到文件 '{input_file}'")
sys.exit(1)
try:
# 转换文档
result = convert_docx_to_json(input_file, output_file)
# 打印预览
print("\n=== 转换结果预览 ===")
print(f"文档标题: {result.get('document_title', 'N/A')}")
print(f"版本: {result.get('version', 'N/A')}")
print(f"日期: {result.get('date', 'N/A')}")
print(f"章节数量: {len(result.get('sections', []))}")
for section in result.get('sections', []):
print(f"\n {section.get('number', '')} {section.get('title', '')}")
for subsection in section.get('subsections', []):
print(f" {subsection.get('number', '')} {subsection.get('title', '')}")
print("\n转换完成!")
except Exception as e:
print(f"转换失败: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()