init rep
This commit is contained in:
371
process_doc_file.py
Normal file
371
process_doc_file.py
Normal file
@@ -0,0 +1,371 @@
|
||||
import json
|
||||
import re
|
||||
from docx import Document
|
||||
# from docx.oml import CT_OMath
|
||||
from docx.table import Table
|
||||
from lxml import etree
|
||||
|
||||
|
||||
def extract_text_from_paragraph(paragraph):
|
||||
"""从段落中提取文本,包括处理特殊元素"""
|
||||
text_parts = []
|
||||
for child in paragraph._element:
|
||||
if child.tag.endswith('}r'): # 文本运行
|
||||
for sub in child:
|
||||
if sub.tag.endswith('}t'):
|
||||
if sub.text:
|
||||
text_parts.append(sub.text)
|
||||
elif child.tag.endswith('}hyperlink'): # 超链接
|
||||
for r in child:
|
||||
for sub in r:
|
||||
if sub.tag.endswith('}t'):
|
||||
if sub.text:
|
||||
text_parts.append(sub.text)
|
||||
return ''.join(text_parts).strip()
|
||||
|
||||
|
||||
def is_image_paragraph(paragraph):
|
||||
"""检查段落是否包含图片"""
|
||||
# 定义命名空间URI
|
||||
NS_A = 'http://schemas.openxmlformats.org/drawingml/2006/main'
|
||||
NS_PIC = 'http://schemas.openxmlformats.org/drawingml/2006/picture'
|
||||
NS_W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
||||
|
||||
# 检查所有元素,查找图片相关标签
|
||||
def has_image_element(element):
|
||||
"""递归检查元素及其子元素是否包含图片"""
|
||||
# 检查当前元素的标签
|
||||
tag = element.tag
|
||||
if tag == f'{{{NS_A}}}blip' or tag == f'{{{NS_PIC}}}pic':
|
||||
return True
|
||||
if tag == f'{{{NS_W}}}drawing':
|
||||
return True
|
||||
# 递归检查子元素
|
||||
for child in element:
|
||||
if has_image_element(child):
|
||||
return True
|
||||
return False
|
||||
|
||||
for run in paragraph.runs:
|
||||
if has_image_element(run._element):
|
||||
return True
|
||||
# 检查段落元素本身
|
||||
if has_image_element(paragraph._element):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def get_heading_level(paragraph):
|
||||
"""
|
||||
获取段落的标题级别
|
||||
返回: 0表示非标题,1-9表示对应级别的标题
|
||||
"""
|
||||
style_name = paragraph.style.name if paragraph.style else ""
|
||||
|
||||
# 检查是否是Heading样式
|
||||
if style_name.startswith('Heading'):
|
||||
try:
|
||||
level = int(style_name.replace('Heading', '').strip())
|
||||
return level
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 检查是否是标题样式(中文)
|
||||
if '标题' in style_name:
|
||||
match = re.search(r'(\d+)', style_name)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
|
||||
# 通过文本内容判断(处理列表编号格式的标题)
|
||||
text = extract_text_from_paragraph(paragraph)
|
||||
if not text:
|
||||
return 0
|
||||
|
||||
# 匹配 "1. 船舶图面展示" 这种一级标题
|
||||
if re.match(r'^[1-9]\d*\.\s+\S', text):
|
||||
return 1
|
||||
|
||||
# 匹配 "1.1 地图图面" 这种二级标题
|
||||
if re.match(r'^[1-9]\d*\.[1-9]\d*\s+\S', text):
|
||||
return 2
|
||||
|
||||
# 匹配 "1.1.1 xxx" 这种三级标题
|
||||
if re.match(r'^[1-9]\d*\.[1-9]\d*\.[1-9]\d*\s+\S', text):
|
||||
return 3
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def parse_heading_text(text):
|
||||
"""解析标题文本,提取编号和标题内容"""
|
||||
# 匹配 "1. 标题" 或 "1.1 标题" 或 "1.1.1 标题"
|
||||
match = re.match(r'^([1-9]\d*(?:\.[1-9]\d*)*\.?)\s*(.+)$', text)
|
||||
if match:
|
||||
return {
|
||||
'number': match.group(1).rstrip('.'),
|
||||
'title': match.group(2).strip()
|
||||
}
|
||||
return {
|
||||
'number': '',
|
||||
'title': text.strip()
|
||||
}
|
||||
|
||||
|
||||
def extract_table_data(table):
|
||||
"""提取表格数据"""
|
||||
table_data = []
|
||||
for row in table.rows:
|
||||
row_data = []
|
||||
for cell in row.cells:
|
||||
cell_text = cell.text.strip()
|
||||
row_data.append(cell_text)
|
||||
table_data.append(row_data)
|
||||
return table_data
|
||||
|
||||
|
||||
def parse_document(doc_path):
|
||||
"""解析Word文档并转换为结构化数据"""
|
||||
try:
|
||||
doc = Document(doc_path)
|
||||
except Exception as e:
|
||||
raise Exception(f"无法打开文档: {e}")
|
||||
|
||||
result = {
|
||||
'document_title': '',
|
||||
'version': '',
|
||||
'date': '',
|
||||
'table_of_contents': [],
|
||||
'sections': []
|
||||
}
|
||||
|
||||
# 用于追踪当前位置
|
||||
current_section = None # 一级章节
|
||||
current_subsection = None # 二级章节
|
||||
content_started = False
|
||||
toc_section = False
|
||||
|
||||
# 提取元数据(标题、版本、日期)
|
||||
for i, para in enumerate(doc.paragraphs[:10]):
|
||||
text = extract_text_from_paragraph(para)
|
||||
if not text:
|
||||
continue
|
||||
|
||||
if 'VDES' in text and '使用说明书' in text and not result['document_title']:
|
||||
result['document_title'] = text
|
||||
elif text.startswith('版本'):
|
||||
result['version'] = text.replace('版本:', '').replace('版本:', '').strip()
|
||||
elif '日期' in text or re.match(r'.*\d{4}\s*年', text):
|
||||
result['date'] = text.replace('日期:', '').replace('日期:', '').strip()
|
||||
elif text == '目录':
|
||||
toc_section = True
|
||||
continue
|
||||
|
||||
# 主要解析逻辑
|
||||
skip_until_content = True
|
||||
|
||||
for i, para in enumerate(doc.paragraphs):
|
||||
text = extract_text_from_paragraph(para)
|
||||
|
||||
# 跳过空段落和图片占位符
|
||||
if not text:
|
||||
if is_image_paragraph(para):
|
||||
# 记录图片位置
|
||||
if current_subsection is not None:
|
||||
current_subsection['content'].append({
|
||||
'type': 'image',
|
||||
'description': '[图片]'
|
||||
})
|
||||
elif current_section is not None:
|
||||
current_section['content'].append({
|
||||
'type': 'image',
|
||||
'description': '[图片]'
|
||||
})
|
||||
continue
|
||||
|
||||
# 跳过目录部分
|
||||
if text == '目录':
|
||||
toc_section = True
|
||||
continue
|
||||
|
||||
if toc_section:
|
||||
# 检测目录结束(遇到正式章节标题)
|
||||
if re.match(r'^[1-9]\.\s+', text) or get_heading_level(para) == 1:
|
||||
# 检查这是否是正式内容的开始
|
||||
if '船舶' in text or '卫星' in text or '气象' in text or '辅助' in text or '运维' in text:
|
||||
toc_section = False
|
||||
skip_until_content = False
|
||||
|
||||
# 跳过文档开头的元数据
|
||||
if skip_until_content:
|
||||
if result['document_title'] and text == result['document_title']:
|
||||
continue
|
||||
if '版本' in text or '日期' in text or text == '目录':
|
||||
continue
|
||||
# 检测正式内容开始
|
||||
heading_level = get_heading_level(para)
|
||||
if heading_level == 1 or re.match(r'^1\.\s+船舶', text):
|
||||
skip_until_content = False
|
||||
toc_section = False
|
||||
else:
|
||||
continue
|
||||
|
||||
# 判断标题级别
|
||||
heading_level = get_heading_level(para)
|
||||
|
||||
if heading_level == 1:
|
||||
# 一级标题:新章节
|
||||
parsed = parse_heading_text(text)
|
||||
|
||||
# 保存之前的subsection到section
|
||||
if current_subsection is not None and current_section is not None:
|
||||
current_section['subsections'].append(current_subsection)
|
||||
current_subsection = None
|
||||
|
||||
# 保存之前的section
|
||||
if current_section is not None:
|
||||
result['sections'].append(current_section)
|
||||
|
||||
current_section = {
|
||||
'number': parsed['number'],
|
||||
'title': parsed['title'],
|
||||
'content': [],
|
||||
'subsections': []
|
||||
}
|
||||
|
||||
elif heading_level == 2:
|
||||
# 二级标题:新子章节
|
||||
parsed = parse_heading_text(text)
|
||||
|
||||
# 保存之前的subsection
|
||||
if current_subsection is not None and current_section is not None:
|
||||
current_section['subsections'].append(current_subsection)
|
||||
|
||||
current_subsection = {
|
||||
'number': parsed['number'],
|
||||
'title': parsed['title'],
|
||||
'content': []
|
||||
}
|
||||
|
||||
else:
|
||||
# 普通段落内容
|
||||
content_item = {
|
||||
'type': 'text',
|
||||
'content': text
|
||||
}
|
||||
|
||||
# 检查是否包含图片引用(UUID格式)
|
||||
if re.match(r'^[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}$', text, re.IGNORECASE):
|
||||
content_item = {
|
||||
'type': 'image_reference',
|
||||
'reference_id': text
|
||||
}
|
||||
|
||||
# 添加到适当的位置
|
||||
if current_subsection is not None:
|
||||
current_subsection['content'].append(content_item)
|
||||
elif current_section is not None:
|
||||
current_section['content'].append(content_item)
|
||||
|
||||
# 保存最后的subsection和section
|
||||
if current_subsection is not None and current_section is not None:
|
||||
current_section['subsections'].append(current_subsection)
|
||||
if current_section is not None:
|
||||
result['sections'].append(current_section)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def clean_json_output(data):
|
||||
"""清理JSON输出,移除空内容"""
|
||||
if isinstance(data, dict):
|
||||
cleaned = {}
|
||||
for key, value in data.items():
|
||||
cleaned_value = clean_json_output(value)
|
||||
# 保留空列表以保持结构完整性
|
||||
if cleaned_value is not None:
|
||||
cleaned[key] = cleaned_value
|
||||
return cleaned
|
||||
elif isinstance(data, list):
|
||||
cleaned = [clean_json_output(item) for item in data if item]
|
||||
return cleaned
|
||||
elif isinstance(data, str):
|
||||
return data.strip() if data.strip() else None
|
||||
else:
|
||||
return data
|
||||
|
||||
|
||||
def convert_docx_to_json(input_path, output_path=None, indent=2, ensure_ascii=False):
|
||||
"""
|
||||
主函数:将Word文档转换为JSON
|
||||
|
||||
参数:
|
||||
input_path: Word文档路径
|
||||
output_path: JSON输出路径(可选)
|
||||
indent: JSON缩进空格数
|
||||
ensure_ascii: 是否将非ASCII字符转义
|
||||
|
||||
返回:
|
||||
解析后的字典数据
|
||||
"""
|
||||
# 解析文档
|
||||
parsed_data = parse_document(input_path)
|
||||
|
||||
# 清理数据
|
||||
cleaned_data = clean_json_output(parsed_data)
|
||||
|
||||
# 输出到文件
|
||||
if output_path:
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(cleaned_data, f, ensure_ascii=ensure_ascii, indent=indent)
|
||||
print(f"JSON已保存至: {output_path}")
|
||||
|
||||
return cleaned_data
|
||||
|
||||
|
||||
def main():
|
||||
"""主入口函数"""
|
||||
import sys
|
||||
import os
|
||||
|
||||
# 默认输入输出路径
|
||||
input_file = "VDES软件使用说明书.docx"
|
||||
output_file = "VDES软件使用说明书.json"
|
||||
|
||||
# 支持命令行参数
|
||||
if len(sys.argv) >= 2:
|
||||
input_file = sys.argv[1]
|
||||
if len(sys.argv) >= 3:
|
||||
output_file = sys.argv[2]
|
||||
|
||||
# 检查文件存在
|
||||
if not os.path.exists(input_file):
|
||||
print(f"错误: 找不到文件 '{input_file}'")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
# 转换文档
|
||||
result = convert_docx_to_json(input_file, output_file)
|
||||
|
||||
# 打印预览
|
||||
print("\n=== 转换结果预览 ===")
|
||||
print(f"文档标题: {result.get('document_title', 'N/A')}")
|
||||
print(f"版本: {result.get('version', 'N/A')}")
|
||||
print(f"日期: {result.get('date', 'N/A')}")
|
||||
print(f"章节数量: {len(result.get('sections', []))}")
|
||||
|
||||
for section in result.get('sections', []):
|
||||
print(f"\n {section.get('number', '')} {section.get('title', '')}")
|
||||
for subsection in section.get('subsections', []):
|
||||
print(f" {subsection.get('number', '')} {subsection.get('title', '')}")
|
||||
|
||||
print("\n转换完成!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"转换失败: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user