apps/project/tool/xq_parse.py

import json
import re
import docx
import base64

from docx.document import Document

from docx.text.paragraph import Paragraph
from docx.parts.image import ImagePart
from docx.table import _Cell, Table
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P

from collections import OrderedDict

class DocxChapterExtractor(object):
    def __init__(self, docx_path):
        self.doc = docx.Document(docx_path)  # 解析文档

    def extract_chapter_info(self, text):
        """提取章节编号和标题"""
        pattern = r'^(\d+(?:\.\d+)*)\s+(.*?)(?:\s*\d+)?\s*$'
        match = re.match(pattern, text)
        chapter_num = None
        content = None
        if match:
            chapter_num = match.group(1)  # '4.1' or '4'
            content = match.group(2).strip()  # '外部接口需求'
        else:
            print(f"'{text}' no match")
        return chapter_num, content

    def if_valid_match(self, chaptera_name, text):
        pattern = r'^(\d+(?:\.\d+)*)\s+' + chaptera_name + r'(?:\s*\d+)?\s*$'
        return re.match(pattern, text) is not None

    def get_chapter_number(self, chapter_name):
        """获取目录结构"""
        directory = []
        chapter_num = ''
        flag = False
        for paragraph in self.doc.paragraphs:
            if self.if_valid_match(chapter_name, paragraph.text) and 'toc' in paragraph.style.name:
                chapter_num, content = self.extract_chapter_info(paragraph.text)
                directory.append((chapter_num, content))
                flag = True
                continue
            if flag and paragraph.text.startswith(chapter_num) and 'toc' in paragraph.style.name:
                num, content = self.extract_chapter_info(paragraph.text)
                directory.append((num, content))
        return directory

    def build_hierarchy(self, chapter_body_list):
        """将线性章节列表转换为嵌套结构"""
        hierarchy = {}
        path = []  # 当前路径栈，存储章节号的字符串部分（如 ["4", "2"]）
        for item in chapter_body_list:
            # 处理不同格式的输入数据
            if len(item) == 3:
                num, content, _ = item  # 忽略第三个元素
            elif len(item) == 2:
                num, content = item
            else:
                continue  # 跳过无效数据
            # 切割章节号为字符串列表（如 '4.2.1' -> ["4", "2", "1"]）
            parts = num.split('.')
            # 1. 回溯路径找到当前层级
            while len(path) >= len(parts):
                path.pop()
            # 2. 逐级构建或定位父节点
            current_level = hierarchy
            for i in range(len(path)):
                part = path[i]
                # 如果父节点不存在，自动创建占位节点
                if part not in current_level:
                    current_level[part] = {
                        "number": ".".join(parts[:i + 1]),
                        "title": "[未命名章节]",  # 占位节点标题
                        "children": {}
                    }
                current_level = current_level[part]["children"]
            # 3. 插入当前节点
            current_part = parts[len(path)]  # 当前层级的部分（如 "1"）
            if current_part not in current_level:
                current_level[current_part] = {
                    "number": num,
                    "title": content,
                    "children": {}
                }
            # 4. 更新路径栈
            path = parts.copy()
        return hierarchy

    def extract_title_ordinal(self, s):
        # 正则表达式匹配以括号结尾的字符串
        pattern = r'^(.*?)\s*[(（](.*?)[)）]$'
        match = re.match(pattern, s)
        if match:
            # 提取标题并去除前后空格
            title = match.group(1).strip()
            # 提取序号并去除前后空格
            ordinal = match.group(2).strip()
        else:
            title = s
            ordinal = None
        return title, ordinal

    def build_json_tree(self, chapter_body_list):
        """直接生成树形JSON结构"""
        root = {"number": "", "title": "ROOT", "content": "", "children": []}
        node_map = OrderedDict()
        node_map[""] = root  # 初始化根节点映射
        for item in chapter_body_list:
            # 处理不同格式的输入数据
            if len(item) == 3:
                num, chapter_name, chapter_content = item
                title, ordinal = self.extract_title_ordinal(chapter_name)
            elif len(item) == 2:
                num, chapter_name = item
                title, ordinal = self.extract_title_ordinal(chapter_name)
                chapter_content = ""
            else:
                continue  # 跳过无效数据
            parts = num.split('.')
            parent_node = root  # 始终从根节点开始查找父级
            for depth in range(len(parts)):
                current_num = ".".join(parts[:depth + 1])
                if current_num not in node_map:
                    new_node = {
                        "number": current_num,
                        "title": title if (depth == len(parts) - 1) else "[未命名章节]",
                        "ordinal": ordinal if (depth == len(parts) - 1) else "",
                        "content": chapter_content if (depth == len(parts) - 1) else "",
                        "children": []
                    }
                    parent_num = ".".join(parts[:depth])
                    parent_node = node_map[parent_num]
                    parent_node["children"].append(new_node)
                    node_map[current_num] = new_node
                parent_node = node_map[current_num]
            # 确保最终标题和内容正确
            node_map[num]["title"] = title
            node_map[num]["ordinal"] = ordinal
            node_map[num]["content"] = chapter_content
        return root["children"][0] if root["children"] else {}

    def is_image(self, graph: Paragraph, doc: Document):
        """判断段落是否图片"""
        images = graph._element.xpath('.//pic:pic')  # 获取所有图片
        for image in images:
            for img_id in image.xpath('.//a:blip/@r:embed'):  # 获取图片id
                part = doc.part.related_parts[img_id]  # 根据图片id获取对应的图片
                if isinstance(part, ImagePart):
                    return True
        return False

    def get_ImagePart(self, graph: Paragraph, doc: Document):  # 一行只能获取一个图片
        """获取图片字节流，类型为bytes"""
        images = graph._element.xpath('.//pic:pic')  # 获取所有图片
        for image in images:
            for img_id in image.xpath('.//a:blip/@r:embed'):  # 获取图片id
                part = doc.part.related_parts[img_id]  # 根据图片id获取对应的图片
                if isinstance(part, ImagePart):
                    return part.blob
        return None

    def iter_block_items(self, parent, directory):
        def custom_serializer(obj):
            if isinstance(obj, bytes):
                return {
                    '__type__': 'image',
                    'format': 'base64',
                    'data': base64.b64encode(obj).decode('utf-8')
                }
            return obj

        """
        根据目录匹配章节内容
        parent: docx解析内容, 传入self.doc
        directory: 章节目录结构，例如[('4', '工程需求'), ('4.1', '外部接口需求'),
         ('4.2', '功能需求'), ('4.2.1', '知识库大模型检索问答功能')]
        """
        if isinstance(parent, Document):
            parent_elm = parent.element.body
        elif isinstance(parent, _Cell):
            parent_elm = parent._tc
        else:
            raise ValueError("something's not right")
        i = 0
        body_list = []
        body = []
        flag = False  # 判断是否循环到章节标题
        for child in parent_elm.iterchildren():
            if isinstance(child, CT_P):
                paragraph = Paragraph(child, parent)
                if i < len(directory) - 1:
                    if paragraph.text == directory[i][1] and 'Heading' in paragraph.style.name:
                        flag = True
                        continue
                    if paragraph.text == directory[i + 1][1] and 'Heading' in paragraph.style.name:
                        # body_list.append(body)
                        new_tuple = directory[i] + (
                            json.dumps(
                                body,
                                default=custom_serializer,
                                ensure_ascii=False,
                            ),
                        )
                        body_list.append(new_tuple)
                        # print(new_tuple)
                        body = []
                        i += 1
                        continue
                    if flag:
                        if self.is_image(paragraph, parent):
                            body.append(self.get_ImagePart(paragraph, parent))

                        elif paragraph.text != '':
                            body.append(paragraph.text)
                elif i == len(directory) - 1:
                    if 'Heading' in paragraph.style.name:
                        new_tuple = directory[i] + (
                            json.dumps(
                                body,
                                default=custom_serializer,
                                ensure_ascii=False,
                            ),
                        )
                        body_list.append(new_tuple)
                        break
                    if self.is_image(paragraph, parent):
                        body.append(self.get_ImagePart(paragraph, parent))
                    elif paragraph.text != '':
                        body.append(paragraph.text)
                    # print(body_list)
                    # print(paragraph.text, '--------------->', paragraph.style.name)
                else:
                    flag = False
            elif isinstance(child, CT_Tbl):
                if flag:
                    table = []
                    for row in Table(child, parent).rows:
                        # 获取每行的单元格文本
                        row_text = [cell.text for cell in row.cells]
                        # 用制表符或其他分隔符连接单元格内容
                        table.append("\t".join(row_text))
                    body.append(table)
        return body_list

    def main(self, chapter_name):
        directory = self.get_chapter_number(chapter_name)
        # print(directory)
        chapter_body_list = self.iter_block_items(self.doc, directory)
        # print(chapter_body_list)
        # 构建层级结构
        # hierarchy = self.build_hierarchy(chapter_body_list)
        # print(hierarchy)
        json_tree = self.build_json_tree(chapter_body_list)
        return json_tree

if __name__ == '__main__':
    docx_path = 'test - 副本.docx'
    extractor = DocxChapterExtractor(docx_path)
    extractor.main('工程需求')