需求解析功能完善

This commit is contained in:
2025-04-30 17:44:52 +08:00
parent 4faed52de5
commit 9a716bb730
6 changed files with 255 additions and 112 deletions

View File

@@ -234,7 +234,7 @@ class UploadController(ControllerBase):
# 上传需求规格说明.docx进行解析
@route.post("/upload_xq_docx/", url_name='dut-xq-docx')
def upload_xq_docx(self, dut_key: str, project_id: int, file: File[UploadedFile]):
def upload_xq_docx(self, parseChapter: str, file: File[UploadedFile]):
# 构建临时目录
with tempfile.TemporaryDirectory() as tmp_dir:
# 保存到临时目录
@@ -242,5 +242,5 @@ class UploadController(ControllerBase):
with open(docx_path, 'wb') as f:
for chunk in file.chunks():
f.write(chunk)
extractor = DocxChapterExtractor(docx_path)
extractor.main('需求')
extracter = DocxChapterExtractor(docx_path)
return extracter.main(parseChapter)

File diff suppressed because one or more lines are too long

View File

@@ -1,11 +1,16 @@
import json
import re
import docx
import base64
from docx.document import Document
from docx.text.paragraph import Paragraph
from docx.parts.image import ImagePart
from docx.table import _Cell, Table
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from collections import OrderedDict
class DocxChapterExtractor(object):
@@ -160,6 +165,15 @@ class DocxChapterExtractor(object):
return None
def iter_block_items(self, parent, directory):
def custom_serializer(obj):
if isinstance(obj, bytes):
return {
'__type__': 'image',
'format': 'base64',
'data': base64.b64encode(obj).decode('utf-8')
}
return obj
"""
根据目录匹配章节内容
parent: docx解析内容, 传入self.doc
@@ -185,7 +199,13 @@ class DocxChapterExtractor(object):
continue
if paragraph.text == directory[i + 1][1] and 'Heading' in paragraph.style.name:
# body_list.append(body)
new_tuple = directory[i] + (repr(body),)
new_tuple = directory[i] + (
json.dumps(
body,
default=custom_serializer,
ensure_ascii=False,
),
)
body_list.append(new_tuple)
# print(new_tuple)
body = []
@@ -199,7 +219,13 @@ class DocxChapterExtractor(object):
body.append(paragraph.text)
elif i == len(directory) - 1:
if 'Heading' in paragraph.style.name:
new_tuple = directory[i] + (repr(body),)
new_tuple = directory[i] + (
json.dumps(
body,
default=custom_serializer,
ensure_ascii=False,
),
)
body_list.append(new_tuple)
break
if self.is_image(paragraph, parent):
@@ -223,14 +249,14 @@ class DocxChapterExtractor(object):
def main(self, chapter_name):
directory = self.get_chapter_number(chapter_name)
print(directory)
# print(directory)
chapter_body_list = self.iter_block_items(self.doc, directory)
print(chapter_body_list)
# print(chapter_body_list)
# 构建层级结构
# hierarchy = self.build_hierarchy(chapter_body_list)
# print(hierarchy)
json_tree = self.build_json_tree(chapter_body_list)
print(json_tree)
return json_tree
if __name__ == '__main__':
docx_path = 'test - 副本.docx'