需求解析功能完善
This commit is contained in:
Binary file not shown.
@@ -234,7 +234,7 @@ class UploadController(ControllerBase):
|
||||
|
||||
# 上传需求规格说明.docx进行解析
|
||||
@route.post("/upload_xq_docx/", url_name='dut-xq-docx')
|
||||
def upload_xq_docx(self, dut_key: str, project_id: int, file: File[UploadedFile]):
|
||||
def upload_xq_docx(self, parseChapter: str, file: File[UploadedFile]):
|
||||
# 构建临时目录
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
# 保存到临时目录
|
||||
@@ -242,5 +242,5 @@ class UploadController(ControllerBase):
|
||||
with open(docx_path, 'wb') as f:
|
||||
for chunk in file.chunks():
|
||||
f.write(chunk)
|
||||
extractor = DocxChapterExtractor(docx_path)
|
||||
extractor.main('需求')
|
||||
extracter = DocxChapterExtractor(docx_path)
|
||||
return extracter.main(parseChapter)
|
||||
|
||||
Binary file not shown.
File diff suppressed because one or more lines are too long
@@ -1,11 +1,16 @@
|
||||
import json
|
||||
import re
|
||||
import docx
|
||||
import base64
|
||||
|
||||
from docx.document import Document
|
||||
|
||||
from docx.text.paragraph import Paragraph
|
||||
from docx.parts.image import ImagePart
|
||||
from docx.table import _Cell, Table
|
||||
from docx.oxml.table import CT_Tbl
|
||||
from docx.oxml.text.paragraph import CT_P
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
class DocxChapterExtractor(object):
|
||||
@@ -160,6 +165,15 @@ class DocxChapterExtractor(object):
|
||||
return None
|
||||
|
||||
def iter_block_items(self, parent, directory):
|
||||
def custom_serializer(obj):
|
||||
if isinstance(obj, bytes):
|
||||
return {
|
||||
'__type__': 'image',
|
||||
'format': 'base64',
|
||||
'data': base64.b64encode(obj).decode('utf-8')
|
||||
}
|
||||
return obj
|
||||
|
||||
"""
|
||||
根据目录匹配章节内容
|
||||
parent: docx解析内容, 传入self.doc
|
||||
@@ -185,7 +199,13 @@ class DocxChapterExtractor(object):
|
||||
continue
|
||||
if paragraph.text == directory[i + 1][1] and 'Heading' in paragraph.style.name:
|
||||
# body_list.append(body)
|
||||
new_tuple = directory[i] + (repr(body),)
|
||||
new_tuple = directory[i] + (
|
||||
json.dumps(
|
||||
body,
|
||||
default=custom_serializer,
|
||||
ensure_ascii=False,
|
||||
),
|
||||
)
|
||||
body_list.append(new_tuple)
|
||||
# print(new_tuple)
|
||||
body = []
|
||||
@@ -199,7 +219,13 @@ class DocxChapterExtractor(object):
|
||||
body.append(paragraph.text)
|
||||
elif i == len(directory) - 1:
|
||||
if 'Heading' in paragraph.style.name:
|
||||
new_tuple = directory[i] + (repr(body),)
|
||||
new_tuple = directory[i] + (
|
||||
json.dumps(
|
||||
body,
|
||||
default=custom_serializer,
|
||||
ensure_ascii=False,
|
||||
),
|
||||
)
|
||||
body_list.append(new_tuple)
|
||||
break
|
||||
if self.is_image(paragraph, parent):
|
||||
@@ -223,14 +249,14 @@ class DocxChapterExtractor(object):
|
||||
|
||||
def main(self, chapter_name):
|
||||
directory = self.get_chapter_number(chapter_name)
|
||||
print(directory)
|
||||
# print(directory)
|
||||
chapter_body_list = self.iter_block_items(self.doc, directory)
|
||||
print(chapter_body_list)
|
||||
# print(chapter_body_list)
|
||||
# 构建层级结构
|
||||
# hierarchy = self.build_hierarchy(chapter_body_list)
|
||||
# print(hierarchy)
|
||||
json_tree = self.build_json_tree(chapter_body_list)
|
||||
print(json_tree)
|
||||
return json_tree
|
||||
|
||||
if __name__ == '__main__':
|
||||
docx_path = 'test - 副本.docx'
|
||||
|
||||
Reference in New Issue
Block a user