Files
linux_format_docs_check/tests/test_docx_parser.py

120 lines
4.9 KiB
Python

from pathlib import Path
from docx import Document
from app.docx_parser import parse_docx
from scripts.docx_full_parser import DocxPackage
def test_parse_docx_extracts_headings_paragraphs_and_tables(tmp_path: Path) -> None:
docx_path = tmp_path / "sample.docx"
document = Document()
document.add_heading("软件需求规格说明", level=1)
document.add_paragraph("本文档描述 CSCI 的能力需求和接口需求。")
table = document.add_table(rows=1, cols=2)
table.cell(0, 0).text = "需求编号"
table.cell(0, 1).text = "REQ-001"
document.save(docx_path)
parsed = parse_docx(docx_path)
assert parsed.filename == "sample.docx"
assert "软件需求规格说明" in parsed.text
assert "REQ-001" in parsed.text
assert parsed.headings[0].text == "软件需求规格说明"
assert parsed.tables[0][0] == ["需求编号", "REQ-001"]
def test_docx_package_extracts_elements_across_parts_and_replaces_text(tmp_path: Path) -> None:
docx_path = tmp_path / "full.docx"
output_path = tmp_path / "modified.docx"
document = Document()
document.add_heading("原始标题", level=1)
document.add_paragraph("正文原始内容")
document.sections[0].header.paragraphs[0].text = "页眉原始内容"
document.sections[0].footer.paragraphs[0].text = "页脚原始内容"
table = document.add_table(rows=1, cols=1)
table.cell(0, 0).text = "表格原始内容"
document.save(docx_path)
package = DocxPackage(docx_path)
extraction = package.extract()
text = "\n".join(element.text for element in extraction.elements)
assert any(part.name == "word/document.xml" for part in extraction.parts)
assert "原始标题" in text
assert "页眉原始内容" in text
assert "页脚原始内容" in text
assert any(element.kind == "table" for element in extraction.elements)
replacements = package.replace_text("原始", "修改后")
package.save(output_path)
assert replacements >= 4
modified = Document(output_path)
assert "修改后标题" in "\n".join(paragraph.text for paragraph in modified.paragraphs)
assert modified.sections[0].header.paragraphs[0].text == "页眉修改后内容"
assert modified.sections[0].footer.paragraphs[0].text == "页脚修改后内容"
assert modified.tables[0].cell(0, 0).text == "表格修改后内容"
def test_docx_package_replaces_text_split_across_runs(tmp_path: Path) -> None:
docx_path = tmp_path / "split.docx"
output_path = tmp_path / "split-modified.docx"
document = Document()
paragraph = document.add_paragraph()
paragraph.add_run("附录")
paragraph.add_run("A ")
paragraph.add_run("文档审查单")
document.save(docx_path)
package = DocxPackage(docx_path)
replacements = package.replace_text("附录A 文档审查单", "附录A 文档检查单")
package.save(output_path)
modified = Document(output_path)
assert replacements == 1
assert modified.paragraphs[0].text == "附录A 文档检查单"
def test_docx_package_fills_review_result_columns(tmp_path: Path) -> None:
docx_path = tmp_path / "review.docx"
output_path = tmp_path / "review-modified.docx"
document = Document()
document.add_paragraph("A.3软件设计文档审查单")
table = document.add_table(rows=5, cols=7)
table.rows[0].cells[0].text = "文档名称"
table.rows[1].cells[0].text = "序号"
table.rows[1].cells[1].text = "审查项"
table.rows[1].cells[2].text = "审查内容"
table.rows[1].cells[3].text = "审查结果(填√)"
table.rows[1].cells[6].text = "备注"
table.rows[2].cells[0].text = "序号"
table.rows[2].cells[1].text = "审查项"
table.rows[2].cells[2].text = "审查内容"
table.rows[2].cells[3].text = "通过"
table.rows[2].cells[4].text = "未通过"
table.rows[2].cells[5].text = "不适用"
table.rows[2].cells[6].text = "备注"
table.rows[3].cells[0].text = "1"
table.rows[3].cells[1].text = "完整性"
table.rows[3].cells[2].text = "标识描述本文档所适用系统和软件的完整标识。"
table.rows[3].cells[4].text = "旧值"
table.rows[4].cells[0].text = "2"
table.rows[4].cells[1].text = "完整性"
table.rows[4].cells[2].text = "系统概述本文档适用的系统和软件的用途。"
document.save(docx_path)
package = DocxPackage(docx_path)
updates = package.fill_review_results(heading_contains="A.3", result="通过")
package.save(output_path)
assert [update.sequence for update in updates] == ["1", "2"]
assert updates[0].review_content == "标识描述本文档所适用系统和软件的完整标识。"
modified = Document(output_path)
modified_table = modified.tables[0]
assert modified_table.rows[3].cells[3].text == ""
assert modified_table.rows[3].cells[4].text == ""
assert modified_table.rows[3].cells[5].text == ""
assert modified_table.rows[4].cells[3].text == ""