tests/test_docx_parser.py

from pathlib import Path

from docx import Document

from app.docx_parser import parse_docx
from scripts.docx_full_parser import DocxPackage


def test_parse_docx_extracts_headings_paragraphs_and_tables(tmp_path: Path) -> None:
    docx_path = tmp_path / "sample.docx"
    document = Document()
    document.add_heading("软件需求规格说明", level=1)
    document.add_paragraph("本文档描述 CSCI 的能力需求和接口需求。")
    table = document.add_table(rows=1, cols=2)
    table.cell(0, 0).text = "需求编号"
    table.cell(0, 1).text = "REQ-001"
    document.save(docx_path)

    parsed = parse_docx(docx_path)

    assert parsed.filename == "sample.docx"
    assert "软件需求规格说明" in parsed.text
    assert "REQ-001" in parsed.text
    assert parsed.headings[0].text == "软件需求规格说明"
    assert parsed.tables[0][0] == ["需求编号", "REQ-001"]


def test_docx_package_extracts_elements_across_parts_and_replaces_text(tmp_path: Path) -> None:
    docx_path = tmp_path / "full.docx"
    output_path = tmp_path / "modified.docx"
    document = Document()
    document.add_heading("原始标题", level=1)
    document.add_paragraph("正文原始内容")
    document.sections[0].header.paragraphs[0].text = "页眉原始内容"
    document.sections[0].footer.paragraphs[0].text = "页脚原始内容"
    table = document.add_table(rows=1, cols=1)
    table.cell(0, 0).text = "表格原始内容"
    document.save(docx_path)

    package = DocxPackage(docx_path)
    extraction = package.extract()
    text = "\n".join(element.text for element in extraction.elements)

    assert any(part.name == "word/document.xml" for part in extraction.parts)
    assert "原始标题" in text
    assert "页眉原始内容" in text
    assert "页脚原始内容" in text
    assert any(element.kind == "table" for element in extraction.elements)

    replacements = package.replace_text("原始", "修改后")
    package.save(output_path)

    assert replacements >= 4
    modified = Document(output_path)
    assert "修改后标题" in "\n".join(paragraph.text for paragraph in modified.paragraphs)
    assert modified.sections[0].header.paragraphs[0].text == "页眉修改后内容"
    assert modified.sections[0].footer.paragraphs[0].text == "页脚修改后内容"
    assert modified.tables[0].cell(0, 0).text == "表格修改后内容"


def test_docx_package_replaces_text_split_across_runs(tmp_path: Path) -> None:
    docx_path = tmp_path / "split.docx"
    output_path = tmp_path / "split-modified.docx"
    document = Document()
    paragraph = document.add_paragraph()
    paragraph.add_run("附录")
    paragraph.add_run("A ")
    paragraph.add_run("文档审查单")
    document.save(docx_path)

    package = DocxPackage(docx_path)
    replacements = package.replace_text("附录A 文档审查单", "附录A 文档检查单")
    package.save(output_path)

    modified = Document(output_path)
    assert replacements == 1
    assert modified.paragraphs[0].text == "附录A 文档检查单"


def test_docx_package_fills_review_result_columns(tmp_path: Path) -> None:
    docx_path = tmp_path / "review.docx"
    output_path = tmp_path / "review-modified.docx"
    document = Document()
    document.add_paragraph("A.3软件设计文档审查单")
    table = document.add_table(rows=5, cols=7)
    table.rows[0].cells[0].text = "文档名称"
    table.rows[1].cells[0].text = "序号"
    table.rows[1].cells[1].text = "审查项"
    table.rows[1].cells[2].text = "审查内容"
    table.rows[1].cells[3].text = "审查结果（填√）"
    table.rows[1].cells[6].text = "备注"
    table.rows[2].cells[0].text = "序号"
    table.rows[2].cells[1].text = "审查项"
    table.rows[2].cells[2].text = "审查内容"
    table.rows[2].cells[3].text = "通过"
    table.rows[2].cells[4].text = "未通过"
    table.rows[2].cells[5].text = "不适用"
    table.rows[2].cells[6].text = "备注"
    table.rows[3].cells[0].text = "1"
    table.rows[3].cells[1].text = "完整性"
    table.rows[3].cells[2].text = "标识描述本文档所适用系统和软件的完整标识。"
    table.rows[3].cells[4].text = "旧值"
    table.rows[4].cells[0].text = "2"
    table.rows[4].cells[1].text = "完整性"
    table.rows[4].cells[2].text = "系统概述本文档适用的系统和软件的用途。"
    document.save(docx_path)

    package = DocxPackage(docx_path)
    updates = package.fill_review_results(heading_contains="A.3", result="通过")
    package.save(output_path)

    assert [update.sequence for update in updates] == ["1", "2"]
    assert updates[0].review_content == "标识描述本文档所适用系统和软件的完整标识。"
    modified = Document(output_path)
    modified_table = modified.tables[0]
    assert modified_table.rows[3].cells[3].text == "✔"
    assert modified_table.rows[3].cells[4].text == ""
    assert modified_table.rows[3].cells[5].text == ""
    assert modified_table.rows[4].cells[3].text == "✔"
finish app develop 2026-05-18 15:50:43 +08:00			`from pathlib import Path`

			`from docx import Document`

			`from app.docx_parser import parse_docx`
test edit question table docx based on table image 2026-05-26 15:08:34 +08:00			`from scripts.docx_full_parser import DocxPackage`
finish app develop 2026-05-18 15:50:43 +08:00

			`def test_parse_docx_extracts_headings_paragraphs_and_tables(tmp_path: Path) -> None:`
			`docx_path = tmp_path / "sample.docx"`
			`document = Document()`
			`document.add_heading("软件需求规格说明", level=1)`
			`document.add_paragraph("本文档描述 CSCI 的能力需求和接口需求。")`
			`table = document.add_table(rows=1, cols=2)`
			`table.cell(0, 0).text = "需求编号"`
			`table.cell(0, 1).text = "REQ-001"`
			`document.save(docx_path)`

			`parsed = parse_docx(docx_path)`

			`assert parsed.filename == "sample.docx"`
			`assert "软件需求规格说明" in parsed.text`
			`assert "REQ-001" in parsed.text`
			`assert parsed.headings[0].text == "软件需求规格说明"`
			`assert parsed.tables[0][0] == ["需求编号", "REQ-001"]`
test edit question table docx based on table image 2026-05-26 15:08:34 +08:00

			`def test_docx_package_extracts_elements_across_parts_and_replaces_text(tmp_path: Path) -> None:`
			`docx_path = tmp_path / "full.docx"`
			`output_path = tmp_path / "modified.docx"`
			`document = Document()`
			`document.add_heading("原始标题", level=1)`
			`document.add_paragraph("正文原始内容")`
			`document.sections[0].header.paragraphs[0].text = "页眉原始内容"`
			`document.sections[0].footer.paragraphs[0].text = "页脚原始内容"`
			`table = document.add_table(rows=1, cols=1)`
			`table.cell(0, 0).text = "表格原始内容"`
			`document.save(docx_path)`

			`package = DocxPackage(docx_path)`
			`extraction = package.extract()`
			`text = "\n".join(element.text for element in extraction.elements)`

			`assert any(part.name == "word/document.xml" for part in extraction.parts)`
			`assert "原始标题" in text`
			`assert "页眉原始内容" in text`
			`assert "页脚原始内容" in text`
			`assert any(element.kind == "table" for element in extraction.elements)`

			`replacements = package.replace_text("原始", "修改后")`
			`package.save(output_path)`

			`assert replacements >= 4`
			`modified = Document(output_path)`
			`assert "修改后标题" in "\n".join(paragraph.text for paragraph in modified.paragraphs)`
			`assert modified.sections[0].header.paragraphs[0].text == "页眉修改后内容"`
			`assert modified.sections[0].footer.paragraphs[0].text == "页脚修改后内容"`
			`assert modified.tables[0].cell(0, 0).text == "表格修改后内容"`


			`def test_docx_package_replaces_text_split_across_runs(tmp_path: Path) -> None:`
			`docx_path = tmp_path / "split.docx"`
			`output_path = tmp_path / "split-modified.docx"`
			`document = Document()`
			`paragraph = document.add_paragraph()`
			`paragraph.add_run("附录")`
			`paragraph.add_run("A ")`
			`paragraph.add_run("文档审查单")`
			`document.save(docx_path)`

			`package = DocxPackage(docx_path)`
			`replacements = package.replace_text("附录A 文档审查单", "附录A 文档检查单")`
			`package.save(output_path)`

			`modified = Document(output_path)`
			`assert replacements == 1`
			`assert modified.paragraphs[0].text == "附录A 文档检查单"`


			`def test_docx_package_fills_review_result_columns(tmp_path: Path) -> None:`
			`docx_path = tmp_path / "review.docx"`
			`output_path = tmp_path / "review-modified.docx"`
			`document = Document()`
			`document.add_paragraph("A.3软件设计文档审查单")`
			`table = document.add_table(rows=5, cols=7)`
			`table.rows[0].cells[0].text = "文档名称"`
			`table.rows[1].cells[0].text = "序号"`
			`table.rows[1].cells[1].text = "审查项"`
			`table.rows[1].cells[2].text = "审查内容"`
			`table.rows[1].cells[3].text = "审查结果（填√）"`
			`table.rows[1].cells[6].text = "备注"`
			`table.rows[2].cells[0].text = "序号"`
			`table.rows[2].cells[1].text = "审查项"`
			`table.rows[2].cells[2].text = "审查内容"`
			`table.rows[2].cells[3].text = "通过"`
			`table.rows[2].cells[4].text = "未通过"`
			`table.rows[2].cells[5].text = "不适用"`
			`table.rows[2].cells[6].text = "备注"`
			`table.rows[3].cells[0].text = "1"`
			`table.rows[3].cells[1].text = "完整性"`
			`table.rows[3].cells[2].text = "标识描述本文档所适用系统和软件的完整标识。"`
			`table.rows[3].cells[4].text = "旧值"`
			`table.rows[4].cells[0].text = "2"`
			`table.rows[4].cells[1].text = "完整性"`
			`table.rows[4].cells[2].text = "系统概述本文档适用的系统和软件的用途。"`
			`document.save(docx_path)`

			`package = DocxPackage(docx_path)`
			`updates = package.fill_review_results(heading_contains="A.3", result="通过")`
			`package.save(output_path)`

			`assert [update.sequence for update in updates] == ["1", "2"]`
			`assert updates[0].review_content == "标识描述本文档所适用系统和软件的完整标识。"`
			`modified = Document(output_path)`
			`modified_table = modified.tables[0]`
			`assert modified_table.rows[3].cells[3].text == "✔"`
			`assert modified_table.rows[3].cells[4].text == ""`
			`assert modified_table.rows[3].cells[5].text == ""`
			`assert modified_table.rows[4].cells[3].text == "✔"`