Files

68 lines
1.7 KiB
Python
Raw Permalink Normal View History

2026-05-18 15:50:43 +08:00
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from docx import Document
@dataclass(frozen=True)
class Heading:
level: int
text: str
@dataclass(frozen=True)
class ParsedDocument:
filename: str
text: str
paragraphs: list[str]
headings: list[Heading]
tables: list[list[list[str]]]
def _heading_level(style_name: str) -> int | None:
if not style_name.lower().startswith("heading"):
return None
parts = style_name.split()
if parts and parts[-1].isdigit():
return int(parts[-1])
return 1
def parse_docx(path: Path | str, display_filename: str | None = None) -> ParsedDocument:
docx_path = Path(path)
document = Document(docx_path)
paragraphs: list[str] = []
headings: list[Heading] = []
tables: list[list[list[str]]] = []
text_parts: list[str] = []
for paragraph in document.paragraphs:
text = paragraph.text.strip()
if not text:
continue
paragraphs.append(text)
text_parts.append(text)
level = _heading_level(paragraph.style.name if paragraph.style else "")
if level is not None:
headings.append(Heading(level=level, text=text))
for table in document.tables:
rows: list[list[str]] = []
for row in table.rows:
values = [cell.text.strip() for cell in row.cells]
if any(values):
rows.append(values)
text_parts.append(" | ".join(values))
if rows:
tables.append(rows)
return ParsedDocument(
filename=display_filename or docx_path.name,
text="\n".join(text_parts),
paragraphs=paragraphs,
headings=headings,
tables=tables,
)