from __future__ import annotations from dataclasses import dataclass from pathlib import Path from docx import Document @dataclass(frozen=True) class Heading: level: int text: str @dataclass(frozen=True) class ParsedDocument: filename: str text: str paragraphs: list[str] headings: list[Heading] tables: list[list[list[str]]] def _heading_level(style_name: str) -> int | None: if not style_name.lower().startswith("heading"): return None parts = style_name.split() if parts and parts[-1].isdigit(): return int(parts[-1]) return 1 def parse_docx(path: Path | str, display_filename: str | None = None) -> ParsedDocument: docx_path = Path(path) document = Document(docx_path) paragraphs: list[str] = [] headings: list[Heading] = [] tables: list[list[list[str]]] = [] text_parts: list[str] = [] for paragraph in document.paragraphs: text = paragraph.text.strip() if not text: continue paragraphs.append(text) text_parts.append(text) level = _heading_level(paragraph.style.name if paragraph.style else "") if level is not None: headings.append(Heading(level=level, text=text)) for table in document.tables: rows: list[list[str]] = [] for row in table.rows: values = [cell.text.strip() for cell in row.cells] if any(values): rows.append(values) text_parts.append(" | ".join(values)) if rows: tables.append(rows) return ParsedDocument( filename=display_filename or docx_path.name, text="\n".join(text_parts), paragraphs=paragraphs, headings=headings, tables=tables, )