68 lines
1.7 KiB
Python
68 lines
1.7 KiB
Python
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
from dataclasses import dataclass
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
from docx import Document
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True)
|
||
|
|
class Heading:
|
||
|
|
level: int
|
||
|
|
text: str
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True)
|
||
|
|
class ParsedDocument:
|
||
|
|
filename: str
|
||
|
|
text: str
|
||
|
|
paragraphs: list[str]
|
||
|
|
headings: list[Heading]
|
||
|
|
tables: list[list[list[str]]]
|
||
|
|
|
||
|
|
|
||
|
|
def _heading_level(style_name: str) -> int | None:
|
||
|
|
if not style_name.lower().startswith("heading"):
|
||
|
|
return None
|
||
|
|
parts = style_name.split()
|
||
|
|
if parts and parts[-1].isdigit():
|
||
|
|
return int(parts[-1])
|
||
|
|
return 1
|
||
|
|
|
||
|
|
|
||
|
|
def parse_docx(path: Path | str, display_filename: str | None = None) -> ParsedDocument:
|
||
|
|
docx_path = Path(path)
|
||
|
|
document = Document(docx_path)
|
||
|
|
paragraphs: list[str] = []
|
||
|
|
headings: list[Heading] = []
|
||
|
|
tables: list[list[list[str]]] = []
|
||
|
|
text_parts: list[str] = []
|
||
|
|
|
||
|
|
for paragraph in document.paragraphs:
|
||
|
|
text = paragraph.text.strip()
|
||
|
|
if not text:
|
||
|
|
continue
|
||
|
|
paragraphs.append(text)
|
||
|
|
text_parts.append(text)
|
||
|
|
level = _heading_level(paragraph.style.name if paragraph.style else "")
|
||
|
|
if level is not None:
|
||
|
|
headings.append(Heading(level=level, text=text))
|
||
|
|
|
||
|
|
for table in document.tables:
|
||
|
|
rows: list[list[str]] = []
|
||
|
|
for row in table.rows:
|
||
|
|
values = [cell.text.strip() for cell in row.cells]
|
||
|
|
if any(values):
|
||
|
|
rows.append(values)
|
||
|
|
text_parts.append(" | ".join(values))
|
||
|
|
if rows:
|
||
|
|
tables.append(rows)
|
||
|
|
|
||
|
|
return ParsedDocument(
|
||
|
|
filename=display_filename or docx_path.name,
|
||
|
|
text="\n".join(text_parts),
|
||
|
|
paragraphs=paragraphs,
|
||
|
|
headings=headings,
|
||
|
|
tables=tables,
|
||
|
|
)
|