finish app develop
This commit is contained in:
67
app/docx_parser.py
Normal file
67
app/docx_parser.py
Normal file
@@ -0,0 +1,67 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from docx import Document
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Heading:
|
||||
level: int
|
||||
text: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ParsedDocument:
|
||||
filename: str
|
||||
text: str
|
||||
paragraphs: list[str]
|
||||
headings: list[Heading]
|
||||
tables: list[list[list[str]]]
|
||||
|
||||
|
||||
def _heading_level(style_name: str) -> int | None:
|
||||
if not style_name.lower().startswith("heading"):
|
||||
return None
|
||||
parts = style_name.split()
|
||||
if parts and parts[-1].isdigit():
|
||||
return int(parts[-1])
|
||||
return 1
|
||||
|
||||
|
||||
def parse_docx(path: Path | str, display_filename: str | None = None) -> ParsedDocument:
|
||||
docx_path = Path(path)
|
||||
document = Document(docx_path)
|
||||
paragraphs: list[str] = []
|
||||
headings: list[Heading] = []
|
||||
tables: list[list[list[str]]] = []
|
||||
text_parts: list[str] = []
|
||||
|
||||
for paragraph in document.paragraphs:
|
||||
text = paragraph.text.strip()
|
||||
if not text:
|
||||
continue
|
||||
paragraphs.append(text)
|
||||
text_parts.append(text)
|
||||
level = _heading_level(paragraph.style.name if paragraph.style else "")
|
||||
if level is not None:
|
||||
headings.append(Heading(level=level, text=text))
|
||||
|
||||
for table in document.tables:
|
||||
rows: list[list[str]] = []
|
||||
for row in table.rows:
|
||||
values = [cell.text.strip() for cell in row.cells]
|
||||
if any(values):
|
||||
rows.append(values)
|
||||
text_parts.append(" | ".join(values))
|
||||
if rows:
|
||||
tables.append(rows)
|
||||
|
||||
return ParsedDocument(
|
||||
filename=display_filename or docx_path.name,
|
||||
text="\n".join(text_parts),
|
||||
paragraphs=paragraphs,
|
||||
headings=headings,
|
||||
tables=tables,
|
||||
)
|
||||
Reference in New Issue
Block a user