finish app develop

This commit is contained in:
kuangji
2026-05-18 15:50:43 +08:00
parent 8f23a841f0
commit 17decab2fc
20 changed files with 2447 additions and 0 deletions

67
app/docx_parser.py Normal file
View File

@@ -0,0 +1,67 @@
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from docx import Document
@dataclass(frozen=True)
class Heading:
level: int
text: str
@dataclass(frozen=True)
class ParsedDocument:
filename: str
text: str
paragraphs: list[str]
headings: list[Heading]
tables: list[list[list[str]]]
def _heading_level(style_name: str) -> int | None:
if not style_name.lower().startswith("heading"):
return None
parts = style_name.split()
if parts and parts[-1].isdigit():
return int(parts[-1])
return 1
def parse_docx(path: Path | str, display_filename: str | None = None) -> ParsedDocument:
docx_path = Path(path)
document = Document(docx_path)
paragraphs: list[str] = []
headings: list[Heading] = []
tables: list[list[list[str]]] = []
text_parts: list[str] = []
for paragraph in document.paragraphs:
text = paragraph.text.strip()
if not text:
continue
paragraphs.append(text)
text_parts.append(text)
level = _heading_level(paragraph.style.name if paragraph.style else "")
if level is not None:
headings.append(Heading(level=level, text=text))
for table in document.tables:
rows: list[list[str]] = []
for row in table.rows:
values = [cell.text.strip() for cell in row.cells]
if any(values):
rows.append(values)
text_parts.append(" | ".join(values))
if rows:
tables.append(rows)
return ParsedDocument(
filename=display_filename or docx_path.name,
text="\n".join(text_parts),
paragraphs=paragraphs,
headings=headings,
tables=tables,
)