from dataclasses import dataclass, field from datetime import datetime, timezone from typing import Any, Dict, List, Optional @dataclass class ChunkVectorMetadata: """Metadata payload for vector DB and graph linkage.""" chunk_id: str kb_id: int document_id: int document_name: str document_path: str chunk_index: int chunk_text: str token_count: int language: str = "zh" source_type: str = "document" mission_phase: Optional[str] = None section_title: Optional[str] = None publish_time: Optional[str] = None extracted_entities: List[str] = field(default_factory=list) extracted_entity_types: List[str] = field(default_factory=list) extracted_relations: List[Dict[str, Any]] = field(default_factory=list) graph_node_ids: List[str] = field(default_factory=list) graph_edge_ids: List[str] = field(default_factory=list) community_ids: List[str] = field(default_factory=list) embedding_model: str = "" embedding_dim: int = 0 ingest_time: str = field( default_factory=lambda: datetime.now(timezone.utc).isoformat() ) def to_payload(self) -> Dict[str, Any]: return { "chunk_id": self.chunk_id, "kb_id": self.kb_id, "document_id": self.document_id, "document_name": self.document_name, "document_path": self.document_path, "chunk_index": self.chunk_index, "chunk_text": self.chunk_text, "token_count": self.token_count, "language": self.language, "source_type": self.source_type, "mission_phase": self.mission_phase, "section_title": self.section_title, "publish_time": self.publish_time, "extracted_entities": self.extracted_entities, "extracted_entity_types": self.extracted_entity_types, "extracted_relations": self.extracted_relations, "graph_node_ids": self.graph_node_ids, "graph_edge_ids": self.graph_edge_ids, "community_ids": self.community_ids, "embedding_model": self.embedding_model, "embedding_dim": self.embedding_dim, "ingest_time": self.ingest_time, } def qdrant_collection_schema(collection_name: str, vector_size: int) -> Dict[str, Any]: """Qdrant collection and payload index recommendations.""" return { "collection_name": collection_name, "vectors": { "size": vector_size, "distance": "Cosine", }, "payload_indexes": [ {"field_name": "kb_id", "field_schema": "integer"}, {"field_name": "document_id", "field_schema": "integer"}, {"field_name": "document_name", "field_schema": "keyword"}, {"field_name": "chunk_id", "field_schema": "keyword"}, {"field_name": "mission_phase", "field_schema": "keyword"}, {"field_name": "community_ids", "field_schema": "keyword"}, {"field_name": "extracted_entities", "field_schema": "keyword"}, {"field_name": "ingest_time", "field_schema": "datetime"}, ], } def milvus_collection_schema(collection_name: str, vector_size: int) -> Dict[str, Any]: """Milvus field design for vector+graph linkage.""" return { "collection_name": collection_name, "fields": [ {"name": "id", "type": "VARCHAR", "max_length": 64, "is_primary": True}, {"name": "kb_id", "type": "INT64"}, {"name": "document_id", "type": "INT64"}, {"name": "chunk_index", "type": "INT32"}, {"name": "document_name", "type": "VARCHAR", "max_length": 255}, {"name": "mission_phase", "type": "VARCHAR", "max_length": 64}, {"name": "community_ids", "type": "VARCHAR", "max_length": 512}, {"name": "extracted_entities", "type": "VARCHAR", "max_length": 2048}, {"name": "ingest_time", "type": "VARCHAR", "max_length": 64}, {"name": "embedding", "type": "FLOAT_VECTOR", "dim": vector_size}, ], "index": { "field_name": "embedding", "index_type": "HNSW", "metric_type": "COSINE", "params": {"M": 16, "efConstruction": 200}, }, } DOCUMENT_CHUNK_METADATA_DDL = """ ALTER TABLE document_chunks ADD COLUMN IF NOT EXISTS chunk_index INT NULL, ADD COLUMN IF NOT EXISTS token_count INT NULL, ADD COLUMN IF NOT EXISTS language VARCHAR(16) DEFAULT 'zh', ADD COLUMN IF NOT EXISTS mission_phase VARCHAR(64) NULL, ADD COLUMN IF NOT EXISTS extracted_entities JSON NULL, ADD COLUMN IF NOT EXISTS extracted_entity_types JSON NULL, ADD COLUMN IF NOT EXISTS extracted_relations JSON NULL, ADD COLUMN IF NOT EXISTS graph_node_ids JSON NULL, ADD COLUMN IF NOT EXISTS graph_edge_ids JSON NULL, ADD COLUMN IF NOT EXISTS community_ids JSON NULL, ADD COLUMN IF NOT EXISTS embedding_model VARCHAR(128) NULL, ADD COLUMN IF NOT EXISTS embedding_dim INT NULL; """.strip()