Files

123 lines
4.9 KiB
Python
Raw Permalink Normal View History

2026-04-13 11:34:23 +08:00
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional
@dataclass
class ChunkVectorMetadata:
"""Metadata payload for vector DB and graph linkage."""
chunk_id: str
kb_id: int
document_id: int
document_name: str
document_path: str
chunk_index: int
chunk_text: str
token_count: int
language: str = "zh"
source_type: str = "document"
mission_phase: Optional[str] = None
section_title: Optional[str] = None
publish_time: Optional[str] = None
extracted_entities: List[str] = field(default_factory=list)
extracted_entity_types: List[str] = field(default_factory=list)
extracted_relations: List[Dict[str, Any]] = field(default_factory=list)
graph_node_ids: List[str] = field(default_factory=list)
graph_edge_ids: List[str] = field(default_factory=list)
community_ids: List[str] = field(default_factory=list)
embedding_model: str = ""
embedding_dim: int = 0
ingest_time: str = field(
default_factory=lambda: datetime.now(timezone.utc).isoformat()
)
def to_payload(self) -> Dict[str, Any]:
return {
"chunk_id": self.chunk_id,
"kb_id": self.kb_id,
"document_id": self.document_id,
"document_name": self.document_name,
"document_path": self.document_path,
"chunk_index": self.chunk_index,
"chunk_text": self.chunk_text,
"token_count": self.token_count,
"language": self.language,
"source_type": self.source_type,
"mission_phase": self.mission_phase,
"section_title": self.section_title,
"publish_time": self.publish_time,
"extracted_entities": self.extracted_entities,
"extracted_entity_types": self.extracted_entity_types,
"extracted_relations": self.extracted_relations,
"graph_node_ids": self.graph_node_ids,
"graph_edge_ids": self.graph_edge_ids,
"community_ids": self.community_ids,
"embedding_model": self.embedding_model,
"embedding_dim": self.embedding_dim,
"ingest_time": self.ingest_time,
}
def qdrant_collection_schema(collection_name: str, vector_size: int) -> Dict[str, Any]:
"""Qdrant collection and payload index recommendations."""
return {
"collection_name": collection_name,
"vectors": {
"size": vector_size,
"distance": "Cosine",
},
"payload_indexes": [
{"field_name": "kb_id", "field_schema": "integer"},
{"field_name": "document_id", "field_schema": "integer"},
{"field_name": "document_name", "field_schema": "keyword"},
{"field_name": "chunk_id", "field_schema": "keyword"},
{"field_name": "mission_phase", "field_schema": "keyword"},
{"field_name": "community_ids", "field_schema": "keyword"},
{"field_name": "extracted_entities", "field_schema": "keyword"},
{"field_name": "ingest_time", "field_schema": "datetime"},
],
}
def milvus_collection_schema(collection_name: str, vector_size: int) -> Dict[str, Any]:
"""Milvus field design for vector+graph linkage."""
return {
"collection_name": collection_name,
"fields": [
{"name": "id", "type": "VARCHAR", "max_length": 64, "is_primary": True},
{"name": "kb_id", "type": "INT64"},
{"name": "document_id", "type": "INT64"},
{"name": "chunk_index", "type": "INT32"},
{"name": "document_name", "type": "VARCHAR", "max_length": 255},
{"name": "mission_phase", "type": "VARCHAR", "max_length": 64},
{"name": "community_ids", "type": "VARCHAR", "max_length": 512},
{"name": "extracted_entities", "type": "VARCHAR", "max_length": 2048},
{"name": "ingest_time", "type": "VARCHAR", "max_length": 64},
{"name": "embedding", "type": "FLOAT_VECTOR", "dim": vector_size},
],
"index": {
"field_name": "embedding",
"index_type": "HNSW",
"metric_type": "COSINE",
"params": {"M": 16, "efConstruction": 200},
},
}
DOCUMENT_CHUNK_METADATA_DDL = """
ALTER TABLE document_chunks
ADD COLUMN IF NOT EXISTS chunk_index INT NULL,
ADD COLUMN IF NOT EXISTS token_count INT NULL,
ADD COLUMN IF NOT EXISTS language VARCHAR(16) DEFAULT 'zh',
ADD COLUMN IF NOT EXISTS mission_phase VARCHAR(64) NULL,
ADD COLUMN IF NOT EXISTS extracted_entities JSON NULL,
ADD COLUMN IF NOT EXISTS extracted_entity_types JSON NULL,
ADD COLUMN IF NOT EXISTS extracted_relations JSON NULL,
ADD COLUMN IF NOT EXISTS graph_node_ids JSON NULL,
ADD COLUMN IF NOT EXISTS graph_edge_ids JSON NULL,
ADD COLUMN IF NOT EXISTS community_ids JSON NULL,
ADD COLUMN IF NOT EXISTS embedding_model VARCHAR(128) NULL,
ADD COLUMN IF NOT EXISTS embedding_dim INT NULL;
""".strip()