123 lines
4.9 KiB
Python
123 lines
4.9 KiB
Python
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
|
|
@dataclass
|
|
class ChunkVectorMetadata:
|
|
"""Metadata payload for vector DB and graph linkage."""
|
|
|
|
chunk_id: str
|
|
kb_id: int
|
|
document_id: int
|
|
document_name: str
|
|
document_path: str
|
|
chunk_index: int
|
|
chunk_text: str
|
|
token_count: int
|
|
language: str = "zh"
|
|
source_type: str = "document"
|
|
mission_phase: Optional[str] = None
|
|
section_title: Optional[str] = None
|
|
publish_time: Optional[str] = None
|
|
extracted_entities: List[str] = field(default_factory=list)
|
|
extracted_entity_types: List[str] = field(default_factory=list)
|
|
extracted_relations: List[Dict[str, Any]] = field(default_factory=list)
|
|
graph_node_ids: List[str] = field(default_factory=list)
|
|
graph_edge_ids: List[str] = field(default_factory=list)
|
|
community_ids: List[str] = field(default_factory=list)
|
|
embedding_model: str = ""
|
|
embedding_dim: int = 0
|
|
ingest_time: str = field(
|
|
default_factory=lambda: datetime.now(timezone.utc).isoformat()
|
|
)
|
|
|
|
def to_payload(self) -> Dict[str, Any]:
|
|
return {
|
|
"chunk_id": self.chunk_id,
|
|
"kb_id": self.kb_id,
|
|
"document_id": self.document_id,
|
|
"document_name": self.document_name,
|
|
"document_path": self.document_path,
|
|
"chunk_index": self.chunk_index,
|
|
"chunk_text": self.chunk_text,
|
|
"token_count": self.token_count,
|
|
"language": self.language,
|
|
"source_type": self.source_type,
|
|
"mission_phase": self.mission_phase,
|
|
"section_title": self.section_title,
|
|
"publish_time": self.publish_time,
|
|
"extracted_entities": self.extracted_entities,
|
|
"extracted_entity_types": self.extracted_entity_types,
|
|
"extracted_relations": self.extracted_relations,
|
|
"graph_node_ids": self.graph_node_ids,
|
|
"graph_edge_ids": self.graph_edge_ids,
|
|
"community_ids": self.community_ids,
|
|
"embedding_model": self.embedding_model,
|
|
"embedding_dim": self.embedding_dim,
|
|
"ingest_time": self.ingest_time,
|
|
}
|
|
|
|
|
|
def qdrant_collection_schema(collection_name: str, vector_size: int) -> Dict[str, Any]:
|
|
"""Qdrant collection and payload index recommendations."""
|
|
return {
|
|
"collection_name": collection_name,
|
|
"vectors": {
|
|
"size": vector_size,
|
|
"distance": "Cosine",
|
|
},
|
|
"payload_indexes": [
|
|
{"field_name": "kb_id", "field_schema": "integer"},
|
|
{"field_name": "document_id", "field_schema": "integer"},
|
|
{"field_name": "document_name", "field_schema": "keyword"},
|
|
{"field_name": "chunk_id", "field_schema": "keyword"},
|
|
{"field_name": "mission_phase", "field_schema": "keyword"},
|
|
{"field_name": "community_ids", "field_schema": "keyword"},
|
|
{"field_name": "extracted_entities", "field_schema": "keyword"},
|
|
{"field_name": "ingest_time", "field_schema": "datetime"},
|
|
],
|
|
}
|
|
|
|
|
|
def milvus_collection_schema(collection_name: str, vector_size: int) -> Dict[str, Any]:
|
|
"""Milvus field design for vector+graph linkage."""
|
|
return {
|
|
"collection_name": collection_name,
|
|
"fields": [
|
|
{"name": "id", "type": "VARCHAR", "max_length": 64, "is_primary": True},
|
|
{"name": "kb_id", "type": "INT64"},
|
|
{"name": "document_id", "type": "INT64"},
|
|
{"name": "chunk_index", "type": "INT32"},
|
|
{"name": "document_name", "type": "VARCHAR", "max_length": 255},
|
|
{"name": "mission_phase", "type": "VARCHAR", "max_length": 64},
|
|
{"name": "community_ids", "type": "VARCHAR", "max_length": 512},
|
|
{"name": "extracted_entities", "type": "VARCHAR", "max_length": 2048},
|
|
{"name": "ingest_time", "type": "VARCHAR", "max_length": 64},
|
|
{"name": "embedding", "type": "FLOAT_VECTOR", "dim": vector_size},
|
|
],
|
|
"index": {
|
|
"field_name": "embedding",
|
|
"index_type": "HNSW",
|
|
"metric_type": "COSINE",
|
|
"params": {"M": 16, "efConstruction": 200},
|
|
},
|
|
}
|
|
|
|
|
|
DOCUMENT_CHUNK_METADATA_DDL = """
|
|
ALTER TABLE document_chunks
|
|
ADD COLUMN IF NOT EXISTS chunk_index INT NULL,
|
|
ADD COLUMN IF NOT EXISTS token_count INT NULL,
|
|
ADD COLUMN IF NOT EXISTS language VARCHAR(16) DEFAULT 'zh',
|
|
ADD COLUMN IF NOT EXISTS mission_phase VARCHAR(64) NULL,
|
|
ADD COLUMN IF NOT EXISTS extracted_entities JSON NULL,
|
|
ADD COLUMN IF NOT EXISTS extracted_entity_types JSON NULL,
|
|
ADD COLUMN IF NOT EXISTS extracted_relations JSON NULL,
|
|
ADD COLUMN IF NOT EXISTS graph_node_ids JSON NULL,
|
|
ADD COLUMN IF NOT EXISTS graph_edge_ids JSON NULL,
|
|
ADD COLUMN IF NOT EXISTS community_ids JSON NULL,
|
|
ADD COLUMN IF NOT EXISTS embedding_model VARCHAR(128) NULL,
|
|
ADD COLUMN IF NOT EXISTS embedding_dim INT NULL;
|
|
""".strip()
|