init. project
This commit is contained in:
122
rag-web-ui/backend/app/services/vector_schema.py
Normal file
122
rag-web-ui/backend/app/services/vector_schema.py
Normal file
@@ -0,0 +1,122 @@
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkVectorMetadata:
|
||||
"""Metadata payload for vector DB and graph linkage."""
|
||||
|
||||
chunk_id: str
|
||||
kb_id: int
|
||||
document_id: int
|
||||
document_name: str
|
||||
document_path: str
|
||||
chunk_index: int
|
||||
chunk_text: str
|
||||
token_count: int
|
||||
language: str = "zh"
|
||||
source_type: str = "document"
|
||||
mission_phase: Optional[str] = None
|
||||
section_title: Optional[str] = None
|
||||
publish_time: Optional[str] = None
|
||||
extracted_entities: List[str] = field(default_factory=list)
|
||||
extracted_entity_types: List[str] = field(default_factory=list)
|
||||
extracted_relations: List[Dict[str, Any]] = field(default_factory=list)
|
||||
graph_node_ids: List[str] = field(default_factory=list)
|
||||
graph_edge_ids: List[str] = field(default_factory=list)
|
||||
community_ids: List[str] = field(default_factory=list)
|
||||
embedding_model: str = ""
|
||||
embedding_dim: int = 0
|
||||
ingest_time: str = field(
|
||||
default_factory=lambda: datetime.now(timezone.utc).isoformat()
|
||||
)
|
||||
|
||||
def to_payload(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"chunk_id": self.chunk_id,
|
||||
"kb_id": self.kb_id,
|
||||
"document_id": self.document_id,
|
||||
"document_name": self.document_name,
|
||||
"document_path": self.document_path,
|
||||
"chunk_index": self.chunk_index,
|
||||
"chunk_text": self.chunk_text,
|
||||
"token_count": self.token_count,
|
||||
"language": self.language,
|
||||
"source_type": self.source_type,
|
||||
"mission_phase": self.mission_phase,
|
||||
"section_title": self.section_title,
|
||||
"publish_time": self.publish_time,
|
||||
"extracted_entities": self.extracted_entities,
|
||||
"extracted_entity_types": self.extracted_entity_types,
|
||||
"extracted_relations": self.extracted_relations,
|
||||
"graph_node_ids": self.graph_node_ids,
|
||||
"graph_edge_ids": self.graph_edge_ids,
|
||||
"community_ids": self.community_ids,
|
||||
"embedding_model": self.embedding_model,
|
||||
"embedding_dim": self.embedding_dim,
|
||||
"ingest_time": self.ingest_time,
|
||||
}
|
||||
|
||||
|
||||
def qdrant_collection_schema(collection_name: str, vector_size: int) -> Dict[str, Any]:
|
||||
"""Qdrant collection and payload index recommendations."""
|
||||
return {
|
||||
"collection_name": collection_name,
|
||||
"vectors": {
|
||||
"size": vector_size,
|
||||
"distance": "Cosine",
|
||||
},
|
||||
"payload_indexes": [
|
||||
{"field_name": "kb_id", "field_schema": "integer"},
|
||||
{"field_name": "document_id", "field_schema": "integer"},
|
||||
{"field_name": "document_name", "field_schema": "keyword"},
|
||||
{"field_name": "chunk_id", "field_schema": "keyword"},
|
||||
{"field_name": "mission_phase", "field_schema": "keyword"},
|
||||
{"field_name": "community_ids", "field_schema": "keyword"},
|
||||
{"field_name": "extracted_entities", "field_schema": "keyword"},
|
||||
{"field_name": "ingest_time", "field_schema": "datetime"},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def milvus_collection_schema(collection_name: str, vector_size: int) -> Dict[str, Any]:
|
||||
"""Milvus field design for vector+graph linkage."""
|
||||
return {
|
||||
"collection_name": collection_name,
|
||||
"fields": [
|
||||
{"name": "id", "type": "VARCHAR", "max_length": 64, "is_primary": True},
|
||||
{"name": "kb_id", "type": "INT64"},
|
||||
{"name": "document_id", "type": "INT64"},
|
||||
{"name": "chunk_index", "type": "INT32"},
|
||||
{"name": "document_name", "type": "VARCHAR", "max_length": 255},
|
||||
{"name": "mission_phase", "type": "VARCHAR", "max_length": 64},
|
||||
{"name": "community_ids", "type": "VARCHAR", "max_length": 512},
|
||||
{"name": "extracted_entities", "type": "VARCHAR", "max_length": 2048},
|
||||
{"name": "ingest_time", "type": "VARCHAR", "max_length": 64},
|
||||
{"name": "embedding", "type": "FLOAT_VECTOR", "dim": vector_size},
|
||||
],
|
||||
"index": {
|
||||
"field_name": "embedding",
|
||||
"index_type": "HNSW",
|
||||
"metric_type": "COSINE",
|
||||
"params": {"M": 16, "efConstruction": 200},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
DOCUMENT_CHUNK_METADATA_DDL = """
|
||||
ALTER TABLE document_chunks
|
||||
ADD COLUMN IF NOT EXISTS chunk_index INT NULL,
|
||||
ADD COLUMN IF NOT EXISTS token_count INT NULL,
|
||||
ADD COLUMN IF NOT EXISTS language VARCHAR(16) DEFAULT 'zh',
|
||||
ADD COLUMN IF NOT EXISTS mission_phase VARCHAR(64) NULL,
|
||||
ADD COLUMN IF NOT EXISTS extracted_entities JSON NULL,
|
||||
ADD COLUMN IF NOT EXISTS extracted_entity_types JSON NULL,
|
||||
ADD COLUMN IF NOT EXISTS extracted_relations JSON NULL,
|
||||
ADD COLUMN IF NOT EXISTS graph_node_ids JSON NULL,
|
||||
ADD COLUMN IF NOT EXISTS graph_edge_ids JSON NULL,
|
||||
ADD COLUMN IF NOT EXISTS community_ids JSON NULL,
|
||||
ADD COLUMN IF NOT EXISTS embedding_model VARCHAR(128) NULL,
|
||||
ADD COLUMN IF NOT EXISTS embedding_dim INT NULL;
|
||||
""".strip()
|
||||
Reference in New Issue
Block a user