增加代码知识库;修复文档处理内容;增加API设置
This commit is contained in:
@@ -6,6 +6,7 @@ import traceback
|
||||
import json
|
||||
from app.db.session import SessionLocal
|
||||
from io import BytesIO
|
||||
from types import SimpleNamespace
|
||||
from typing import Optional, List, Dict, Any
|
||||
from fastapi import UploadFile
|
||||
from langchain_community.document_loaders import (
|
||||
@@ -26,6 +27,7 @@ from minio.error import MinioException
|
||||
from minio.commonconfig import CopySource
|
||||
from app.services.vector_store import VectorStoreFactory
|
||||
from app.services.embedding.embedding_factory import EmbeddingsFactory
|
||||
from app.services.model_config import ModelConfigService
|
||||
|
||||
class UploadResult(BaseModel):
|
||||
file_path: str
|
||||
@@ -120,7 +122,45 @@ def _sanitize_metadata_for_vector_store(metadata: Optional[Dict[str, Any]]) -> D
|
||||
|
||||
return sanitized
|
||||
|
||||
async def process_document(file_path: str, file_name: str, kb_id: int, document_id: int, chunk_size: int = 1000, chunk_overlap: int = 200) -> None:
|
||||
def _resolve_model_profile(db: Session, user_id: Optional[int]) -> Any:
|
||||
if user_id is None:
|
||||
return None
|
||||
return ModelConfigService.require_active_config(db, user_id)
|
||||
|
||||
|
||||
def _model_profile_snapshot(model_profile: Any) -> Any:
|
||||
if model_profile is None:
|
||||
return None
|
||||
return SimpleNamespace(
|
||||
provider=model_profile.provider,
|
||||
api_key=model_profile.api_key,
|
||||
api_base=model_profile.api_base,
|
||||
chat_model=model_profile.chat_model,
|
||||
embedding_model=model_profile.embedding_model,
|
||||
)
|
||||
|
||||
|
||||
def _load_model_profile_for_user(user_id: Optional[int]) -> Any:
|
||||
if user_id is None:
|
||||
return None
|
||||
db = SessionLocal()
|
||||
try:
|
||||
model_profile = ModelConfigService.require_active_config(db, user_id)
|
||||
ModelConfigService.touch_last_used(db, model_profile)
|
||||
return _model_profile_snapshot(model_profile)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
async def process_document(
|
||||
file_path: str,
|
||||
file_name: str,
|
||||
kb_id: int,
|
||||
document_id: int,
|
||||
chunk_size: int = 1000,
|
||||
chunk_overlap: int = 200,
|
||||
user_id: Optional[int] = None,
|
||||
) -> None:
|
||||
"""Process document and store in vector database with incremental updates"""
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -129,7 +169,8 @@ async def process_document(file_path: str, file_name: str, kb_id: int, document_
|
||||
|
||||
# Initialize embeddings
|
||||
logger.info("Initializing OpenAI embeddings...")
|
||||
embeddings = EmbeddingsFactory.create()
|
||||
model_profile = _load_model_profile_for_user(user_id)
|
||||
embeddings = EmbeddingsFactory.create(model_profile=model_profile)
|
||||
|
||||
logger.info(f"Initializing vector store with collection: kb_{kb_id}")
|
||||
vector_store = VectorStoreFactory.create(
|
||||
@@ -202,7 +243,7 @@ async def process_document(file_path: str, file_name: str, kb_id: int, document_
|
||||
try:
|
||||
from app.services.graph.graphrag_adapter import GraphRAGAdapter
|
||||
|
||||
graph_adapter = GraphRAGAdapter()
|
||||
graph_adapter = GraphRAGAdapter(model_profile=model_profile)
|
||||
source_texts = [doc.page_content for doc in documents_to_update if doc.page_content.strip()]
|
||||
await graph_adapter.ingest_texts(kb_id, source_texts)
|
||||
logger.info("GraphRAG ingestion completed in incremental processing")
|
||||
@@ -323,7 +364,8 @@ async def process_document_background(
|
||||
task_id: int,
|
||||
db: Session = None,
|
||||
chunk_size: int = 1000,
|
||||
chunk_overlap: int = 200
|
||||
chunk_overlap: int = 200,
|
||||
user_id: Optional[int] = None,
|
||||
) -> None:
|
||||
"""Process document in background"""
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -348,6 +390,9 @@ async def process_document_background(
|
||||
logger.info(f"Task {task_id}: Setting status to processing")
|
||||
task.status = "processing"
|
||||
db.commit()
|
||||
model_profile = _resolve_model_profile(db, user_id)
|
||||
if model_profile is not None:
|
||||
ModelConfigService.touch_last_used(db, model_profile)
|
||||
|
||||
# 1. 从临时目录下载文件
|
||||
minio_client = get_minio_client()
|
||||
@@ -416,7 +461,7 @@ async def process_document_background(
|
||||
|
||||
# 3. 创建向量存储
|
||||
logger.info(f"Task {task_id}: Initializing vector store")
|
||||
embeddings = EmbeddingsFactory.create()
|
||||
embeddings = EmbeddingsFactory.create(model_profile=model_profile)
|
||||
|
||||
vector_store = VectorStoreFactory.create(
|
||||
store_type=settings.VECTOR_STORE_TYPE,
|
||||
@@ -520,7 +565,7 @@ async def process_document_background(
|
||||
from app.services.graph.graphrag_adapter import GraphRAGAdapter
|
||||
|
||||
logger.info(f"Task {task_id}: Starting GraphRAG ingestion")
|
||||
graph_adapter = GraphRAGAdapter()
|
||||
graph_adapter = GraphRAGAdapter(model_profile=model_profile)
|
||||
source_texts = [doc.page_content for doc in documents if doc.page_content.strip()]
|
||||
await graph_adapter.ingest_texts(kb_id, source_texts)
|
||||
logger.info(f"Task {task_id}: GraphRAG ingestion completed")
|
||||
|
||||
Reference in New Issue
Block a user