增加代码知识库;修复文档处理内容;增加API设置

This commit is contained in:
2026-05-16 20:20:10 +08:00
parent 69b49d28b2
commit 7aa3ce3294
119 changed files with 182273 additions and 793 deletions

View File

@@ -6,6 +6,7 @@ import traceback
import json
from app.db.session import SessionLocal
from io import BytesIO
from types import SimpleNamespace
from typing import Optional, List, Dict, Any
from fastapi import UploadFile
from langchain_community.document_loaders import (
@@ -26,6 +27,7 @@ from minio.error import MinioException
from minio.commonconfig import CopySource
from app.services.vector_store import VectorStoreFactory
from app.services.embedding.embedding_factory import EmbeddingsFactory
from app.services.model_config import ModelConfigService
class UploadResult(BaseModel):
file_path: str
@@ -120,7 +122,45 @@ def _sanitize_metadata_for_vector_store(metadata: Optional[Dict[str, Any]]) -> D
return sanitized
async def process_document(file_path: str, file_name: str, kb_id: int, document_id: int, chunk_size: int = 1000, chunk_overlap: int = 200) -> None:
def _resolve_model_profile(db: Session, user_id: Optional[int]) -> Any:
if user_id is None:
return None
return ModelConfigService.require_active_config(db, user_id)
def _model_profile_snapshot(model_profile: Any) -> Any:
if model_profile is None:
return None
return SimpleNamespace(
provider=model_profile.provider,
api_key=model_profile.api_key,
api_base=model_profile.api_base,
chat_model=model_profile.chat_model,
embedding_model=model_profile.embedding_model,
)
def _load_model_profile_for_user(user_id: Optional[int]) -> Any:
if user_id is None:
return None
db = SessionLocal()
try:
model_profile = ModelConfigService.require_active_config(db, user_id)
ModelConfigService.touch_last_used(db, model_profile)
return _model_profile_snapshot(model_profile)
finally:
db.close()
async def process_document(
file_path: str,
file_name: str,
kb_id: int,
document_id: int,
chunk_size: int = 1000,
chunk_overlap: int = 200,
user_id: Optional[int] = None,
) -> None:
"""Process document and store in vector database with incremental updates"""
logger = logging.getLogger(__name__)
@@ -129,7 +169,8 @@ async def process_document(file_path: str, file_name: str, kb_id: int, document_
# Initialize embeddings
logger.info("Initializing OpenAI embeddings...")
embeddings = EmbeddingsFactory.create()
model_profile = _load_model_profile_for_user(user_id)
embeddings = EmbeddingsFactory.create(model_profile=model_profile)
logger.info(f"Initializing vector store with collection: kb_{kb_id}")
vector_store = VectorStoreFactory.create(
@@ -202,7 +243,7 @@ async def process_document(file_path: str, file_name: str, kb_id: int, document_
try:
from app.services.graph.graphrag_adapter import GraphRAGAdapter
graph_adapter = GraphRAGAdapter()
graph_adapter = GraphRAGAdapter(model_profile=model_profile)
source_texts = [doc.page_content for doc in documents_to_update if doc.page_content.strip()]
await graph_adapter.ingest_texts(kb_id, source_texts)
logger.info("GraphRAG ingestion completed in incremental processing")
@@ -323,7 +364,8 @@ async def process_document_background(
task_id: int,
db: Session = None,
chunk_size: int = 1000,
chunk_overlap: int = 200
chunk_overlap: int = 200,
user_id: Optional[int] = None,
) -> None:
"""Process document in background"""
logger = logging.getLogger(__name__)
@@ -348,6 +390,9 @@ async def process_document_background(
logger.info(f"Task {task_id}: Setting status to processing")
task.status = "processing"
db.commit()
model_profile = _resolve_model_profile(db, user_id)
if model_profile is not None:
ModelConfigService.touch_last_used(db, model_profile)
# 1. 从临时目录下载文件
minio_client = get_minio_client()
@@ -416,7 +461,7 @@ async def process_document_background(
# 3. 创建向量存储
logger.info(f"Task {task_id}: Initializing vector store")
embeddings = EmbeddingsFactory.create()
embeddings = EmbeddingsFactory.create(model_profile=model_profile)
vector_store = VectorStoreFactory.create(
store_type=settings.VECTOR_STORE_TYPE,
@@ -520,7 +565,7 @@ async def process_document_background(
from app.services.graph.graphrag_adapter import GraphRAGAdapter
logger.info(f"Task {task_id}: Starting GraphRAG ingestion")
graph_adapter = GraphRAGAdapter()
graph_adapter = GraphRAGAdapter(model_profile=model_profile)
source_texts = [doc.page_content for doc in documents if doc.page_content.strip()]
await graph_adapter.ingest_texts(kb_id, source_texts)
logger.info(f"Task {task_id}: GraphRAG ingestion completed")