init. project

This commit is contained in:
2026-04-13 11:34:23 +08:00
commit c7c0659a85
202 changed files with 31196 additions and 0 deletions

View File

@@ -0,0 +1,582 @@
import logging
import os
import hashlib
import tempfile
import traceback
import json
from app.db.session import SessionLocal
from io import BytesIO
from typing import Optional, List, Dict, Any
from fastapi import UploadFile
from langchain_community.document_loaders import (
PyPDFLoader,
Docx2txtLoader,
UnstructuredMarkdownLoader,
TextLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document as LangchainDocument
from pydantic import BaseModel
from sqlalchemy.orm import Session
from app.core.config import settings
from app.core.minio import get_minio_client
from app.models.knowledge import ProcessingTask, Document, DocumentChunk
from app.services.chunk_record import ChunkRecord
from minio.error import MinioException
from minio.commonconfig import CopySource
from app.services.vector_store import VectorStoreFactory
from app.services.embedding.embedding_factory import EmbeddingsFactory
class UploadResult(BaseModel):
file_path: str
file_name: str
file_size: int
content_type: str
file_hash: str
class TextChunk(BaseModel):
content: str
metadata: Optional[Dict] = None
class PreviewResult(BaseModel):
chunks: List[TextChunk]
total_chunks: int
def _estimate_token_count(text: str) -> int:
# Lightweight estimation without adding tokenizer dependencies.
return len(text)
def _build_enriched_chunk_metadata(
*,
source_metadata: Optional[Dict[str, Any]],
chunk_id: str,
file_name: str,
file_path: str,
kb_id: int,
document_id: int,
chunk_index: int,
chunk_text: str,
) -> Dict[str, Any]:
source_metadata = source_metadata or {}
token_count = _estimate_token_count(chunk_text)
return {
**source_metadata,
"source": file_name,
"chunk_id": chunk_id,
"file_name": file_name,
"file_path": file_path,
"kb_id": kb_id,
"document_id": document_id,
"chunk_index": chunk_index,
"chunk_text": chunk_text,
"token_count": token_count,
"language": source_metadata.get("language", "zh"),
"source_type": "document",
"mission_phase": source_metadata.get("mission_phase"),
"section_title": source_metadata.get("section_title"),
"publish_time": source_metadata.get("publish_time"),
# Keep graph-linked fields for future graph/vector federation.
"extracted_entities": source_metadata.get("extracted_entities", []),
"extracted_entity_types": source_metadata.get("extracted_entity_types", []),
"extracted_relations": source_metadata.get("extracted_relations", []),
"graph_node_ids": source_metadata.get("graph_node_ids", []),
"graph_edge_ids": source_metadata.get("graph_edge_ids", []),
"community_ids": source_metadata.get("community_ids", []),
}
def _sanitize_metadata_for_vector_store(metadata: Optional[Dict[str, Any]]) -> Dict[str, Any]:
"""Normalize metadata to satisfy Chroma's strict metadata constraints."""
if not metadata:
return {}
sanitized: Dict[str, Any] = {}
scalar_types = (str, int, float, bool)
for key, value in metadata.items():
if value is None:
continue
if isinstance(value, scalar_types):
sanitized[key] = value
continue
if isinstance(value, list):
primitive_items = [item for item in value if isinstance(item, scalar_types)]
if primitive_items:
sanitized[key] = primitive_items
elif value:
sanitized[key] = json.dumps(value, ensure_ascii=False)
continue
if isinstance(value, dict):
sanitized[key] = json.dumps(value, ensure_ascii=False)
continue
sanitized[key] = str(value)
return sanitized
async def process_document(file_path: str, file_name: str, kb_id: int, document_id: int, chunk_size: int = 1000, chunk_overlap: int = 200) -> None:
"""Process document and store in vector database with incremental updates"""
logger = logging.getLogger(__name__)
try:
preview_result = await preview_document(file_path, chunk_size, chunk_overlap)
# Initialize embeddings
logger.info("Initializing OpenAI embeddings...")
embeddings = EmbeddingsFactory.create()
logger.info(f"Initializing vector store with collection: kb_{kb_id}")
vector_store = VectorStoreFactory.create(
store_type=settings.VECTOR_STORE_TYPE,
collection_name=f"kb_{kb_id}",
embedding_function=embeddings,
)
# Initialize chunk record manager
chunk_manager = ChunkRecord(kb_id)
# Get existing chunk hashes for this file
existing_hashes = chunk_manager.list_chunks(file_name)
# Prepare new chunks
new_chunks = []
current_hashes = set()
documents_to_update = []
for i, chunk in enumerate(preview_result.chunks):
# Calculate chunk hash
chunk_hash = hashlib.sha256(
(chunk.content + str(chunk.metadata)).encode()
).hexdigest()
current_hashes.add(chunk_hash)
# Skip if chunk hasn't changed
if chunk_hash in existing_hashes:
continue
# Create unique ID for the chunk
chunk_id = hashlib.sha256(
f"{kb_id}:{file_name}:{chunk_hash}".encode()
).hexdigest()
metadata = _build_enriched_chunk_metadata(
source_metadata=chunk.metadata,
chunk_id=chunk_id,
file_name=file_name,
file_path=file_path,
kb_id=kb_id,
document_id=document_id,
chunk_index=i,
chunk_text=chunk.content,
)
vector_metadata = _sanitize_metadata_for_vector_store(metadata)
new_chunks.append({
"id": chunk_id,
"kb_id": kb_id,
"document_id": document_id,
"file_name": file_name,
"metadata": metadata,
"hash": chunk_hash
})
# Prepare document for vector store
doc = LangchainDocument(
page_content=chunk.content,
metadata=vector_metadata
)
documents_to_update.append(doc)
# Add new chunks to database and vector store
if new_chunks:
logger.info(f"Adding {len(new_chunks)} new/updated chunks")
chunk_manager.add_chunks(new_chunks)
vector_store.add_documents(documents_to_update)
if settings.GRAPHRAG_ENABLED:
try:
from app.services.graph.graphrag_adapter import GraphRAGAdapter
graph_adapter = GraphRAGAdapter()
source_texts = [doc.page_content for doc in documents_to_update if doc.page_content.strip()]
await graph_adapter.ingest_texts(kb_id, source_texts)
logger.info("GraphRAG ingestion completed in incremental processing")
except Exception as graph_exc:
logger.error(f"GraphRAG ingestion failed in incremental processing: {graph_exc}")
# Delete removed chunks
chunks_to_delete = chunk_manager.get_deleted_chunks(current_hashes, file_name)
if chunks_to_delete:
logger.info(f"Removing {len(chunks_to_delete)} deleted chunks")
chunk_manager.delete_chunks(chunks_to_delete)
vector_store.delete(chunks_to_delete)
logger.info("Document processing completed successfully")
except Exception as e:
logger.error(f"Error processing document: {str(e)}")
raise
async def upload_document(file: UploadFile, kb_id: int) -> UploadResult:
"""Step 1: Upload document to MinIO"""
content = await file.read()
file_size = len(content)
file_hash = hashlib.sha256(content).hexdigest()
# Clean and normalize filename
file_name = "".join(c for c in file.filename if c.isalnum() or c in ('-', '_', '.')).strip()
object_path = f"kb_{kb_id}/{file_name}"
content_types = {
".pdf": "application/pdf",
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".md": "text/markdown",
".txt": "text/plain"
}
_, ext = os.path.splitext(file_name)
content_type = content_types.get(ext.lower(), "application/octet-stream")
# Upload to MinIO
minio_client = get_minio_client()
try:
minio_client.put_object(
bucket_name=settings.MINIO_BUCKET_NAME,
object_name=object_path,
data=BytesIO(content),
length=file_size,
content_type=content_type
)
except Exception as e:
logging.error(f"Failed to upload file to MinIO: {str(e)}")
raise
return UploadResult(
file_path=object_path,
file_name=file_name,
file_size=file_size,
content_type=content_type,
file_hash=file_hash
)
async def preview_document(file_path: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> PreviewResult:
"""Step 2: Generate preview chunks"""
# Get file from MinIO
minio_client = get_minio_client()
_, ext = os.path.splitext(file_path)
ext = ext.lower()
# Download to temp file
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
minio_client.fget_object(
bucket_name=settings.MINIO_BUCKET_NAME,
object_name=file_path,
file_path=temp_file.name
)
temp_path = temp_file.name
try:
# Select appropriate loader
if ext == ".pdf":
loader = PyPDFLoader(temp_path)
elif ext == ".docx":
loader = Docx2txtLoader(temp_path)
elif ext == ".md":
loader = UnstructuredMarkdownLoader(temp_path)
else: # Default to text loader
loader = TextLoader(temp_path)
# Load and split the document
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
chunks = text_splitter.split_documents(documents)
# Convert to preview format
preview_chunks = [
TextChunk(
content=chunk.page_content,
metadata=chunk.metadata
)
for chunk in chunks
]
return PreviewResult(
chunks=preview_chunks,
total_chunks=len(chunks)
)
finally:
os.unlink(temp_path)
async def process_document_background(
temp_path: str,
file_name: str,
kb_id: int,
task_id: int,
db: Session = None,
chunk_size: int = 1000,
chunk_overlap: int = 200
) -> None:
"""Process document in background"""
logger = logging.getLogger(__name__)
logger.info(f"Starting background processing for task {task_id}, file: {file_name}")
# if we don't pass in db, create a new database session
if db is None:
db = SessionLocal()
should_close_db = True
else:
should_close_db = False
task = db.query(ProcessingTask).get(task_id)
if not task:
logger.error(f"Task {task_id} not found")
return
minio_client = None
local_temp_path = None
try:
logger.info(f"Task {task_id}: Setting status to processing")
task.status = "processing"
db.commit()
# 1. 从临时目录下载文件
minio_client = get_minio_client()
try:
local_temp_path = f"/tmp/temp_{task_id}_{file_name}" # 使用系统临时目录
logger.info(f"Task {task_id}: Downloading file from MinIO: {temp_path} to {local_temp_path}")
minio_client.fget_object(
bucket_name=settings.MINIO_BUCKET_NAME,
object_name=temp_path,
file_path=local_temp_path
)
logger.info(f"Task {task_id}: File downloaded successfully")
except MinioException as e:
# Idempotent fallback: temp object may already be consumed by another task.
# If the final document is already created, treat current task as completed.
if "NoSuchKey" in str(e) and task.document_upload:
existing_document = db.query(Document).filter(
Document.knowledge_base_id == kb_id,
Document.file_name == file_name,
Document.file_hash == task.document_upload.file_hash,
).first()
if existing_document:
logger.warning(
f"Task {task_id}: Temp object missing but document already exists, "
f"marking task as completed (document_id={existing_document.id})"
)
task.status = "completed"
task.document_id = existing_document.id
task.error_message = None
task.document_upload.status = "completed"
task.document_upload.error_message = None
db.commit()
return
error_msg = f"Failed to download temp file: {str(e)}"
logger.error(f"Task {task_id}: {error_msg}")
raise Exception(error_msg)
try:
# 2. 加载和分块文档
_, ext = os.path.splitext(file_name)
ext = ext.lower()
logger.info(f"Task {task_id}: Loading document with extension {ext}")
# 选择合适的加载器
if ext == ".pdf":
loader = PyPDFLoader(local_temp_path)
elif ext == ".docx":
loader = Docx2txtLoader(local_temp_path)
elif ext == ".md":
loader = UnstructuredMarkdownLoader(local_temp_path)
else: # 默认使用文本加载器
loader = TextLoader(local_temp_path)
logger.info(f"Task {task_id}: Loading document content")
documents = loader.load()
logger.info(f"Task {task_id}: Document loaded successfully")
logger.info(f"Task {task_id}: Splitting document into chunks")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
chunks = text_splitter.split_documents(documents)
logger.info(f"Task {task_id}: Document split into {len(chunks)} chunks")
# 3. 创建向量存储
logger.info(f"Task {task_id}: Initializing vector store")
embeddings = EmbeddingsFactory.create()
vector_store = VectorStoreFactory.create(
store_type=settings.VECTOR_STORE_TYPE,
collection_name=f"kb_{kb_id}",
embedding_function=embeddings,
)
# 4. 将临时文件移动到永久目录
permanent_path = f"kb_{kb_id}/{file_name}"
try:
logger.info(f"Task {task_id}: Moving file to permanent storage")
# 复制到永久目录
source = CopySource(settings.MINIO_BUCKET_NAME, temp_path)
minio_client.copy_object(
bucket_name=settings.MINIO_BUCKET_NAME,
object_name=permanent_path,
source=source
)
logger.info(f"Task {task_id}: File moved to permanent storage")
# 删除临时文件
logger.info(f"Task {task_id}: Removing temporary file from MinIO")
minio_client.remove_object(
bucket_name=settings.MINIO_BUCKET_NAME,
object_name=temp_path
)
logger.info(f"Task {task_id}: Temporary file removed")
except MinioException as e:
error_msg = f"Failed to move file to permanent storage: {str(e)}"
logger.error(f"Task {task_id}: {error_msg}")
raise Exception(error_msg)
# 5. 创建文档记录
logger.info(f"Task {task_id}: Creating document record")
document = Document(
file_name=file_name,
file_path=permanent_path,
file_hash=task.document_upload.file_hash,
file_size=task.document_upload.file_size,
content_type=task.document_upload.content_type,
knowledge_base_id=kb_id
)
db.add(document)
db.flush()
db.refresh(document)
logger.info(f"Task {task_id}: Document record created with ID {document.id}")
# 6. 存储文档块
logger.info(f"Task {task_id}: Storing document chunks")
for i, chunk in enumerate(chunks):
# 为每个 chunk 生成唯一的 ID
chunk_id = hashlib.sha256(
f"{kb_id}:{file_name}:{chunk.page_content}".encode()
).hexdigest()
metadata = _build_enriched_chunk_metadata(
source_metadata=chunk.metadata,
chunk_id=chunk_id,
file_name=file_name,
file_path=permanent_path,
kb_id=kb_id,
document_id=document.id,
chunk_index=i,
chunk_text=chunk.page_content,
)
chunk.metadata = metadata
doc_chunk = DocumentChunk(
id=chunk_id, # 添加 ID 字段
document_id=document.id,
kb_id=kb_id,
file_name=file_name,
chunk_metadata={
"page_content": chunk.page_content,
**metadata
},
hash=hashlib.sha256(
(chunk.page_content + str(metadata)).encode()
).hexdigest()
)
db.add(doc_chunk)
if i > 0 and i % 100 == 0:
logger.info(f"Task {task_id}: Stored {i} chunks")
db.flush()
# 7. 添加到向量存储
logger.info(f"Task {task_id}: Adding chunks to vector store")
vector_chunks = [
LangchainDocument(
page_content=chunk.page_content,
metadata=_sanitize_metadata_for_vector_store(chunk.metadata),
)
for chunk in chunks
]
vector_store.add_documents(vector_chunks)
# 移除 persist() 调用,因为新版本不需要
logger.info(f"Task {task_id}: Chunks added to vector store")
if settings.GRAPHRAG_ENABLED:
try:
from app.services.graph.graphrag_adapter import GraphRAGAdapter
logger.info(f"Task {task_id}: Starting GraphRAG ingestion")
graph_adapter = GraphRAGAdapter()
source_texts = [doc.page_content for doc in documents if doc.page_content.strip()]
await graph_adapter.ingest_texts(kb_id, source_texts)
logger.info(f"Task {task_id}: GraphRAG ingestion completed")
except Exception as graph_exc:
logger.error(f"Task {task_id}: GraphRAG ingestion failed: {graph_exc}")
# 8. 更新任务状态
logger.info(f"Task {task_id}: Updating task status to completed")
task.status = "completed"
task.document_id = document.id # 更新为新创建的文档ID
# 9. 更新上传记录状态
upload = task.document_upload # 直接通过关系获取
if upload:
logger.info(f"Task {task_id}: Updating upload record status to completed")
upload.status = "completed"
db.commit()
logger.info(f"Task {task_id}: Processing completed successfully")
finally:
# 清理本地临时文件
try:
if os.path.exists(local_temp_path):
logger.info(f"Task {task_id}: Cleaning up local temp file")
os.remove(local_temp_path)
logger.info(f"Task {task_id}: Local temp file cleaned up")
except Exception as e:
logger.warning(f"Task {task_id}: Failed to clean up local temp file: {str(e)}")
except Exception as e:
logger.error(f"Task {task_id}: Error processing document: {str(e)}")
logger.error(f"Task {task_id}: Stack trace: {traceback.format_exc()}")
db.rollback()
failed_task = db.query(ProcessingTask).get(task_id)
if failed_task:
failed_task.status = "failed"
failed_task.error_message = str(e)
if failed_task.document_upload:
failed_task.document_upload.status = "failed"
failed_task.document_upload.error_message = str(e)
db.commit()
# 清理临时文件
try:
logger.info(f"Task {task_id}: Cleaning up temporary file after error")
if minio_client is not None:
minio_client.remove_object(
bucket_name=settings.MINIO_BUCKET_NAME,
object_name=temp_path
)
logger.info(f"Task {task_id}: Temporary file cleaned up after error")
except:
logger.warning(f"Task {task_id}: Failed to clean up temporary file after error")
finally:
# if we create the db session, we need to close it
if should_close_db and db:
db.close()