init. project

2026-04-13 11:34:23 +08:00
commit c7c0659a85
202 changed files with 31196 additions and 0 deletions
--- a/rag-web-ui/backend/app/services/document_processor.py
+++ b/rag-web-ui/backend/app/services/document_processor.py
@@ -0,0 +1,582 @@
+import logging
+import os
+import hashlib
+import tempfile
+import traceback
+import json
+from app.db.session import SessionLocal
+from io import BytesIO
+from typing import Optional, List, Dict, Any
+from fastapi import UploadFile
+from langchain_community.document_loaders import (
+    PyPDFLoader,
+    Docx2txtLoader,
+    UnstructuredMarkdownLoader,
+    TextLoader
+)
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_core.documents import Document as LangchainDocument
+from pydantic import BaseModel
+from sqlalchemy.orm import Session
+from app.core.config import settings
+from app.core.minio import get_minio_client
+from app.models.knowledge import ProcessingTask, Document, DocumentChunk
+from app.services.chunk_record import ChunkRecord
+from minio.error import MinioException
+from minio.commonconfig import CopySource
+from app.services.vector_store import VectorStoreFactory
+from app.services.embedding.embedding_factory import EmbeddingsFactory
+
+class UploadResult(BaseModel):
+    file_path: str
+    file_name: str
+    file_size: int
+    content_type: str
+    file_hash: str
+
+class TextChunk(BaseModel):
+    content: str
+    metadata: Optional[Dict] = None
+
+class PreviewResult(BaseModel):
+    chunks: List[TextChunk]
+    total_chunks: int
+
+
+def _estimate_token_count(text: str) -> int:
+    # Lightweight estimation without adding tokenizer dependencies.
+    return len(text)
+
+
+def _build_enriched_chunk_metadata(
+    *,
+    source_metadata: Optional[Dict[str, Any]],
+    chunk_id: str,
+    file_name: str,
+    file_path: str,
+    kb_id: int,
+    document_id: int,
+    chunk_index: int,
+    chunk_text: str,
+) -> Dict[str, Any]:
+    source_metadata = source_metadata or {}
+    token_count = _estimate_token_count(chunk_text)
+
+    return {
+        **source_metadata,
+        "source": file_name,
+        "chunk_id": chunk_id,
+        "file_name": file_name,
+        "file_path": file_path,
+        "kb_id": kb_id,
+        "document_id": document_id,
+        "chunk_index": chunk_index,
+        "chunk_text": chunk_text,
+        "token_count": token_count,
+        "language": source_metadata.get("language", "zh"),
+        "source_type": "document",
+        "mission_phase": source_metadata.get("mission_phase"),
+        "section_title": source_metadata.get("section_title"),
+        "publish_time": source_metadata.get("publish_time"),
+        # Keep graph-linked fields for future graph/vector federation.
+        "extracted_entities": source_metadata.get("extracted_entities", []),
+        "extracted_entity_types": source_metadata.get("extracted_entity_types", []),
+        "extracted_relations": source_metadata.get("extracted_relations", []),
+        "graph_node_ids": source_metadata.get("graph_node_ids", []),
+        "graph_edge_ids": source_metadata.get("graph_edge_ids", []),
+        "community_ids": source_metadata.get("community_ids", []),
+    }
+
+
+def _sanitize_metadata_for_vector_store(metadata: Optional[Dict[str, Any]]) -> Dict[str, Any]:
+    """Normalize metadata to satisfy Chroma's strict metadata constraints."""
+    if not metadata:
+        return {}
+
+    sanitized: Dict[str, Any] = {}
+    scalar_types = (str, int, float, bool)
+
+    for key, value in metadata.items():
+        if value is None:
+            continue
+
+        if isinstance(value, scalar_types):
+            sanitized[key] = value
+            continue
+
+        if isinstance(value, list):
+            primitive_items = [item for item in value if isinstance(item, scalar_types)]
+            if primitive_items:
+                sanitized[key] = primitive_items
+            elif value:
+                sanitized[key] = json.dumps(value, ensure_ascii=False)
+            continue
+
+        if isinstance(value, dict):
+            sanitized[key] = json.dumps(value, ensure_ascii=False)
+            continue
+
+        sanitized[key] = str(value)
+
+    return sanitized
+
+async def process_document(file_path: str, file_name: str, kb_id: int, document_id: int, chunk_size: int = 1000, chunk_overlap: int = 200) -> None:
+    """Process document and store in vector database with incremental updates"""
+    logger = logging.getLogger(__name__)
+    
+    try:
+        preview_result = await preview_document(file_path, chunk_size, chunk_overlap)
+        
+        # Initialize embeddings
+        logger.info("Initializing OpenAI embeddings...")
+        embeddings = EmbeddingsFactory.create()
+        
+        logger.info(f"Initializing vector store with collection: kb_{kb_id}")
+        vector_store = VectorStoreFactory.create(
+            store_type=settings.VECTOR_STORE_TYPE,
+            collection_name=f"kb_{kb_id}",
+            embedding_function=embeddings,
+        )
+        
+        # Initialize chunk record manager
+        chunk_manager = ChunkRecord(kb_id)
+        
+        # Get existing chunk hashes for this file
+        existing_hashes = chunk_manager.list_chunks(file_name)
+        
+        # Prepare new chunks
+        new_chunks = []
+        current_hashes = set()
+        documents_to_update = []
+        
+        for i, chunk in enumerate(preview_result.chunks):
+            # Calculate chunk hash
+            chunk_hash = hashlib.sha256(
+                (chunk.content + str(chunk.metadata)).encode()
+            ).hexdigest()
+            current_hashes.add(chunk_hash)
+            
+            # Skip if chunk hasn't changed
+            if chunk_hash in existing_hashes:
+                continue
+            
+            # Create unique ID for the chunk
+            chunk_id = hashlib.sha256(
+                f"{kb_id}:{file_name}:{chunk_hash}".encode()
+            ).hexdigest()
+            
+            metadata = _build_enriched_chunk_metadata(
+                source_metadata=chunk.metadata,
+                chunk_id=chunk_id,
+                file_name=file_name,
+                file_path=file_path,
+                kb_id=kb_id,
+                document_id=document_id,
+                chunk_index=i,
+                chunk_text=chunk.content,
+            )
+            vector_metadata = _sanitize_metadata_for_vector_store(metadata)
+            
+            new_chunks.append({
+                "id": chunk_id,
+                "kb_id": kb_id,
+                "document_id": document_id,
+                "file_name": file_name,
+                "metadata": metadata,
+                "hash": chunk_hash
+            })
+            
+            # Prepare document for vector store
+            doc = LangchainDocument(
+                page_content=chunk.content,
+                metadata=vector_metadata
+            )
+            documents_to_update.append(doc)
+        
+        # Add new chunks to database and vector store
+        if new_chunks:
+            logger.info(f"Adding {len(new_chunks)} new/updated chunks")
+            chunk_manager.add_chunks(new_chunks)
+            vector_store.add_documents(documents_to_update)
+            if settings.GRAPHRAG_ENABLED:
+                try:
+                    from app.services.graph.graphrag_adapter import GraphRAGAdapter
+
+                    graph_adapter = GraphRAGAdapter()
+                    source_texts = [doc.page_content for doc in documents_to_update if doc.page_content.strip()]
+                    await graph_adapter.ingest_texts(kb_id, source_texts)
+                    logger.info("GraphRAG ingestion completed in incremental processing")
+                except Exception as graph_exc:
+                    logger.error(f"GraphRAG ingestion failed in incremental processing: {graph_exc}")
+        
+        # Delete removed chunks
+        chunks_to_delete = chunk_manager.get_deleted_chunks(current_hashes, file_name)
+        if chunks_to_delete:
+            logger.info(f"Removing {len(chunks_to_delete)} deleted chunks")
+            chunk_manager.delete_chunks(chunks_to_delete)
+            vector_store.delete(chunks_to_delete)
+        
+        logger.info("Document processing completed successfully")
+        
+    except Exception as e:
+        logger.error(f"Error processing document: {str(e)}")
+        raise
+
+async def upload_document(file: UploadFile, kb_id: int) -> UploadResult:
+    """Step 1: Upload document to MinIO"""
+    content = await file.read()
+    file_size = len(content)
+    
+    file_hash = hashlib.sha256(content).hexdigest()
+    
+    # Clean and normalize filename
+    file_name = "".join(c for c in file.filename if c.isalnum() or c in ('-', '_', '.')).strip()
+    object_path = f"kb_{kb_id}/{file_name}"
+    
+    content_types = {
+        ".pdf": "application/pdf",
+        ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        ".md": "text/markdown",
+        ".txt": "text/plain"
+    }
+    
+    _, ext = os.path.splitext(file_name)
+    content_type = content_types.get(ext.lower(), "application/octet-stream")
+    
+    # Upload to MinIO
+    minio_client = get_minio_client()
+    try:
+        minio_client.put_object(
+            bucket_name=settings.MINIO_BUCKET_NAME,
+            object_name=object_path,
+            data=BytesIO(content),
+            length=file_size,
+            content_type=content_type
+        )
+    except Exception as e:
+        logging.error(f"Failed to upload file to MinIO: {str(e)}")
+        raise
+        
+    return UploadResult(
+        file_path=object_path,
+        file_name=file_name,
+        file_size=file_size,
+        content_type=content_type,
+        file_hash=file_hash
+    )
+
+async def preview_document(file_path: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> PreviewResult:
+    """Step 2: Generate preview chunks"""
+    # Get file from MinIO
+    minio_client = get_minio_client()
+    _, ext = os.path.splitext(file_path)
+    ext = ext.lower()
+    
+    # Download to temp file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
+        minio_client.fget_object(
+            bucket_name=settings.MINIO_BUCKET_NAME,
+            object_name=file_path,
+            file_path=temp_file.name
+        )
+        temp_path = temp_file.name
+    
+    try:
+        # Select appropriate loader
+        if ext == ".pdf":
+            loader = PyPDFLoader(temp_path)
+        elif ext == ".docx":
+            loader = Docx2txtLoader(temp_path)
+        elif ext == ".md":
+            loader = UnstructuredMarkdownLoader(temp_path)
+        else:  # Default to text loader
+            loader = TextLoader(temp_path)
+        
+        # Load and split the document
+        documents = loader.load()
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap
+        )
+        chunks = text_splitter.split_documents(documents)
+        
+        # Convert to preview format
+        preview_chunks = [
+            TextChunk(
+                content=chunk.page_content,
+                metadata=chunk.metadata
+            )
+            for chunk in chunks
+        ]
+        
+        return PreviewResult(
+            chunks=preview_chunks,
+            total_chunks=len(chunks)
+        )
+    finally:
+        os.unlink(temp_path)
+
+async def process_document_background(
+    temp_path: str,
+    file_name: str,
+    kb_id: int,
+    task_id: int,
+    db: Session = None,
+    chunk_size: int = 1000,
+    chunk_overlap: int = 200
+) -> None:
+    """Process document in background"""
+    logger = logging.getLogger(__name__)
+    logger.info(f"Starting background processing for task {task_id}, file: {file_name}")
+
+    # if we don't pass in db, create a new database session
+    if db is None:
+        db = SessionLocal()
+        should_close_db = True
+    else:
+        should_close_db = False
+    
+    task = db.query(ProcessingTask).get(task_id)
+    if not task:
+        logger.error(f"Task {task_id} not found")
+        return
+    
+    minio_client = None
+    local_temp_path = None
+
+    try:
+        logger.info(f"Task {task_id}: Setting status to processing")
+        task.status = "processing"
+        db.commit()
+        
+        # 1. 从临时目录下载文件
+        minio_client = get_minio_client()
+        try:
+            local_temp_path = f"/tmp/temp_{task_id}_{file_name}"  # 使用系统临时目录
+            logger.info(f"Task {task_id}: Downloading file from MinIO: {temp_path} to {local_temp_path}")
+            minio_client.fget_object(
+                bucket_name=settings.MINIO_BUCKET_NAME,
+                object_name=temp_path,
+                file_path=local_temp_path
+            )
+            logger.info(f"Task {task_id}: File downloaded successfully")
+        except MinioException as e:
+            # Idempotent fallback: temp object may already be consumed by another task.
+            # If the final document is already created, treat current task as completed.
+            if "NoSuchKey" in str(e) and task.document_upload:
+                existing_document = db.query(Document).filter(
+                    Document.knowledge_base_id == kb_id,
+                    Document.file_name == file_name,
+                    Document.file_hash == task.document_upload.file_hash,
+                ).first()
+                if existing_document:
+                    logger.warning(
+                        f"Task {task_id}: Temp object missing but document already exists, "
+                        f"marking task as completed (document_id={existing_document.id})"
+                    )
+                    task.status = "completed"
+                    task.document_id = existing_document.id
+                    task.error_message = None
+                    task.document_upload.status = "completed"
+                    task.document_upload.error_message = None
+                    db.commit()
+                    return
+
+            error_msg = f"Failed to download temp file: {str(e)}"
+            logger.error(f"Task {task_id}: {error_msg}")
+            raise Exception(error_msg)
+        
+        try:
+            # 2. 加载和分块文档
+            _, ext = os.path.splitext(file_name)
+            ext = ext.lower()
+            
+            logger.info(f"Task {task_id}: Loading document with extension {ext}")
+            # 选择合适的加载器
+            if ext == ".pdf":
+                loader = PyPDFLoader(local_temp_path)
+            elif ext == ".docx":
+                loader = Docx2txtLoader(local_temp_path)
+            elif ext == ".md":
+                loader = UnstructuredMarkdownLoader(local_temp_path)
+            else:  # 默认使用文本加载器
+                loader = TextLoader(local_temp_path)
+            
+            logger.info(f"Task {task_id}: Loading document content")
+            documents = loader.load()
+            logger.info(f"Task {task_id}: Document loaded successfully")
+            
+            logger.info(f"Task {task_id}: Splitting document into chunks")
+            text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap
+            )
+            chunks = text_splitter.split_documents(documents)
+            logger.info(f"Task {task_id}: Document split into {len(chunks)} chunks")
+            
+            # 3. 创建向量存储
+            logger.info(f"Task {task_id}: Initializing vector store")
+            embeddings = EmbeddingsFactory.create()
+            
+            vector_store = VectorStoreFactory.create(
+                store_type=settings.VECTOR_STORE_TYPE,
+                collection_name=f"kb_{kb_id}",
+                embedding_function=embeddings,
+            )
+            
+            # 4. 将临时文件移动到永久目录
+            permanent_path = f"kb_{kb_id}/{file_name}"
+            try:
+                logger.info(f"Task {task_id}: Moving file to permanent storage")
+                # 复制到永久目录
+                source = CopySource(settings.MINIO_BUCKET_NAME, temp_path)
+                minio_client.copy_object(
+                    bucket_name=settings.MINIO_BUCKET_NAME,
+                    object_name=permanent_path,
+                    source=source
+                )
+                logger.info(f"Task {task_id}: File moved to permanent storage")
+                
+                # 删除临时文件
+                logger.info(f"Task {task_id}: Removing temporary file from MinIO")
+                minio_client.remove_object(
+                    bucket_name=settings.MINIO_BUCKET_NAME,
+                    object_name=temp_path
+                )
+                logger.info(f"Task {task_id}: Temporary file removed")
+            except MinioException as e:
+                error_msg = f"Failed to move file to permanent storage: {str(e)}"
+                logger.error(f"Task {task_id}: {error_msg}")
+                raise Exception(error_msg)
+            
+            # 5. 创建文档记录
+            logger.info(f"Task {task_id}: Creating document record")
+            document = Document(
+                file_name=file_name,
+                file_path=permanent_path,
+                file_hash=task.document_upload.file_hash,
+                file_size=task.document_upload.file_size,
+                content_type=task.document_upload.content_type,
+                knowledge_base_id=kb_id
+            )
+            db.add(document)
+            db.flush()
+            db.refresh(document)
+            logger.info(f"Task {task_id}: Document record created with ID {document.id}")
+            
+            # 6. 存储文档块
+            logger.info(f"Task {task_id}: Storing document chunks")
+            for i, chunk in enumerate(chunks):
+                # 为每个 chunk 生成唯一的 ID
+                chunk_id = hashlib.sha256(
+                    f"{kb_id}:{file_name}:{chunk.page_content}".encode()
+                ).hexdigest()
+
+                metadata = _build_enriched_chunk_metadata(
+                    source_metadata=chunk.metadata,
+                    chunk_id=chunk_id,
+                    file_name=file_name,
+                    file_path=permanent_path,
+                    kb_id=kb_id,
+                    document_id=document.id,
+                    chunk_index=i,
+                    chunk_text=chunk.page_content,
+                )
+                chunk.metadata = metadata
+                
+                doc_chunk = DocumentChunk(
+                    id=chunk_id,  # 添加 ID 字段
+                    document_id=document.id,
+                    kb_id=kb_id,
+                    file_name=file_name,
+                    chunk_metadata={
+                        "page_content": chunk.page_content,
+                        **metadata
+                    },
+                    hash=hashlib.sha256(
+                        (chunk.page_content + str(metadata)).encode()
+                    ).hexdigest()
+                )
+                db.add(doc_chunk)
+                if i > 0 and i % 100 == 0:
+                    logger.info(f"Task {task_id}: Stored {i} chunks")
+                    db.flush()
+            
+            # 7. 添加到向量存储
+            logger.info(f"Task {task_id}: Adding chunks to vector store")
+            vector_chunks = [
+                LangchainDocument(
+                    page_content=chunk.page_content,
+                    metadata=_sanitize_metadata_for_vector_store(chunk.metadata),
+                )
+                for chunk in chunks
+            ]
+            vector_store.add_documents(vector_chunks)
+            # 移除 persist() 调用，因为新版本不需要
+            logger.info(f"Task {task_id}: Chunks added to vector store")
+
+            if settings.GRAPHRAG_ENABLED:
+                try:
+                    from app.services.graph.graphrag_adapter import GraphRAGAdapter
+
+                    logger.info(f"Task {task_id}: Starting GraphRAG ingestion")
+                    graph_adapter = GraphRAGAdapter()
+                    source_texts = [doc.page_content for doc in documents if doc.page_content.strip()]
+                    await graph_adapter.ingest_texts(kb_id, source_texts)
+                    logger.info(f"Task {task_id}: GraphRAG ingestion completed")
+                except Exception as graph_exc:
+                    logger.error(f"Task {task_id}: GraphRAG ingestion failed: {graph_exc}")
+            
+            # 8. 更新任务状态
+            logger.info(f"Task {task_id}: Updating task status to completed")
+            task.status = "completed"
+            task.document_id = document.id  # 更新为新创建的文档ID
+            
+            # 9. 更新上传记录状态
+            upload = task.document_upload  # 直接通过关系获取
+            if upload:
+                logger.info(f"Task {task_id}: Updating upload record status to completed")
+                upload.status = "completed"
+            
+            db.commit()
+            logger.info(f"Task {task_id}: Processing completed successfully")
+            
+        finally:
+            # 清理本地临时文件
+            try:
+                if os.path.exists(local_temp_path):
+                    logger.info(f"Task {task_id}: Cleaning up local temp file")
+                    os.remove(local_temp_path)
+                    logger.info(f"Task {task_id}: Local temp file cleaned up")
+            except Exception as e:
+                logger.warning(f"Task {task_id}: Failed to clean up local temp file: {str(e)}")
+        
+    except Exception as e:
+        logger.error(f"Task {task_id}: Error processing document: {str(e)}")
+        logger.error(f"Task {task_id}: Stack trace: {traceback.format_exc()}")
+        db.rollback()
+
+        failed_task = db.query(ProcessingTask).get(task_id)
+        if failed_task:
+            failed_task.status = "failed"
+            failed_task.error_message = str(e)
+            if failed_task.document_upload:
+                failed_task.document_upload.status = "failed"
+                failed_task.document_upload.error_message = str(e)
+            db.commit()
+        
+        # 清理临时文件
+        try:
+            logger.info(f"Task {task_id}: Cleaning up temporary file after error")
+            if minio_client is not None:
+                minio_client.remove_object(
+                    bucket_name=settings.MINIO_BUCKET_NAME,
+                    object_name=temp_path
+                )
+                logger.info(f"Task {task_id}: Temporary file cleaned up after error")
+        except:
+            logger.warning(f"Task {task_id}: Failed to clean up temporary file after error")
+    finally:
+        # if we create the db session, we need to close it
+        if should_close_db and db:
+            db.close()