feat: implement RAG Service MVP with PARSER + Memory integration

RAG Service Implementation: - Create rag-service/ with full structure (config, document_store, embedding, pipelines) - Document Store: PostgreSQL + pgvector via Haystack - Embedding: BAAI/bge-m3 (multilingual, 1024 dim) - Ingest Pipeline: Convert ParsedDocument to Haystack Documents, embed, index - Query Pipeline: Retrieve documents, generate answers via DAGI Router - FastAPI endpoints: /ingest, /query, /health Tests: - Unit tests for ingest and query pipelines - E2E test with example parsed JSON - Test fixtures with real PARSER output example Router Integration: - Add mode='rag_query' routing rule in router-config.yml - Priority 7, uses local_qwen3_8b for RAG queries Docker: - Add rag-service to docker-compose.yml - Configure dependencies (router, city-db) - Add model cache volume Documentation: - Complete README with API examples - Integration guides for PARSER and Router
2025-11-16 04:41:53 -08:00
parent d3c701f3ff
commit 9b86f9a694
19 changed files with 1275 additions and 97 deletions
--- a/services/rag-service/app/init.py
+++ b/services/rag-service/app/init.py
@@ -0,0 +1,5 @@
+"""
+RAG Service - Retrieval-Augmented Generation for MicroDAO
+Integrates PARSER + Memory + Vector Search
+"""
+
--- a/services/rag-service/app/core/init.py
+++ b/services/rag-service/app/core/init.py
--- a/services/rag-service/app/core/config.py
+++ b/services/rag-service/app/core/config.py
@@ -0,0 +1,51 @@
+"""
+Configuration for RAG Service
+"""
+
+import os
+from typing import Literal
+from pydantic_settings import BaseSettings
+
+
+class Settings(BaseSettings):
+    """Application settings"""
+    
+    # Service
+    API_HOST: str = "0.0.0.0"
+    API_PORT: int = 9500
+    
+    # PostgreSQL + pgvector
+    PG_DSN: str = os.getenv(
+        "PG_DSN",
+        "postgresql+psycopg2://postgres:postgres@city-db:5432/daarion_city"
+    )
+    
+    # Embedding model
+    EMBED_MODEL_NAME: str = os.getenv("EMBED_MODEL_NAME", "BAAI/bge-m3")
+    EMBED_DEVICE: Literal["cuda", "cpu", "mps"] = os.getenv("EMBED_DEVICE", "cpu")
+    EMBED_DIM: int = int(os.getenv("EMBED_DIM", "1024"))  # BAAI/bge-m3 = 1024
+    
+    # Document Store
+    RAG_TABLE_NAME: str = os.getenv("RAG_TABLE_NAME", "rag_documents")
+    SEARCH_STRATEGY: Literal["approximate", "exact"] = os.getenv("SEARCH_STRATEGY", "approximate")
+    
+    # Chunking
+    CHUNK_SIZE: int = int(os.getenv("CHUNK_SIZE", "500"))
+    CHUNK_OVERLAP: int = int(os.getenv("CHUNK_OVERLAP", "50"))
+    
+    # Retrieval
+    TOP_K: int = int(os.getenv("TOP_K", "5"))
+    
+    # LLM (for query pipeline)
+    LLM_PROVIDER: str = os.getenv("LLM_PROVIDER", "router")  # router, openai, local
+    ROUTER_BASE_URL: str = os.getenv("ROUTER_BASE_URL", "http://router:9102")
+    OPENAI_API_KEY: str = os.getenv("OPENAI_API_KEY", "")
+    OPENAI_MODEL: str = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
+    
+    class Config:
+        env_file = ".env"
+        case_sensitive = True
+
+
+settings = Settings()
+
--- a/services/rag-service/app/document_store.py
+++ b/services/rag-service/app/document_store.py
@@ -0,0 +1,57 @@
+"""
+Document Store for RAG Service
+Uses PostgreSQL + pgvector via Haystack
+"""
+
+import logging
+from typing import Optional
+
+from haystack.document_stores import PGVectorDocumentStore
+
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+# Global document store instance
+_document_store: Optional[PGVectorDocumentStore] = None
+
+
+def get_document_store() -> PGVectorDocumentStore:
+    """
+    Get or create PGVectorDocumentStore instance
+    
+    Returns:
+        PGVectorDocumentStore configured with pgvector
+    """
+    global _document_store
+    
+    if _document_store is not None:
+        return _document_store
+    
+    logger.info(f"Initializing PGVectorDocumentStore: table={settings.RAG_TABLE_NAME}")
+    logger.info(f"Connection: {settings.PG_DSN.split('@')[1] if '@' in settings.PG_DSN else 'hidden'}")
+    
+    try:
+        _document_store = PGVectorDocumentStore(
+            connection_string=settings.PG_DSN,
+            embedding_dim=settings.EMBED_DIM,
+            table_name=settings.RAG_TABLE_NAME,
+            search_strategy=settings.SEARCH_STRATEGY,
+            # Additional options
+            recreate_table=False,  # Don't drop existing table
+            similarity="cosine",  # Cosine similarity for embeddings
+        )
+        
+        logger.info("PGVectorDocumentStore initialized successfully")
+        return _document_store
+        
+    except Exception as e:
+        logger.error(f"Failed to initialize DocumentStore: {e}", exc_info=True)
+        raise RuntimeError(f"DocumentStore initialization failed: {e}") from e
+
+
+def reset_document_store():
+    """Reset global document store instance (for testing)"""
+    global _document_store
+    _document_store = None
+
--- a/services/rag-service/app/embedding.py
+++ b/services/rag-service/app/embedding.py
@@ -0,0 +1,52 @@
+"""
+Embedding service for RAG
+Uses SentenceTransformers via Haystack
+"""
+
+import logging
+from typing import Optional
+
+from haystack.components.embedders import SentenceTransformersTextEmbedder
+
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+# Global embedder instance
+_text_embedder: Optional[SentenceTransformersTextEmbedder] = None
+
+
+def get_text_embedder() -> SentenceTransformersTextEmbedder:
+    """
+    Get or create SentenceTransformersTextEmbedder instance
+    
+    Returns:
+        SentenceTransformersTextEmbedder configured with embedding model
+    """
+    global _text_embedder
+    
+    if _text_embedder is not None:
+        return _text_embedder
+    
+    logger.info(f"Loading embedding model: {settings.EMBED_MODEL_NAME}")
+    logger.info(f"Device: {settings.EMBED_DEVICE}")
+    
+    try:
+        _text_embedder = SentenceTransformersTextEmbedder(
+            model=settings.EMBED_MODEL_NAME,
+            device=settings.EMBED_DEVICE,
+        )
+        
+        logger.info("Text embedder initialized successfully")
+        return _text_embedder
+        
+    except Exception as e:
+        logger.error(f"Failed to initialize TextEmbedder: {e}", exc_info=True)
+        raise RuntimeError(f"TextEmbedder initialization failed: {e}") from e
+
+
+def reset_embedder():
+    """Reset global embedder instance (for testing)"""
+    global _text_embedder
+    _text_embedder = None
+
--- a/services/rag-service/app/ingest_pipeline.py
+++ b/services/rag-service/app/ingest_pipeline.py
@@ -0,0 +1,191 @@
+"""
+Ingest Pipeline: PARSER → RAG
+Converts ParsedDocument to Haystack Documents and indexes them
+"""
+
+import logging
+from typing import List, Dict, Any, Optional
+
+from haystack import Pipeline
+from haystack.components.preprocessors import DocumentSplitter
+from haystack.components.writers import DocumentWriter
+from haystack.schema import Document
+
+from app.document_store import get_document_store
+from app.embedding import get_text_embedder
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+def ingest_parsed_document(
+    dao_id: str,
+    doc_id: str,
+    parsed_json: Dict[str, Any],
+    user_id: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Ingest parsed document from PARSER service into RAG
+    
+    Args:
+        dao_id: DAO identifier
+        doc_id: Document identifier
+        parsed_json: ParsedDocument JSON from PARSER service
+        user_id: Optional user identifier
+    
+    Returns:
+        Dictionary with ingest results (doc_count, status)
+    """
+    logger.info(f"Ingesting document: dao_id={dao_id}, doc_id={doc_id}")
+    
+    try:
+        # Convert parsed_json to Haystack Documents
+        documents = _parsed_json_to_documents(parsed_json, dao_id, doc_id, user_id)
+        
+        if not documents:
+            logger.warning(f"No documents to ingest for doc_id={doc_id}")
+            return {
+                "status": "error",
+                "message": "No documents to ingest",
+                "doc_count": 0
+            }
+        
+        logger.info(f"Converted {len(documents)} blocks to Haystack Documents")
+        
+        # Create ingest pipeline
+        pipeline = _create_ingest_pipeline()
+        
+        # Run pipeline
+        result = pipeline.run({"documents": documents})
+        
+        # Extract results
+        written_docs = result.get("documents_writer", {}).get("documents_written", 0)
+        
+        logger.info(f"Ingested {written_docs} documents for doc_id={doc_id}")
+        
+        return {
+            "status": "success",
+            "doc_count": written_docs,
+            "dao_id": dao_id,
+            "doc_id": doc_id
+        }
+        
+    except Exception as e:
+        logger.error(f"Failed to ingest document: {e}", exc_info=True)
+        return {
+            "status": "error",
+            "message": str(e),
+            "doc_count": 0
+        }
+
+
+def _parsed_json_to_documents(
+    parsed_json: Dict[str, Any],
+    dao_id: str,
+    doc_id: str,
+    user_id: Optional[str] = None
+) -> List[Document]:
+    """
+    Convert ParsedDocument JSON to Haystack Documents
+    
+    Args:
+        parsed_json: ParsedDocument JSON structure
+        dao_id: DAO identifier
+        doc_id: Document identifier
+        user_id: Optional user identifier
+    
+    Returns:
+        List of Haystack Document objects
+    """
+    documents = []
+    
+    # Extract pages from parsed_json
+    pages = parsed_json.get("pages", [])
+    
+    for page_data in pages:
+        page_num = page_data.get("page_num", 1)
+        blocks = page_data.get("blocks", [])
+        
+        for block in blocks:
+            # Skip empty blocks
+            text = block.get("text", "").strip()
+            if not text:
+                continue
+            
+            # Build metadata (must-have для RAG)
+            meta = {
+                "dao_id": dao_id,
+                "doc_id": doc_id,
+                "page": page_num,
+                "block_type": block.get("type", "paragraph"),
+                "reading_order": block.get("reading_order", 0)
+            }
+            
+            # Add optional fields
+            if block.get("bbox"):
+                bbox = block["bbox"]
+                meta.update({
+                    "bbox_x": bbox.get("x", 0),
+                    "bbox_y": bbox.get("y", 0),
+                    "bbox_width": bbox.get("width", 0),
+                    "bbox_height": bbox.get("height", 0)
+                })
+            
+            # Add section if heading
+            if block.get("type") == "heading":
+                meta["section"] = text[:100]  # First 100 chars as section name
+            
+            # Add user_id if provided
+            if user_id:
+                meta["user_id"] = user_id
+            
+            # Add document-level metadata
+            if parsed_json.get("metadata"):
+                meta.update({
+                    k: v for k, v in parsed_json["metadata"].items()
+                    if k not in ["dao_id"]  # Already added
+                })
+            
+            # Create Haystack Document
+            doc = Document(
+                content=text,
+                meta=meta
+            )
+            
+            documents.append(doc)
+    
+    return documents
+
+
+def _create_ingest_pipeline() -> Pipeline:
+    """
+    Create Haystack ingest pipeline
+    
+    Pipeline: DocumentSplitter → Embedder → DocumentWriter
+    """
+    # Get components
+    embedder = get_text_embedder()
+    document_store = get_document_store()
+    
+    # Create splitter (optional, if chunks are too large)
+    splitter = DocumentSplitter(
+        split_by="sentence",
+        split_length=settings.CHUNK_SIZE,
+        split_overlap=settings.CHUNK_OVERLAP
+    )
+    
+    # Create writer
+    writer = DocumentWriter(document_store)
+    
+    # Build pipeline
+    pipeline = Pipeline()
+    pipeline.add_component("splitter", splitter)
+    pipeline.add_component("embedder", embedder)
+    pipeline.add_component("documents_writer", writer)
+    
+    # Connect components
+    pipeline.connect("splitter", "embedder")
+    pipeline.connect("embedder", "documents_writer")
+    
+    return pipeline
+
--- a/services/rag-service/app/main.py
+++ b/services/rag-service/app/main.py
@@ -0,0 +1,105 @@
+"""
+RAG Service - FastAPI application
+Retrieval-Augmented Generation for MicroDAO
+"""
+
+import logging
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+
+from app.models import IngestRequest, IngestResponse, QueryRequest, QueryResponse
+from app.ingest_pipeline import ingest_parsed_document
+from app.query_pipeline import answer_query
+
+logger = logging.getLogger(__name__)
+
+# FastAPI app
+app = FastAPI(
+    title="RAG Service",
+    description="Retrieval-Augmented Generation service for MicroDAO",
+    version="1.0.0"
+)
+
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+@app.get("/health")
+async def health():
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "service": "rag-service",
+        "version": "1.0.0"
+    }
+
+
+@app.post("/ingest", response_model=IngestResponse)
+async def ingest_endpoint(request: IngestRequest):
+    """
+    Ingest parsed document from PARSER service into RAG
+    
+    Body:
+    - dao_id: DAO identifier
+    - doc_id: Document identifier
+    - parsed_json: ParsedDocument JSON from PARSER service
+    - user_id: Optional user identifier
+    """
+    try:
+        result = ingest_parsed_document(
+            dao_id=request.dao_id,
+            doc_id=request.doc_id,
+            parsed_json=request.parsed_json,
+            user_id=request.user_id
+        )
+        
+        return IngestResponse(**result)
+        
+    except Exception as e:
+        logger.error(f"Ingest endpoint error: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/query", response_model=QueryResponse)
+async def query_endpoint(request: QueryRequest):
+    """
+    Answer query using RAG pipeline
+    
+    Body:
+    - dao_id: DAO identifier
+    - question: User question
+    - top_k: Optional number of documents to retrieve
+    - user_id: Optional user identifier
+    """
+    try:
+        result = await answer_query(
+            dao_id=request.dao_id,
+            question=request.question,
+            top_k=request.top_k,
+            user_id=request.user_id
+        )
+        
+        return QueryResponse(**result)
+        
+    except Exception as e:
+        logger.error(f"Query endpoint error: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+if __name__ == "__main__":
+    import uvicorn
+    from app.core.config import settings
+    
+    uvicorn.run(
+        "app.main:app",
+        host=settings.API_HOST,
+        port=settings.API_PORT,
+        reload=True
+    )
+
--- a/services/rag-service/app/models.py
+++ b/services/rag-service/app/models.py
@@ -0,0 +1,47 @@
+"""
+Pydantic models for RAG Service API
+"""
+
+from typing import Optional, List, Dict, Any
+from pydantic import BaseModel, Field
+
+
+class IngestRequest(BaseModel):
+    """Request for document ingestion"""
+    dao_id: str = Field(..., description="DAO identifier")
+    doc_id: str = Field(..., description="Document identifier")
+    parsed_json: Dict[str, Any] = Field(..., description="ParsedDocument JSON from PARSER service")
+    user_id: Optional[str] = Field(None, description="User identifier")
+
+
+class IngestResponse(BaseModel):
+    """Response from document ingestion"""
+    status: str = Field(..., description="Status: success or error")
+    doc_count: int = Field(..., description="Number of documents ingested")
+    dao_id: str = Field(..., description="DAO identifier")
+    doc_id: str = Field(..., description="Document identifier")
+    message: Optional[str] = Field(None, description="Error message if status=error")
+
+
+class QueryRequest(BaseModel):
+    """Request for RAG query"""
+    dao_id: str = Field(..., description="DAO identifier")
+    question: str = Field(..., description="User question")
+    top_k: Optional[int] = Field(None, description="Number of documents to retrieve")
+    user_id: Optional[str] = Field(None, description="User identifier")
+
+
+class Citation(BaseModel):
+    """Citation from retrieved document"""
+    doc_id: str = Field(..., description="Document identifier")
+    page: int = Field(..., description="Page number")
+    section: Optional[str] = Field(None, description="Section name")
+    excerpt: str = Field(..., description="Document excerpt")
+
+
+class QueryResponse(BaseModel):
+    """Response from RAG query"""
+    answer: str = Field(..., description="Generated answer")
+    citations: List[Citation] = Field(..., description="List of citations")
+    documents: List[Dict[str, Any]] = Field(..., description="Retrieved documents (for debugging)")
+
--- a/services/rag-service/app/query_pipeline.py
+++ b/services/rag-service/app/query_pipeline.py
@@ -0,0 +1,250 @@
+"""
+Query Pipeline: RAG → LLM
+Retrieves relevant documents and generates answers
+"""
+
+import logging
+from typing import List, Dict, Any, Optional
+import httpx
+
+from haystack import Pipeline
+from haystack.components.embedders import SentenceTransformersTextEmbedder
+from haystack.components.retrievers import InMemoryEmbeddingRetriever
+from haystack.document_stores import PGVectorDocumentStore
+
+from app.document_store import get_document_store
+from app.embedding import get_text_embedder
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+async def answer_query(
+    dao_id: str,
+    question: str,
+    top_k: Optional[int] = None,
+    user_id: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Answer query using RAG pipeline
+    
+    Args:
+        dao_id: DAO identifier (for filtering)
+        question: User question
+        top_k: Number of documents to retrieve (default from settings)
+        user_id: Optional user identifier
+    
+    Returns:
+        Dictionary with answer, citations, and retrieved documents
+    """
+    logger.info(f"Answering query: dao_id={dao_id}, question={question[:50]}...")
+    
+    top_k = top_k or settings.TOP_K
+    
+    try:
+        # Retrieve relevant documents
+        documents = _retrieve_documents(dao_id, question, top_k)
+        
+        if not documents:
+            logger.warning(f"No documents found for dao_id={dao_id}")
+            return {
+                "answer": "На жаль, я не знайшов релевантної інформації в базі знань.",
+                "citations": [],
+                "documents": []
+            }
+        
+        logger.info(f"Retrieved {len(documents)} documents")
+        
+        # Generate answer using LLM
+        answer = await _generate_answer(question, documents, dao_id, user_id)
+        
+        # Build citations
+        citations = _build_citations(documents)
+        
+        return {
+            "answer": answer,
+            "citations": citations,
+            "documents": [
+                {
+                    "content": doc.content[:200] + "..." if len(doc.content) > 200 else doc.content,
+                    "meta": doc.meta
+                }
+                for doc in documents
+            ]
+        }
+        
+    except Exception as e:
+        logger.error(f"Failed to answer query: {e}", exc_info=True)
+        return {
+            "answer": f"Помилка при обробці запиту: {str(e)}",
+            "citations": [],
+            "documents": []
+        }
+
+
+def _retrieve_documents(
+    dao_id: str,
+    question: str,
+    top_k: int
+) -> List[Any]:
+    """
+    Retrieve relevant documents from DocumentStore
+    
+    Args:
+        dao_id: DAO identifier for filtering
+        question: Query text
+        top_k: Number of documents to retrieve
+    
+    Returns:
+        List of Haystack Document objects
+    """
+    # Get components
+    embedder = get_text_embedder()
+    document_store = get_document_store()
+    
+    # Embed query
+    embedding_result = embedder.run(question)
+    query_embedding = embedding_result["embedding"][0] if isinstance(embedding_result["embedding"], list) else embedding_result["embedding"]
+    
+    # Retrieve with filters using vector similarity search
+    filters = {"dao_id": [dao_id]}
+    
+    try:
+        documents = document_store.search(
+            query_embedding=query_embedding,
+            filters=filters,
+            top_k=top_k,
+            return_embedding=False
+        )
+    except Exception as e:
+        logger.warning(f"Vector search failed: {e}, trying filter_documents")
+        # Fallback to filter_documents
+        documents = document_store.filter_documents(
+            filters=filters,
+            top_k=top_k,
+            return_embedding=False
+        )
+    
+    # If no documents with filter, try without filter (fallback)
+    if not documents:
+        logger.warning(f"No documents found with dao_id={dao_id}, trying without filter")
+        try:
+            documents = document_store.search(
+                query_embedding=query_embedding,
+                filters=None,
+                top_k=top_k,
+                return_embedding=False
+            )
+        except Exception:
+            documents = document_store.filter_documents(
+                filters=None,
+                top_k=top_k,
+                return_embedding=False
+            )
+    
+    return documents
+
+
+async def _generate_answer(
+    question: str,
+    documents: List[Any],
+    dao_id: str,
+    user_id: Optional[str] = None
+) -> str:
+    """
+    Generate answer using LLM (via DAGI Router or OpenAI)
+    
+    Args:
+        question: User question
+        documents: Retrieved documents
+        dao_id: DAO identifier
+        user_id: Optional user identifier
+    
+    Returns:
+        Generated answer text
+    """
+    # Build context from documents
+    context = "\n\n".join([
+        f"[Документ {idx+1}, сторінка {doc.meta.get('page', '?')}]: {doc.content[:500]}"
+        for idx, doc in enumerate(documents[:3])  # Limit to first 3 documents
+    ])
+    
+    # Build prompt
+    prompt = (
+        "Тобі надано контекст з бази знань та питання користувача.\n"
+        "Відповідай на основі наданого контексту. Якщо в контексті немає відповіді, "
+        "скажи що не знаєш.\n\n"
+        f"Контекст:\n{context}\n\n"
+        f"Питання: {question}\n\n"
+        "Відповідь:"
+    )
+    
+    # Call LLM based on provider
+    if settings.LLM_PROVIDER == "router":
+        return await _call_router_llm(prompt, dao_id, user_id)
+    elif settings.LLM_PROVIDER == "openai" and settings.OPENAI_API_KEY:
+        return await _call_openai_llm(prompt)
+    else:
+        # Fallback: simple answer
+        return f"Знайдено {len(documents)} релевантних документів. Перший фрагмент: {documents[0].content[:200]}..."
+
+
+async def _call_router_llm(
+    prompt: str,
+    dao_id: str,
+    user_id: Optional[str] = None
+) -> str:
+    """Call DAGI Router LLM"""
+    router_url = f"{settings.ROUTER_BASE_URL.rstrip('/')}/route"
+    
+    payload = {
+        "mode": "chat",
+        "dao_id": dao_id,
+        "user_id": user_id or "rag-service",
+        "payload": {
+            "message": prompt
+        }
+    }
+    
+    try:
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            resp = await client.post(router_url, json=payload)
+            resp.raise_for_status()
+            data = resp.json()
+            
+            return data.get("data", {}).get("text", "Не вдалося отримати відповідь")
+            
+    except Exception as e:
+        logger.error(f"Router LLM call failed: {e}")
+        return f"Помилка при виклику LLM: {str(e)}"
+
+
+async def _call_openai_llm(prompt: str) -> str:
+    """Call OpenAI LLM"""
+    # TODO: Implement OpenAI client
+    return "OpenAI integration not yet implemented"
+
+
+def _build_citations(documents: List[Any]) -> List[Dict[str, Any]]:
+    """
+    Build citations from retrieved documents
+    
+    Args:
+        documents: List of Haystack Documents
+    
+    Returns:
+        List of citation dictionaries
+    """
+    citations = []
+    
+    for doc in documents:
+        meta = doc.meta
+        citations.append({
+            "doc_id": meta.get("doc_id", "unknown"),
+            "page": meta.get("page", 0),
+            "section": meta.get("section"),
+            "excerpt": doc.content[:200] + "..." if len(doc.content) > 200 else doc.content
+        })
+    
+    return citations
+