feat: complete RAG pipeline integration (ingest + query + Memory)

Parser Service: - Add /ocr/ingest endpoint (PARSER → RAG in one call) - Add RAG_BASE_URL and RAG_TIMEOUT to config - Add OcrIngestResponse schema - Create file_converter utility for PDF/image → PNG bytes - Endpoint accepts file, dao_id, doc_id, user_id - Automatically parses with dots.ocr and sends to RAG Service Router Integration: - Add _handle_rag_query() method in RouterApp - Combines Memory + RAG → LLM pipeline - Get Memory context (facts, events, summaries) - Query RAG Service for documents - Build prompt with Memory + RAG documents - Call LLM provider with combined context - Return answer with citations Clients: - Create rag_client.py for Router (query RAG Service) - Create memory_client.py for Router (get Memory context) E2E Tests: - Create e2e_rag_pipeline.sh script for full pipeline test - Test ingest → query → router query flow - Add E2E_RAG_README.md with usage examples Docker: - Add RAG_SERVICE_URL and MEMORY_SERVICE_URL to router environment
2025-11-16 05:02:14 -08:00
parent 6d69f901f7
commit 382e661f1f
10 changed files with 719 additions and 1 deletions
--- a/services/parser-service/app/api/endpoints.py
+++ b/services/parser-service/app/api/endpoints.py
@@ -4,14 +4,17 @@ API endpoints for PARSER Service

 import logging
 import uuid
+import json
 from pathlib import Path
 from typing import Optional

 from fastapi import APIRouter, UploadFile, File, HTTPException, Form
 from fastapi.responses import JSONResponse
+import httpx

 from app.schemas import (
-    ParseRequest, ParseResponse, ParsedDocument, ParsedChunk, QAPair, ChunksResponse
+    ParseRequest, ParseResponse, ParsedDocument, ParsedChunk, QAPair, ChunksResponse,
+    OcrIngestResponse
 )
 from app.core.config import settings
 from app.runtime.inference import parse_document_from_images
@@ -22,6 +25,7 @@ from app.runtime.postprocessing import (
    build_chunks, build_qa_pairs, build_markdown
 )
 from app.runtime.qa_builder import build_qa_pairs_via_router
+from app.utils.file_converter import pdf_or_image_to_png_bytes

 logger = logging.getLogger(__name__)

@@ -242,3 +246,101 @@ async def parse_chunks_endpoint(
        dao_id=dao_id
    )

+
+@router.post("/ocr/ingest", response_model=OcrIngestResponse)
+async def ocr_ingest_endpoint(
+    file: UploadFile = File(...),
+    dao_id: str = Form(...),
+    doc_id: Optional[str] = Form(None),
+    user_id: Optional[str] = Form(None)
+):
+    """
+    Parse document and ingest into RAG in one call
+    
+    Flow:
+    1. Accept PDF/image file
+    2. Parse with dots.ocr (raw_json mode)
+    3. Send parsed_json to RAG Service /ingest
+    4. Return doc_id + raw_json
+    
+    Args:
+        file: PDF or image file
+        dao_id: DAO identifier (required)
+        doc_id: Document identifier (optional, defaults to filename)
+        user_id: User identifier (optional)
+    """
+    try:
+        # Generate doc_id if not provided
+        if not doc_id:
+            doc_id = file.filename or f"doc-{uuid.uuid4().hex[:8]}"
+        
+        # Read and validate file
+        content = await file.read()
+        validate_file_size(content)
+        
+        # Detect file type
+        doc_type = detect_file_type(content, file.filename)
+        
+        # Convert to images
+        if doc_type == "pdf":
+            images = convert_pdf_to_images(content)
+        else:
+            image = load_image(content)
+            images = [image]
+        
+        pages_count = len(images)
+        logger.info(f"Ingesting document: dao_id={dao_id}, doc_id={doc_id}, pages={pages_count}")
+        
+        # Parse document (raw_json mode)
+        parsed_doc = parse_document_from_images(
+            images=images,
+            output_mode="raw_json",
+            doc_id=doc_id,
+            doc_type=doc_type
+        )
+        
+        # Convert to JSON
+        parsed_json = parsed_doc.model_dump(mode="json")
+        
+        # Send to RAG Service
+        ingest_payload = {
+            "dao_id": dao_id,
+            "doc_id": doc_id,
+            "parsed_json": parsed_json,
+        }
+        
+        if user_id:
+            ingest_payload["user_id"] = user_id
+        
+        rag_url = f"{settings.RAG_BASE_URL.rstrip('/')}/ingest"
+        logger.info(f"Sending to RAG Service: {rag_url}")
+        
+        try:
+            async with httpx.AsyncClient(timeout=settings.RAG_TIMEOUT) as client:
+                resp = await client.post(rag_url, json=ingest_payload)
+                resp.raise_for_status()
+                rag_result = resp.json()
+                
+                logger.info(f"RAG ingest successful: {rag_result.get('doc_count', 0)} documents indexed")
+                
+        except httpx.HTTPError as e:
+            logger.error(f"RAG ingest failed: {e}")
+            raise HTTPException(
+                status_code=502,
+                detail=f"RAG Service ingest failed: {str(e)}"
+            )
+        
+        return OcrIngestResponse(
+            dao_id=dao_id,
+            doc_id=doc_id,
+            pages_processed=pages_count,
+            rag_ingested=True,
+            raw_json=parsed_json
+        )
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error in ocr_ingest: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Ingest failed: {str(e)}")
+
--- a/services/parser-service/app/core/config.py
+++ b/services/parser-service/app/core/config.py
@@ -47,6 +47,10 @@ class Settings(BaseSettings):
    ROUTER_BASE_URL: str = os.getenv("ROUTER_BASE_URL", "http://router:9102")
    ROUTER_TIMEOUT: int = int(os.getenv("ROUTER_TIMEOUT", "60"))
    
+    # RAG Service configuration (for ingest pipeline)
+    RAG_BASE_URL: str = os.getenv("RAG_BASE_URL", "http://rag-service:9500")
+    RAG_TIMEOUT: int = int(os.getenv("RAG_TIMEOUT", "120"))
+    
    class Config:
        env_file = ".env"
        case_sensitive = True
--- a/services/parser-service/app/schemas.py
+++ b/services/parser-service/app/schemas.py
@@ -141,3 +141,12 @@ class ChunksResponse(BaseModel):
    doc_id: str = Field(..., description="Document ID")
    dao_id: str = Field(..., description="DAO ID")

+
+class OcrIngestResponse(BaseModel):
+    """Response from /ocr/ingest endpoint"""
+    dao_id: str = Field(..., description="DAO identifier")
+    doc_id: str = Field(..., description="Document identifier")
+    pages_processed: int = Field(..., description="Number of pages processed")
+    rag_ingested: bool = Field(..., description="Whether document was ingested into RAG")
+    raw_json: Dict[str, Any] = Field(..., description="Parsed document JSON")
+
--- a/services/parser-service/app/utils/file_converter.py
+++ b/services/parser-service/app/utils/file_converter.py
@@ -0,0 +1,59 @@
+"""
+Helper functions for file conversion (PDF/image → PNG bytes)
+"""
+
+import logging
+from typing import Tuple, Optional
+from io import BytesIO
+
+from PIL import Image
+from app.runtime.preprocessing import convert_pdf_to_images, load_image, detect_file_type
+
+logger = logging.getLogger(__name__)
+
+
+async def pdf_or_image_to_png_bytes(
+    filename: Optional[str],
+    file_bytes: bytes
+) -> Tuple[bytes, int]:
+    """
+    Convert PDF or image file to PNG bytes
+    
+    Args:
+        filename: Original filename (for type detection)
+        file_bytes: File content as bytes
+    
+    Returns:
+        Tuple of (PNG bytes, number of pages)
+    
+    Raises:
+        ValueError: If file type is not supported or conversion fails
+    """
+    # Detect file type
+    doc_type = detect_file_type(file_bytes, filename)
+    
+    if doc_type == "pdf":
+        # Convert PDF to images
+        images = convert_pdf_to_images(file_bytes)
+        
+        if not images:
+            raise ValueError("PDF conversion produced no images")
+        
+        # Convert first page to PNG bytes (for single-page processing)
+        # For multi-page, we'll process all pages separately
+        first_image = images[0]
+        buf = BytesIO()
+        first_image.convert("RGB").save(buf, format="PNG")
+        png_bytes = buf.getvalue()
+        
+        return png_bytes, len(images)
+        
+    else:
+        # Load image and convert to PNG
+        image = load_image(file_bytes)
+        buf = BytesIO()
+        image.convert("RGB").save(buf, format="PNG")
+        png_bytes = buf.getvalue()
+        
+        return png_bytes, 1
+