feat: complete RAG pipeline integration (ingest + query + Memory)

Parser Service:
- Add /ocr/ingest endpoint (PARSER → RAG in one call)
- Add RAG_BASE_URL and RAG_TIMEOUT to config
- Add OcrIngestResponse schema
- Create file_converter utility for PDF/image → PNG bytes
- Endpoint accepts file, dao_id, doc_id, user_id
- Automatically parses with dots.ocr and sends to RAG Service

Router Integration:
- Add _handle_rag_query() method in RouterApp
- Combines Memory + RAG → LLM pipeline
- Get Memory context (facts, events, summaries)
- Query RAG Service for documents
- Build prompt with Memory + RAG documents
- Call LLM provider with combined context
- Return answer with citations

Clients:
- Create rag_client.py for Router (query RAG Service)
- Create memory_client.py for Router (get Memory context)

E2E Tests:
- Create e2e_rag_pipeline.sh script for full pipeline test
- Test ingest → query → router query flow
- Add E2E_RAG_README.md with usage examples

Docker:
- Add RAG_SERVICE_URL and MEMORY_SERVICE_URL to router environment
This commit is contained in:
Apple
2025-11-16 05:02:14 -08:00
parent 6d69f901f7
commit 382e661f1f
10 changed files with 719 additions and 1 deletions

View File

@@ -4,14 +4,17 @@ API endpoints for PARSER Service
import logging
import uuid
import json
from pathlib import Path
from typing import Optional
from fastapi import APIRouter, UploadFile, File, HTTPException, Form
from fastapi.responses import JSONResponse
import httpx
from app.schemas import (
ParseRequest, ParseResponse, ParsedDocument, ParsedChunk, QAPair, ChunksResponse
ParseRequest, ParseResponse, ParsedDocument, ParsedChunk, QAPair, ChunksResponse,
OcrIngestResponse
)
from app.core.config import settings
from app.runtime.inference import parse_document_from_images
@@ -22,6 +25,7 @@ from app.runtime.postprocessing import (
build_chunks, build_qa_pairs, build_markdown
)
from app.runtime.qa_builder import build_qa_pairs_via_router
from app.utils.file_converter import pdf_or_image_to_png_bytes
logger = logging.getLogger(__name__)
@@ -242,3 +246,101 @@ async def parse_chunks_endpoint(
dao_id=dao_id
)
@router.post("/ocr/ingest", response_model=OcrIngestResponse)
async def ocr_ingest_endpoint(
file: UploadFile = File(...),
dao_id: str = Form(...),
doc_id: Optional[str] = Form(None),
user_id: Optional[str] = Form(None)
):
"""
Parse document and ingest into RAG in one call
Flow:
1. Accept PDF/image file
2. Parse with dots.ocr (raw_json mode)
3. Send parsed_json to RAG Service /ingest
4. Return doc_id + raw_json
Args:
file: PDF or image file
dao_id: DAO identifier (required)
doc_id: Document identifier (optional, defaults to filename)
user_id: User identifier (optional)
"""
try:
# Generate doc_id if not provided
if not doc_id:
doc_id = file.filename or f"doc-{uuid.uuid4().hex[:8]}"
# Read and validate file
content = await file.read()
validate_file_size(content)
# Detect file type
doc_type = detect_file_type(content, file.filename)
# Convert to images
if doc_type == "pdf":
images = convert_pdf_to_images(content)
else:
image = load_image(content)
images = [image]
pages_count = len(images)
logger.info(f"Ingesting document: dao_id={dao_id}, doc_id={doc_id}, pages={pages_count}")
# Parse document (raw_json mode)
parsed_doc = parse_document_from_images(
images=images,
output_mode="raw_json",
doc_id=doc_id,
doc_type=doc_type
)
# Convert to JSON
parsed_json = parsed_doc.model_dump(mode="json")
# Send to RAG Service
ingest_payload = {
"dao_id": dao_id,
"doc_id": doc_id,
"parsed_json": parsed_json,
}
if user_id:
ingest_payload["user_id"] = user_id
rag_url = f"{settings.RAG_BASE_URL.rstrip('/')}/ingest"
logger.info(f"Sending to RAG Service: {rag_url}")
try:
async with httpx.AsyncClient(timeout=settings.RAG_TIMEOUT) as client:
resp = await client.post(rag_url, json=ingest_payload)
resp.raise_for_status()
rag_result = resp.json()
logger.info(f"RAG ingest successful: {rag_result.get('doc_count', 0)} documents indexed")
except httpx.HTTPError as e:
logger.error(f"RAG ingest failed: {e}")
raise HTTPException(
status_code=502,
detail=f"RAG Service ingest failed: {str(e)}"
)
return OcrIngestResponse(
dao_id=dao_id,
doc_id=doc_id,
pages_processed=pages_count,
rag_ingested=True,
raw_json=parsed_json
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error in ocr_ingest: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=f"Ingest failed: {str(e)}")