feat: complete RAG pipeline integration (ingest + query + Memory)
Parser Service: - Add /ocr/ingest endpoint (PARSER → RAG in one call) - Add RAG_BASE_URL and RAG_TIMEOUT to config - Add OcrIngestResponse schema - Create file_converter utility for PDF/image → PNG bytes - Endpoint accepts file, dao_id, doc_id, user_id - Automatically parses with dots.ocr and sends to RAG Service Router Integration: - Add _handle_rag_query() method in RouterApp - Combines Memory + RAG → LLM pipeline - Get Memory context (facts, events, summaries) - Query RAG Service for documents - Build prompt with Memory + RAG documents - Call LLM provider with combined context - Return answer with citations Clients: - Create rag_client.py for Router (query RAG Service) - Create memory_client.py for Router (get Memory context) E2E Tests: - Create e2e_rag_pipeline.sh script for full pipeline test - Test ingest → query → router query flow - Add E2E_RAG_README.md with usage examples Docker: - Add RAG_SERVICE_URL and MEMORY_SERVICE_URL to router environment
This commit is contained in:
@@ -4,14 +4,17 @@ API endpoints for PARSER Service
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, UploadFile, File, HTTPException, Form
|
||||
from fastapi.responses import JSONResponse
|
||||
import httpx
|
||||
|
||||
from app.schemas import (
|
||||
ParseRequest, ParseResponse, ParsedDocument, ParsedChunk, QAPair, ChunksResponse
|
||||
ParseRequest, ParseResponse, ParsedDocument, ParsedChunk, QAPair, ChunksResponse,
|
||||
OcrIngestResponse
|
||||
)
|
||||
from app.core.config import settings
|
||||
from app.runtime.inference import parse_document_from_images
|
||||
@@ -22,6 +25,7 @@ from app.runtime.postprocessing import (
|
||||
build_chunks, build_qa_pairs, build_markdown
|
||||
)
|
||||
from app.runtime.qa_builder import build_qa_pairs_via_router
|
||||
from app.utils.file_converter import pdf_or_image_to_png_bytes
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -242,3 +246,101 @@ async def parse_chunks_endpoint(
|
||||
dao_id=dao_id
|
||||
)
|
||||
|
||||
|
||||
@router.post("/ocr/ingest", response_model=OcrIngestResponse)
|
||||
async def ocr_ingest_endpoint(
|
||||
file: UploadFile = File(...),
|
||||
dao_id: str = Form(...),
|
||||
doc_id: Optional[str] = Form(None),
|
||||
user_id: Optional[str] = Form(None)
|
||||
):
|
||||
"""
|
||||
Parse document and ingest into RAG in one call
|
||||
|
||||
Flow:
|
||||
1. Accept PDF/image file
|
||||
2. Parse with dots.ocr (raw_json mode)
|
||||
3. Send parsed_json to RAG Service /ingest
|
||||
4. Return doc_id + raw_json
|
||||
|
||||
Args:
|
||||
file: PDF or image file
|
||||
dao_id: DAO identifier (required)
|
||||
doc_id: Document identifier (optional, defaults to filename)
|
||||
user_id: User identifier (optional)
|
||||
"""
|
||||
try:
|
||||
# Generate doc_id if not provided
|
||||
if not doc_id:
|
||||
doc_id = file.filename or f"doc-{uuid.uuid4().hex[:8]}"
|
||||
|
||||
# Read and validate file
|
||||
content = await file.read()
|
||||
validate_file_size(content)
|
||||
|
||||
# Detect file type
|
||||
doc_type = detect_file_type(content, file.filename)
|
||||
|
||||
# Convert to images
|
||||
if doc_type == "pdf":
|
||||
images = convert_pdf_to_images(content)
|
||||
else:
|
||||
image = load_image(content)
|
||||
images = [image]
|
||||
|
||||
pages_count = len(images)
|
||||
logger.info(f"Ingesting document: dao_id={dao_id}, doc_id={doc_id}, pages={pages_count}")
|
||||
|
||||
# Parse document (raw_json mode)
|
||||
parsed_doc = parse_document_from_images(
|
||||
images=images,
|
||||
output_mode="raw_json",
|
||||
doc_id=doc_id,
|
||||
doc_type=doc_type
|
||||
)
|
||||
|
||||
# Convert to JSON
|
||||
parsed_json = parsed_doc.model_dump(mode="json")
|
||||
|
||||
# Send to RAG Service
|
||||
ingest_payload = {
|
||||
"dao_id": dao_id,
|
||||
"doc_id": doc_id,
|
||||
"parsed_json": parsed_json,
|
||||
}
|
||||
|
||||
if user_id:
|
||||
ingest_payload["user_id"] = user_id
|
||||
|
||||
rag_url = f"{settings.RAG_BASE_URL.rstrip('/')}/ingest"
|
||||
logger.info(f"Sending to RAG Service: {rag_url}")
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=settings.RAG_TIMEOUT) as client:
|
||||
resp = await client.post(rag_url, json=ingest_payload)
|
||||
resp.raise_for_status()
|
||||
rag_result = resp.json()
|
||||
|
||||
logger.info(f"RAG ingest successful: {rag_result.get('doc_count', 0)} documents indexed")
|
||||
|
||||
except httpx.HTTPError as e:
|
||||
logger.error(f"RAG ingest failed: {e}")
|
||||
raise HTTPException(
|
||||
status_code=502,
|
||||
detail=f"RAG Service ingest failed: {str(e)}"
|
||||
)
|
||||
|
||||
return OcrIngestResponse(
|
||||
dao_id=dao_id,
|
||||
doc_id=doc_id,
|
||||
pages_processed=pages_count,
|
||||
rag_ingested=True,
|
||||
raw_json=parsed_json
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error in ocr_ingest: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=f"Ingest failed: {str(e)}")
|
||||
|
||||
|
||||
@@ -47,6 +47,10 @@ class Settings(BaseSettings):
|
||||
ROUTER_BASE_URL: str = os.getenv("ROUTER_BASE_URL", "http://router:9102")
|
||||
ROUTER_TIMEOUT: int = int(os.getenv("ROUTER_TIMEOUT", "60"))
|
||||
|
||||
# RAG Service configuration (for ingest pipeline)
|
||||
RAG_BASE_URL: str = os.getenv("RAG_BASE_URL", "http://rag-service:9500")
|
||||
RAG_TIMEOUT: int = int(os.getenv("RAG_TIMEOUT", "120"))
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
case_sensitive = True
|
||||
|
||||
@@ -141,3 +141,12 @@ class ChunksResponse(BaseModel):
|
||||
doc_id: str = Field(..., description="Document ID")
|
||||
dao_id: str = Field(..., description="DAO ID")
|
||||
|
||||
|
||||
class OcrIngestResponse(BaseModel):
|
||||
"""Response from /ocr/ingest endpoint"""
|
||||
dao_id: str = Field(..., description="DAO identifier")
|
||||
doc_id: str = Field(..., description="Document identifier")
|
||||
pages_processed: int = Field(..., description="Number of pages processed")
|
||||
rag_ingested: bool = Field(..., description="Whether document was ingested into RAG")
|
||||
raw_json: Dict[str, Any] = Field(..., description="Parsed document JSON")
|
||||
|
||||
|
||||
59
services/parser-service/app/utils/file_converter.py
Normal file
59
services/parser-service/app/utils/file_converter.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""
|
||||
Helper functions for file conversion (PDF/image → PNG bytes)
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Tuple, Optional
|
||||
from io import BytesIO
|
||||
|
||||
from PIL import Image
|
||||
from app.runtime.preprocessing import convert_pdf_to_images, load_image, detect_file_type
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def pdf_or_image_to_png_bytes(
|
||||
filename: Optional[str],
|
||||
file_bytes: bytes
|
||||
) -> Tuple[bytes, int]:
|
||||
"""
|
||||
Convert PDF or image file to PNG bytes
|
||||
|
||||
Args:
|
||||
filename: Original filename (for type detection)
|
||||
file_bytes: File content as bytes
|
||||
|
||||
Returns:
|
||||
Tuple of (PNG bytes, number of pages)
|
||||
|
||||
Raises:
|
||||
ValueError: If file type is not supported or conversion fails
|
||||
"""
|
||||
# Detect file type
|
||||
doc_type = detect_file_type(file_bytes, filename)
|
||||
|
||||
if doc_type == "pdf":
|
||||
# Convert PDF to images
|
||||
images = convert_pdf_to_images(file_bytes)
|
||||
|
||||
if not images:
|
||||
raise ValueError("PDF conversion produced no images")
|
||||
|
||||
# Convert first page to PNG bytes (for single-page processing)
|
||||
# For multi-page, we'll process all pages separately
|
||||
first_image = images[0]
|
||||
buf = BytesIO()
|
||||
first_image.convert("RGB").save(buf, format="PNG")
|
||||
png_bytes = buf.getvalue()
|
||||
|
||||
return png_bytes, len(images)
|
||||
|
||||
else:
|
||||
# Load image and convert to PNG
|
||||
image = load_image(file_bytes)
|
||||
buf = BytesIO()
|
||||
image.convert("RGB").save(buf, format="PNG")
|
||||
png_bytes = buf.getvalue()
|
||||
|
||||
return png_bytes, 1
|
||||
|
||||
Reference in New Issue
Block a user