feat: add Ollama runtime support and RAG implementation plan

Ollama Runtime:
- Add ollama_client.py for Ollama API integration
- Support for dots-ocr model via Ollama
- Add OLLAMA_BASE_URL configuration
- Update inference.py to support Ollama runtime (RUNTIME_TYPE=ollama)
- Update endpoints to handle async Ollama calls
- Alternative to local transformers model

RAG Implementation Plan:
- Create TODO-RAG.md with detailed Haystack integration plan
- Document Store setup (pgvector)
- Embedding model selection
- Ingest pipeline (PARSER → RAG)
- Query pipeline (RAG → LLM)
- Integration with DAGI Router
- Bot commands (/upload_doc, /ask_doc)
- Testing strategy

Now supports three runtime modes:
1. Local transformers (RUNTIME_TYPE=local)
2. Ollama (RUNTIME_TYPE=ollama)
3. Dummy (USE_DUMMY_PARSER=true)
This commit is contained in:
Apple
2025-11-16 02:56:36 -08:00
parent d56ff3493d
commit 00f9102e50
6 changed files with 607 additions and 9 deletions

View File

@@ -16,11 +16,94 @@ from app.runtime.preprocessing import (
)
from app.runtime.postprocessing import build_parsed_document
from app.runtime.model_output_parser import parse_model_output_to_blocks
from app.runtime.ollama_client import (
call_ollama_vision, parse_ollama_response, OutputMode as OllamaOutputMode
)
from app.core.config import settings
logger = logging.getLogger(__name__)
async def parse_document_with_ollama(
images: List[Image.Image],
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
doc_id: Optional[str] = None,
doc_type: Literal["pdf", "image"] = "image"
) -> ParsedDocument:
"""
Parse document using Ollama API
Args:
images: List of PIL Images
output_mode: Output format mode
doc_id: Document ID
doc_type: Document type
Returns:
ParsedDocument
"""
import io
# Convert output_mode to Ollama format
ollama_mode_map = {
"raw_json": OllamaOutputMode.raw_json,
"markdown": OllamaOutputMode.markdown,
"qa_pairs": OllamaOutputMode.qa_pairs,
"chunks": OllamaOutputMode.raw_json # Use raw_json for chunks, will be processed later
}
ollama_mode = ollama_mode_map.get(output_mode, OllamaOutputMode.raw_json)
pages_data = []
for idx, image in enumerate(images, start=1):
try:
# Convert image to PNG bytes
buf = io.BytesIO()
image.convert("RGB").save(buf, format="PNG")
png_bytes = buf.getvalue()
# Call Ollama
ollama_data = await call_ollama_vision(png_bytes, ollama_mode)
raw_text, parsed_json = parse_ollama_response(ollama_data, ollama_mode)
logger.debug(f"Ollama output for page {idx}: {raw_text[:100]}...")
# Parse into blocks
if parsed_json and isinstance(parsed_json, dict):
# Use structured JSON if available
blocks = parsed_json.get("blocks", [])
if not blocks:
# Fallback: create block from raw text
blocks = [{
"type": "paragraph",
"text": raw_text,
"bbox": {"x": 0, "y": 0, "width": image.width, "height": image.height},
"reading_order": 1
}]
else:
# Parse plain text output
blocks = parse_model_output_to_blocks(raw_text, image.size, page_num=idx)
pages_data.append({
"blocks": blocks,
"width": image.width,
"height": image.height
})
logger.info(f"Processed page {idx}/{len(images)} via Ollama")
except Exception as e:
logger.error(f"Error processing page {idx} with Ollama: {e}", exc_info=True)
continue
return build_parsed_document(
pages_data=pages_data,
doc_id=doc_id or "parsed-doc",
doc_type=doc_type,
metadata={"model": settings.PARSER_MODEL_NAME, "runtime": "ollama"}
)
def parse_document_from_images(
images: List[Image.Image],
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
@@ -44,7 +127,12 @@ def parse_document_from_images(
logger.info("Using dummy parser (USE_DUMMY_PARSER=true)")
return dummy_parse_document_from_images(images, doc_id, doc_type)
# Try to get model
# Check if using Ollama runtime
if settings.RUNTIME_TYPE == "ollama":
logger.info("Using Ollama runtime")
return await parse_document_with_ollama(images, output_mode, doc_id, doc_type)
# Try to get local model
model = get_model()
if model is None:

View File

@@ -0,0 +1,127 @@
"""
Ollama client for dots.ocr model
Alternative runtime using Ollama API
"""
import base64
import json
import logging
from typing import Dict, Any, Optional
from enum import Enum
import httpx
from app.core.config import settings
logger = logging.getLogger(__name__)
class OutputMode(str, Enum):
raw_json = "raw_json"
markdown = "markdown"
qa_pairs = "qa_pairs"
def build_prompt(mode: OutputMode) -> str:
"""Build prompt for Ollama based on output mode"""
if mode == OutputMode.raw_json:
return (
"You are a document OCR and layout parser. "
"Extract all text, tables, formulas, and layout into a clean JSON structure with fields like "
"`blocks`, `tables`, `reading_order`, including bounding boxes and page numbers. "
"Respond with JSON only, no explanations."
)
elif mode == OutputMode.markdown:
return (
"You are a document OCR and layout parser. "
"Extract the document as Markdown, preserving headings, paragraphs, and tables. "
"Tables should be proper GitHub-flavored Markdown tables. "
"Respond with Markdown as plain text."
)
elif mode == OutputMode.qa_pairs:
return (
"You are a document OCR and knowledge extraction assistant. "
"Read the document and output a JSON array of Q&A pairs covering the key information. "
"Each item should be {\"question\": ..., \"answer\": ..., \"page\": ..., \"section\": ...}. "
"Respond with JSON only, no explanations."
)
return "You are a document OCR assistant. Extract text."
async def call_ollama_vision(
image_bytes: bytes,
mode: OutputMode,
model_name: Optional[str] = None
) -> Dict[str, Any]:
"""
Call Ollama vision API with image
Args:
image_bytes: PNG image bytes
mode: Output mode
model_name: Model name (defaults to PARSER_MODEL_NAME)
Returns:
Ollama response dictionary
"""
model_name = model_name or settings.PARSER_MODEL_NAME
# Encode image to base64
img_b64 = base64.b64encode(image_bytes).decode("ascii")
prompt = build_prompt(mode)
body = {
"model": model_name,
"prompt": prompt,
"images": [img_b64],
"stream": False,
}
url = f"{settings.OLLAMA_BASE_URL.rstrip('/')}/api/generate"
logger.info(f"Calling Ollama: {url}, model: {model_name}, mode: {mode}")
try:
async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post(url, json=body)
resp.raise_for_status()
data = resp.json()
logger.debug(f"Ollama response: {data.get('response', '')[:100]}...")
return data
except httpx.HTTPError as e:
logger.error(f"Ollama HTTP error: {e}")
raise RuntimeError(f"Ollama API error: {e}") from e
except Exception as e:
logger.error(f"Ollama error: {e}", exc_info=True)
raise RuntimeError(f"Failed to call Ollama: {e}") from e
def parse_ollama_response(
ollama_data: Dict[str, Any],
mode: OutputMode
) -> tuple[str, Optional[Dict[str, Any]]]:
"""
Parse Ollama response
Args:
ollama_data: Response from Ollama API
mode: Output mode
Returns:
Tuple of (raw_text, parsed_json)
"""
raw_text = ollama_data.get("response", "").strip()
parsed_json: Optional[Dict[str, Any]] = None
# Try to parse as JSON for raw_json and qa_pairs modes
if mode in (OutputMode.raw_json, OutputMode.qa_pairs):
try:
parsed_json = json.loads(raw_text)
except (json.JSONDecodeError, ValueError):
logger.warning(f"Failed to parse response as JSON for mode {mode}")
parsed_json = None
return raw_text, parsed_json