feat: add Ollama runtime support and RAG implementation plan
Ollama Runtime: - Add ollama_client.py for Ollama API integration - Support for dots-ocr model via Ollama - Add OLLAMA_BASE_URL configuration - Update inference.py to support Ollama runtime (RUNTIME_TYPE=ollama) - Update endpoints to handle async Ollama calls - Alternative to local transformers model RAG Implementation Plan: - Create TODO-RAG.md with detailed Haystack integration plan - Document Store setup (pgvector) - Embedding model selection - Ingest pipeline (PARSER → RAG) - Query pipeline (RAG → LLM) - Integration with DAGI Router - Bot commands (/upload_doc, /ask_doc) - Testing strategy Now supports three runtime modes: 1. Local transformers (RUNTIME_TYPE=local) 2. Ollama (RUNTIME_TYPE=ollama) 3. Dummy (USE_DUMMY_PARSER=true)
This commit is contained in:
@@ -90,12 +90,23 @@ async def parse_document_endpoint(
|
||||
# Parse document from images
|
||||
logger.info(f"Parsing document: {len(images)} page(s), mode: {output_mode}")
|
||||
|
||||
parsed_doc = parse_document_from_images(
|
||||
images=images,
|
||||
output_mode=output_mode,
|
||||
doc_id=doc_id or str(uuid.uuid4()),
|
||||
doc_type=doc_type
|
||||
)
|
||||
# Check if using Ollama (async) or local model (sync)
|
||||
from app.core.config import settings
|
||||
if settings.RUNTIME_TYPE == "ollama":
|
||||
from app.runtime.inference import parse_document_with_ollama
|
||||
parsed_doc = await parse_document_with_ollama(
|
||||
images=images,
|
||||
output_mode=output_mode,
|
||||
doc_id=doc_id or str(uuid.uuid4()),
|
||||
doc_type=doc_type
|
||||
)
|
||||
else:
|
||||
parsed_doc = parse_document_from_images(
|
||||
images=images,
|
||||
output_mode=output_mode,
|
||||
doc_id=doc_id or str(uuid.uuid4()),
|
||||
doc_type=doc_type
|
||||
)
|
||||
|
||||
# Build response based on output_mode
|
||||
response_data = {"metadata": {
|
||||
|
||||
@@ -37,9 +37,12 @@ class Settings(BaseSettings):
|
||||
ALLOW_DUMMY_FALLBACK: bool = os.getenv("ALLOW_DUMMY_FALLBACK", "true").lower() == "true"
|
||||
|
||||
# Runtime
|
||||
RUNTIME_TYPE: Literal["local", "remote"] = os.getenv("RUNTIME_TYPE", "local")
|
||||
RUNTIME_TYPE: Literal["local", "remote", "ollama"] = os.getenv("RUNTIME_TYPE", "local")
|
||||
RUNTIME_URL: str = os.getenv("RUNTIME_URL", "http://parser-runtime:11435")
|
||||
|
||||
# Ollama configuration (if RUNTIME_TYPE=ollama)
|
||||
OLLAMA_BASE_URL: str = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
case_sensitive = True
|
||||
|
||||
@@ -16,11 +16,94 @@ from app.runtime.preprocessing import (
|
||||
)
|
||||
from app.runtime.postprocessing import build_parsed_document
|
||||
from app.runtime.model_output_parser import parse_model_output_to_blocks
|
||||
from app.runtime.ollama_client import (
|
||||
call_ollama_vision, parse_ollama_response, OutputMode as OllamaOutputMode
|
||||
)
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def parse_document_with_ollama(
|
||||
images: List[Image.Image],
|
||||
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
|
||||
doc_id: Optional[str] = None,
|
||||
doc_type: Literal["pdf", "image"] = "image"
|
||||
) -> ParsedDocument:
|
||||
"""
|
||||
Parse document using Ollama API
|
||||
|
||||
Args:
|
||||
images: List of PIL Images
|
||||
output_mode: Output format mode
|
||||
doc_id: Document ID
|
||||
doc_type: Document type
|
||||
|
||||
Returns:
|
||||
ParsedDocument
|
||||
"""
|
||||
import io
|
||||
|
||||
# Convert output_mode to Ollama format
|
||||
ollama_mode_map = {
|
||||
"raw_json": OllamaOutputMode.raw_json,
|
||||
"markdown": OllamaOutputMode.markdown,
|
||||
"qa_pairs": OllamaOutputMode.qa_pairs,
|
||||
"chunks": OllamaOutputMode.raw_json # Use raw_json for chunks, will be processed later
|
||||
}
|
||||
ollama_mode = ollama_mode_map.get(output_mode, OllamaOutputMode.raw_json)
|
||||
|
||||
pages_data = []
|
||||
|
||||
for idx, image in enumerate(images, start=1):
|
||||
try:
|
||||
# Convert image to PNG bytes
|
||||
buf = io.BytesIO()
|
||||
image.convert("RGB").save(buf, format="PNG")
|
||||
png_bytes = buf.getvalue()
|
||||
|
||||
# Call Ollama
|
||||
ollama_data = await call_ollama_vision(png_bytes, ollama_mode)
|
||||
raw_text, parsed_json = parse_ollama_response(ollama_data, ollama_mode)
|
||||
|
||||
logger.debug(f"Ollama output for page {idx}: {raw_text[:100]}...")
|
||||
|
||||
# Parse into blocks
|
||||
if parsed_json and isinstance(parsed_json, dict):
|
||||
# Use structured JSON if available
|
||||
blocks = parsed_json.get("blocks", [])
|
||||
if not blocks:
|
||||
# Fallback: create block from raw text
|
||||
blocks = [{
|
||||
"type": "paragraph",
|
||||
"text": raw_text,
|
||||
"bbox": {"x": 0, "y": 0, "width": image.width, "height": image.height},
|
||||
"reading_order": 1
|
||||
}]
|
||||
else:
|
||||
# Parse plain text output
|
||||
blocks = parse_model_output_to_blocks(raw_text, image.size, page_num=idx)
|
||||
|
||||
pages_data.append({
|
||||
"blocks": blocks,
|
||||
"width": image.width,
|
||||
"height": image.height
|
||||
})
|
||||
|
||||
logger.info(f"Processed page {idx}/{len(images)} via Ollama")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing page {idx} with Ollama: {e}", exc_info=True)
|
||||
continue
|
||||
|
||||
return build_parsed_document(
|
||||
pages_data=pages_data,
|
||||
doc_id=doc_id or "parsed-doc",
|
||||
doc_type=doc_type,
|
||||
metadata={"model": settings.PARSER_MODEL_NAME, "runtime": "ollama"}
|
||||
)
|
||||
|
||||
|
||||
def parse_document_from_images(
|
||||
images: List[Image.Image],
|
||||
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
|
||||
@@ -44,7 +127,12 @@ def parse_document_from_images(
|
||||
logger.info("Using dummy parser (USE_DUMMY_PARSER=true)")
|
||||
return dummy_parse_document_from_images(images, doc_id, doc_type)
|
||||
|
||||
# Try to get model
|
||||
# Check if using Ollama runtime
|
||||
if settings.RUNTIME_TYPE == "ollama":
|
||||
logger.info("Using Ollama runtime")
|
||||
return await parse_document_with_ollama(images, output_mode, doc_id, doc_type)
|
||||
|
||||
# Try to get local model
|
||||
model = get_model()
|
||||
|
||||
if model is None:
|
||||
|
||||
127
services/parser-service/app/runtime/ollama_client.py
Normal file
127
services/parser-service/app/runtime/ollama_client.py
Normal file
@@ -0,0 +1,127 @@
|
||||
"""
|
||||
Ollama client for dots.ocr model
|
||||
Alternative runtime using Ollama API
|
||||
"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, Any, Optional
|
||||
from enum import Enum
|
||||
|
||||
import httpx
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OutputMode(str, Enum):
|
||||
raw_json = "raw_json"
|
||||
markdown = "markdown"
|
||||
qa_pairs = "qa_pairs"
|
||||
|
||||
|
||||
def build_prompt(mode: OutputMode) -> str:
|
||||
"""Build prompt for Ollama based on output mode"""
|
||||
if mode == OutputMode.raw_json:
|
||||
return (
|
||||
"You are a document OCR and layout parser. "
|
||||
"Extract all text, tables, formulas, and layout into a clean JSON structure with fields like "
|
||||
"`blocks`, `tables`, `reading_order`, including bounding boxes and page numbers. "
|
||||
"Respond with JSON only, no explanations."
|
||||
)
|
||||
elif mode == OutputMode.markdown:
|
||||
return (
|
||||
"You are a document OCR and layout parser. "
|
||||
"Extract the document as Markdown, preserving headings, paragraphs, and tables. "
|
||||
"Tables should be proper GitHub-flavored Markdown tables. "
|
||||
"Respond with Markdown as plain text."
|
||||
)
|
||||
elif mode == OutputMode.qa_pairs:
|
||||
return (
|
||||
"You are a document OCR and knowledge extraction assistant. "
|
||||
"Read the document and output a JSON array of Q&A pairs covering the key information. "
|
||||
"Each item should be {\"question\": ..., \"answer\": ..., \"page\": ..., \"section\": ...}. "
|
||||
"Respond with JSON only, no explanations."
|
||||
)
|
||||
return "You are a document OCR assistant. Extract text."
|
||||
|
||||
|
||||
async def call_ollama_vision(
|
||||
image_bytes: bytes,
|
||||
mode: OutputMode,
|
||||
model_name: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Call Ollama vision API with image
|
||||
|
||||
Args:
|
||||
image_bytes: PNG image bytes
|
||||
mode: Output mode
|
||||
model_name: Model name (defaults to PARSER_MODEL_NAME)
|
||||
|
||||
Returns:
|
||||
Ollama response dictionary
|
||||
"""
|
||||
model_name = model_name or settings.PARSER_MODEL_NAME
|
||||
|
||||
# Encode image to base64
|
||||
img_b64 = base64.b64encode(image_bytes).decode("ascii")
|
||||
prompt = build_prompt(mode)
|
||||
|
||||
body = {
|
||||
"model": model_name,
|
||||
"prompt": prompt,
|
||||
"images": [img_b64],
|
||||
"stream": False,
|
||||
}
|
||||
|
||||
url = f"{settings.OLLAMA_BASE_URL.rstrip('/')}/api/generate"
|
||||
|
||||
logger.info(f"Calling Ollama: {url}, model: {model_name}, mode: {mode}")
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
resp = await client.post(url, json=body)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
logger.debug(f"Ollama response: {data.get('response', '')[:100]}...")
|
||||
return data
|
||||
|
||||
except httpx.HTTPError as e:
|
||||
logger.error(f"Ollama HTTP error: {e}")
|
||||
raise RuntimeError(f"Ollama API error: {e}") from e
|
||||
except Exception as e:
|
||||
logger.error(f"Ollama error: {e}", exc_info=True)
|
||||
raise RuntimeError(f"Failed to call Ollama: {e}") from e
|
||||
|
||||
|
||||
def parse_ollama_response(
|
||||
ollama_data: Dict[str, Any],
|
||||
mode: OutputMode
|
||||
) -> tuple[str, Optional[Dict[str, Any]]]:
|
||||
"""
|
||||
Parse Ollama response
|
||||
|
||||
Args:
|
||||
ollama_data: Response from Ollama API
|
||||
mode: Output mode
|
||||
|
||||
Returns:
|
||||
Tuple of (raw_text, parsed_json)
|
||||
"""
|
||||
raw_text = ollama_data.get("response", "").strip()
|
||||
parsed_json: Optional[Dict[str, Any]] = None
|
||||
|
||||
# Try to parse as JSON for raw_json and qa_pairs modes
|
||||
if mode in (OutputMode.raw_json, OutputMode.qa_pairs):
|
||||
try:
|
||||
parsed_json = json.loads(raw_text)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
logger.warning(f"Failed to parse response as JSON for mode {mode}")
|
||||
parsed_json = None
|
||||
|
||||
return raw_text, parsed_json
|
||||
|
||||
@@ -23,5 +23,5 @@ python-dotenv>=1.0.1
|
||||
# Testing
|
||||
pytest>=7.4.0
|
||||
pytest-asyncio>=0.21.0
|
||||
httpx>=0.25.0 # For TestClient
|
||||
httpx>=0.25.0 # For TestClient and Ollama client
|
||||
|
||||
|
||||
Reference in New Issue
Block a user