Files
microdao-daarion/services/parser-service/app/runtime/ollama_client.py
Apple be22752590 feat: integrate dots.ocr native prompt modes and 2-stage qa_pairs pipeline
Prompt Modes Integration:
- Create local_runtime.py with DOTS_PROMPT_MAP
- Map OutputMode to native dots.ocr prompt modes (prompt_layout_all_en, prompt_ocr, etc.)
- Support dict_promptmode_to_prompt from dots.ocr with fallback prompts
- Add layout_only and region modes to OutputMode enum

2-Stage Q&A Pipeline:
- Create qa_builder.py for 2-stage qa_pairs generation
- Stage 1: PARSER (dots.ocr) → raw JSON via prompt_layout_all_en
- Stage 2: LLM (DAGI Router) → Q&A pairs via mode=qa_build
- Update endpoints.py to use 2-stage pipeline for qa_pairs mode
- Add ROUTER_BASE_URL and ROUTER_TIMEOUT to config

Updates:
- Update inference.py to use local_runtime with native prompts
- Update ollama_client.py to use same prompt map
- Add PROMPT_MODES.md documentation
2025-11-16 04:24:03 -08:00

152 lines
4.7 KiB
Python

"""
Ollama client for dots.ocr model
Alternative runtime using Ollama API
"""
import base64
import json
import logging
from typing import Dict, Any, Optional
from enum import Enum
import httpx
from app.core.config import settings
logger = logging.getLogger(__name__)
class OutputMode(str, Enum):
raw_json = "raw_json"
markdown = "markdown"
qa_pairs = "qa_pairs"
chunks = "chunks"
layout_only = "layout_only"
region = "region"
def build_prompt(mode: OutputMode) -> str:
"""
Build prompt for Ollama based on output mode
Maps to dots.ocr native prompt modes
"""
# Map to dots.ocr prompt modes (same as local_runtime)
prompt_map = {
OutputMode.raw_json: "prompt_layout_all_en",
OutputMode.markdown: "prompt_ocr",
OutputMode.qa_pairs: "prompt_layout_all_en", # Full JSON, then 2nd step LLM
OutputMode.chunks: "prompt_layout_all_en",
OutputMode.layout_only: "prompt_layout_only_en",
OutputMode.region: "prompt_grounding_ocr",
}
prompt_key = prompt_map.get(mode, "prompt_layout_all_en")
# Fallback prompts (same as local_runtime)
fallback_prompts = {
"prompt_layout_all_en": (
"You are a document OCR and layout parser. "
"Extract all text, tables, formulas, and layout into a clean JSON structure with fields like "
"`blocks`, `tables`, `reading_order`, including bounding boxes and page numbers. "
"Respond with JSON only, no explanations."
),
"prompt_ocr": (
"You are a document OCR and layout parser. "
"Extract the document as Markdown, preserving headings, paragraphs, and tables. "
"Tables should be proper GitHub-flavored Markdown tables. "
"Respond with Markdown as plain text."
),
"prompt_layout_only_en": (
"You are a document layout parser. "
"Extract only the layout structure (bounding boxes, block types, reading order) "
"without the text content. "
"Respond with JSON containing only layout information (bbox, type, reading_order)."
),
"prompt_grounding_ocr": (
"You are a document OCR assistant for targeted region parsing. "
"Extract text and layout for the specified region of the document. "
"Respond with JSON containing the parsed content for the region."
),
}
return fallback_prompts.get(prompt_key, fallback_prompts["prompt_layout_all_en"])
async def call_ollama_vision(
image_bytes: bytes,
mode: OutputMode,
model_name: Optional[str] = None
) -> Dict[str, Any]:
"""
Call Ollama vision API with image
Args:
image_bytes: PNG image bytes
mode: Output mode
model_name: Model name (defaults to PARSER_MODEL_NAME)
Returns:
Ollama response dictionary
"""
model_name = model_name or settings.PARSER_MODEL_NAME
# Encode image to base64
img_b64 = base64.b64encode(image_bytes).decode("ascii")
prompt = build_prompt(mode)
body = {
"model": model_name,
"prompt": prompt,
"images": [img_b64],
"stream": False,
}
url = f"{settings.OLLAMA_BASE_URL.rstrip('/')}/api/generate"
logger.info(f"Calling Ollama: {url}, model: {model_name}, mode: {mode}")
try:
async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post(url, json=body)
resp.raise_for_status()
data = resp.json()
logger.debug(f"Ollama response: {data.get('response', '')[:100]}...")
return data
except httpx.HTTPError as e:
logger.error(f"Ollama HTTP error: {e}")
raise RuntimeError(f"Ollama API error: {e}") from e
except Exception as e:
logger.error(f"Ollama error: {e}", exc_info=True)
raise RuntimeError(f"Failed to call Ollama: {e}") from e
def parse_ollama_response(
ollama_data: Dict[str, Any],
mode: OutputMode
) -> tuple[str, Optional[Dict[str, Any]]]:
"""
Parse Ollama response
Args:
ollama_data: Response from Ollama API
mode: Output mode
Returns:
Tuple of (raw_text, parsed_json)
"""
raw_text = ollama_data.get("response", "").strip()
parsed_json: Optional[Dict[str, Any]] = None
# Try to parse as JSON for raw_json and qa_pairs modes
if mode in (OutputMode.raw_json, OutputMode.qa_pairs):
try:
parsed_json = json.loads(raw_text)
except (json.JSONDecodeError, ValueError):
logger.warning(f"Failed to parse response as JSON for mode {mode}")
parsed_json = None
return raw_text, parsed_json