Prompt Modes Integration: - Create local_runtime.py with DOTS_PROMPT_MAP - Map OutputMode to native dots.ocr prompt modes (prompt_layout_all_en, prompt_ocr, etc.) - Support dict_promptmode_to_prompt from dots.ocr with fallback prompts - Add layout_only and region modes to OutputMode enum 2-Stage Q&A Pipeline: - Create qa_builder.py for 2-stage qa_pairs generation - Stage 1: PARSER (dots.ocr) → raw JSON via prompt_layout_all_en - Stage 2: LLM (DAGI Router) → Q&A pairs via mode=qa_build - Update endpoints.py to use 2-stage pipeline for qa_pairs mode - Add ROUTER_BASE_URL and ROUTER_TIMEOUT to config Updates: - Update inference.py to use local_runtime with native prompts - Update ollama_client.py to use same prompt map - Add PROMPT_MODES.md documentation
152 lines
4.7 KiB
Python
152 lines
4.7 KiB
Python
"""
|
|
Ollama client for dots.ocr model
|
|
Alternative runtime using Ollama API
|
|
"""
|
|
|
|
import base64
|
|
import json
|
|
import logging
|
|
from typing import Dict, Any, Optional
|
|
from enum import Enum
|
|
|
|
import httpx
|
|
|
|
from app.core.config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class OutputMode(str, Enum):
|
|
raw_json = "raw_json"
|
|
markdown = "markdown"
|
|
qa_pairs = "qa_pairs"
|
|
chunks = "chunks"
|
|
layout_only = "layout_only"
|
|
region = "region"
|
|
|
|
|
|
def build_prompt(mode: OutputMode) -> str:
|
|
"""
|
|
Build prompt for Ollama based on output mode
|
|
Maps to dots.ocr native prompt modes
|
|
"""
|
|
# Map to dots.ocr prompt modes (same as local_runtime)
|
|
prompt_map = {
|
|
OutputMode.raw_json: "prompt_layout_all_en",
|
|
OutputMode.markdown: "prompt_ocr",
|
|
OutputMode.qa_pairs: "prompt_layout_all_en", # Full JSON, then 2nd step LLM
|
|
OutputMode.chunks: "prompt_layout_all_en",
|
|
OutputMode.layout_only: "prompt_layout_only_en",
|
|
OutputMode.region: "prompt_grounding_ocr",
|
|
}
|
|
|
|
prompt_key = prompt_map.get(mode, "prompt_layout_all_en")
|
|
|
|
# Fallback prompts (same as local_runtime)
|
|
fallback_prompts = {
|
|
"prompt_layout_all_en": (
|
|
"You are a document OCR and layout parser. "
|
|
"Extract all text, tables, formulas, and layout into a clean JSON structure with fields like "
|
|
"`blocks`, `tables`, `reading_order`, including bounding boxes and page numbers. "
|
|
"Respond with JSON only, no explanations."
|
|
),
|
|
"prompt_ocr": (
|
|
"You are a document OCR and layout parser. "
|
|
"Extract the document as Markdown, preserving headings, paragraphs, and tables. "
|
|
"Tables should be proper GitHub-flavored Markdown tables. "
|
|
"Respond with Markdown as plain text."
|
|
),
|
|
"prompt_layout_only_en": (
|
|
"You are a document layout parser. "
|
|
"Extract only the layout structure (bounding boxes, block types, reading order) "
|
|
"without the text content. "
|
|
"Respond with JSON containing only layout information (bbox, type, reading_order)."
|
|
),
|
|
"prompt_grounding_ocr": (
|
|
"You are a document OCR assistant for targeted region parsing. "
|
|
"Extract text and layout for the specified region of the document. "
|
|
"Respond with JSON containing the parsed content for the region."
|
|
),
|
|
}
|
|
|
|
return fallback_prompts.get(prompt_key, fallback_prompts["prompt_layout_all_en"])
|
|
|
|
|
|
async def call_ollama_vision(
|
|
image_bytes: bytes,
|
|
mode: OutputMode,
|
|
model_name: Optional[str] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Call Ollama vision API with image
|
|
|
|
Args:
|
|
image_bytes: PNG image bytes
|
|
mode: Output mode
|
|
model_name: Model name (defaults to PARSER_MODEL_NAME)
|
|
|
|
Returns:
|
|
Ollama response dictionary
|
|
"""
|
|
model_name = model_name or settings.PARSER_MODEL_NAME
|
|
|
|
# Encode image to base64
|
|
img_b64 = base64.b64encode(image_bytes).decode("ascii")
|
|
prompt = build_prompt(mode)
|
|
|
|
body = {
|
|
"model": model_name,
|
|
"prompt": prompt,
|
|
"images": [img_b64],
|
|
"stream": False,
|
|
}
|
|
|
|
url = f"{settings.OLLAMA_BASE_URL.rstrip('/')}/api/generate"
|
|
|
|
logger.info(f"Calling Ollama: {url}, model: {model_name}, mode: {mode}")
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=120.0) as client:
|
|
resp = await client.post(url, json=body)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
|
|
logger.debug(f"Ollama response: {data.get('response', '')[:100]}...")
|
|
return data
|
|
|
|
except httpx.HTTPError as e:
|
|
logger.error(f"Ollama HTTP error: {e}")
|
|
raise RuntimeError(f"Ollama API error: {e}") from e
|
|
except Exception as e:
|
|
logger.error(f"Ollama error: {e}", exc_info=True)
|
|
raise RuntimeError(f"Failed to call Ollama: {e}") from e
|
|
|
|
|
|
def parse_ollama_response(
|
|
ollama_data: Dict[str, Any],
|
|
mode: OutputMode
|
|
) -> tuple[str, Optional[Dict[str, Any]]]:
|
|
"""
|
|
Parse Ollama response
|
|
|
|
Args:
|
|
ollama_data: Response from Ollama API
|
|
mode: Output mode
|
|
|
|
Returns:
|
|
Tuple of (raw_text, parsed_json)
|
|
"""
|
|
raw_text = ollama_data.get("response", "").strip()
|
|
parsed_json: Optional[Dict[str, Any]] = None
|
|
|
|
# Try to parse as JSON for raw_json and qa_pairs modes
|
|
if mode in (OutputMode.raw_json, OutputMode.qa_pairs):
|
|
try:
|
|
parsed_json = json.loads(raw_text)
|
|
except (json.JSONDecodeError, ValueError):
|
|
logger.warning(f"Failed to parse response as JSON for mode {mode}")
|
|
parsed_json = None
|
|
|
|
return raw_text, parsed_json
|
|
|