microdao-daarion/services/node-worker/providers/ollama_vision.py

"""Ollama vision provider — direct Ollama API with images, no Swapper dependency."""
import logging
from typing import Any, Dict, List, Optional

import httpx

from config import OLLAMA_BASE_URL, DEFAULT_VISION

logger = logging.getLogger("provider.ollama_vision")


async def infer(
    images: Optional[List[str]] = None,
    prompt: str = "",
    model: str = "",
    system: str = "",
    max_tokens: int = 1024,
    temperature: float = 0.2,
    timeout_s: float = 60.0,
) -> Dict[str, Any]:
    model = model or DEFAULT_VISION

    payload: Dict[str, Any] = {
        "model": model,
        "prompt": prompt or "Describe this image.",
        "stream": False,
        "options": {"num_predict": max_tokens, "temperature": temperature},
    }
    if images:
        clean = []
        for img in images:
            if "," in img and img.startswith("data:"):
                clean.append(img.split(",", 1)[1])
            else:
                clean.append(img)
        payload["images"] = clean
    if system:
        payload["system"] = system

    async with httpx.AsyncClient(timeout=timeout_s) as c:
        resp = await c.post(f"{OLLAMA_BASE_URL}/api/generate", json=payload)
        resp.raise_for_status()
        data = resp.json()
        return {
            "text": data.get("response", ""),
            "model": model,
            "provider": "ollama_vision",
            "eval_count": data.get("eval_count", 0),
        }