microdao-daarion/services/router/sofiia_auto_router.py

"""Sofiia Smart Auto-Router — Cursor-style model selection for Sofiia agent.

Classifies incoming prompt by task type and selects the best available model,
balancing capability, speed, cost, and provider availability.

Full model catalog includes:
  - Cloud: Anthropic Claude, xAI Grok, DeepSeek, Mistral AI, GLM-5 (Z.AI)
  - Local Ollama (NODA2/MacBook): qwen3.5:35b-a3b, qwen3:14b, glm-4.7-flash:32k,
    deepseek-r1:70b, deepseek-coder:33b, gemma3, mistral-nemo:12b,
    starcoder2:3b, phi3, llava:13b

Task taxonomy (inspired by Cursor Auto mode):
  code_gen, code_review, code_debug, code_refactor,
  architecture, devops, security, analysis, quick_answer, creative, reasoning,
  math_code, vision, chatbot
"""
from __future__ import annotations

import logging
import os
import re
import time
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Tuple

logger = logging.getLogger(__name__)

# ── Task taxonomy ──────────────────────────────────────────────────────────────
# Each pattern group uses multi-word or context-aware patterns to reduce false
# positives. Single common words (system, design, check, list, graph, tree) are
# avoided unless paired with a qualifier.

TASK_PATTERNS: List[Tuple[str, List[str], float]] = [
    # (task_type, patterns, base_weight)  — weight scales final score
    ("code_gen", [
        r"\bнапиши\s+(функці|код|клас|скрипт|модуль|endpoint|api)",
        r"\bреалізуй\b", r"\bcreate\s+(function|class|module|endpoint|api|component)",
        r"\bimplement\b", r"\bgenerate\s+code\b", r"\bзгенеруй\s+код\b",
        r"\bфункці[юя]\s+для\b", r"\bклас\s+для\b", r"\bнапиши\s+код\b",
        r"\bwrite\s+a?\s*(function|class|module|script|endpoint)\b",
        r"\bcontroller\b", r"\bendpoint\s+(для|for)\b",
    ], 1.0),
    ("code_debug", [
        r"\bвиправ\b", r"\bбаг\b", r"\bпомилк[аи]\b", r"\btraceback\b",
        r"\bexception\b", r"\bfailed\b", r"\bcrash(es|ed)?\b", r"\bне\s+працю",
        r"\bдебаг\b", r"\bdebug\b", r"\bfix\s+(the\s+)?(bug|error|issue|crash)\b",
        r"\bsyntax\s*error\b", r"\btype\s*error\b", r"\battribute\s*error\b",
        r"\bruntime\s*error\b", r"\bvalue\s*error\b",
    ], 1.0),
    ("code_review", [
        r"\breview\s+(the\s+)?(code|pr|pull\s+request|diff)\b",
        r"\bаудит\s+(код|сервіс|систем)\b", r"\baudit\s+(code|service)\b",
        r"\bперевір\w*\s+(код|якість)\b", r"\bcode\s+quality\b",
        r"\bcode\s+review\b", r"\brev'ю\b",
    ], 1.0),
    ("code_refactor", [
        r"\bрефактор\b", r"\brefactor\b",
        r"\bоптимізу[йї]\s+(код|функці|клас)\b", r"\boptimize\s+(the\s+)?(code|function|class)\b",
        r"\bclean\s+up\s+(the\s+)?code\b", r"\bpolish\s+(the\s+)?code\b",
        r"\bspeed\s+up\b", r"\bimprove\s+(the\s+)?code\b",
    ], 1.0),
    ("architecture", [
        r"\bархітектур\w+\b", r"\barchitecture\b",
        r"\bспроєктуй\b", r"\bsystem\s+design\b",
        r"\bmicroservice\s+(architect|design|pattern)\b",
        r"\bdatabase\s+design\b", r"\bapi\s+design\b",
        r"\bscalab(le|ility)\b", r"\bscaling\s+strateg\b",
        r"\bdesign\s+pattern\b", r"\bsystem\s+structure\b",
    ], 1.0),
    ("devops", [
        r"\bdeploy\b", r"\bdocker\s*(file|compose|-compose|ize)?\b",
        r"\bkubernetes\b", r"\bk8s\b", r"\bci[\s/]cd\b",
        r"\bpipeline\b", r"\bnginx\b", r"\bcaddy\b",
        r"\bнода\d?\b", r"\bnoda\d?\b", r"\bcontainer\s+(start|stop|restart|build|image)\b",
        r"\bдеплой\b", r"\bssh\s+(to|into|root|connect)\b",
        r"\bhelm\b", r"\bterraform\b", r"\binfrastructure\b",
        r"\bdocker\s+compose\s+up\b",
    ], 1.0),
    ("security", [
        r"\bvulnerability\b", r"\bCVE-\d+\b", r"\bsecurity\s+(audit|review|issue|scan)\b",
        r"\bauth(entication|orization)\b", r"\bencrypt(ion)?\b",
        r"\bRBAC\b", r"\bpermission\s+(model|system)\b",
        r"\bбезпек\w+\b", r"\bpentest\b", r"\b(sql|xss|csrf)\s*injection\b",
        r"\bthreat\s+model\b",
    ], 1.0),
    ("reasoning", [
        r"\bчому\s+\w+\b", r"\bwhy\s+(does|is|do|did|should|would)\b",
        r"\bpros\s+and\s+cons\b", r"\btrade[\s-]?off\b",
        r"\bпорівняй\b", r"\bcompare\s+\w+\s+(vs|and|with|to)\b",
        r"\bяк\s+краще\b", r"\bперевага\b", r"\bнедолік\b",
        r"\bdecision\s+(between|about)\b",
        r"\bversus\b", r"\b\w+\s+vs\s+\w+\b",
    ], 1.0),
    ("analysis", [
        r"\bпроаналізуй\b", r"\bаналіз\s+\w+\b",
        r"\banalyze\s+\w+\b", r"\binvestigate\b",
        r"\bexplain\s+(how|why|what)\b", r"\bsummariz(e|ation)\b",
        r"\bдослідж\b", r"\bпоясни\s+(як|чому|що)\b",
        r"\bhow\s+does\s+\w+\s+work\b",
    ], 1.0),
    ("creative", [
        r"\bнапиши\s+(текст|стат|пост|лист|опис)\b",
        r"\bwrite\s+a\s+(blog|article|post|email|description|letter)\b",
        r"\bdraft\s+(a\s+)?(doc|email|message|proposal)\b",
        r"\breadme\b", r"\bchangelog\b", r"\bdocumentation\b",
    ], 1.0),
    ("quick_answer", [
        r"\bщо\s+таке\b", r"\bwhat\s+is\s+(a|an|the)?\b",
        r"\bhow\s+to\s+\w+\b", r"\bdefinition\s+of\b",
        r"\bшвидко\b", r"\bсинтаксис\s+\w+\b",
        r"\bgive\s+me\s+an?\s+example\b", r"\bexample\s+of\b",
    ], 0.9),
    ("vision", [
        r"\bзображен\w+\b", r"\bфото\b", r"\bimage\s+(analysis|recognition|detect)\b",
        r"\bскріншот\b", r"\bscreenshot\b",
        r"\bвізуальн\w+\s+аналіз\b", r"\bвідео\s+(аналіз|розпізна)\b",
    ], 1.0),
    ("math_code", [
        r"\bалгоритм\s+\w+\b", r"\balgorithm\s+(for|to)\b",
        r"\bсортуван\w+\b", r"\bsort(ing)?\s+algorithm\b",
        r"\bdynamic\s+programming\b", r"\bgraph\s+(algorithm|traversal|search)\b",
        r"\bmatrix\s+(mult|inver|decomp)\b",
        r"\bcalculate\s+\w+\b", r"\bcompute\s+\w+\b",
        r"\bformula\s+(for|to)\b", r"\bДейкстр\b", r"\bDijkstra\b",
    ], 1.0),
    # Chatbot / conversational — greetings, small talk, acknowledgements
    ("chatbot", [
        r"^(привіт|вітаю|добрий|доброго|hi|hello|hey)\b",
        r"^(дякую|спасибі|thank|thanks)\b",
        r"^(ок|добре|зрозумів|зрозуміло|so?|ok|yes|no|ні|так)\s*[,!.]?\s*$",
        r"\bяк\s+(справи|діла|ся маєш)\b", r"\bhow\s+are\s+you\b",
    ], 0.8),
]

# Pre-compile patterns once for performance
_COMPILED_PATTERNS: Optional[List[Tuple[str, List[re.Pattern], float]]] = None


def _get_compiled_patterns() -> List[Tuple[str, List[re.Pattern], float]]:
    global _COMPILED_PATTERNS
    if _COMPILED_PATTERNS is None:
        _COMPILED_PATTERNS = [
            (task_type, [re.compile(p, re.IGNORECASE) for p in patterns], weight)
            for task_type, patterns, weight in TASK_PATTERNS
        ]
    return _COMPILED_PATTERNS


# ── Model catalog ──────────────────────────────────────────────────────────────

@dataclass
class ModelSpec:
    profile_name: str
    provider: str
    model_id: str
    api_key_env: str = ""
    strengths: List[str] = field(default_factory=list)
    cost_tier: int = 1              # 0=free(local), 1=cheap, 2=mid, 3=expensive
    speed_tier: int = 1             # 1=fast, 2=medium, 3=slow
    context_k: int = 8              # context window in thousands
    local: bool = False
    max_tokens: int = 4096
    vram_gb: float = 0.0
    description: str = ""

    @property
    def available(self) -> bool:
        if self.local:
            return _is_ollama_model_available(self.model_id)
        return bool(os.getenv(self.api_key_env, "").strip())

    @property
    def has_credits(self) -> bool:
        return ProviderBudget.is_available(self.provider)


# ── Ollama model availability cache ───────────────────────────────────────────

_ollama_available_models: Optional[List[str]] = None
_ollama_cache_ts: float = 0.0
_OLLAMA_CACHE_TTL = 60.0


def _is_ollama_model_available(model_id: str) -> bool:
    global _ollama_available_models, _ollama_cache_ts
    now = time.time()
    if _ollama_available_models is None or (now - _ollama_cache_ts) > _OLLAMA_CACHE_TTL:
        _refresh_ollama_models_sync()
    if _ollama_available_models is None:
        return False
    model_lower = model_id.lower()
    model_base = model_lower.split(":")[0]
    for m in _ollama_available_models:
        ml = m.lower()
        if ml == model_lower or ml.split(":")[0] == model_base:
            return True
    return False


def _refresh_ollama_models_sync() -> None:
    global _ollama_available_models, _ollama_cache_ts
    import urllib.request
    import json as _json
    ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434")
    try:
        with urllib.request.urlopen(f"{ollama_url}/api/tags", timeout=2) as resp:
            data = _json.loads(resp.read())
            _ollama_available_models = [m["name"] for m in data.get("models", [])]
            _ollama_cache_ts = time.time()
    except Exception:
        _ollama_available_models = []
        _ollama_cache_ts = time.time()


async def refresh_ollama_models_async() -> List[str]:
    global _ollama_available_models, _ollama_cache_ts
    try:
        import httpx
        ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434")
        async with httpx.AsyncClient(timeout=2.0) as client:
            resp = await client.get(f"{ollama_url}/api/tags")
            data = resp.json()
            _ollama_available_models = [m["name"] for m in data.get("models", [])]
            _ollama_cache_ts = time.time()
            return _ollama_available_models
    except Exception:
        _ollama_available_models = _ollama_available_models or []
        return _ollama_available_models


# ── Full model catalog ─────────────────────────────────────────────────────────

SOFIIA_MODEL_CATALOG: List[ModelSpec] = [

    # ── Anthropic Claude ─────────────────────────────────────────────────────
    ModelSpec(
        profile_name="cloud_claude_sonnet",
        provider="anthropic", model_id="claude-sonnet-4-5",
        api_key_env="ANTHROPIC_API_KEY",
        strengths=["code_gen", "code_debug", "code_refactor", "architecture", "security", "reasoning"],
        cost_tier=2, speed_tier=2, context_k=200, max_tokens=8192,
        description="Claude Sonnet 4.5 — найкращий для коду та архітектури",
    ),
    ModelSpec(
        profile_name="cloud_claude_haiku",
        provider="anthropic", model_id="claude-haiku-3-5",
        api_key_env="ANTHROPIC_API_KEY",
        strengths=["quick_answer", "code_review", "creative", "analysis", "chatbot"],
        cost_tier=1, speed_tier=1, context_k=200, max_tokens=4096,
        description="Claude Haiku 3.5 — швидкий та дешевий",
    ),

    # ── xAI Grok ─────────────────────────────────────────────────────────────
    ModelSpec(
        profile_name="cloud_grok",
        provider="grok", model_id="grok-4-1-fast-reasoning",
        api_key_env="GROK_API_KEY",
        strengths=["reasoning", "architecture", "analysis", "code_gen"],
        cost_tier=2, speed_tier=1, context_k=2000, max_tokens=8192,
        description="Grok 4.1 Fast — 2M контекст, кращий для reasoning",
    ),

    # ── DeepSeek API ─────────────────────────────────────────────────────────
    ModelSpec(
        profile_name="cloud_deepseek",
        provider="deepseek", model_id="deepseek-chat",
        api_key_env="DEEPSEEK_API_KEY",
        strengths=["code_gen", "code_debug", "code_refactor", "devops", "quick_answer"],
        cost_tier=1, speed_tier=2, context_k=64, max_tokens=4096,
        description="DeepSeek Chat — дешевий і добре знає код/devops",
    ),

    # ── GLM-5 / Z.AI (API) ───────────────────────────────────────────────────
    ModelSpec(
        profile_name="cloud_glm5",
        provider="glm", model_id="glm-4-plus",
        api_key_env="GLM5_API_KEY",
        strengths=["quick_answer", "creative", "analysis", "code_gen", "chatbot"],
        cost_tier=1, speed_tier=1, context_k=128, max_tokens=4096,
        description="GLM-4 Plus (Z.AI) — швидкий, дешевий, гарно знає українську/CJK",
    ),
    ModelSpec(
        profile_name="cloud_glm5_flash",
        provider="glm", model_id="glm-4-flash",
        api_key_env="GLM5_API_KEY",
        strengths=["quick_answer", "creative", "chatbot"],
        cost_tier=0, speed_tier=1, context_k=128, max_tokens=2048,
        description="GLM-4 Flash (Z.AI) — безкоштовний, найшвидший",
    ),

    # ── Mistral AI (API) ─────────────────────────────────────────────────────
    ModelSpec(
        profile_name="cloud_mistral",
        provider="mistral", model_id="mistral-large-latest",
        api_key_env="MISTRAL_API_KEY",
        strengths=["analysis", "creative", "reasoning", "architecture"],
        cost_tier=2, speed_tier=2, context_k=128, max_tokens=4096,
        description="Mistral Large — добрий для аналізу та creative",
    ),

    # ── Local: qwen3.5:35b-a3b (FLAGSHIP) ────────────────────────────────────
    ModelSpec(
        profile_name="local_qwen35_35b",
        provider="ollama", model_id="qwen3.5:35b-a3b",
        strengths=["code_gen", "code_debug", "code_refactor", "reasoning", "architecture",
                   "analysis", "devops", "security", "chatbot"],
        cost_tier=0, speed_tier=2, context_k=32, max_tokens=4096,
        local=True, vram_gb=24.0,
        description="Qwen3.5 35B MoE (NODA2) — флагман локально, якість ≈ cloud",
    ),

    # ── Local: qwen3:14b ─────────────────────────────────────────────────────
    ModelSpec(
        profile_name="local_qwen3_14b",
        provider="ollama", model_id="qwen3:14b",
        strengths=["code_gen", "code_debug", "quick_answer", "devops", "analysis", "chatbot"],
        cost_tier=0, speed_tier=2, context_k=32, max_tokens=2048,
        local=True, vram_gb=10.0,
        description="Qwen3 14B (NODA2) — швидкий локальний загальний",
    ),

    # ── Local: glm-4.7-flash:32k ─────────────────────────────────────────────
    ModelSpec(
        profile_name="local_glm47_32k",
        provider="ollama", model_id="glm-4.7-flash:32k",
        strengths=["quick_answer", "creative", "analysis", "code_review", "chatbot"],
        cost_tier=0, speed_tier=2, context_k=32, max_tokens=2048,
        local=True, vram_gb=20.0,
        description="GLM-4.7 Flash 32K (NODA2) — локальний GLM, великий контекст",
    ),

    # ── Local: deepseek-r1:70b ────────────────────────────────────────────────
    ModelSpec(
        profile_name="local_deepseek_r1_70b",
        provider="ollama", model_id="deepseek-r1:70b",
        strengths=["reasoning", "math_code", "architecture", "analysis"],
        cost_tier=0, speed_tier=3, context_k=64, max_tokens=4096,
        local=True, vram_gb=48.0,
        description="DeepSeek-R1 70B (NODA2) — локальний reasoning як o1",
    ),

    # ── Local: deepseek-coder:33b ─────────────────────────────────────────────
    ModelSpec(
        profile_name="local_deepseek_coder_33b",
        provider="ollama", model_id="deepseek-coder:33b",
        strengths=["code_gen", "code_debug", "code_refactor", "math_code"],
        cost_tier=0, speed_tier=2, context_k=16, max_tokens=2048,
        local=True, vram_gb=20.0,
        description="DeepSeek Coder 33B (NODA2) — спеціаліст по коду",
    ),

    # ── Local: gemma3:latest ──────────────────────────────────────────────────
    ModelSpec(
        profile_name="local_gemma3",
        provider="ollama", model_id="gemma3:latest",
        strengths=["quick_answer", "analysis", "creative", "chatbot"],
        cost_tier=0, speed_tier=2, context_k=8, max_tokens=2048,
        local=True, vram_gb=8.0,
        description="Gemma3 (NODA2) — Google's ефективна модель",
    ),

    # ── Local: mistral-nemo:12b ───────────────────────────────────────────────
    ModelSpec(
        profile_name="local_mistral_nemo",
        provider="ollama", model_id="mistral-nemo:12b",
        strengths=["creative", "quick_answer", "analysis", "chatbot"],
        cost_tier=0, speed_tier=2, context_k=128, max_tokens=2048,
        local=True, vram_gb=8.0,
        description="Mistral Nemo 12B (NODA2) — 128K контекст локально",
    ),

    # ── Local: starcoder2:3b ──────────────────────────────────────────────────
    ModelSpec(
        profile_name="local_starcoder2",
        provider="ollama", model_id="starcoder2:3b",
        strengths=["code_gen", "code_review"],
        cost_tier=0, speed_tier=1, context_k=16, max_tokens=2048,
        local=True, vram_gb=2.0,
        description="StarCoder2 3B (NODA2) — мікро-модель для code completion",
    ),

    # ── Local: phi3:latest ────────────────────────────────────────────────────
    ModelSpec(
        profile_name="local_phi3",
        provider="ollama", model_id="phi3:latest",
        strengths=["quick_answer", "analysis", "chatbot"],
        cost_tier=0, speed_tier=1, context_k=128, max_tokens=2048,
        local=True, vram_gb=4.0,
        description="Phi-3 (NODA2) — Microsoft мала ефективна модель",
    ),

    # ── Local: llava:13b (vision) ─────────────────────────────────────────────
    ModelSpec(
        profile_name="local_llava_13b",
        provider="ollama", model_id="llava:13b",
        strengths=["vision"],
        cost_tier=0, speed_tier=2, context_k=4, max_tokens=2048,
        local=True, vram_gb=10.0,
        description="LLaVA 13B (NODA2) — vision модель для зображень",
    ),

    # ── Local: gpt-oss:latest ─────────────────────────────────────────────────
    ModelSpec(
        profile_name="local_gpt_oss",
        provider="ollama", model_id="gpt-oss:latest",
        strengths=["code_gen", "quick_answer"],
        cost_tier=0, speed_tier=2, context_k=8, max_tokens=2048,
        local=True, vram_gb=8.0,
        description="GPT-OSS (NODA2) — відкрита OSS GPT-like модель",
    ),
]

# ── Task → preferred model matrix ─────────────────────────────────────────────

TASK_MODEL_PRIORITY: Dict[str, List[str]] = {
    # Principle: local-first for tasks where local quality is sufficient.
    # Cloud only when the task genuinely needs it (complex code, deep reasoning,
    # very long context, security audits).
    #
    # qwen3.5:35b-a3b is the flagship local — MoE with cloud-level quality.
    # It should be preferred over cloud APIs for most routine tasks.

    "code_gen": [
        "local_qwen35_35b", "cloud_claude_sonnet", "local_deepseek_coder_33b",
        "cloud_deepseek", "local_qwen3_14b", "cloud_grok",
    ],
    "code_debug": [
        "local_qwen35_35b", "local_deepseek_coder_33b", "cloud_claude_sonnet",
        "cloud_deepseek", "local_qwen3_14b",
    ],
    "code_review": [
        "local_qwen35_35b", "cloud_claude_haiku", "local_deepseek_coder_33b",
        "cloud_claude_sonnet", "cloud_deepseek",
    ],
    "code_refactor": [
        "local_qwen35_35b", "local_deepseek_coder_33b", "cloud_claude_sonnet",
        "cloud_deepseek", "local_qwen3_14b",
    ],
    "math_code": [
        "local_deepseek_r1_70b", "local_qwen35_35b", "cloud_grok",
        "cloud_claude_sonnet", "local_deepseek_coder_33b",
    ],
    "architecture": [
        "local_qwen35_35b", "cloud_grok", "cloud_claude_sonnet",
        "local_deepseek_r1_70b", "cloud_mistral",
    ],
    "devops": [
        "local_qwen35_35b", "local_qwen3_14b", "cloud_deepseek",
        "cloud_claude_sonnet", "local_glm47_32k",
    ],
    "security": [
        "cloud_claude_sonnet", "local_qwen35_35b", "cloud_grok", "cloud_mistral",
    ],
    "reasoning": [
        "local_deepseek_r1_70b", "local_qwen35_35b", "cloud_grok",
        "cloud_claude_sonnet", "cloud_mistral",
    ],
    "analysis": [
        "local_qwen35_35b", "local_glm47_32k", "cloud_grok",
        "cloud_claude_haiku", "local_mistral_nemo", "cloud_mistral",
    ],
    "creative": [
        "local_qwen35_35b", "local_mistral_nemo", "cloud_claude_haiku",
        "local_glm47_32k", "cloud_mistral",
    ],
    "quick_answer": [
        "local_qwen3_14b", "local_qwen35_35b", "local_phi3",
        "local_gemma3", "cloud_deepseek", "cloud_glm5_flash",
    ],
    "chatbot": [
        "local_qwen3_14b", "local_qwen35_35b", "local_gemma3",
        "local_phi3", "local_mistral_nemo",
    ],
    "vision": [
        "local_llava_13b",
    ],
    "unknown": [
        "local_qwen35_35b", "local_qwen3_14b", "cloud_claude_sonnet",
        "cloud_grok", "cloud_deepseek",
    ],
}

# ── Budget integration ─────────────────────────────────────────────────────────

class ProviderBudget:
    """In-memory budget gate: marks providers exhausted until TTL expires."""
    _exhausted: Dict[str, float] = {}
    _exhausted_ttl: int = 3600

    @classmethod
    def mark_exhausted(cls, provider: str) -> None:
        cls._exhausted[provider] = time.time()
        logger.warning("💸 Provider %s marked as budget-exhausted", provider)

    @classmethod
    def is_available(cls, provider: str) -> bool:
        ts = cls._exhausted.get(provider)
        if ts is None:
            return True
        if time.time() - ts > cls._exhausted_ttl:
            cls._exhausted.pop(provider, None)
            return True
        return False

    @classmethod
    def reset(cls, provider: str) -> None:
        cls._exhausted.pop(provider, None)


# ── Task classification ────────────────────────────────────────────────────────

@dataclass
class ClassificationResult:
    task_type: str
    confidence: float
    all_scores: Dict[str, float]
    ambiguous: bool = False
    runner_up: Optional[str] = None


def classify_task(prompt: str, context_len: int = 0) -> Tuple[str, float]:
    """Classify prompt into a task type. Returns (task_type, confidence)."""
    result = classify_task_detailed(prompt, context_len)
    return result.task_type, result.confidence


def classify_task_detailed(prompt: str, context_len: int = 0) -> ClassificationResult:
    """Detailed classification with ambiguity detection and all scores."""
    if not prompt or not prompt.strip():
        return ClassificationResult("chatbot", 0.5, {}, ambiguous=False)

    text = prompt.strip()
    compiled = _get_compiled_patterns()
    scores: Dict[str, float] = {}

    for task_type, patterns, weight in compiled:
        hits = sum(1 for p in patterns if p.search(text))
        if hits > 0:
            raw = hits / len(patterns)
            scores[task_type] = raw * weight

    if not scores:
        return ClassificationResult("unknown", 0.3, {}, ambiguous=False)

    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    best_task, best_score = sorted_scores[0]
    confidence = min(best_score * 10, 1.0)

    # Penalize confidence for very short prompts (fewer signals)
    word_count = len(text.split())
    if word_count <= 3:
        confidence *= 0.6
    elif word_count <= 8:
        confidence *= 0.85

    # Detect ambiguity: second-place is within 30% of the best
    ambiguous = False
    runner_up = None
    if len(sorted_scores) >= 2:
        _, second_score = sorted_scores[1]
        if second_score > 0 and second_score / best_score > 0.7:
            ambiguous = True
            runner_up = sorted_scores[1][0]

    # For long conversations, slight preference for context-heavy models
    # (influences scoring, not classification)
    if context_len > 50:
        confidence = max(confidence, 0.5)

    return ClassificationResult(
        task_type=best_task,
        confidence=round(confidence, 3),
        all_scores={k: round(v, 4) for k, v in sorted_scores[:5]},
        ambiguous=ambiguous,
        runner_up=runner_up,
    )


def _prompt_complexity(prompt: str) -> str:
    """Estimate prompt complexity: simple | medium | complex"""
    words = len(prompt.split())
    lines = prompt.count("\n")
    code_blocks = prompt.count("```")
    if words < 20 and lines < 3 and code_blocks == 0:
        return "simple"
    if words > 200 or code_blocks >= 2 or lines > 20:
        return "complex"
    return "medium"


# ── Main selection function ────────────────────────────────────────────────────

@dataclass
class AutoRouteResult:
    profile_name: str
    model_id: str
    provider: str
    task_type: str
    confidence: float
    complexity: str
    reason: str
    fallback_used: bool = False
    all_candidates: List[str] = field(default_factory=list)
    ambiguous: bool = False
    runner_up: Optional[str] = None
    all_scores: Dict[str, float] = field(default_factory=dict)


def select_model_auto(
    prompt: str,
    force_fast: bool = False,
    force_capable: bool = False,
    prefer_local: bool = False,
    prefer_cheap: bool = False,
    budget_aware: bool = True,
    context_messages_len: int = 0,
) -> AutoRouteResult:
    """
    Cursor-style auto model selection for Sofiia.

    Logic:
      1. Classify task type from prompt (with ambiguity detection)
      2. Estimate complexity (simple/medium/complex)
      3. Apply modifiers (force_fast, force_capable, prefer_local, prefer_cheap)
      4. Score candidates from priority list factoring availability, budget, speed, cost
      5. For long conversations, prefer large-context models
    """
    classification = classify_task_detailed(prompt, context_messages_len)
    task_type = classification.task_type
    confidence = classification.confidence
    complexity = _prompt_complexity(prompt)

    effective_task = task_type

    # Modifier overrides (parentheses fix for operator precedence)
    if force_fast and task_type not in ("code_gen", "code_debug", "math_code"):
        effective_task = "quick_answer"
    if (prefer_cheap or complexity == "simple") and task_type in ("quick_answer", "creative", "chatbot"):
        effective_task = "quick_answer"

    priority_list = TASK_MODEL_PRIORITY.get(effective_task, TASK_MODEL_PRIORITY["unknown"])
    catalog_map = {m.profile_name: m for m in SOFIIA_MODEL_CATALOG}

    candidates = [p for p in priority_list if p in catalog_map]
    if prefer_local:
        local_cands = [p for p in candidates if catalog_map[p].local]
        if local_cands:
            candidates = local_cands

    def _score(profile_name: str) -> float:
        spec = catalog_map[profile_name]
        score = 0.0

        if not spec.available:
            score += 1000
        if budget_aware and not spec.has_credits:
            score += 500

        # Priority-list position is the strongest signal
        try:
            pos = priority_list.index(profile_name)
            score += pos * 20
        except ValueError:
            score += 200

        if prefer_local and not spec.local:
            score += 200
        if force_fast:
            score += spec.speed_tier * 15
        if prefer_cheap or prefer_local:
            score -= spec.cost_tier * 20
        else:
            score += spec.cost_tier * 2

        if force_capable:
            score -= spec.context_k / 100

        if complexity == "complex" and spec.context_k < 32:
            score += 40

        # Long conversation bonus for large-context models
        if context_messages_len > 30 and spec.context_k >= 128:
            score -= 15
        elif context_messages_len > 50 and spec.context_k < 32:
            score += 25

        return score

    scored = sorted([c for c in candidates if c in catalog_map], key=_score)

    if not scored:
        for fallback in ["local_qwen35_35b", "local_qwen3_14b", "local_phi3"]:
            if fallback in catalog_map:
                scored = [fallback]
                break

    best = scored[0] if scored else "local_qwen3_14b"
    spec = catalog_map.get(best)
    fallback_used = best not in priority_list[:2]

    reasons: List[str] = [f"task={task_type} ({confidence:.0%})", f"complexity={complexity}"]
    if classification.ambiguous:
        reasons.append(f"ambiguous (runner_up={classification.runner_up})")
    if force_fast:
        reasons.append("force_fast")
    if prefer_local:
        reasons.append("prefer_local")
    if prefer_cheap:
        reasons.append("prefer_cheap")
    if force_capable:
        reasons.append("force_capable")
    if context_messages_len > 30:
        reasons.append(f"long_conversation({context_messages_len})")
    if fallback_used:
        reasons.append("fallback (top unavailable)")

    return AutoRouteResult(
        profile_name=best,
        model_id=spec.model_id if spec else best,
        provider=spec.provider if spec else "unknown",
        task_type=task_type,
        confidence=confidence,
        complexity=complexity,
        reason=" | ".join(reasons),
        fallback_used=fallback_used,
        all_candidates=scored[:5],
        ambiguous=classification.ambiguous,
        runner_up=classification.runner_up,
        all_scores=classification.all_scores,
    )


def explain_selection(result: AutoRouteResult) -> str:
    """Human-readable explanation of model selection (for debug/UI)."""
    lines = [
        f"Auto-selected **{result.model_id}** ({result.provider})",
        f"Task: `{result.task_type}` | Complexity: `{result.complexity}` | "
        f"Confidence: {result.confidence:.0%}",
        f"Reason: {result.reason}",
    ]
    if result.ambiguous:
        lines.append(f"Ambiguous: runner-up was `{result.runner_up}`")
    if result.all_scores:
        top3 = list(result.all_scores.items())[:3]
        lines.append("Scores: " + ", ".join(f"{k}={v:.3f}" for k, v in top3))
    return "\n".join(lines)


def get_full_catalog() -> List[Dict[str, Any]]:
    """Return full model catalog with availability status for dashboard."""
    return [
        {
            "profile_name": m.profile_name,
            "provider": m.provider,
            "model_id": m.model_id,
            "description": m.description,
            "strengths": m.strengths,
            "cost_tier": m.cost_tier,
            "speed_tier": m.speed_tier,
            "context_k": m.context_k,
            "local": m.local,
            "vram_gb": m.vram_gb,
            "available": m.available,
            "has_credits": m.has_credits,
        }
        for m in SOFIIA_MODEL_CATALOG
    ]