"""Sofiia Smart Auto-Router — Cursor-style model selection for Sofiia agent. Classifies incoming prompt by task type and selects the best available model, balancing capability, speed, cost, and provider availability. Full model catalog includes: - Cloud: Anthropic Claude, xAI Grok, DeepSeek, Mistral AI, GLM-5 (Z.AI) - Local Ollama (NODA2/MacBook): qwen3.5:35b-a3b, qwen3:14b, glm-4.7-flash:32k, deepseek-r1:70b, deepseek-coder:33b, gemma3, mistral-nemo:12b, starcoder2:3b, phi3, llava:13b Task taxonomy (inspired by Cursor Auto mode): code_gen, code_review, code_debug, code_refactor, architecture, devops, security, analysis, quick_answer, creative, reasoning, math_code, vision, chatbot """ from __future__ import annotations import logging import os import re import time from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Tuple logger = logging.getLogger(__name__) # ── Task taxonomy ────────────────────────────────────────────────────────────── # Each pattern group uses multi-word or context-aware patterns to reduce false # positives. Single common words (system, design, check, list, graph, tree) are # avoided unless paired with a qualifier. TASK_PATTERNS: List[Tuple[str, List[str], float]] = [ # (task_type, patterns, base_weight) — weight scales final score ("code_gen", [ r"\bнапиши\s+(функці|код|клас|скрипт|модуль|endpoint|api)", r"\bреалізуй\b", r"\bcreate\s+(function|class|module|endpoint|api|component)", r"\bimplement\b", r"\bgenerate\s+code\b", r"\bзгенеруй\s+код\b", r"\bфункці[юя]\s+для\b", r"\bклас\s+для\b", r"\bнапиши\s+код\b", r"\bwrite\s+a?\s*(function|class|module|script|endpoint)\b", r"\bcontroller\b", r"\bendpoint\s+(для|for)\b", ], 1.0), ("code_debug", [ r"\bвиправ\b", r"\bбаг\b", r"\bпомилк[аи]\b", r"\btraceback\b", r"\bexception\b", r"\bfailed\b", r"\bcrash(es|ed)?\b", r"\bне\s+працю", r"\bдебаг\b", r"\bdebug\b", r"\bfix\s+(the\s+)?(bug|error|issue|crash)\b", r"\bsyntax\s*error\b", r"\btype\s*error\b", r"\battribute\s*error\b", r"\bruntime\s*error\b", r"\bvalue\s*error\b", ], 1.0), ("code_review", [ r"\breview\s+(the\s+)?(code|pr|pull\s+request|diff)\b", r"\bаудит\s+(код|сервіс|систем)\b", r"\baudit\s+(code|service)\b", r"\bперевір\w*\s+(код|якість)\b", r"\bcode\s+quality\b", r"\bcode\s+review\b", r"\brev'ю\b", ], 1.0), ("code_refactor", [ r"\bрефактор\b", r"\brefactor\b", r"\bоптимізу[йї]\s+(код|функці|клас)\b", r"\boptimize\s+(the\s+)?(code|function|class)\b", r"\bclean\s+up\s+(the\s+)?code\b", r"\bpolish\s+(the\s+)?code\b", r"\bspeed\s+up\b", r"\bimprove\s+(the\s+)?code\b", ], 1.0), ("architecture", [ r"\bархітектур\w+\b", r"\barchitecture\b", r"\bспроєктуй\b", r"\bsystem\s+design\b", r"\bmicroservice\s+(architect|design|pattern)\b", r"\bdatabase\s+design\b", r"\bapi\s+design\b", r"\bscalab(le|ility)\b", r"\bscaling\s+strateg\b", r"\bdesign\s+pattern\b", r"\bsystem\s+structure\b", ], 1.0), ("devops", [ r"\bdeploy\b", r"\bdocker\s*(file|compose|-compose|ize)?\b", r"\bkubernetes\b", r"\bk8s\b", r"\bci[\s/]cd\b", r"\bpipeline\b", r"\bnginx\b", r"\bcaddy\b", r"\bнода\d?\b", r"\bnoda\d?\b", r"\bcontainer\s+(start|stop|restart|build|image)\b", r"\bдеплой\b", r"\bssh\s+(to|into|root|connect)\b", r"\bhelm\b", r"\bterraform\b", r"\binfrastructure\b", r"\bdocker\s+compose\s+up\b", ], 1.0), ("security", [ r"\bvulnerability\b", r"\bCVE-\d+\b", r"\bsecurity\s+(audit|review|issue|scan)\b", r"\bauth(entication|orization)\b", r"\bencrypt(ion)?\b", r"\bRBAC\b", r"\bpermission\s+(model|system)\b", r"\bбезпек\w+\b", r"\bpentest\b", r"\b(sql|xss|csrf)\s*injection\b", r"\bthreat\s+model\b", ], 1.0), ("reasoning", [ r"\bчому\s+\w+\b", r"\bwhy\s+(does|is|do|did|should|would)\b", r"\bpros\s+and\s+cons\b", r"\btrade[\s-]?off\b", r"\bпорівняй\b", r"\bcompare\s+\w+\s+(vs|and|with|to)\b", r"\bяк\s+краще\b", r"\bперевага\b", r"\bнедолік\b", r"\bdecision\s+(between|about)\b", r"\bversus\b", r"\b\w+\s+vs\s+\w+\b", ], 1.0), ("analysis", [ r"\bпроаналізуй\b", r"\bаналіз\s+\w+\b", r"\banalyze\s+\w+\b", r"\binvestigate\b", r"\bexplain\s+(how|why|what)\b", r"\bsummariz(e|ation)\b", r"\bдослідж\b", r"\bпоясни\s+(як|чому|що)\b", r"\bhow\s+does\s+\w+\s+work\b", ], 1.0), ("creative", [ r"\bнапиши\s+(текст|стат|пост|лист|опис)\b", r"\bwrite\s+a\s+(blog|article|post|email|description|letter)\b", r"\bdraft\s+(a\s+)?(doc|email|message|proposal)\b", r"\breadme\b", r"\bchangelog\b", r"\bdocumentation\b", ], 1.0), ("quick_answer", [ r"\bщо\s+таке\b", r"\bwhat\s+is\s+(a|an|the)?\b", r"\bhow\s+to\s+\w+\b", r"\bdefinition\s+of\b", r"\bшвидко\b", r"\bсинтаксис\s+\w+\b", r"\bgive\s+me\s+an?\s+example\b", r"\bexample\s+of\b", ], 0.9), ("vision", [ r"\bзображен\w+\b", r"\bфото\b", r"\bimage\s+(analysis|recognition|detect)\b", r"\bскріншот\b", r"\bscreenshot\b", r"\bвізуальн\w+\s+аналіз\b", r"\bвідео\s+(аналіз|розпізна)\b", ], 1.0), ("math_code", [ r"\bалгоритм\s+\w+\b", r"\balgorithm\s+(for|to)\b", r"\bсортуван\w+\b", r"\bsort(ing)?\s+algorithm\b", r"\bdynamic\s+programming\b", r"\bgraph\s+(algorithm|traversal|search)\b", r"\bmatrix\s+(mult|inver|decomp)\b", r"\bcalculate\s+\w+\b", r"\bcompute\s+\w+\b", r"\bformula\s+(for|to)\b", r"\bДейкстр\b", r"\bDijkstra\b", ], 1.0), # Chatbot / conversational — greetings, small talk, acknowledgements ("chatbot", [ r"^(привіт|вітаю|добрий|доброго|hi|hello|hey)\b", r"^(дякую|спасибі|thank|thanks)\b", r"^(ок|добре|зрозумів|зрозуміло|so?|ok|yes|no|ні|так)\s*[,!.]?\s*$", r"\bяк\s+(справи|діла|ся маєш)\b", r"\bhow\s+are\s+you\b", ], 0.8), ] # Pre-compile patterns once for performance _COMPILED_PATTERNS: Optional[List[Tuple[str, List[re.Pattern], float]]] = None def _get_compiled_patterns() -> List[Tuple[str, List[re.Pattern], float]]: global _COMPILED_PATTERNS if _COMPILED_PATTERNS is None: _COMPILED_PATTERNS = [ (task_type, [re.compile(p, re.IGNORECASE) for p in patterns], weight) for task_type, patterns, weight in TASK_PATTERNS ] return _COMPILED_PATTERNS # ── Model catalog ────────────────────────────────────────────────────────────── @dataclass class ModelSpec: profile_name: str provider: str model_id: str api_key_env: str = "" strengths: List[str] = field(default_factory=list) cost_tier: int = 1 # 0=free(local), 1=cheap, 2=mid, 3=expensive speed_tier: int = 1 # 1=fast, 2=medium, 3=slow context_k: int = 8 # context window in thousands local: bool = False max_tokens: int = 4096 vram_gb: float = 0.0 description: str = "" @property def available(self) -> bool: if self.local: return _is_ollama_model_available(self.model_id) return bool(os.getenv(self.api_key_env, "").strip()) @property def has_credits(self) -> bool: return ProviderBudget.is_available(self.provider) # ── Ollama model availability cache ─────────────────────────────────────────── _ollama_available_models: Optional[List[str]] = None _ollama_cache_ts: float = 0.0 _OLLAMA_CACHE_TTL = 60.0 def _is_ollama_model_available(model_id: str) -> bool: global _ollama_available_models, _ollama_cache_ts now = time.time() if _ollama_available_models is None or (now - _ollama_cache_ts) > _OLLAMA_CACHE_TTL: _refresh_ollama_models_sync() if _ollama_available_models is None: return False model_lower = model_id.lower() model_base = model_lower.split(":")[0] for m in _ollama_available_models: ml = m.lower() if ml == model_lower or ml.split(":")[0] == model_base: return True return False def _refresh_ollama_models_sync() -> None: global _ollama_available_models, _ollama_cache_ts import urllib.request import json as _json ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434") try: with urllib.request.urlopen(f"{ollama_url}/api/tags", timeout=2) as resp: data = _json.loads(resp.read()) _ollama_available_models = [m["name"] for m in data.get("models", [])] _ollama_cache_ts = time.time() except Exception: _ollama_available_models = [] _ollama_cache_ts = time.time() async def refresh_ollama_models_async() -> List[str]: global _ollama_available_models, _ollama_cache_ts try: import httpx ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434") async with httpx.AsyncClient(timeout=2.0) as client: resp = await client.get(f"{ollama_url}/api/tags") data = resp.json() _ollama_available_models = [m["name"] for m in data.get("models", [])] _ollama_cache_ts = time.time() return _ollama_available_models except Exception: _ollama_available_models = _ollama_available_models or [] return _ollama_available_models # ── Full model catalog ───────────────────────────────────────────────────────── SOFIIA_MODEL_CATALOG: List[ModelSpec] = [ # ── Anthropic Claude ───────────────────────────────────────────────────── ModelSpec( profile_name="cloud_claude_sonnet", provider="anthropic", model_id="claude-sonnet-4-5", api_key_env="ANTHROPIC_API_KEY", strengths=["code_gen", "code_debug", "code_refactor", "architecture", "security", "reasoning"], cost_tier=2, speed_tier=2, context_k=200, max_tokens=8192, description="Claude Sonnet 4.5 — найкращий для коду та архітектури", ), ModelSpec( profile_name="cloud_claude_haiku", provider="anthropic", model_id="claude-haiku-3-5", api_key_env="ANTHROPIC_API_KEY", strengths=["quick_answer", "code_review", "creative", "analysis", "chatbot"], cost_tier=1, speed_tier=1, context_k=200, max_tokens=4096, description="Claude Haiku 3.5 — швидкий та дешевий", ), # ── xAI Grok ───────────────────────────────────────────────────────────── ModelSpec( profile_name="cloud_grok", provider="grok", model_id="grok-4-1-fast-reasoning", api_key_env="GROK_API_KEY", strengths=["reasoning", "architecture", "analysis", "code_gen"], cost_tier=2, speed_tier=1, context_k=2000, max_tokens=8192, description="Grok 4.1 Fast — 2M контекст, кращий для reasoning", ), # ── DeepSeek API ───────────────────────────────────────────────────────── ModelSpec( profile_name="cloud_deepseek", provider="deepseek", model_id="deepseek-chat", api_key_env="DEEPSEEK_API_KEY", strengths=["code_gen", "code_debug", "code_refactor", "devops", "quick_answer"], cost_tier=1, speed_tier=2, context_k=64, max_tokens=4096, description="DeepSeek Chat — дешевий і добре знає код/devops", ), # ── GLM-5 / Z.AI (API) ─────────────────────────────────────────────────── ModelSpec( profile_name="cloud_glm5", provider="glm", model_id="glm-4-plus", api_key_env="GLM5_API_KEY", strengths=["quick_answer", "creative", "analysis", "code_gen", "chatbot"], cost_tier=1, speed_tier=1, context_k=128, max_tokens=4096, description="GLM-4 Plus (Z.AI) — швидкий, дешевий, гарно знає українську/CJK", ), ModelSpec( profile_name="cloud_glm5_flash", provider="glm", model_id="glm-4-flash", api_key_env="GLM5_API_KEY", strengths=["quick_answer", "creative", "chatbot"], cost_tier=0, speed_tier=1, context_k=128, max_tokens=2048, description="GLM-4 Flash (Z.AI) — безкоштовний, найшвидший", ), # ── Mistral AI (API) ───────────────────────────────────────────────────── ModelSpec( profile_name="cloud_mistral", provider="mistral", model_id="mistral-large-latest", api_key_env="MISTRAL_API_KEY", strengths=["analysis", "creative", "reasoning", "architecture"], cost_tier=2, speed_tier=2, context_k=128, max_tokens=4096, description="Mistral Large — добрий для аналізу та creative", ), # ── Local: qwen3.5:35b-a3b (FLAGSHIP) ──────────────────────────────────── ModelSpec( profile_name="local_qwen35_35b", provider="ollama", model_id="qwen3.5:35b-a3b", strengths=["code_gen", "code_debug", "code_refactor", "reasoning", "architecture", "analysis", "devops", "security", "chatbot"], cost_tier=0, speed_tier=2, context_k=32, max_tokens=4096, local=True, vram_gb=24.0, description="Qwen3.5 35B MoE (NODA2) — флагман локально, якість ≈ cloud", ), # ── Local: qwen3:14b ───────────────────────────────────────────────────── ModelSpec( profile_name="local_qwen3_14b", provider="ollama", model_id="qwen3:14b", strengths=["code_gen", "code_debug", "quick_answer", "devops", "analysis", "chatbot"], cost_tier=0, speed_tier=2, context_k=32, max_tokens=2048, local=True, vram_gb=10.0, description="Qwen3 14B (NODA2) — швидкий локальний загальний", ), # ── Local: glm-4.7-flash:32k ───────────────────────────────────────────── ModelSpec( profile_name="local_glm47_32k", provider="ollama", model_id="glm-4.7-flash:32k", strengths=["quick_answer", "creative", "analysis", "code_review", "chatbot"], cost_tier=0, speed_tier=2, context_k=32, max_tokens=2048, local=True, vram_gb=20.0, description="GLM-4.7 Flash 32K (NODA2) — локальний GLM, великий контекст", ), # ── Local: deepseek-r1:70b ──────────────────────────────────────────────── ModelSpec( profile_name="local_deepseek_r1_70b", provider="ollama", model_id="deepseek-r1:70b", strengths=["reasoning", "math_code", "architecture", "analysis"], cost_tier=0, speed_tier=3, context_k=64, max_tokens=4096, local=True, vram_gb=48.0, description="DeepSeek-R1 70B (NODA2) — локальний reasoning як o1", ), # ── Local: deepseek-coder:33b ───────────────────────────────────────────── ModelSpec( profile_name="local_deepseek_coder_33b", provider="ollama", model_id="deepseek-coder:33b", strengths=["code_gen", "code_debug", "code_refactor", "math_code"], cost_tier=0, speed_tier=2, context_k=16, max_tokens=2048, local=True, vram_gb=20.0, description="DeepSeek Coder 33B (NODA2) — спеціаліст по коду", ), # ── Local: gemma3:latest ────────────────────────────────────────────────── ModelSpec( profile_name="local_gemma3", provider="ollama", model_id="gemma3:latest", strengths=["quick_answer", "analysis", "creative", "chatbot"], cost_tier=0, speed_tier=2, context_k=8, max_tokens=2048, local=True, vram_gb=8.0, description="Gemma3 (NODA2) — Google's ефективна модель", ), # ── Local: mistral-nemo:12b ─────────────────────────────────────────────── ModelSpec( profile_name="local_mistral_nemo", provider="ollama", model_id="mistral-nemo:12b", strengths=["creative", "quick_answer", "analysis", "chatbot"], cost_tier=0, speed_tier=2, context_k=128, max_tokens=2048, local=True, vram_gb=8.0, description="Mistral Nemo 12B (NODA2) — 128K контекст локально", ), # ── Local: starcoder2:3b ────────────────────────────────────────────────── ModelSpec( profile_name="local_starcoder2", provider="ollama", model_id="starcoder2:3b", strengths=["code_gen", "code_review"], cost_tier=0, speed_tier=1, context_k=16, max_tokens=2048, local=True, vram_gb=2.0, description="StarCoder2 3B (NODA2) — мікро-модель для code completion", ), # ── Local: phi3:latest ──────────────────────────────────────────────────── ModelSpec( profile_name="local_phi3", provider="ollama", model_id="phi3:latest", strengths=["quick_answer", "analysis", "chatbot"], cost_tier=0, speed_tier=1, context_k=128, max_tokens=2048, local=True, vram_gb=4.0, description="Phi-3 (NODA2) — Microsoft мала ефективна модель", ), # ── Local: llava:13b (vision) ───────────────────────────────────────────── ModelSpec( profile_name="local_llava_13b", provider="ollama", model_id="llava:13b", strengths=["vision"], cost_tier=0, speed_tier=2, context_k=4, max_tokens=2048, local=True, vram_gb=10.0, description="LLaVA 13B (NODA2) — vision модель для зображень", ), # ── Local: gpt-oss:latest ───────────────────────────────────────────────── ModelSpec( profile_name="local_gpt_oss", provider="ollama", model_id="gpt-oss:latest", strengths=["code_gen", "quick_answer"], cost_tier=0, speed_tier=2, context_k=8, max_tokens=2048, local=True, vram_gb=8.0, description="GPT-OSS (NODA2) — відкрита OSS GPT-like модель", ), ] # ── Task → preferred model matrix ───────────────────────────────────────────── TASK_MODEL_PRIORITY: Dict[str, List[str]] = { # Principle: local-first for tasks where local quality is sufficient. # Cloud only when the task genuinely needs it (complex code, deep reasoning, # very long context, security audits). # # qwen3.5:35b-a3b is the flagship local — MoE with cloud-level quality. # It should be preferred over cloud APIs for most routine tasks. "code_gen": [ "local_qwen35_35b", "cloud_claude_sonnet", "local_deepseek_coder_33b", "cloud_deepseek", "local_qwen3_14b", "cloud_grok", ], "code_debug": [ "local_qwen35_35b", "local_deepseek_coder_33b", "cloud_claude_sonnet", "cloud_deepseek", "local_qwen3_14b", ], "code_review": [ "local_qwen35_35b", "cloud_claude_haiku", "local_deepseek_coder_33b", "cloud_claude_sonnet", "cloud_deepseek", ], "code_refactor": [ "local_qwen35_35b", "local_deepseek_coder_33b", "cloud_claude_sonnet", "cloud_deepseek", "local_qwen3_14b", ], "math_code": [ "local_deepseek_r1_70b", "local_qwen35_35b", "cloud_grok", "cloud_claude_sonnet", "local_deepseek_coder_33b", ], "architecture": [ "local_qwen35_35b", "cloud_grok", "cloud_claude_sonnet", "local_deepseek_r1_70b", "cloud_mistral", ], "devops": [ "local_qwen35_35b", "local_qwen3_14b", "cloud_deepseek", "cloud_claude_sonnet", "local_glm47_32k", ], "security": [ "cloud_claude_sonnet", "local_qwen35_35b", "cloud_grok", "cloud_mistral", ], "reasoning": [ "local_deepseek_r1_70b", "local_qwen35_35b", "cloud_grok", "cloud_claude_sonnet", "cloud_mistral", ], "analysis": [ "local_qwen35_35b", "local_glm47_32k", "cloud_grok", "cloud_claude_haiku", "local_mistral_nemo", "cloud_mistral", ], "creative": [ "local_qwen35_35b", "local_mistral_nemo", "cloud_claude_haiku", "local_glm47_32k", "cloud_mistral", ], "quick_answer": [ "local_qwen3_14b", "local_qwen35_35b", "local_phi3", "local_gemma3", "cloud_deepseek", "cloud_glm5_flash", ], "chatbot": [ "local_qwen3_14b", "local_qwen35_35b", "local_gemma3", "local_phi3", "local_mistral_nemo", ], "vision": [ "local_llava_13b", ], "unknown": [ "local_qwen35_35b", "local_qwen3_14b", "cloud_claude_sonnet", "cloud_grok", "cloud_deepseek", ], } # ── Budget integration ───────────────────────────────────────────────────────── class ProviderBudget: """In-memory budget gate: marks providers exhausted until TTL expires.""" _exhausted: Dict[str, float] = {} _exhausted_ttl: int = 3600 @classmethod def mark_exhausted(cls, provider: str) -> None: cls._exhausted[provider] = time.time() logger.warning("💸 Provider %s marked as budget-exhausted", provider) @classmethod def is_available(cls, provider: str) -> bool: ts = cls._exhausted.get(provider) if ts is None: return True if time.time() - ts > cls._exhausted_ttl: cls._exhausted.pop(provider, None) return True return False @classmethod def reset(cls, provider: str) -> None: cls._exhausted.pop(provider, None) # ── Task classification ──────────────────────────────────────────────────────── @dataclass class ClassificationResult: task_type: str confidence: float all_scores: Dict[str, float] ambiguous: bool = False runner_up: Optional[str] = None def classify_task(prompt: str, context_len: int = 0) -> Tuple[str, float]: """Classify prompt into a task type. Returns (task_type, confidence).""" result = classify_task_detailed(prompt, context_len) return result.task_type, result.confidence def classify_task_detailed(prompt: str, context_len: int = 0) -> ClassificationResult: """Detailed classification with ambiguity detection and all scores.""" if not prompt or not prompt.strip(): return ClassificationResult("chatbot", 0.5, {}, ambiguous=False) text = prompt.strip() compiled = _get_compiled_patterns() scores: Dict[str, float] = {} for task_type, patterns, weight in compiled: hits = sum(1 for p in patterns if p.search(text)) if hits > 0: raw = hits / len(patterns) scores[task_type] = raw * weight if not scores: return ClassificationResult("unknown", 0.3, {}, ambiguous=False) sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True) best_task, best_score = sorted_scores[0] confidence = min(best_score * 10, 1.0) # Penalize confidence for very short prompts (fewer signals) word_count = len(text.split()) if word_count <= 3: confidence *= 0.6 elif word_count <= 8: confidence *= 0.85 # Detect ambiguity: second-place is within 30% of the best ambiguous = False runner_up = None if len(sorted_scores) >= 2: _, second_score = sorted_scores[1] if second_score > 0 and second_score / best_score > 0.7: ambiguous = True runner_up = sorted_scores[1][0] # For long conversations, slight preference for context-heavy models # (influences scoring, not classification) if context_len > 50: confidence = max(confidence, 0.5) return ClassificationResult( task_type=best_task, confidence=round(confidence, 3), all_scores={k: round(v, 4) for k, v in sorted_scores[:5]}, ambiguous=ambiguous, runner_up=runner_up, ) def _prompt_complexity(prompt: str) -> str: """Estimate prompt complexity: simple | medium | complex""" words = len(prompt.split()) lines = prompt.count("\n") code_blocks = prompt.count("```") if words < 20 and lines < 3 and code_blocks == 0: return "simple" if words > 200 or code_blocks >= 2 or lines > 20: return "complex" return "medium" # ── Main selection function ──────────────────────────────────────────────────── @dataclass class AutoRouteResult: profile_name: str model_id: str provider: str task_type: str confidence: float complexity: str reason: str fallback_used: bool = False all_candidates: List[str] = field(default_factory=list) ambiguous: bool = False runner_up: Optional[str] = None all_scores: Dict[str, float] = field(default_factory=dict) def select_model_auto( prompt: str, force_fast: bool = False, force_capable: bool = False, prefer_local: bool = False, prefer_cheap: bool = False, budget_aware: bool = True, context_messages_len: int = 0, ) -> AutoRouteResult: """ Cursor-style auto model selection for Sofiia. Logic: 1. Classify task type from prompt (with ambiguity detection) 2. Estimate complexity (simple/medium/complex) 3. Apply modifiers (force_fast, force_capable, prefer_local, prefer_cheap) 4. Score candidates from priority list factoring availability, budget, speed, cost 5. For long conversations, prefer large-context models """ classification = classify_task_detailed(prompt, context_messages_len) task_type = classification.task_type confidence = classification.confidence complexity = _prompt_complexity(prompt) effective_task = task_type # Modifier overrides (parentheses fix for operator precedence) if force_fast and task_type not in ("code_gen", "code_debug", "math_code"): effective_task = "quick_answer" if (prefer_cheap or complexity == "simple") and task_type in ("quick_answer", "creative", "chatbot"): effective_task = "quick_answer" priority_list = TASK_MODEL_PRIORITY.get(effective_task, TASK_MODEL_PRIORITY["unknown"]) catalog_map = {m.profile_name: m for m in SOFIIA_MODEL_CATALOG} candidates = [p for p in priority_list if p in catalog_map] if prefer_local: local_cands = [p for p in candidates if catalog_map[p].local] if local_cands: candidates = local_cands def _score(profile_name: str) -> float: spec = catalog_map[profile_name] score = 0.0 if not spec.available: score += 1000 if budget_aware and not spec.has_credits: score += 500 # Priority-list position is the strongest signal try: pos = priority_list.index(profile_name) score += pos * 20 except ValueError: score += 200 if prefer_local and not spec.local: score += 200 if force_fast: score += spec.speed_tier * 15 if prefer_cheap or prefer_local: score -= spec.cost_tier * 20 else: score += spec.cost_tier * 2 if force_capable: score -= spec.context_k / 100 if complexity == "complex" and spec.context_k < 32: score += 40 # Long conversation bonus for large-context models if context_messages_len > 30 and spec.context_k >= 128: score -= 15 elif context_messages_len > 50 and spec.context_k < 32: score += 25 return score scored = sorted([c for c in candidates if c in catalog_map], key=_score) if not scored: for fallback in ["local_qwen35_35b", "local_qwen3_14b", "local_phi3"]: if fallback in catalog_map: scored = [fallback] break best = scored[0] if scored else "local_qwen3_14b" spec = catalog_map.get(best) fallback_used = best not in priority_list[:2] reasons: List[str] = [f"task={task_type} ({confidence:.0%})", f"complexity={complexity}"] if classification.ambiguous: reasons.append(f"ambiguous (runner_up={classification.runner_up})") if force_fast: reasons.append("force_fast") if prefer_local: reasons.append("prefer_local") if prefer_cheap: reasons.append("prefer_cheap") if force_capable: reasons.append("force_capable") if context_messages_len > 30: reasons.append(f"long_conversation({context_messages_len})") if fallback_used: reasons.append("fallback (top unavailable)") return AutoRouteResult( profile_name=best, model_id=spec.model_id if spec else best, provider=spec.provider if spec else "unknown", task_type=task_type, confidence=confidence, complexity=complexity, reason=" | ".join(reasons), fallback_used=fallback_used, all_candidates=scored[:5], ambiguous=classification.ambiguous, runner_up=classification.runner_up, all_scores=classification.all_scores, ) def explain_selection(result: AutoRouteResult) -> str: """Human-readable explanation of model selection (for debug/UI).""" lines = [ f"Auto-selected **{result.model_id}** ({result.provider})", f"Task: `{result.task_type}` | Complexity: `{result.complexity}` | " f"Confidence: {result.confidence:.0%}", f"Reason: {result.reason}", ] if result.ambiguous: lines.append(f"Ambiguous: runner-up was `{result.runner_up}`") if result.all_scores: top3 = list(result.all_scores.items())[:3] lines.append("Scores: " + ", ".join(f"{k}={v:.3f}" for k, v in top3)) return "\n".join(lines) def get_full_catalog() -> List[Dict[str, Any]]: """Return full model catalog with availability status for dashboard.""" return [ { "profile_name": m.profile_name, "provider": m.provider, "model_id": m.model_id, "description": m.description, "strengths": m.strengths, "cost_tier": m.cost_tier, "speed_tier": m.speed_tier, "context_k": m.context_k, "local": m.local, "vram_gb": m.vram_gb, "available": m.available, "has_credits": m.has_credits, } for m in SOFIIA_MODEL_CATALOG ]