New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
768 lines
32 KiB
Python
768 lines
32 KiB
Python
"""Sofiia Smart Auto-Router — Cursor-style model selection for Sofiia agent.
|
||
|
||
Classifies incoming prompt by task type and selects the best available model,
|
||
balancing capability, speed, cost, and provider availability.
|
||
|
||
Full model catalog includes:
|
||
- Cloud: Anthropic Claude, xAI Grok, DeepSeek, Mistral AI, GLM-5 (Z.AI)
|
||
- Local Ollama (NODA2/MacBook): qwen3.5:35b-a3b, qwen3:14b, glm-4.7-flash:32k,
|
||
deepseek-r1:70b, deepseek-coder:33b, gemma3, mistral-nemo:12b,
|
||
starcoder2:3b, phi3, llava:13b
|
||
|
||
Task taxonomy (inspired by Cursor Auto mode):
|
||
code_gen, code_review, code_debug, code_refactor,
|
||
architecture, devops, security, analysis, quick_answer, creative, reasoning,
|
||
math_code, vision, chatbot
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
import os
|
||
import re
|
||
import time
|
||
from dataclasses import dataclass, field
|
||
from typing import Any, Dict, List, Optional, Tuple
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# ── Task taxonomy ──────────────────────────────────────────────────────────────
|
||
# Each pattern group uses multi-word or context-aware patterns to reduce false
|
||
# positives. Single common words (system, design, check, list, graph, tree) are
|
||
# avoided unless paired with a qualifier.
|
||
|
||
TASK_PATTERNS: List[Tuple[str, List[str], float]] = [
|
||
# (task_type, patterns, base_weight) — weight scales final score
|
||
("code_gen", [
|
||
r"\bнапиши\s+(функці|код|клас|скрипт|модуль|endpoint|api)",
|
||
r"\bреалізуй\b", r"\bcreate\s+(function|class|module|endpoint|api|component)",
|
||
r"\bimplement\b", r"\bgenerate\s+code\b", r"\bзгенеруй\s+код\b",
|
||
r"\bфункці[юя]\s+для\b", r"\bклас\s+для\b", r"\bнапиши\s+код\b",
|
||
r"\bwrite\s+a?\s*(function|class|module|script|endpoint)\b",
|
||
r"\bcontroller\b", r"\bendpoint\s+(для|for)\b",
|
||
], 1.0),
|
||
("code_debug", [
|
||
r"\bвиправ\b", r"\bбаг\b", r"\bпомилк[аи]\b", r"\btraceback\b",
|
||
r"\bexception\b", r"\bfailed\b", r"\bcrash(es|ed)?\b", r"\bне\s+працю",
|
||
r"\bдебаг\b", r"\bdebug\b", r"\bfix\s+(the\s+)?(bug|error|issue|crash)\b",
|
||
r"\bsyntax\s*error\b", r"\btype\s*error\b", r"\battribute\s*error\b",
|
||
r"\bruntime\s*error\b", r"\bvalue\s*error\b",
|
||
], 1.0),
|
||
("code_review", [
|
||
r"\breview\s+(the\s+)?(code|pr|pull\s+request|diff)\b",
|
||
r"\bаудит\s+(код|сервіс|систем)\b", r"\baudit\s+(code|service)\b",
|
||
r"\bперевір\w*\s+(код|якість)\b", r"\bcode\s+quality\b",
|
||
r"\bcode\s+review\b", r"\brev'ю\b",
|
||
], 1.0),
|
||
("code_refactor", [
|
||
r"\bрефактор\b", r"\brefactor\b",
|
||
r"\bоптимізу[йї]\s+(код|функці|клас)\b", r"\boptimize\s+(the\s+)?(code|function|class)\b",
|
||
r"\bclean\s+up\s+(the\s+)?code\b", r"\bpolish\s+(the\s+)?code\b",
|
||
r"\bspeed\s+up\b", r"\bimprove\s+(the\s+)?code\b",
|
||
], 1.0),
|
||
("architecture", [
|
||
r"\bархітектур\w+\b", r"\barchitecture\b",
|
||
r"\bспроєктуй\b", r"\bsystem\s+design\b",
|
||
r"\bmicroservice\s+(architect|design|pattern)\b",
|
||
r"\bdatabase\s+design\b", r"\bapi\s+design\b",
|
||
r"\bscalab(le|ility)\b", r"\bscaling\s+strateg\b",
|
||
r"\bdesign\s+pattern\b", r"\bsystem\s+structure\b",
|
||
], 1.0),
|
||
("devops", [
|
||
r"\bdeploy\b", r"\bdocker\s*(file|compose|-compose|ize)?\b",
|
||
r"\bkubernetes\b", r"\bk8s\b", r"\bci[\s/]cd\b",
|
||
r"\bpipeline\b", r"\bnginx\b", r"\bcaddy\b",
|
||
r"\bнода\d?\b", r"\bnoda\d?\b", r"\bcontainer\s+(start|stop|restart|build|image)\b",
|
||
r"\bдеплой\b", r"\bssh\s+(to|into|root|connect)\b",
|
||
r"\bhelm\b", r"\bterraform\b", r"\binfrastructure\b",
|
||
r"\bdocker\s+compose\s+up\b",
|
||
], 1.0),
|
||
("security", [
|
||
r"\bvulnerability\b", r"\bCVE-\d+\b", r"\bsecurity\s+(audit|review|issue|scan)\b",
|
||
r"\bauth(entication|orization)\b", r"\bencrypt(ion)?\b",
|
||
r"\bRBAC\b", r"\bpermission\s+(model|system)\b",
|
||
r"\bбезпек\w+\b", r"\bpentest\b", r"\b(sql|xss|csrf)\s*injection\b",
|
||
r"\bthreat\s+model\b",
|
||
], 1.0),
|
||
("reasoning", [
|
||
r"\bчому\s+\w+\b", r"\bwhy\s+(does|is|do|did|should|would)\b",
|
||
r"\bpros\s+and\s+cons\b", r"\btrade[\s-]?off\b",
|
||
r"\bпорівняй\b", r"\bcompare\s+\w+\s+(vs|and|with|to)\b",
|
||
r"\bяк\s+краще\b", r"\bперевага\b", r"\bнедолік\b",
|
||
r"\bdecision\s+(between|about)\b",
|
||
r"\bversus\b", r"\b\w+\s+vs\s+\w+\b",
|
||
], 1.0),
|
||
("analysis", [
|
||
r"\bпроаналізуй\b", r"\bаналіз\s+\w+\b",
|
||
r"\banalyze\s+\w+\b", r"\binvestigate\b",
|
||
r"\bexplain\s+(how|why|what)\b", r"\bsummariz(e|ation)\b",
|
||
r"\bдослідж\b", r"\bпоясни\s+(як|чому|що)\b",
|
||
r"\bhow\s+does\s+\w+\s+work\b",
|
||
], 1.0),
|
||
("creative", [
|
||
r"\bнапиши\s+(текст|стат|пост|лист|опис)\b",
|
||
r"\bwrite\s+a\s+(blog|article|post|email|description|letter)\b",
|
||
r"\bdraft\s+(a\s+)?(doc|email|message|proposal)\b",
|
||
r"\breadme\b", r"\bchangelog\b", r"\bdocumentation\b",
|
||
], 1.0),
|
||
("quick_answer", [
|
||
r"\bщо\s+таке\b", r"\bwhat\s+is\s+(a|an|the)?\b",
|
||
r"\bhow\s+to\s+\w+\b", r"\bdefinition\s+of\b",
|
||
r"\bшвидко\b", r"\bсинтаксис\s+\w+\b",
|
||
r"\bgive\s+me\s+an?\s+example\b", r"\bexample\s+of\b",
|
||
], 0.9),
|
||
("vision", [
|
||
r"\bзображен\w+\b", r"\bфото\b", r"\bimage\s+(analysis|recognition|detect)\b",
|
||
r"\bскріншот\b", r"\bscreenshot\b",
|
||
r"\bвізуальн\w+\s+аналіз\b", r"\bвідео\s+(аналіз|розпізна)\b",
|
||
], 1.0),
|
||
("math_code", [
|
||
r"\bалгоритм\s+\w+\b", r"\balgorithm\s+(for|to)\b",
|
||
r"\bсортуван\w+\b", r"\bsort(ing)?\s+algorithm\b",
|
||
r"\bdynamic\s+programming\b", r"\bgraph\s+(algorithm|traversal|search)\b",
|
||
r"\bmatrix\s+(mult|inver|decomp)\b",
|
||
r"\bcalculate\s+\w+\b", r"\bcompute\s+\w+\b",
|
||
r"\bformula\s+(for|to)\b", r"\bДейкстр\b", r"\bDijkstra\b",
|
||
], 1.0),
|
||
# Chatbot / conversational — greetings, small talk, acknowledgements
|
||
("chatbot", [
|
||
r"^(привіт|вітаю|добрий|доброго|hi|hello|hey)\b",
|
||
r"^(дякую|спасибі|thank|thanks)\b",
|
||
r"^(ок|добре|зрозумів|зрозуміло|so?|ok|yes|no|ні|так)\s*[,!.]?\s*$",
|
||
r"\bяк\s+(справи|діла|ся маєш)\b", r"\bhow\s+are\s+you\b",
|
||
], 0.8),
|
||
]
|
||
|
||
# Pre-compile patterns once for performance
|
||
_COMPILED_PATTERNS: Optional[List[Tuple[str, List[re.Pattern], float]]] = None
|
||
|
||
|
||
def _get_compiled_patterns() -> List[Tuple[str, List[re.Pattern], float]]:
|
||
global _COMPILED_PATTERNS
|
||
if _COMPILED_PATTERNS is None:
|
||
_COMPILED_PATTERNS = [
|
||
(task_type, [re.compile(p, re.IGNORECASE) for p in patterns], weight)
|
||
for task_type, patterns, weight in TASK_PATTERNS
|
||
]
|
||
return _COMPILED_PATTERNS
|
||
|
||
|
||
# ── Model catalog ──────────────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class ModelSpec:
|
||
profile_name: str
|
||
provider: str
|
||
model_id: str
|
||
api_key_env: str = ""
|
||
strengths: List[str] = field(default_factory=list)
|
||
cost_tier: int = 1 # 0=free(local), 1=cheap, 2=mid, 3=expensive
|
||
speed_tier: int = 1 # 1=fast, 2=medium, 3=slow
|
||
context_k: int = 8 # context window in thousands
|
||
local: bool = False
|
||
max_tokens: int = 4096
|
||
vram_gb: float = 0.0
|
||
description: str = ""
|
||
|
||
@property
|
||
def available(self) -> bool:
|
||
if self.local:
|
||
return _is_ollama_model_available(self.model_id)
|
||
return bool(os.getenv(self.api_key_env, "").strip())
|
||
|
||
@property
|
||
def has_credits(self) -> bool:
|
||
return ProviderBudget.is_available(self.provider)
|
||
|
||
|
||
# ── Ollama model availability cache ───────────────────────────────────────────
|
||
|
||
_ollama_available_models: Optional[List[str]] = None
|
||
_ollama_cache_ts: float = 0.0
|
||
_OLLAMA_CACHE_TTL = 60.0
|
||
|
||
|
||
def _is_ollama_model_available(model_id: str) -> bool:
|
||
global _ollama_available_models, _ollama_cache_ts
|
||
now = time.time()
|
||
if _ollama_available_models is None or (now - _ollama_cache_ts) > _OLLAMA_CACHE_TTL:
|
||
_refresh_ollama_models_sync()
|
||
if _ollama_available_models is None:
|
||
return False
|
||
model_lower = model_id.lower()
|
||
model_base = model_lower.split(":")[0]
|
||
for m in _ollama_available_models:
|
||
ml = m.lower()
|
||
if ml == model_lower or ml.split(":")[0] == model_base:
|
||
return True
|
||
return False
|
||
|
||
|
||
def _refresh_ollama_models_sync() -> None:
|
||
global _ollama_available_models, _ollama_cache_ts
|
||
import urllib.request
|
||
import json as _json
|
||
ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434")
|
||
try:
|
||
with urllib.request.urlopen(f"{ollama_url}/api/tags", timeout=2) as resp:
|
||
data = _json.loads(resp.read())
|
||
_ollama_available_models = [m["name"] for m in data.get("models", [])]
|
||
_ollama_cache_ts = time.time()
|
||
except Exception:
|
||
_ollama_available_models = []
|
||
_ollama_cache_ts = time.time()
|
||
|
||
|
||
async def refresh_ollama_models_async() -> List[str]:
|
||
global _ollama_available_models, _ollama_cache_ts
|
||
try:
|
||
import httpx
|
||
ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434")
|
||
async with httpx.AsyncClient(timeout=2.0) as client:
|
||
resp = await client.get(f"{ollama_url}/api/tags")
|
||
data = resp.json()
|
||
_ollama_available_models = [m["name"] for m in data.get("models", [])]
|
||
_ollama_cache_ts = time.time()
|
||
return _ollama_available_models
|
||
except Exception:
|
||
_ollama_available_models = _ollama_available_models or []
|
||
return _ollama_available_models
|
||
|
||
|
||
# ── Full model catalog ─────────────────────────────────────────────────────────
|
||
|
||
SOFIIA_MODEL_CATALOG: List[ModelSpec] = [
|
||
|
||
# ── Anthropic Claude ─────────────────────────────────────────────────────
|
||
ModelSpec(
|
||
profile_name="cloud_claude_sonnet",
|
||
provider="anthropic", model_id="claude-sonnet-4-5",
|
||
api_key_env="ANTHROPIC_API_KEY",
|
||
strengths=["code_gen", "code_debug", "code_refactor", "architecture", "security", "reasoning"],
|
||
cost_tier=2, speed_tier=2, context_k=200, max_tokens=8192,
|
||
description="Claude Sonnet 4.5 — найкращий для коду та архітектури",
|
||
),
|
||
ModelSpec(
|
||
profile_name="cloud_claude_haiku",
|
||
provider="anthropic", model_id="claude-haiku-3-5",
|
||
api_key_env="ANTHROPIC_API_KEY",
|
||
strengths=["quick_answer", "code_review", "creative", "analysis", "chatbot"],
|
||
cost_tier=1, speed_tier=1, context_k=200, max_tokens=4096,
|
||
description="Claude Haiku 3.5 — швидкий та дешевий",
|
||
),
|
||
|
||
# ── xAI Grok ─────────────────────────────────────────────────────────────
|
||
ModelSpec(
|
||
profile_name="cloud_grok",
|
||
provider="grok", model_id="grok-4-1-fast-reasoning",
|
||
api_key_env="GROK_API_KEY",
|
||
strengths=["reasoning", "architecture", "analysis", "code_gen"],
|
||
cost_tier=2, speed_tier=1, context_k=2000, max_tokens=8192,
|
||
description="Grok 4.1 Fast — 2M контекст, кращий для reasoning",
|
||
),
|
||
|
||
# ── DeepSeek API ─────────────────────────────────────────────────────────
|
||
ModelSpec(
|
||
profile_name="cloud_deepseek",
|
||
provider="deepseek", model_id="deepseek-chat",
|
||
api_key_env="DEEPSEEK_API_KEY",
|
||
strengths=["code_gen", "code_debug", "code_refactor", "devops", "quick_answer"],
|
||
cost_tier=1, speed_tier=2, context_k=64, max_tokens=4096,
|
||
description="DeepSeek Chat — дешевий і добре знає код/devops",
|
||
),
|
||
|
||
# ── GLM-5 / Z.AI (API) ───────────────────────────────────────────────────
|
||
ModelSpec(
|
||
profile_name="cloud_glm5",
|
||
provider="glm", model_id="glm-4-plus",
|
||
api_key_env="GLM5_API_KEY",
|
||
strengths=["quick_answer", "creative", "analysis", "code_gen", "chatbot"],
|
||
cost_tier=1, speed_tier=1, context_k=128, max_tokens=4096,
|
||
description="GLM-4 Plus (Z.AI) — швидкий, дешевий, гарно знає українську/CJK",
|
||
),
|
||
ModelSpec(
|
||
profile_name="cloud_glm5_flash",
|
||
provider="glm", model_id="glm-4-flash",
|
||
api_key_env="GLM5_API_KEY",
|
||
strengths=["quick_answer", "creative", "chatbot"],
|
||
cost_tier=0, speed_tier=1, context_k=128, max_tokens=2048,
|
||
description="GLM-4 Flash (Z.AI) — безкоштовний, найшвидший",
|
||
),
|
||
|
||
# ── Mistral AI (API) ─────────────────────────────────────────────────────
|
||
ModelSpec(
|
||
profile_name="cloud_mistral",
|
||
provider="mistral", model_id="mistral-large-latest",
|
||
api_key_env="MISTRAL_API_KEY",
|
||
strengths=["analysis", "creative", "reasoning", "architecture"],
|
||
cost_tier=2, speed_tier=2, context_k=128, max_tokens=4096,
|
||
description="Mistral Large — добрий для аналізу та creative",
|
||
),
|
||
|
||
# ── Local: qwen3.5:35b-a3b (FLAGSHIP) ────────────────────────────────────
|
||
ModelSpec(
|
||
profile_name="local_qwen35_35b",
|
||
provider="ollama", model_id="qwen3.5:35b-a3b",
|
||
strengths=["code_gen", "code_debug", "code_refactor", "reasoning", "architecture",
|
||
"analysis", "devops", "security", "chatbot"],
|
||
cost_tier=0, speed_tier=2, context_k=32, max_tokens=4096,
|
||
local=True, vram_gb=24.0,
|
||
description="Qwen3.5 35B MoE (NODA2) — флагман локально, якість ≈ cloud",
|
||
),
|
||
|
||
# ── Local: qwen3:14b ─────────────────────────────────────────────────────
|
||
ModelSpec(
|
||
profile_name="local_qwen3_14b",
|
||
provider="ollama", model_id="qwen3:14b",
|
||
strengths=["code_gen", "code_debug", "quick_answer", "devops", "analysis", "chatbot"],
|
||
cost_tier=0, speed_tier=2, context_k=32, max_tokens=2048,
|
||
local=True, vram_gb=10.0,
|
||
description="Qwen3 14B (NODA2) — швидкий локальний загальний",
|
||
),
|
||
|
||
# ── Local: glm-4.7-flash:32k ─────────────────────────────────────────────
|
||
ModelSpec(
|
||
profile_name="local_glm47_32k",
|
||
provider="ollama", model_id="glm-4.7-flash:32k",
|
||
strengths=["quick_answer", "creative", "analysis", "code_review", "chatbot"],
|
||
cost_tier=0, speed_tier=2, context_k=32, max_tokens=2048,
|
||
local=True, vram_gb=20.0,
|
||
description="GLM-4.7 Flash 32K (NODA2) — локальний GLM, великий контекст",
|
||
),
|
||
|
||
# ── Local: deepseek-r1:70b ────────────────────────────────────────────────
|
||
ModelSpec(
|
||
profile_name="local_deepseek_r1_70b",
|
||
provider="ollama", model_id="deepseek-r1:70b",
|
||
strengths=["reasoning", "math_code", "architecture", "analysis"],
|
||
cost_tier=0, speed_tier=3, context_k=64, max_tokens=4096,
|
||
local=True, vram_gb=48.0,
|
||
description="DeepSeek-R1 70B (NODA2) — локальний reasoning як o1",
|
||
),
|
||
|
||
# ── Local: deepseek-coder:33b ─────────────────────────────────────────────
|
||
ModelSpec(
|
||
profile_name="local_deepseek_coder_33b",
|
||
provider="ollama", model_id="deepseek-coder:33b",
|
||
strengths=["code_gen", "code_debug", "code_refactor", "math_code"],
|
||
cost_tier=0, speed_tier=2, context_k=16, max_tokens=2048,
|
||
local=True, vram_gb=20.0,
|
||
description="DeepSeek Coder 33B (NODA2) — спеціаліст по коду",
|
||
),
|
||
|
||
# ── Local: gemma3:latest ──────────────────────────────────────────────────
|
||
ModelSpec(
|
||
profile_name="local_gemma3",
|
||
provider="ollama", model_id="gemma3:latest",
|
||
strengths=["quick_answer", "analysis", "creative", "chatbot"],
|
||
cost_tier=0, speed_tier=2, context_k=8, max_tokens=2048,
|
||
local=True, vram_gb=8.0,
|
||
description="Gemma3 (NODA2) — Google's ефективна модель",
|
||
),
|
||
|
||
# ── Local: mistral-nemo:12b ───────────────────────────────────────────────
|
||
ModelSpec(
|
||
profile_name="local_mistral_nemo",
|
||
provider="ollama", model_id="mistral-nemo:12b",
|
||
strengths=["creative", "quick_answer", "analysis", "chatbot"],
|
||
cost_tier=0, speed_tier=2, context_k=128, max_tokens=2048,
|
||
local=True, vram_gb=8.0,
|
||
description="Mistral Nemo 12B (NODA2) — 128K контекст локально",
|
||
),
|
||
|
||
# ── Local: starcoder2:3b ──────────────────────────────────────────────────
|
||
ModelSpec(
|
||
profile_name="local_starcoder2",
|
||
provider="ollama", model_id="starcoder2:3b",
|
||
strengths=["code_gen", "code_review"],
|
||
cost_tier=0, speed_tier=1, context_k=16, max_tokens=2048,
|
||
local=True, vram_gb=2.0,
|
||
description="StarCoder2 3B (NODA2) — мікро-модель для code completion",
|
||
),
|
||
|
||
# ── Local: phi3:latest ────────────────────────────────────────────────────
|
||
ModelSpec(
|
||
profile_name="local_phi3",
|
||
provider="ollama", model_id="phi3:latest",
|
||
strengths=["quick_answer", "analysis", "chatbot"],
|
||
cost_tier=0, speed_tier=1, context_k=128, max_tokens=2048,
|
||
local=True, vram_gb=4.0,
|
||
description="Phi-3 (NODA2) — Microsoft мала ефективна модель",
|
||
),
|
||
|
||
# ── Local: llava:13b (vision) ─────────────────────────────────────────────
|
||
ModelSpec(
|
||
profile_name="local_llava_13b",
|
||
provider="ollama", model_id="llava:13b",
|
||
strengths=["vision"],
|
||
cost_tier=0, speed_tier=2, context_k=4, max_tokens=2048,
|
||
local=True, vram_gb=10.0,
|
||
description="LLaVA 13B (NODA2) — vision модель для зображень",
|
||
),
|
||
|
||
# ── Local: gpt-oss:latest ─────────────────────────────────────────────────
|
||
ModelSpec(
|
||
profile_name="local_gpt_oss",
|
||
provider="ollama", model_id="gpt-oss:latest",
|
||
strengths=["code_gen", "quick_answer"],
|
||
cost_tier=0, speed_tier=2, context_k=8, max_tokens=2048,
|
||
local=True, vram_gb=8.0,
|
||
description="GPT-OSS (NODA2) — відкрита OSS GPT-like модель",
|
||
),
|
||
]
|
||
|
||
# ── Task → preferred model matrix ─────────────────────────────────────────────
|
||
|
||
TASK_MODEL_PRIORITY: Dict[str, List[str]] = {
|
||
# Principle: local-first for tasks where local quality is sufficient.
|
||
# Cloud only when the task genuinely needs it (complex code, deep reasoning,
|
||
# very long context, security audits).
|
||
#
|
||
# qwen3.5:35b-a3b is the flagship local — MoE with cloud-level quality.
|
||
# It should be preferred over cloud APIs for most routine tasks.
|
||
|
||
"code_gen": [
|
||
"local_qwen35_35b", "cloud_claude_sonnet", "local_deepseek_coder_33b",
|
||
"cloud_deepseek", "local_qwen3_14b", "cloud_grok",
|
||
],
|
||
"code_debug": [
|
||
"local_qwen35_35b", "local_deepseek_coder_33b", "cloud_claude_sonnet",
|
||
"cloud_deepseek", "local_qwen3_14b",
|
||
],
|
||
"code_review": [
|
||
"local_qwen35_35b", "cloud_claude_haiku", "local_deepseek_coder_33b",
|
||
"cloud_claude_sonnet", "cloud_deepseek",
|
||
],
|
||
"code_refactor": [
|
||
"local_qwen35_35b", "local_deepseek_coder_33b", "cloud_claude_sonnet",
|
||
"cloud_deepseek", "local_qwen3_14b",
|
||
],
|
||
"math_code": [
|
||
"local_deepseek_r1_70b", "local_qwen35_35b", "cloud_grok",
|
||
"cloud_claude_sonnet", "local_deepseek_coder_33b",
|
||
],
|
||
"architecture": [
|
||
"local_qwen35_35b", "cloud_grok", "cloud_claude_sonnet",
|
||
"local_deepseek_r1_70b", "cloud_mistral",
|
||
],
|
||
"devops": [
|
||
"local_qwen35_35b", "local_qwen3_14b", "cloud_deepseek",
|
||
"cloud_claude_sonnet", "local_glm47_32k",
|
||
],
|
||
"security": [
|
||
"cloud_claude_sonnet", "local_qwen35_35b", "cloud_grok", "cloud_mistral",
|
||
],
|
||
"reasoning": [
|
||
"local_deepseek_r1_70b", "local_qwen35_35b", "cloud_grok",
|
||
"cloud_claude_sonnet", "cloud_mistral",
|
||
],
|
||
"analysis": [
|
||
"local_qwen35_35b", "local_glm47_32k", "cloud_grok",
|
||
"cloud_claude_haiku", "local_mistral_nemo", "cloud_mistral",
|
||
],
|
||
"creative": [
|
||
"local_qwen35_35b", "local_mistral_nemo", "cloud_claude_haiku",
|
||
"local_glm47_32k", "cloud_mistral",
|
||
],
|
||
"quick_answer": [
|
||
"local_qwen3_14b", "local_qwen35_35b", "local_phi3",
|
||
"local_gemma3", "cloud_deepseek", "cloud_glm5_flash",
|
||
],
|
||
"chatbot": [
|
||
"local_qwen3_14b", "local_qwen35_35b", "local_gemma3",
|
||
"local_phi3", "local_mistral_nemo",
|
||
],
|
||
"vision": [
|
||
"local_llava_13b",
|
||
],
|
||
"unknown": [
|
||
"local_qwen35_35b", "local_qwen3_14b", "cloud_claude_sonnet",
|
||
"cloud_grok", "cloud_deepseek",
|
||
],
|
||
}
|
||
|
||
# ── Budget integration ─────────────────────────────────────────────────────────
|
||
|
||
class ProviderBudget:
|
||
"""In-memory budget gate: marks providers exhausted until TTL expires."""
|
||
_exhausted: Dict[str, float] = {}
|
||
_exhausted_ttl: int = 3600
|
||
|
||
@classmethod
|
||
def mark_exhausted(cls, provider: str) -> None:
|
||
cls._exhausted[provider] = time.time()
|
||
logger.warning("💸 Provider %s marked as budget-exhausted", provider)
|
||
|
||
@classmethod
|
||
def is_available(cls, provider: str) -> bool:
|
||
ts = cls._exhausted.get(provider)
|
||
if ts is None:
|
||
return True
|
||
if time.time() - ts > cls._exhausted_ttl:
|
||
cls._exhausted.pop(provider, None)
|
||
return True
|
||
return False
|
||
|
||
@classmethod
|
||
def reset(cls, provider: str) -> None:
|
||
cls._exhausted.pop(provider, None)
|
||
|
||
|
||
# ── Task classification ────────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class ClassificationResult:
|
||
task_type: str
|
||
confidence: float
|
||
all_scores: Dict[str, float]
|
||
ambiguous: bool = False
|
||
runner_up: Optional[str] = None
|
||
|
||
|
||
def classify_task(prompt: str, context_len: int = 0) -> Tuple[str, float]:
|
||
"""Classify prompt into a task type. Returns (task_type, confidence)."""
|
||
result = classify_task_detailed(prompt, context_len)
|
||
return result.task_type, result.confidence
|
||
|
||
|
||
def classify_task_detailed(prompt: str, context_len: int = 0) -> ClassificationResult:
|
||
"""Detailed classification with ambiguity detection and all scores."""
|
||
if not prompt or not prompt.strip():
|
||
return ClassificationResult("chatbot", 0.5, {}, ambiguous=False)
|
||
|
||
text = prompt.strip()
|
||
compiled = _get_compiled_patterns()
|
||
scores: Dict[str, float] = {}
|
||
|
||
for task_type, patterns, weight in compiled:
|
||
hits = sum(1 for p in patterns if p.search(text))
|
||
if hits > 0:
|
||
raw = hits / len(patterns)
|
||
scores[task_type] = raw * weight
|
||
|
||
if not scores:
|
||
return ClassificationResult("unknown", 0.3, {}, ambiguous=False)
|
||
|
||
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
||
best_task, best_score = sorted_scores[0]
|
||
confidence = min(best_score * 10, 1.0)
|
||
|
||
# Penalize confidence for very short prompts (fewer signals)
|
||
word_count = len(text.split())
|
||
if word_count <= 3:
|
||
confidence *= 0.6
|
||
elif word_count <= 8:
|
||
confidence *= 0.85
|
||
|
||
# Detect ambiguity: second-place is within 30% of the best
|
||
ambiguous = False
|
||
runner_up = None
|
||
if len(sorted_scores) >= 2:
|
||
_, second_score = sorted_scores[1]
|
||
if second_score > 0 and second_score / best_score > 0.7:
|
||
ambiguous = True
|
||
runner_up = sorted_scores[1][0]
|
||
|
||
# For long conversations, slight preference for context-heavy models
|
||
# (influences scoring, not classification)
|
||
if context_len > 50:
|
||
confidence = max(confidence, 0.5)
|
||
|
||
return ClassificationResult(
|
||
task_type=best_task,
|
||
confidence=round(confidence, 3),
|
||
all_scores={k: round(v, 4) for k, v in sorted_scores[:5]},
|
||
ambiguous=ambiguous,
|
||
runner_up=runner_up,
|
||
)
|
||
|
||
|
||
def _prompt_complexity(prompt: str) -> str:
|
||
"""Estimate prompt complexity: simple | medium | complex"""
|
||
words = len(prompt.split())
|
||
lines = prompt.count("\n")
|
||
code_blocks = prompt.count("```")
|
||
if words < 20 and lines < 3 and code_blocks == 0:
|
||
return "simple"
|
||
if words > 200 or code_blocks >= 2 or lines > 20:
|
||
return "complex"
|
||
return "medium"
|
||
|
||
|
||
# ── Main selection function ────────────────────────────────────────────────────
|
||
|
||
@dataclass
|
||
class AutoRouteResult:
|
||
profile_name: str
|
||
model_id: str
|
||
provider: str
|
||
task_type: str
|
||
confidence: float
|
||
complexity: str
|
||
reason: str
|
||
fallback_used: bool = False
|
||
all_candidates: List[str] = field(default_factory=list)
|
||
ambiguous: bool = False
|
||
runner_up: Optional[str] = None
|
||
all_scores: Dict[str, float] = field(default_factory=dict)
|
||
|
||
|
||
def select_model_auto(
|
||
prompt: str,
|
||
force_fast: bool = False,
|
||
force_capable: bool = False,
|
||
prefer_local: bool = False,
|
||
prefer_cheap: bool = False,
|
||
budget_aware: bool = True,
|
||
context_messages_len: int = 0,
|
||
) -> AutoRouteResult:
|
||
"""
|
||
Cursor-style auto model selection for Sofiia.
|
||
|
||
Logic:
|
||
1. Classify task type from prompt (with ambiguity detection)
|
||
2. Estimate complexity (simple/medium/complex)
|
||
3. Apply modifiers (force_fast, force_capable, prefer_local, prefer_cheap)
|
||
4. Score candidates from priority list factoring availability, budget, speed, cost
|
||
5. For long conversations, prefer large-context models
|
||
"""
|
||
classification = classify_task_detailed(prompt, context_messages_len)
|
||
task_type = classification.task_type
|
||
confidence = classification.confidence
|
||
complexity = _prompt_complexity(prompt)
|
||
|
||
effective_task = task_type
|
||
|
||
# Modifier overrides (parentheses fix for operator precedence)
|
||
if force_fast and task_type not in ("code_gen", "code_debug", "math_code"):
|
||
effective_task = "quick_answer"
|
||
if (prefer_cheap or complexity == "simple") and task_type in ("quick_answer", "creative", "chatbot"):
|
||
effective_task = "quick_answer"
|
||
|
||
priority_list = TASK_MODEL_PRIORITY.get(effective_task, TASK_MODEL_PRIORITY["unknown"])
|
||
catalog_map = {m.profile_name: m for m in SOFIIA_MODEL_CATALOG}
|
||
|
||
candidates = [p for p in priority_list if p in catalog_map]
|
||
if prefer_local:
|
||
local_cands = [p for p in candidates if catalog_map[p].local]
|
||
if local_cands:
|
||
candidates = local_cands
|
||
|
||
def _score(profile_name: str) -> float:
|
||
spec = catalog_map[profile_name]
|
||
score = 0.0
|
||
|
||
if not spec.available:
|
||
score += 1000
|
||
if budget_aware and not spec.has_credits:
|
||
score += 500
|
||
|
||
# Priority-list position is the strongest signal
|
||
try:
|
||
pos = priority_list.index(profile_name)
|
||
score += pos * 20
|
||
except ValueError:
|
||
score += 200
|
||
|
||
if prefer_local and not spec.local:
|
||
score += 200
|
||
if force_fast:
|
||
score += spec.speed_tier * 15
|
||
if prefer_cheap or prefer_local:
|
||
score -= spec.cost_tier * 20
|
||
else:
|
||
score += spec.cost_tier * 2
|
||
|
||
if force_capable:
|
||
score -= spec.context_k / 100
|
||
|
||
if complexity == "complex" and spec.context_k < 32:
|
||
score += 40
|
||
|
||
# Long conversation bonus for large-context models
|
||
if context_messages_len > 30 and spec.context_k >= 128:
|
||
score -= 15
|
||
elif context_messages_len > 50 and spec.context_k < 32:
|
||
score += 25
|
||
|
||
return score
|
||
|
||
scored = sorted([c for c in candidates if c in catalog_map], key=_score)
|
||
|
||
if not scored:
|
||
for fallback in ["local_qwen35_35b", "local_qwen3_14b", "local_phi3"]:
|
||
if fallback in catalog_map:
|
||
scored = [fallback]
|
||
break
|
||
|
||
best = scored[0] if scored else "local_qwen3_14b"
|
||
spec = catalog_map.get(best)
|
||
fallback_used = best not in priority_list[:2]
|
||
|
||
reasons: List[str] = [f"task={task_type} ({confidence:.0%})", f"complexity={complexity}"]
|
||
if classification.ambiguous:
|
||
reasons.append(f"ambiguous (runner_up={classification.runner_up})")
|
||
if force_fast:
|
||
reasons.append("force_fast")
|
||
if prefer_local:
|
||
reasons.append("prefer_local")
|
||
if prefer_cheap:
|
||
reasons.append("prefer_cheap")
|
||
if force_capable:
|
||
reasons.append("force_capable")
|
||
if context_messages_len > 30:
|
||
reasons.append(f"long_conversation({context_messages_len})")
|
||
if fallback_used:
|
||
reasons.append("fallback (top unavailable)")
|
||
|
||
return AutoRouteResult(
|
||
profile_name=best,
|
||
model_id=spec.model_id if spec else best,
|
||
provider=spec.provider if spec else "unknown",
|
||
task_type=task_type,
|
||
confidence=confidence,
|
||
complexity=complexity,
|
||
reason=" | ".join(reasons),
|
||
fallback_used=fallback_used,
|
||
all_candidates=scored[:5],
|
||
ambiguous=classification.ambiguous,
|
||
runner_up=classification.runner_up,
|
||
all_scores=classification.all_scores,
|
||
)
|
||
|
||
|
||
def explain_selection(result: AutoRouteResult) -> str:
|
||
"""Human-readable explanation of model selection (for debug/UI)."""
|
||
lines = [
|
||
f"Auto-selected **{result.model_id}** ({result.provider})",
|
||
f"Task: `{result.task_type}` | Complexity: `{result.complexity}` | "
|
||
f"Confidence: {result.confidence:.0%}",
|
||
f"Reason: {result.reason}",
|
||
]
|
||
if result.ambiguous:
|
||
lines.append(f"Ambiguous: runner-up was `{result.runner_up}`")
|
||
if result.all_scores:
|
||
top3 = list(result.all_scores.items())[:3]
|
||
lines.append("Scores: " + ", ".join(f"{k}={v:.3f}" for k, v in top3))
|
||
return "\n".join(lines)
|
||
|
||
|
||
def get_full_catalog() -> List[Dict[str, Any]]:
|
||
"""Return full model catalog with availability status for dashboard."""
|
||
return [
|
||
{
|
||
"profile_name": m.profile_name,
|
||
"provider": m.provider,
|
||
"model_id": m.model_id,
|
||
"description": m.description,
|
||
"strengths": m.strengths,
|
||
"cost_tier": m.cost_tier,
|
||
"speed_tier": m.speed_tier,
|
||
"context_k": m.context_k,
|
||
"local": m.local,
|
||
"vram_gb": m.vram_gb,
|
||
"available": m.available,
|
||
"has_credits": m.has_credits,
|
||
}
|
||
for m in SOFIIA_MODEL_CATALOG
|
||
]
|