P2: Global multi-node model selection + NCS on NODA1

Architecture for 150+ nodes: - global_capabilities_client.py: NATS scatter-gather discovery using wildcard subject node.*.capabilities.get — zero static node lists. New nodes auto-register by deploying NCS and subscribing to NATS. Dead nodes expire from cache after 3x TTL automatically. Multi-node model_select.py: - ModelSelection now includes node, local, via_nats fields - select_best_model prefers local candidates, then remote - Prefer list resolution: local first, remote second - All logged per request: node, runtime, model, local/remote NODA1 compose: - Added node-capabilities service (NCS) to docker-compose.node1.yml - NATS subscription: node.noda1.capabilities.get - Router env: NODE_CAPABILITIES_URL + ENABLE_GLOBAL_CAPS_NATS=true NODA2 compose: - Router env: ENABLE_GLOBAL_CAPS_NATS=true Router main.py: - Startup: initializes global_capabilities_client (NATS connect + first discovery). Falls back to local-only capabilities_client if unavailable. - /infer: uses get_global_capabilities() for cross-node model pool - Offload support: send_offload_request(node_id, type, payload) via NATS Verified on NODA2: - Global caps: 1 node, 14 models (NODA1 not yet deployed) - Sofiia: cloud_grok → grok-4-1-fast-reasoning (OK) - Helion: NCS → qwen3:14b local (OK) - When NODA1 deploys NCS, its models appear automatically via NATS discovery Made-with: Cursor
2026-02-27 02:26:12 -08:00
parent 89c3f2ac66
commit a92c424845
5 changed files with 575 additions and 62 deletions
--- a/services/router/model_select.py
+++ b/services/router/model_select.py
@@ -1,8 +1,10 @@
-"""NCS-first model selection for DAGI Router.
+"""NCS-first model selection for DAGI Router — multi-node aware.

 Resolves an agent's LLM profile into a concrete model+provider using live
-capabilities from the Node Capabilities Service (NCS).  Falls back to static
-router-config.yml when NCS is unavailable.
+capabilities from Node Capabilities Services across all nodes.
+Falls back to static router-config.yml when NCS is unavailable.
+
+Scaling: works with 1 node or 150+. No static node lists.
 """
 import logging
 import time
@@ -31,7 +33,10 @@ class ModelSelection:
    model_type: str       # llm | vision | code | …
    base_url: str = ""
    provider: str = ""    # cloud provider name if applicable
+    node: str = ""        # which node owns this model
+    local: bool = True    # is it on the current node?
    via_ncs: bool = False
+    via_nats: bool = False
    fallback_reason: str = ""
    caps_age_s: float = 0.0

@@ -44,13 +49,11 @@ def resolve_effective_profile(
    router_cfg: Dict[str, Any],
    request_model: Optional[str] = None,
 ) -> str:
-    """Determine the effective LLM profile name for a request."""
    if request_model:
        llm_profiles = router_cfg.get("llm_profiles", {})
        for pname, pcfg in llm_profiles.items():
            if pcfg.get("model") == request_model:
                return pname
-
    return agent_cfg.get("default_llm", "local_default_coder")


@@ -59,11 +62,6 @@ def profile_requirements(
    agent_cfg: Dict[str, Any],
    router_cfg: Dict[str, Any],
 ) -> ProfileRequirements:
-    """Build selection requirements from a profile definition.
-
-    If the profile has `selection_policy` in config, use it directly.
-    Otherwise, infer from the legacy `provider`/`model` fields.
-    """
    llm_profiles = router_cfg.get("llm_profiles", {})
    selection_policies = router_cfg.get("selection_policies", {})
    profile_cfg = llm_profiles.get(profile_name, {})
@@ -107,22 +105,23 @@ def profile_requirements(
    )


-# ── NCS-based selection ───────────────────────────────────────────────────────
+# ── Multi-node model selection ────────────────────────────────────────────────

 def select_best_model(
    reqs: ProfileRequirements,
    capabilities: Dict[str, Any],
 ) -> Optional[ModelSelection]:
-    """Choose the best served model from NCS capabilities.
+    """Choose the best served model from global (multi-node) capabilities.

-    Returns None if no suitable model found (caller should try static fallback).
+    Selection order:
+    1. Prefer list matches (local first, then remote)
+    2. Best candidate by size (local first, then remote)
+    3. None → caller should try static fallback
    """
    served = capabilities.get("served_models", [])
    if not served:
        return None

-    caps_age = time.time() - capabilities.get("_fetch_ts", time.time())
-
    search_types = [reqs.required_type]
    if reqs.required_type == "code":
        search_types.append("llm")
@@ -133,24 +132,30 @@ def select_best_model(
    if not candidates:
        return None

+    local_candidates = [m for m in candidates if m.get("local", False)]
+    remote_candidates = [m for m in candidates if not m.get("local", False)]
+
    prefer = reqs.prefer if reqs.prefer else []

    for pref in prefer:
        if pref == "*":
            break
-        for m in candidates:
+        for m in local_candidates:
            if pref == m.get("name") or pref in m.get("name", ""):
-                return _make_selection(m, capabilities, caps_age, reqs)
+                return _make_selection(m, capabilities)
+        for m in remote_candidates:
+            if pref == m.get("name") or pref in m.get("name", ""):
+                return _make_selection(m, capabilities)

-    if candidates:
-        best = _pick_best_candidate(candidates)
-        return _make_selection(best, capabilities, caps_age, reqs)
+    if local_candidates:
+        return _make_selection(_pick_best(local_candidates), capabilities)
+    if remote_candidates:
+        return _make_selection(_pick_best(remote_candidates), capabilities)

    return None


-def _pick_best_candidate(candidates: List[Dict[str, Any]]) -> Dict[str, Any]:
-    """Prefer running models, then largest by size_gb."""
+def _pick_best(candidates: List[Dict[str, Any]]) -> Dict[str, Any]:
    running = [m for m in candidates if m.get("running")]
    pool = running if running else candidates
    return max(pool, key=lambda m: m.get("size_gb", 0))
@@ -159,15 +164,11 @@ def _pick_best_candidate(candidates: List[Dict[str, Any]]) -> Dict[str, Any]:
 def _make_selection(
    model: Dict[str, Any],
    capabilities: Dict[str, Any],
-    caps_age: float,
-    reqs: ProfileRequirements,
 ) -> ModelSelection:
    runtime = model.get("runtime", "ollama")
+    is_local = model.get("local", False)
+    node = model.get("node", capabilities.get("local_node", ""))
    base_url = model.get("base_url", "")
-    if not base_url:
-        runtimes = capabilities.get("runtimes", {})
-        rt = runtimes.get(runtime, {})
-        base_url = rt.get("base_url", "")

    return ModelSelection(
        runtime=runtime,
@@ -175,18 +176,20 @@ def _make_selection(
        model_type=model.get("type", "llm"),
        base_url=base_url,
        provider="ollama" if runtime in ("ollama", "llama_server") else runtime,
+        node=node,
+        local=is_local,
        via_ncs=True,
-        caps_age_s=round(caps_age, 1),
+        via_nats=not is_local,
+        caps_age_s=model.get("node_age_s", 0.0),
    )


-# ── Static fallback (from router-config profiles) ────────────────────────────
+# ── Static fallback ──────────────────────────────────────────────────────────

 def static_fallback(
    profile_name: str,
    router_cfg: Dict[str, Any],
 ) -> Optional[ModelSelection]:
-    """Build a ModelSelection from the static llm_profiles config."""
    llm_profiles = router_cfg.get("llm_profiles", {})
    cfg = llm_profiles.get(profile_name, {})
    if not cfg:
@@ -200,6 +203,8 @@ def static_fallback(
        model_type="cloud_llm" if provider in CLOUD_PROVIDERS else "llm",
        base_url=cfg.get("base_url", ""),
        provider=provider,
+        node="local",
+        local=True,
        via_ncs=False,
        fallback_reason="NCS unavailable or no match; using static config",
    )
@@ -214,10 +219,7 @@ async def select_model_for_agent(
    capabilities: Optional[Dict[str, Any]],
    request_model: Optional[str] = None,
 ) -> ModelSelection:
-    """Full selection pipeline: resolve profile → NCS → static fallback.
-
-    This is the single entry point the router calls for each request.
-    """
+    """Full selection pipeline: resolve profile → NCS (multi-node) → static → hard default."""
    profile = resolve_effective_profile(
        agent_id, agent_cfg, router_cfg, request_model,
    )
@@ -238,36 +240,36 @@ async def select_model_for_agent(
        sel = select_best_model(reqs, capabilities)
        if sel:
            logger.info(
-                f"[select] agent={agent_id} profile={profile} → NCS "
-                f"runtime={sel.runtime} model={sel.name} caps_age={sel.caps_age_s}s"
+                f"[select] agent={agent_id} profile={profile} → "
+                f"{'NCS' if sel.local else 'REMOTE'} "
+                f"node={sel.node} runtime={sel.runtime} "
+                f"model={sel.name} caps_age={sel.caps_age_s}s"
            )
            return sel
        logger.warning(
-            f"[select] agent={agent_id} profile={profile} → NCS had no match "
-            f"for type={reqs.required_type}; trying static"
+            f"[select] agent={agent_id} profile={profile} → no match "
+            f"for type={reqs.required_type} across {capabilities.get('node_count', 0)} node(s)"
        )

    static = static_fallback(profile, router_cfg)
    if static:
        logger.info(
            f"[select] agent={agent_id} profile={profile} → static "
-            f"provider={static.provider} model={static.name} "
-            f"reason={static.fallback_reason}"
+            f"provider={static.provider} model={static.name}"
        )
        return static

    if reqs.fallback_profile and reqs.fallback_profile != profile:
        logger.warning(
            f"[select] agent={agent_id} profile={profile} not found → "
-            f"trying fallback_profile={reqs.fallback_profile}"
+            f"fallback_profile={reqs.fallback_profile}"
        )
        return await select_model_for_agent(
            agent_id, agent_cfg, router_cfg, capabilities,
        )

    logger.error(
-        f"[select] agent={agent_id} profile={profile} → ALL selection "
-        f"methods failed. Using hard default qwen3:14b"
+        f"[select] agent={agent_id} ALL methods failed → hard default"
    )
    return ModelSelection(
        runtime="ollama",
@@ -275,6 +277,8 @@ async def select_model_for_agent(
        model_type="llm",
        base_url="http://host.docker.internal:11434",
        provider="ollama",
+        node="local",
+        local=True,
        via_ncs=False,
        fallback_reason="all methods failed; hard default",
    )