P3.1: GPU/Queue-aware routing — NCS metrics + scoring-based model selection
NCS (services/node-capabilities/metrics.py): - NodeLoad: inflight_jobs, queue_depth, concurrency_limit, estimated_wait_ms, cpu_load_1m, mem_pressure (macOS + Linux), rtt_ms_to_hub - RuntimeLoad: per-runtime healthy, p50_ms, p95_ms from rolling 50-sample window - POST /capabilities/report_latency for node-worker → NCS reporting - NCS fetches worker metrics via NODE_WORKER_URL Node Worker: - GET /metrics endpoint (inflight, concurrency, latency buffers) - Latency tracking per job type (llm/vision) with rolling buffer - Fire-and-forget latency reporting to NCS after each successful job Router (model_select v3): - score_candidate(): wait + model_latency + cross_node_penalty + prefer_bonus - LOCAL_THRESHOLD_MS=250: prefer local if within threshold of remote - ModelSelection.score field for observability - Structured [score] logs with chosen node, model, and score breakdown Tests: 19 new (12 scoring + 7 NCS metrics), 36 total pass Docs: ops/runbook_p3_1.md, ops/CHANGELOG_FABRIC.md No breaking changes to JobRequest/JobResponse or capabilities schema. Made-with: Cursor
This commit is contained in:
@@ -26,6 +26,9 @@ class ProfileRequirements:
|
||||
constraints: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
LOCAL_THRESHOLD_MS = 250
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelSelection:
|
||||
runtime: str # ollama | swapper | llama_server | cloud
|
||||
@@ -39,6 +42,7 @@ class ModelSelection:
|
||||
via_nats: bool = False
|
||||
fallback_reason: str = ""
|
||||
caps_age_s: float = 0.0
|
||||
score: int = 0 # lower = faster
|
||||
|
||||
|
||||
# ── Profile resolution ────────────────────────────────────────────────────────
|
||||
@@ -105,6 +109,56 @@ def profile_requirements(
|
||||
)
|
||||
|
||||
|
||||
# ── Scoring ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def score_candidate(
|
||||
model: Dict[str, Any],
|
||||
capabilities: Dict[str, Any],
|
||||
prefer: List[str],
|
||||
rtt_hint_ms: int = 60,
|
||||
) -> int:
|
||||
"""Lower score = better candidate.
|
||||
|
||||
Formula: wait + model_latency + cross_node_penalty + prefer_bonus
|
||||
"""
|
||||
is_local = model.get("local", False)
|
||||
node_id = model.get("node", "")
|
||||
|
||||
node_load = capabilities.get("node_load", {})
|
||||
if not is_local:
|
||||
for ndata in capabilities.get("nodes", {}).values():
|
||||
if ndata.get("node_id") == node_id:
|
||||
node_load = ndata.get("node_load", {})
|
||||
break
|
||||
|
||||
wait = node_load.get("estimated_wait_ms", 0)
|
||||
|
||||
model_lat = model.get("model_p50_ms") or 0
|
||||
if not model_lat:
|
||||
runtime_loads = capabilities.get("runtime_load", [])
|
||||
rt = model.get("runtime", "ollama")
|
||||
for rl in runtime_loads:
|
||||
if rl.get("runtime") == rt:
|
||||
model_lat = rl.get("p50_ms") or 0
|
||||
break
|
||||
if not model_lat:
|
||||
model_lat = 1500
|
||||
|
||||
rtt = 0 if is_local else (node_load.get("rtt_ms_to_hub") or rtt_hint_ms or 60)
|
||||
cross_penalty = 0 if is_local else (rtt * 2)
|
||||
|
||||
prefer_bonus = 0
|
||||
name = model.get("name", "")
|
||||
for i, pref in enumerate(prefer):
|
||||
if pref == "*":
|
||||
break
|
||||
if pref == name or pref in name:
|
||||
prefer_bonus = -(1000 - i * 100)
|
||||
break
|
||||
|
||||
return wait + model_lat + cross_penalty + prefer_bonus
|
||||
|
||||
|
||||
# ── Multi-node model selection ────────────────────────────────────────────────
|
||||
|
||||
def select_best_model(
|
||||
@@ -114,10 +168,8 @@ def select_best_model(
|
||||
) -> Optional[ModelSelection]:
|
||||
"""Choose the best served model from global (multi-node) capabilities.
|
||||
|
||||
Selection order:
|
||||
1. Prefer list matches (local first, then remote)
|
||||
2. Best candidate by size (local first, then remote)
|
||||
3. None → caller should try static fallback
|
||||
Uses scoring: wait + model_latency + cross_node_rtt + prefer_bonus.
|
||||
If best local score <= best remote score + LOCAL_THRESHOLD_MS, prefer local.
|
||||
|
||||
exclude_nodes: set of node_ids to skip (e.g. circuit-broken nodes).
|
||||
"""
|
||||
@@ -140,35 +192,34 @@ def select_best_model(
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
local_candidates = [m for m in candidates if m.get("local", False)]
|
||||
remote_candidates = [m for m in candidates if not m.get("local", False)]
|
||||
|
||||
prefer = reqs.prefer if reqs.prefer else []
|
||||
scored = [(score_candidate(m, capabilities, prefer), m) for m in candidates]
|
||||
scored.sort(key=lambda x: x[0])
|
||||
|
||||
for pref in prefer:
|
||||
if pref == "*":
|
||||
break
|
||||
for m in local_candidates:
|
||||
if pref == m.get("name") or pref in m.get("name", ""):
|
||||
return _make_selection(m, capabilities)
|
||||
for m in remote_candidates:
|
||||
if pref == m.get("name") or pref in m.get("name", ""):
|
||||
return _make_selection(m, capabilities)
|
||||
local_scored = [(s, m) for s, m in scored if m.get("local", False)]
|
||||
remote_scored = [(s, m) for s, m in scored if not m.get("local", False)]
|
||||
|
||||
if local_candidates:
|
||||
return _make_selection(_pick_best(local_candidates), capabilities)
|
||||
if remote_candidates:
|
||||
return _make_selection(_pick_best(remote_candidates), capabilities)
|
||||
best_local = local_scored[0] if local_scored else None
|
||||
best_remote = remote_scored[0] if remote_scored else None
|
||||
|
||||
if best_local and best_remote:
|
||||
if best_local[0] <= best_remote[0] + LOCAL_THRESHOLD_MS:
|
||||
sel = _make_selection(best_local[1], capabilities)
|
||||
sel.score = best_local[0]
|
||||
return sel
|
||||
sel = _make_selection(best_remote[1], capabilities)
|
||||
sel.score = best_remote[0]
|
||||
return sel
|
||||
|
||||
winner = (best_local or best_remote)
|
||||
if winner:
|
||||
sel = _make_selection(winner[1], capabilities)
|
||||
sel.score = winner[0]
|
||||
return sel
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _pick_best(candidates: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
running = [m for m in candidates if m.get("running")]
|
||||
pool = running if running else candidates
|
||||
return max(pool, key=lambda m: m.get("size_gb", 0))
|
||||
|
||||
|
||||
def _make_selection(
|
||||
model: Dict[str, Any],
|
||||
capabilities: Dict[str, Any],
|
||||
@@ -269,10 +320,9 @@ async def select_model_for_agent(
|
||||
)
|
||||
if sel:
|
||||
logger.info(
|
||||
f"[select] agent={agent_id} profile={profile} → "
|
||||
f"{'LOCAL' if sel.local else 'REMOTE'} "
|
||||
f"node={sel.node} runtime={sel.runtime} "
|
||||
f"model={sel.name} caps_age={sel.caps_age_s}s"
|
||||
f"[score] agent={agent_id} type={reqs.required_type} "
|
||||
f"chosen={'LOCAL' if sel.local else 'REMOTE'}:{sel.node}/{sel.name} "
|
||||
f"score={sel.score} caps_age={sel.caps_age_s}s"
|
||||
f"{' (force_local)' if force_local else ''}"
|
||||
f"{' (excluded: ' + ','.join(excl) + ')' if excl else ''}"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user