P3.1: GPU/Queue-aware routing — NCS metrics + scoring-based model selection

NCS (services/node-capabilities/metrics.py):
- NodeLoad: inflight_jobs, queue_depth, concurrency_limit, estimated_wait_ms,
  cpu_load_1m, mem_pressure (macOS + Linux), rtt_ms_to_hub
- RuntimeLoad: per-runtime healthy, p50_ms, p95_ms from rolling 50-sample window
- POST /capabilities/report_latency for node-worker → NCS reporting
- NCS fetches worker metrics via NODE_WORKER_URL

Node Worker:
- GET /metrics endpoint (inflight, concurrency, latency buffers)
- Latency tracking per job type (llm/vision) with rolling buffer
- Fire-and-forget latency reporting to NCS after each successful job

Router (model_select v3):
- score_candidate(): wait + model_latency + cross_node_penalty + prefer_bonus
- LOCAL_THRESHOLD_MS=250: prefer local if within threshold of remote
- ModelSelection.score field for observability
- Structured [score] logs with chosen node, model, and score breakdown

Tests: 19 new (12 scoring + 7 NCS metrics), 36 total pass
Docs: ops/runbook_p3_1.md, ops/CHANGELOG_FABRIC.md

No breaking changes to JobRequest/JobResponse or capabilities schema.

Made-with: Cursor
This commit is contained in:
Apple
2026-02-27 02:55:44 -08:00
parent c4b94a327d
commit a605b8c43e
11 changed files with 706 additions and 40 deletions

View File

@@ -2,6 +2,6 @@ FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY main.py .
COPY . .
EXPOSE 8099
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8099"]

View File

@@ -4,10 +4,14 @@ import time
import logging
from typing import Any, Dict, List, Optional
from fastapi import FastAPI
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
import httpx
from metrics import (
build_node_load, build_runtime_load, record_latency,
)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("node-capabilities")
@@ -195,20 +199,24 @@ async def _build_capabilities() -> Dict[str, Any]:
disk = _collect_disk_inventory()
served = _build_served_models(ollama, swapper, llama)
runtimes = {"ollama": ollama, "swapper": swapper}
if llama:
runtimes["llama_server"] = llama
node_load = await build_node_load()
runtime_load = await build_runtime_load(runtimes)
result = {
"node_id": NODE_ID,
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"runtimes": {
"ollama": ollama,
"swapper": swapper,
},
"runtimes": runtimes,
"served_models": served,
"served_count": len(served),
"node_load": node_load,
"runtime_load": runtime_load,
"inventory_only": disk,
"inventory_count": len(disk),
}
if llama:
result["runtimes"]["llama_server"] = llama
_cache = result
_cache_ts = time.time()
@@ -240,6 +248,17 @@ async def capabilities_refresh():
return JSONResponse(content={"refreshed": True, "served_count": data["served_count"]})
@app.post("/capabilities/report_latency")
async def report_latency_endpoint(request: Request):
data = await request.json()
runtime = data.get("runtime", "ollama")
req_type = data.get("type", "llm")
latency_ms = data.get("latency_ms", 0)
if latency_ms > 0:
record_latency(runtime, req_type, latency_ms)
return {"ok": True}
# ── NATS request/reply (optional) ─────────────────────────────────────────────
ENABLE_NATS = os.getenv("ENABLE_NATS_CAPS", "false").lower() in ("true", "1", "yes")

View File

@@ -0,0 +1,164 @@
"""Runtime health and load metrics for NCS capabilities payload."""
import logging
import os
import platform
import subprocess
import time
from collections import deque
from typing import Any, Dict, List, Optional
import httpx
logger = logging.getLogger("ncs-metrics")
NODE_WORKER_URL = os.getenv("NODE_WORKER_URL", "http://127.0.0.1:8109")
_latency_buffer: Dict[str, deque] = {} # key: "runtime:type" → deque of (latency_ms, ts)
LATENCY_BUFFER_SIZE = 50
def record_latency(runtime: str, req_type: str, latency_ms: int):
key = f"{runtime}:{req_type}"
buf = _latency_buffer.setdefault(key, deque(maxlen=LATENCY_BUFFER_SIZE))
buf.append((latency_ms, time.time()))
def _percentile(values: List[int], p: float) -> int:
if not values:
return 0
s = sorted(values)
idx = int(len(s) * p / 100)
return s[min(idx, len(s) - 1)]
def get_latency_stats(runtime: str, req_type: str) -> Dict[str, Optional[int]]:
key = f"{runtime}:{req_type}"
buf = _latency_buffer.get(key)
if not buf or len(buf) == 0:
return {"p50_ms": None, "p95_ms": None, "samples": 0}
cutoff = time.time() - 600
recent = [lat for lat, ts in buf if ts > cutoff]
if not recent:
return {"p50_ms": None, "p95_ms": None, "samples": 0}
return {
"p50_ms": _percentile(recent, 50),
"p95_ms": _percentile(recent, 95),
"samples": len(recent),
}
async def fetch_worker_metrics() -> Dict[str, Any]:
"""Fetch inflight/concurrency from local node-worker /metrics."""
defaults = {"inflight_jobs": 0, "concurrency_limit": 1, "queue_depth": 0,
"last_latencies_llm": [], "last_latencies_vision": []}
try:
async with httpx.AsyncClient(timeout=2) as c:
r = await c.get(f"{NODE_WORKER_URL}/metrics")
if r.status_code == 200:
return r.json()
except Exception as e:
logger.debug(f"Node-worker metrics unavailable: {e}")
return defaults
def get_cpu_load() -> Optional[float]:
try:
return round(os.getloadavg()[0], 2)
except (OSError, AttributeError):
return None
def get_mem_pressure() -> Optional[str]:
"""macOS: use memory_pressure -Q or vm_stat. Linux: /proc/meminfo."""
if platform.system() == "Darwin":
try:
out = subprocess.check_output(
["memory_pressure", "-Q"], timeout=2, stderr=subprocess.DEVNULL
).decode()
for line in out.splitlines():
ll = line.lower()
if "system-wide" in ll and "level" in ll:
if "critical" in ll:
return "critical"
if "warn" in ll:
return "high"
if "normal" in ll:
return "low"
return "low"
except Exception:
try:
out = subprocess.check_output(
["vm_stat"], timeout=2, stderr=subprocess.DEVNULL
).decode()
return "low"
except Exception:
return None
elif platform.system() == "Linux":
try:
with open("/proc/meminfo") as f:
info = {}
for line in f:
parts = line.split(":")
if len(parts) == 2:
info[parts[0].strip()] = int(parts[1].strip().split()[0])
total = info.get("MemTotal", 1)
avail = info.get("MemAvailable", total)
ratio = avail / total
if ratio < 0.05:
return "critical"
elif ratio < 0.15:
return "high"
elif ratio < 0.30:
return "medium"
return "low"
except Exception:
return None
return None
async def build_node_load(worker_metrics: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
"""Build NodeLoad object for capabilities payload."""
wm = worker_metrics or await fetch_worker_metrics()
inflight = wm.get("inflight_jobs", 0)
concurrency = wm.get("concurrency_limit", 1)
queue_depth = wm.get("queue_depth", 0)
llm_stats = get_latency_stats("ollama", "llm")
p50 = llm_stats["p50_ms"] or 1500
if inflight < concurrency:
estimated_wait = 0
else:
estimated_wait = (inflight - concurrency + 1) * p50
return {
"ts": int(time.time() * 1000),
"inflight_jobs": inflight,
"queue_depth": queue_depth,
"concurrency_limit": concurrency,
"estimated_wait_ms": estimated_wait,
"cpu_load_1m": get_cpu_load(),
"mem_pressure": get_mem_pressure(),
"rtt_ms_to_hub": None,
}
async def build_runtime_load(runtimes: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Build RuntimeLoad list from collected runtimes."""
result = []
for rt_name, rt_data in runtimes.items():
status = rt_data.get("status", "unknown")
healthy = status == "ok"
llm_stats = get_latency_stats(rt_name, "llm")
vis_stats = get_latency_stats(rt_name, "vision")
best_stats = vis_stats if vis_stats["samples"] > llm_stats["samples"] else llm_stats
result.append({
"runtime": rt_name,
"healthy": healthy,
"last_check_ms": int(time.time() * 1000),
"p50_ms": best_stats["p50_ms"],
"p95_ms": best_stats["p95_ms"],
})
return result

View File

@@ -26,6 +26,11 @@ async def healthz():
}
@app.get("/metrics")
async def metrics():
return worker.get_metrics()
@app.on_event("startup")
async def startup():
global _nats_client

View File

@@ -2,6 +2,7 @@
import asyncio
import json
import logging
import os
import time
from typing import Any, Dict
@@ -15,6 +16,10 @@ logger = logging.getLogger("node-worker")
_idem = IdempotencyStore()
_semaphore: asyncio.Semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY)
_nats_client = None
_inflight_count: int = 0
_latencies_llm: list = []
_latencies_vision: list = []
_LATENCY_BUFFER = 50
async def start(nats_client):
@@ -88,12 +93,25 @@ async def _handle_request(msg):
await _reply(msg, resp)
return
async with _semaphore:
resp = await _execute(job, remaining)
global _inflight_count
_inflight_count += 1
try:
async with _semaphore:
resp = await _execute(job, remaining)
finally:
_inflight_count -= 1
_idem.put(idem_key, resp)
_idem.complete_inflight(idem_key, resp)
resp.latency_ms = int((time.time() - t0) * 1000)
if resp.status == "ok" and resp.latency_ms > 0:
buf = _latencies_llm if job.required_type in ("llm", "code") else _latencies_vision
buf.append(resp.latency_ms)
if len(buf) > _LATENCY_BUFFER:
del buf[:len(buf) - _LATENCY_BUFFER]
_report_latency_async(job.required_type, resp.provider or "ollama", resp.latency_ms)
await _reply(msg, resp)
except Exception as e:
@@ -179,6 +197,37 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
)
def get_metrics() -> Dict[str, Any]:
return {
"inflight_jobs": _inflight_count,
"concurrency_limit": config.MAX_CONCURRENCY,
"queue_depth": 0,
"last_latencies_llm": list(_latencies_llm[-_LATENCY_BUFFER:]),
"last_latencies_vision": list(_latencies_vision[-_LATENCY_BUFFER:]),
}
def _report_latency_async(req_type: str, runtime: str, latency_ms: int):
"""Fire-and-forget latency report to local NCS."""
import httpx as _httpx
ncs_url = os.getenv("NCS_REPORT_URL", "http://node-capabilities:8099")
async def _do():
try:
async with _httpx.AsyncClient(timeout=1) as c:
await c.post(f"{ncs_url}/capabilities/report_latency", json={
"runtime": runtime, "type": req_type, "latency_ms": latency_ms,
})
except Exception:
pass
try:
asyncio.get_event_loop().create_task(_do())
except RuntimeError:
pass
async def _reply(msg, resp: JobResponse):
if msg.reply:
await _nats_client.publish(msg.reply, resp.model_dump_json().encode())

View File

@@ -26,6 +26,9 @@ class ProfileRequirements:
constraints: Dict[str, Any] = field(default_factory=dict)
LOCAL_THRESHOLD_MS = 250
@dataclass
class ModelSelection:
runtime: str # ollama | swapper | llama_server | cloud
@@ -39,6 +42,7 @@ class ModelSelection:
via_nats: bool = False
fallback_reason: str = ""
caps_age_s: float = 0.0
score: int = 0 # lower = faster
# ── Profile resolution ────────────────────────────────────────────────────────
@@ -105,6 +109,56 @@ def profile_requirements(
)
# ── Scoring ───────────────────────────────────────────────────────────────────
def score_candidate(
model: Dict[str, Any],
capabilities: Dict[str, Any],
prefer: List[str],
rtt_hint_ms: int = 60,
) -> int:
"""Lower score = better candidate.
Formula: wait + model_latency + cross_node_penalty + prefer_bonus
"""
is_local = model.get("local", False)
node_id = model.get("node", "")
node_load = capabilities.get("node_load", {})
if not is_local:
for ndata in capabilities.get("nodes", {}).values():
if ndata.get("node_id") == node_id:
node_load = ndata.get("node_load", {})
break
wait = node_load.get("estimated_wait_ms", 0)
model_lat = model.get("model_p50_ms") or 0
if not model_lat:
runtime_loads = capabilities.get("runtime_load", [])
rt = model.get("runtime", "ollama")
for rl in runtime_loads:
if rl.get("runtime") == rt:
model_lat = rl.get("p50_ms") or 0
break
if not model_lat:
model_lat = 1500
rtt = 0 if is_local else (node_load.get("rtt_ms_to_hub") or rtt_hint_ms or 60)
cross_penalty = 0 if is_local else (rtt * 2)
prefer_bonus = 0
name = model.get("name", "")
for i, pref in enumerate(prefer):
if pref == "*":
break
if pref == name or pref in name:
prefer_bonus = -(1000 - i * 100)
break
return wait + model_lat + cross_penalty + prefer_bonus
# ── Multi-node model selection ────────────────────────────────────────────────
def select_best_model(
@@ -114,10 +168,8 @@ def select_best_model(
) -> Optional[ModelSelection]:
"""Choose the best served model from global (multi-node) capabilities.
Selection order:
1. Prefer list matches (local first, then remote)
2. Best candidate by size (local first, then remote)
3. None → caller should try static fallback
Uses scoring: wait + model_latency + cross_node_rtt + prefer_bonus.
If best local score <= best remote score + LOCAL_THRESHOLD_MS, prefer local.
exclude_nodes: set of node_ids to skip (e.g. circuit-broken nodes).
"""
@@ -140,35 +192,34 @@ def select_best_model(
if not candidates:
return None
local_candidates = [m for m in candidates if m.get("local", False)]
remote_candidates = [m for m in candidates if not m.get("local", False)]
prefer = reqs.prefer if reqs.prefer else []
scored = [(score_candidate(m, capabilities, prefer), m) for m in candidates]
scored.sort(key=lambda x: x[0])
for pref in prefer:
if pref == "*":
break
for m in local_candidates:
if pref == m.get("name") or pref in m.get("name", ""):
return _make_selection(m, capabilities)
for m in remote_candidates:
if pref == m.get("name") or pref in m.get("name", ""):
return _make_selection(m, capabilities)
local_scored = [(s, m) for s, m in scored if m.get("local", False)]
remote_scored = [(s, m) for s, m in scored if not m.get("local", False)]
if local_candidates:
return _make_selection(_pick_best(local_candidates), capabilities)
if remote_candidates:
return _make_selection(_pick_best(remote_candidates), capabilities)
best_local = local_scored[0] if local_scored else None
best_remote = remote_scored[0] if remote_scored else None
if best_local and best_remote:
if best_local[0] <= best_remote[0] + LOCAL_THRESHOLD_MS:
sel = _make_selection(best_local[1], capabilities)
sel.score = best_local[0]
return sel
sel = _make_selection(best_remote[1], capabilities)
sel.score = best_remote[0]
return sel
winner = (best_local or best_remote)
if winner:
sel = _make_selection(winner[1], capabilities)
sel.score = winner[0]
return sel
return None
def _pick_best(candidates: List[Dict[str, Any]]) -> Dict[str, Any]:
running = [m for m in candidates if m.get("running")]
pool = running if running else candidates
return max(pool, key=lambda m: m.get("size_gb", 0))
def _make_selection(
model: Dict[str, Any],
capabilities: Dict[str, Any],
@@ -269,10 +320,9 @@ async def select_model_for_agent(
)
if sel:
logger.info(
f"[select] agent={agent_id} profile={profile} "
f"{'LOCAL' if sel.local else 'REMOTE'} "
f"node={sel.node} runtime={sel.runtime} "
f"model={sel.name} caps_age={sel.caps_age_s}s"
f"[score] agent={agent_id} type={reqs.required_type} "
f"chosen={'LOCAL' if sel.local else 'REMOTE'}:{sel.node}/{sel.name} "
f"score={sel.score} caps_age={sel.caps_age_s}s"
f"{' (force_local)' if force_local else ''}"
f"{' (excluded: ' + ','.join(excl) + ')' if excl else ''}"
)