P3.1: GPU/Queue-aware routing — NCS metrics + scoring-based model selection

NCS (services/node-capabilities/metrics.py):
- NodeLoad: inflight_jobs, queue_depth, concurrency_limit, estimated_wait_ms,
  cpu_load_1m, mem_pressure (macOS + Linux), rtt_ms_to_hub
- RuntimeLoad: per-runtime healthy, p50_ms, p95_ms from rolling 50-sample window
- POST /capabilities/report_latency for node-worker → NCS reporting
- NCS fetches worker metrics via NODE_WORKER_URL

Node Worker:
- GET /metrics endpoint (inflight, concurrency, latency buffers)
- Latency tracking per job type (llm/vision) with rolling buffer
- Fire-and-forget latency reporting to NCS after each successful job

Router (model_select v3):
- score_candidate(): wait + model_latency + cross_node_penalty + prefer_bonus
- LOCAL_THRESHOLD_MS=250: prefer local if within threshold of remote
- ModelSelection.score field for observability
- Structured [score] logs with chosen node, model, and score breakdown

Tests: 19 new (12 scoring + 7 NCS metrics), 36 total pass
Docs: ops/runbook_p3_1.md, ops/CHANGELOG_FABRIC.md

No breaking changes to JobRequest/JobResponse or capabilities schema.

Made-with: Cursor
This commit is contained in:
Apple
2026-02-27 02:55:44 -08:00
parent c4b94a327d
commit a605b8c43e
11 changed files with 706 additions and 40 deletions

View File

@@ -2,6 +2,6 @@ FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY main.py .
COPY . .
EXPOSE 8099
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8099"]

View File

@@ -4,10 +4,14 @@ import time
import logging
from typing import Any, Dict, List, Optional
from fastapi import FastAPI
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
import httpx
from metrics import (
build_node_load, build_runtime_load, record_latency,
)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("node-capabilities")
@@ -195,20 +199,24 @@ async def _build_capabilities() -> Dict[str, Any]:
disk = _collect_disk_inventory()
served = _build_served_models(ollama, swapper, llama)
runtimes = {"ollama": ollama, "swapper": swapper}
if llama:
runtimes["llama_server"] = llama
node_load = await build_node_load()
runtime_load = await build_runtime_load(runtimes)
result = {
"node_id": NODE_ID,
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"runtimes": {
"ollama": ollama,
"swapper": swapper,
},
"runtimes": runtimes,
"served_models": served,
"served_count": len(served),
"node_load": node_load,
"runtime_load": runtime_load,
"inventory_only": disk,
"inventory_count": len(disk),
}
if llama:
result["runtimes"]["llama_server"] = llama
_cache = result
_cache_ts = time.time()
@@ -240,6 +248,17 @@ async def capabilities_refresh():
return JSONResponse(content={"refreshed": True, "served_count": data["served_count"]})
@app.post("/capabilities/report_latency")
async def report_latency_endpoint(request: Request):
data = await request.json()
runtime = data.get("runtime", "ollama")
req_type = data.get("type", "llm")
latency_ms = data.get("latency_ms", 0)
if latency_ms > 0:
record_latency(runtime, req_type, latency_ms)
return {"ok": True}
# ── NATS request/reply (optional) ─────────────────────────────────────────────
ENABLE_NATS = os.getenv("ENABLE_NATS_CAPS", "false").lower() in ("true", "1", "yes")

View File

@@ -0,0 +1,164 @@
"""Runtime health and load metrics for NCS capabilities payload."""
import logging
import os
import platform
import subprocess
import time
from collections import deque
from typing import Any, Dict, List, Optional
import httpx
logger = logging.getLogger("ncs-metrics")
NODE_WORKER_URL = os.getenv("NODE_WORKER_URL", "http://127.0.0.1:8109")
_latency_buffer: Dict[str, deque] = {} # key: "runtime:type" → deque of (latency_ms, ts)
LATENCY_BUFFER_SIZE = 50
def record_latency(runtime: str, req_type: str, latency_ms: int):
key = f"{runtime}:{req_type}"
buf = _latency_buffer.setdefault(key, deque(maxlen=LATENCY_BUFFER_SIZE))
buf.append((latency_ms, time.time()))
def _percentile(values: List[int], p: float) -> int:
if not values:
return 0
s = sorted(values)
idx = int(len(s) * p / 100)
return s[min(idx, len(s) - 1)]
def get_latency_stats(runtime: str, req_type: str) -> Dict[str, Optional[int]]:
key = f"{runtime}:{req_type}"
buf = _latency_buffer.get(key)
if not buf or len(buf) == 0:
return {"p50_ms": None, "p95_ms": None, "samples": 0}
cutoff = time.time() - 600
recent = [lat for lat, ts in buf if ts > cutoff]
if not recent:
return {"p50_ms": None, "p95_ms": None, "samples": 0}
return {
"p50_ms": _percentile(recent, 50),
"p95_ms": _percentile(recent, 95),
"samples": len(recent),
}
async def fetch_worker_metrics() -> Dict[str, Any]:
"""Fetch inflight/concurrency from local node-worker /metrics."""
defaults = {"inflight_jobs": 0, "concurrency_limit": 1, "queue_depth": 0,
"last_latencies_llm": [], "last_latencies_vision": []}
try:
async with httpx.AsyncClient(timeout=2) as c:
r = await c.get(f"{NODE_WORKER_URL}/metrics")
if r.status_code == 200:
return r.json()
except Exception as e:
logger.debug(f"Node-worker metrics unavailable: {e}")
return defaults
def get_cpu_load() -> Optional[float]:
try:
return round(os.getloadavg()[0], 2)
except (OSError, AttributeError):
return None
def get_mem_pressure() -> Optional[str]:
"""macOS: use memory_pressure -Q or vm_stat. Linux: /proc/meminfo."""
if platform.system() == "Darwin":
try:
out = subprocess.check_output(
["memory_pressure", "-Q"], timeout=2, stderr=subprocess.DEVNULL
).decode()
for line in out.splitlines():
ll = line.lower()
if "system-wide" in ll and "level" in ll:
if "critical" in ll:
return "critical"
if "warn" in ll:
return "high"
if "normal" in ll:
return "low"
return "low"
except Exception:
try:
out = subprocess.check_output(
["vm_stat"], timeout=2, stderr=subprocess.DEVNULL
).decode()
return "low"
except Exception:
return None
elif platform.system() == "Linux":
try:
with open("/proc/meminfo") as f:
info = {}
for line in f:
parts = line.split(":")
if len(parts) == 2:
info[parts[0].strip()] = int(parts[1].strip().split()[0])
total = info.get("MemTotal", 1)
avail = info.get("MemAvailable", total)
ratio = avail / total
if ratio < 0.05:
return "critical"
elif ratio < 0.15:
return "high"
elif ratio < 0.30:
return "medium"
return "low"
except Exception:
return None
return None
async def build_node_load(worker_metrics: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
"""Build NodeLoad object for capabilities payload."""
wm = worker_metrics or await fetch_worker_metrics()
inflight = wm.get("inflight_jobs", 0)
concurrency = wm.get("concurrency_limit", 1)
queue_depth = wm.get("queue_depth", 0)
llm_stats = get_latency_stats("ollama", "llm")
p50 = llm_stats["p50_ms"] or 1500
if inflight < concurrency:
estimated_wait = 0
else:
estimated_wait = (inflight - concurrency + 1) * p50
return {
"ts": int(time.time() * 1000),
"inflight_jobs": inflight,
"queue_depth": queue_depth,
"concurrency_limit": concurrency,
"estimated_wait_ms": estimated_wait,
"cpu_load_1m": get_cpu_load(),
"mem_pressure": get_mem_pressure(),
"rtt_ms_to_hub": None,
}
async def build_runtime_load(runtimes: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Build RuntimeLoad list from collected runtimes."""
result = []
for rt_name, rt_data in runtimes.items():
status = rt_data.get("status", "unknown")
healthy = status == "ok"
llm_stats = get_latency_stats(rt_name, "llm")
vis_stats = get_latency_stats(rt_name, "vision")
best_stats = vis_stats if vis_stats["samples"] > llm_stats["samples"] else llm_stats
result.append({
"runtime": rt_name,
"healthy": healthy,
"last_check_ms": int(time.time() * 1000),
"p50_ms": best_stats["p50_ms"],
"p95_ms": best_stats["p95_ms"],
})
return result