P3.1: GPU/Queue-aware routing — NCS metrics + scoring-based model selection

NCS (services/node-capabilities/metrics.py):
- NodeLoad: inflight_jobs, queue_depth, concurrency_limit, estimated_wait_ms,
  cpu_load_1m, mem_pressure (macOS + Linux), rtt_ms_to_hub
- RuntimeLoad: per-runtime healthy, p50_ms, p95_ms from rolling 50-sample window
- POST /capabilities/report_latency for node-worker → NCS reporting
- NCS fetches worker metrics via NODE_WORKER_URL

Node Worker:
- GET /metrics endpoint (inflight, concurrency, latency buffers)
- Latency tracking per job type (llm/vision) with rolling buffer
- Fire-and-forget latency reporting to NCS after each successful job

Router (model_select v3):
- score_candidate(): wait + model_latency + cross_node_penalty + prefer_bonus
- LOCAL_THRESHOLD_MS=250: prefer local if within threshold of remote
- ModelSelection.score field for observability
- Structured [score] logs with chosen node, model, and score breakdown

Tests: 19 new (12 scoring + 7 NCS metrics), 36 total pass
Docs: ops/runbook_p3_1.md, ops/CHANGELOG_FABRIC.md

No breaking changes to JobRequest/JobResponse or capabilities schema.

Made-with: Cursor
This commit is contained in:
Apple
2026-02-27 02:55:44 -08:00
parent c4b94a327d
commit a605b8c43e
11 changed files with 706 additions and 40 deletions

View File

@@ -26,6 +26,11 @@ async def healthz():
}
@app.get("/metrics")
async def metrics():
return worker.get_metrics()
@app.on_event("startup")
async def startup():
global _nats_client

View File

@@ -2,6 +2,7 @@
import asyncio
import json
import logging
import os
import time
from typing import Any, Dict
@@ -15,6 +16,10 @@ logger = logging.getLogger("node-worker")
_idem = IdempotencyStore()
_semaphore: asyncio.Semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY)
_nats_client = None
_inflight_count: int = 0
_latencies_llm: list = []
_latencies_vision: list = []
_LATENCY_BUFFER = 50
async def start(nats_client):
@@ -88,12 +93,25 @@ async def _handle_request(msg):
await _reply(msg, resp)
return
async with _semaphore:
resp = await _execute(job, remaining)
global _inflight_count
_inflight_count += 1
try:
async with _semaphore:
resp = await _execute(job, remaining)
finally:
_inflight_count -= 1
_idem.put(idem_key, resp)
_idem.complete_inflight(idem_key, resp)
resp.latency_ms = int((time.time() - t0) * 1000)
if resp.status == "ok" and resp.latency_ms > 0:
buf = _latencies_llm if job.required_type in ("llm", "code") else _latencies_vision
buf.append(resp.latency_ms)
if len(buf) > _LATENCY_BUFFER:
del buf[:len(buf) - _LATENCY_BUFFER]
_report_latency_async(job.required_type, resp.provider or "ollama", resp.latency_ms)
await _reply(msg, resp)
except Exception as e:
@@ -179,6 +197,37 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
)
def get_metrics() -> Dict[str, Any]:
return {
"inflight_jobs": _inflight_count,
"concurrency_limit": config.MAX_CONCURRENCY,
"queue_depth": 0,
"last_latencies_llm": list(_latencies_llm[-_LATENCY_BUFFER:]),
"last_latencies_vision": list(_latencies_vision[-_LATENCY_BUFFER:]),
}
def _report_latency_async(req_type: str, runtime: str, latency_ms: int):
"""Fire-and-forget latency report to local NCS."""
import httpx as _httpx
ncs_url = os.getenv("NCS_REPORT_URL", "http://node-capabilities:8099")
async def _do():
try:
async with _httpx.AsyncClient(timeout=1) as c:
await c.post(f"{ncs_url}/capabilities/report_latency", json={
"runtime": runtime, "type": req_type, "latency_ms": latency_ms,
})
except Exception:
pass
try:
asyncio.get_event_loop().create_task(_do())
except RuntimeError:
pass
async def _reply(msg, resp: JobResponse):
if msg.reply:
await _nats_client.publish(msg.reply, resp.model_dump_json().encode())