P3.1: GPU/Queue-aware routing — NCS metrics + scoring-based model selection
NCS (services/node-capabilities/metrics.py): - NodeLoad: inflight_jobs, queue_depth, concurrency_limit, estimated_wait_ms, cpu_load_1m, mem_pressure (macOS + Linux), rtt_ms_to_hub - RuntimeLoad: per-runtime healthy, p50_ms, p95_ms from rolling 50-sample window - POST /capabilities/report_latency for node-worker → NCS reporting - NCS fetches worker metrics via NODE_WORKER_URL Node Worker: - GET /metrics endpoint (inflight, concurrency, latency buffers) - Latency tracking per job type (llm/vision) with rolling buffer - Fire-and-forget latency reporting to NCS after each successful job Router (model_select v3): - score_candidate(): wait + model_latency + cross_node_penalty + prefer_bonus - LOCAL_THRESHOLD_MS=250: prefer local if within threshold of remote - ModelSelection.score field for observability - Structured [score] logs with chosen node, model, and score breakdown Tests: 19 new (12 scoring + 7 NCS metrics), 36 total pass Docs: ops/runbook_p3_1.md, ops/CHANGELOG_FABRIC.md No breaking changes to JobRequest/JobResponse or capabilities schema. Made-with: Cursor
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from typing import Any, Dict
|
||||
|
||||
@@ -15,6 +16,10 @@ logger = logging.getLogger("node-worker")
|
||||
_idem = IdempotencyStore()
|
||||
_semaphore: asyncio.Semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY)
|
||||
_nats_client = None
|
||||
_inflight_count: int = 0
|
||||
_latencies_llm: list = []
|
||||
_latencies_vision: list = []
|
||||
_LATENCY_BUFFER = 50
|
||||
|
||||
|
||||
async def start(nats_client):
|
||||
@@ -88,12 +93,25 @@ async def _handle_request(msg):
|
||||
await _reply(msg, resp)
|
||||
return
|
||||
|
||||
async with _semaphore:
|
||||
resp = await _execute(job, remaining)
|
||||
global _inflight_count
|
||||
_inflight_count += 1
|
||||
try:
|
||||
async with _semaphore:
|
||||
resp = await _execute(job, remaining)
|
||||
finally:
|
||||
_inflight_count -= 1
|
||||
|
||||
_idem.put(idem_key, resp)
|
||||
_idem.complete_inflight(idem_key, resp)
|
||||
resp.latency_ms = int((time.time() - t0) * 1000)
|
||||
|
||||
if resp.status == "ok" and resp.latency_ms > 0:
|
||||
buf = _latencies_llm if job.required_type in ("llm", "code") else _latencies_vision
|
||||
buf.append(resp.latency_ms)
|
||||
if len(buf) > _LATENCY_BUFFER:
|
||||
del buf[:len(buf) - _LATENCY_BUFFER]
|
||||
_report_latency_async(job.required_type, resp.provider or "ollama", resp.latency_ms)
|
||||
|
||||
await _reply(msg, resp)
|
||||
|
||||
except Exception as e:
|
||||
@@ -179,6 +197,37 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
|
||||
)
|
||||
|
||||
|
||||
def get_metrics() -> Dict[str, Any]:
|
||||
return {
|
||||
"inflight_jobs": _inflight_count,
|
||||
"concurrency_limit": config.MAX_CONCURRENCY,
|
||||
"queue_depth": 0,
|
||||
"last_latencies_llm": list(_latencies_llm[-_LATENCY_BUFFER:]),
|
||||
"last_latencies_vision": list(_latencies_vision[-_LATENCY_BUFFER:]),
|
||||
}
|
||||
|
||||
|
||||
def _report_latency_async(req_type: str, runtime: str, latency_ms: int):
|
||||
"""Fire-and-forget latency report to local NCS."""
|
||||
import httpx as _httpx
|
||||
|
||||
ncs_url = os.getenv("NCS_REPORT_URL", "http://node-capabilities:8099")
|
||||
|
||||
async def _do():
|
||||
try:
|
||||
async with _httpx.AsyncClient(timeout=1) as c:
|
||||
await c.post(f"{ncs_url}/capabilities/report_latency", json={
|
||||
"runtime": runtime, "type": req_type, "latency_ms": latency_ms,
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
asyncio.get_event_loop().create_task(_do())
|
||||
except RuntimeError:
|
||||
pass
|
||||
|
||||
|
||||
async def _reply(msg, resp: JobResponse):
|
||||
if msg.reply:
|
||||
await _nats_client.publish(msg.reply, resp.model_dump_json().encode())
|
||||
|
||||
Reference in New Issue
Block a user