P3.2+P3.3+P3.4: NODA1 node-worker + NATS auth config + Prometheus counters

P3.2 — Multi-node deployment: - Added node-worker service to docker-compose.node1.yml (NODE_ID=noda1) - NCS NODA1 now has NODE_WORKER_URL for metrics collection - Fixed NODE_ID consistency: router NODA1 uses 'noda1' - NODA2 node-worker/NCS gets NCS_REPORT_URL for latency reporting P3.3 — NATS accounts/auth (opt-in config): - config/nats-server.conf with 3 accounts: SYS, FABRIC, APP - Per-user topic permissions (router, ncs, node_worker) - Leafnode listener :7422 with auth - Not yet activated (requires credential provisioning) P3.4 — Prometheus counters: - Router /fabric_metrics: caps_refresh, caps_stale, model_select, offload_total, breaker_state, score_ms histogram - Node Worker /prom_metrics: jobs_total, inflight gauge, latency_ms histogram - NCS /prom_metrics: runtime_health, runtime_p50/p95, node_wait_ms - All bound to 127.0.0.1 (not externally exposed) Made-with: Cursor
2026-02-27 03:03:18 -08:00
parent a605b8c43e
commit ed7ad49d3a
13 changed files with 408 additions and 1 deletions
--- a/services/node-capabilities/main.py
+++ b/services/node-capabilities/main.py
@@ -11,6 +11,7 @@ import httpx
 from metrics import (
    build_node_load, build_runtime_load, record_latency,
 )
+import prom_metrics

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("node-capabilities")
@@ -231,6 +232,7 @@ async def healthz():
@app.get("/capabilities")
 async def capabilities():
    data = await _build_capabilities()
+    prom_metrics.update_from_caps(data)
    return JSONResponse(content=data)


@@ -248,6 +250,15 @@ async def capabilities_refresh():
    return JSONResponse(content={"refreshed": True, "served_count": data["served_count"]})


+@app.get("/prom_metrics")
+async def prom_metrics_endpoint():
+    data = prom_metrics.get_metrics_text()
+    if data:
+        from fastapi.responses import Response
+        return Response(content=data, media_type="text/plain; charset=utf-8")
+    return {"error": "prometheus_client not installed"}
+
+
@app.post("/capabilities/report_latency")
 async def report_latency_endpoint(request: Request):
    data = await request.json()
--- a/services/node-capabilities/prom_metrics.py
+++ b/services/node-capabilities/prom_metrics.py
@@ -0,0 +1,50 @@
+"""Prometheus metrics for NCS."""
+import logging
+
+logger = logging.getLogger("ncs_prom")
+
+try:
+    from prometheus_client import Gauge, CollectorRegistry, generate_latest
+    PROM_AVAILABLE = True
+    REGISTRY = CollectorRegistry()
+
+    runtime_health = Gauge(
+        "ncs_runtime_health", "Runtime health (1=ok, 0=down)",
+        ["runtime"], registry=REGISTRY,
+    )
+    runtime_p50 = Gauge(
+        "ncs_runtime_p50_ms", "Runtime p50 latency",
+        ["runtime"], registry=REGISTRY,
+    )
+    runtime_p95 = Gauge(
+        "ncs_runtime_p95_ms", "Runtime p95 latency",
+        ["runtime"], registry=REGISTRY,
+    )
+    node_wait = Gauge(
+        "ncs_node_wait_ms", "Estimated wait for node",
+        registry=REGISTRY,
+    )
+
+except ImportError:
+    PROM_AVAILABLE = False
+    REGISTRY = None
+
+
+def update_from_caps(caps: dict):
+    if not PROM_AVAILABLE:
+        return
+    nl = caps.get("node_load", {})
+    node_wait.set(nl.get("estimated_wait_ms", 0))
+    for rl in caps.get("runtime_load", []):
+        rt = rl.get("runtime", "?")
+        runtime_health.labels(runtime=rt).set(1 if rl.get("healthy") else 0)
+        if rl.get("p50_ms") is not None:
+            runtime_p50.labels(runtime=rt).set(rl["p50_ms"])
+        if rl.get("p95_ms") is not None:
+            runtime_p95.labels(runtime=rt).set(rl["p95_ms"])
+
+
+def get_metrics_text():
+    if PROM_AVAILABLE and REGISTRY:
+        return generate_latest(REGISTRY)
+    return None
--- a/services/node-capabilities/requirements.txt
+++ b/services/node-capabilities/requirements.txt
@@ -2,3 +2,4 @@ fastapi>=0.110.0
 uvicorn>=0.29.0
 httpx>=0.27.0
 nats-py>=2.7.0
+prometheus-client>=0.20.0