P3.2+P3.3+P3.4: NODA1 node-worker + NATS auth config + Prometheus counters
P3.2 — Multi-node deployment: - Added node-worker service to docker-compose.node1.yml (NODE_ID=noda1) - NCS NODA1 now has NODE_WORKER_URL for metrics collection - Fixed NODE_ID consistency: router NODA1 uses 'noda1' - NODA2 node-worker/NCS gets NCS_REPORT_URL for latency reporting P3.3 — NATS accounts/auth (opt-in config): - config/nats-server.conf with 3 accounts: SYS, FABRIC, APP - Per-user topic permissions (router, ncs, node_worker) - Leafnode listener :7422 with auth - Not yet activated (requires credential provisioning) P3.4 — Prometheus counters: - Router /fabric_metrics: caps_refresh, caps_stale, model_select, offload_total, breaker_state, score_ms histogram - Node Worker /prom_metrics: jobs_total, inflight gauge, latency_ms histogram - NCS /prom_metrics: runtime_health, runtime_p50/p95, node_wait_ms - All bound to 127.0.0.1 (not externally exposed) Made-with: Cursor
This commit is contained in:
@@ -11,6 +11,7 @@ import httpx
|
||||
from metrics import (
|
||||
build_node_load, build_runtime_load, record_latency,
|
||||
)
|
||||
import prom_metrics
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger("node-capabilities")
|
||||
@@ -231,6 +232,7 @@ async def healthz():
|
||||
@app.get("/capabilities")
|
||||
async def capabilities():
|
||||
data = await _build_capabilities()
|
||||
prom_metrics.update_from_caps(data)
|
||||
return JSONResponse(content=data)
|
||||
|
||||
|
||||
@@ -248,6 +250,15 @@ async def capabilities_refresh():
|
||||
return JSONResponse(content={"refreshed": True, "served_count": data["served_count"]})
|
||||
|
||||
|
||||
@app.get("/prom_metrics")
|
||||
async def prom_metrics_endpoint():
|
||||
data = prom_metrics.get_metrics_text()
|
||||
if data:
|
||||
from fastapi.responses import Response
|
||||
return Response(content=data, media_type="text/plain; charset=utf-8")
|
||||
return {"error": "prometheus_client not installed"}
|
||||
|
||||
|
||||
@app.post("/capabilities/report_latency")
|
||||
async def report_latency_endpoint(request: Request):
|
||||
data = await request.json()
|
||||
|
||||
50
services/node-capabilities/prom_metrics.py
Normal file
50
services/node-capabilities/prom_metrics.py
Normal file
@@ -0,0 +1,50 @@
|
||||
"""Prometheus metrics for NCS."""
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger("ncs_prom")
|
||||
|
||||
try:
|
||||
from prometheus_client import Gauge, CollectorRegistry, generate_latest
|
||||
PROM_AVAILABLE = True
|
||||
REGISTRY = CollectorRegistry()
|
||||
|
||||
runtime_health = Gauge(
|
||||
"ncs_runtime_health", "Runtime health (1=ok, 0=down)",
|
||||
["runtime"], registry=REGISTRY,
|
||||
)
|
||||
runtime_p50 = Gauge(
|
||||
"ncs_runtime_p50_ms", "Runtime p50 latency",
|
||||
["runtime"], registry=REGISTRY,
|
||||
)
|
||||
runtime_p95 = Gauge(
|
||||
"ncs_runtime_p95_ms", "Runtime p95 latency",
|
||||
["runtime"], registry=REGISTRY,
|
||||
)
|
||||
node_wait = Gauge(
|
||||
"ncs_node_wait_ms", "Estimated wait for node",
|
||||
registry=REGISTRY,
|
||||
)
|
||||
|
||||
except ImportError:
|
||||
PROM_AVAILABLE = False
|
||||
REGISTRY = None
|
||||
|
||||
|
||||
def update_from_caps(caps: dict):
|
||||
if not PROM_AVAILABLE:
|
||||
return
|
||||
nl = caps.get("node_load", {})
|
||||
node_wait.set(nl.get("estimated_wait_ms", 0))
|
||||
for rl in caps.get("runtime_load", []):
|
||||
rt = rl.get("runtime", "?")
|
||||
runtime_health.labels(runtime=rt).set(1 if rl.get("healthy") else 0)
|
||||
if rl.get("p50_ms") is not None:
|
||||
runtime_p50.labels(runtime=rt).set(rl["p50_ms"])
|
||||
if rl.get("p95_ms") is not None:
|
||||
runtime_p95.labels(runtime=rt).set(rl["p95_ms"])
|
||||
|
||||
|
||||
def get_metrics_text():
|
||||
if PROM_AVAILABLE and REGISTRY:
|
||||
return generate_latest(REGISTRY)
|
||||
return None
|
||||
@@ -2,3 +2,4 @@ fastapi>=0.110.0
|
||||
uvicorn>=0.29.0
|
||||
httpx>=0.27.0
|
||||
nats-py>=2.7.0
|
||||
prometheus-client>=0.20.0
|
||||
|
||||
Reference in New Issue
Block a user