P3.2+P3.3+P3.4: NODA1 node-worker + NATS auth config + Prometheus counters
P3.2 — Multi-node deployment: - Added node-worker service to docker-compose.node1.yml (NODE_ID=noda1) - NCS NODA1 now has NODE_WORKER_URL for metrics collection - Fixed NODE_ID consistency: router NODA1 uses 'noda1' - NODA2 node-worker/NCS gets NCS_REPORT_URL for latency reporting P3.3 — NATS accounts/auth (opt-in config): - config/nats-server.conf with 3 accounts: SYS, FABRIC, APP - Per-user topic permissions (router, ncs, node_worker) - Leafnode listener :7422 with auth - Not yet activated (requires credential provisioning) P3.4 — Prometheus counters: - Router /fabric_metrics: caps_refresh, caps_stale, model_select, offload_total, breaker_state, score_ms histogram - Node Worker /prom_metrics: jobs_total, inflight gauge, latency_ms histogram - NCS /prom_metrics: runtime_health, runtime_p50/p95, node_wait_ms - All bound to 127.0.0.1 (not externally exposed) Made-with: Cursor
This commit is contained in:
82
services/router/fabric_metrics.py
Normal file
82
services/router/fabric_metrics.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""Prometheus metrics for Fabric routing layer.
|
||||
|
||||
Exposed via /fabric_metrics (separate from main /metrics to avoid conflicts).
|
||||
Falls back to no-op counters if prometheus_client is not installed.
|
||||
"""
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger("fabric_metrics")
|
||||
|
||||
try:
|
||||
from prometheus_client import Counter, Gauge, Histogram, CollectorRegistry, generate_latest
|
||||
PROM_AVAILABLE = True
|
||||
REGISTRY = CollectorRegistry()
|
||||
|
||||
caps_refresh = Counter(
|
||||
"fabric_caps_refresh_total", "Capabilities refresh attempts",
|
||||
["status"], registry=REGISTRY,
|
||||
)
|
||||
caps_stale = Counter(
|
||||
"fabric_caps_stale_total", "Stale capabilities used",
|
||||
registry=REGISTRY,
|
||||
)
|
||||
model_select = Counter(
|
||||
"fabric_model_select_total", "Model selection decisions",
|
||||
["chosen_node", "chosen_runtime", "type"], registry=REGISTRY,
|
||||
)
|
||||
offload_total = Counter(
|
||||
"fabric_offload_total", "Offload attempts",
|
||||
["status", "node", "type"], registry=REGISTRY,
|
||||
)
|
||||
breaker_state = Gauge(
|
||||
"fabric_breaker_state", "Circuit breaker state (1=open)",
|
||||
["node", "type"], registry=REGISTRY,
|
||||
)
|
||||
score_hist = Histogram(
|
||||
"fabric_score_ms", "Model selection score distribution",
|
||||
buckets=[100, 250, 500, 1000, 2000, 5000, 10000],
|
||||
registry=REGISTRY,
|
||||
)
|
||||
|
||||
except ImportError:
|
||||
PROM_AVAILABLE = False
|
||||
REGISTRY = None
|
||||
logger.info("prometheus_client not installed, fabric metrics disabled")
|
||||
|
||||
|
||||
def inc_caps_refresh(status: str):
|
||||
if PROM_AVAILABLE:
|
||||
caps_refresh.labels(status=status).inc()
|
||||
|
||||
|
||||
def inc_caps_stale():
|
||||
if PROM_AVAILABLE:
|
||||
caps_stale.inc()
|
||||
|
||||
|
||||
def inc_model_select(node: str, runtime: str, req_type: str):
|
||||
if PROM_AVAILABLE:
|
||||
model_select.labels(chosen_node=node, chosen_runtime=runtime, type=req_type).inc()
|
||||
|
||||
|
||||
def inc_offload(status: str, node: str, req_type: str):
|
||||
if PROM_AVAILABLE:
|
||||
offload_total.labels(status=status, node=node, type=req_type).inc()
|
||||
|
||||
|
||||
def set_breaker(node: str, req_type: str, is_open: bool):
|
||||
if PROM_AVAILABLE:
|
||||
breaker_state.labels(node=node, type=req_type).set(1 if is_open else 0)
|
||||
|
||||
|
||||
def observe_score(score: int):
|
||||
if PROM_AVAILABLE:
|
||||
score_hist.observe(score)
|
||||
|
||||
|
||||
def get_metrics_text() -> Optional[bytes]:
|
||||
if PROM_AVAILABLE and REGISTRY:
|
||||
return generate_latest(REGISTRY)
|
||||
return None
|
||||
@@ -52,6 +52,7 @@ try:
|
||||
import global_capabilities_client
|
||||
from model_select import select_model_for_agent, ModelSelection, CLOUD_PROVIDERS as NCS_CLOUD_PROVIDERS
|
||||
import offload_client
|
||||
import fabric_metrics as fm
|
||||
NCS_AVAILABLE = True
|
||||
except ImportError:
|
||||
NCS_AVAILABLE = False
|
||||
@@ -940,6 +941,17 @@ async def healthz():
|
||||
return await health()
|
||||
|
||||
|
||||
@app.get("/fabric_metrics")
|
||||
async def fabric_metrics_endpoint():
|
||||
"""Prometheus metrics for Fabric routing layer."""
|
||||
if NCS_AVAILABLE:
|
||||
data = fm.get_metrics_text()
|
||||
if data:
|
||||
from starlette.responses import Response
|
||||
return Response(content=data, media_type="text/plain; charset=utf-8")
|
||||
return {"error": "fabric metrics not available"}
|
||||
|
||||
|
||||
@app.get("/monitor/status")
|
||||
async def monitor_status(request: Request = None):
|
||||
"""
|
||||
@@ -1747,6 +1759,8 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
timeout_ms=infer_timeout,
|
||||
)
|
||||
if offload_resp and offload_resp.get("status") == "ok":
|
||||
if NCS_AVAILABLE:
|
||||
fm.inc_offload("ok", ncs_selection.node, job_payload["required_type"])
|
||||
result_text = offload_resp.get("result", {}).get("text", "")
|
||||
return InferResponse(
|
||||
response=result_text,
|
||||
@@ -1756,6 +1770,8 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
)
|
||||
else:
|
||||
offload_status = offload_resp.get("status", "none") if offload_resp else "no_reply"
|
||||
if NCS_AVAILABLE:
|
||||
fm.inc_offload(offload_status, ncs_selection.node, job_payload["required_type"])
|
||||
logger.warning(
|
||||
f"[fallback] offload to {ncs_selection.node} failed ({offload_status}) "
|
||||
f"→ re-selecting with exclude={ncs_selection.node}, force_local"
|
||||
|
||||
@@ -326,6 +326,12 @@ async def select_model_for_agent(
|
||||
f"{' (force_local)' if force_local else ''}"
|
||||
f"{' (excluded: ' + ','.join(excl) + ')' if excl else ''}"
|
||||
)
|
||||
try:
|
||||
from fabric_metrics import inc_model_select, observe_score
|
||||
inc_model_select(sel.node, sel.runtime, reqs.required_type)
|
||||
observe_score(sel.score)
|
||||
except ImportError:
|
||||
pass
|
||||
return sel
|
||||
logger.warning(
|
||||
f"[select] agent={agent_id} profile={profile} → no match "
|
||||
|
||||
Reference in New Issue
Block a user