Files
microdao-daarion/services/router/fabric_metrics.py
Apple ed7ad49d3a P3.2+P3.3+P3.4: NODA1 node-worker + NATS auth config + Prometheus counters
P3.2 — Multi-node deployment:
- Added node-worker service to docker-compose.node1.yml (NODE_ID=noda1)
- NCS NODA1 now has NODE_WORKER_URL for metrics collection
- Fixed NODE_ID consistency: router NODA1 uses 'noda1'
- NODA2 node-worker/NCS gets NCS_REPORT_URL for latency reporting

P3.3 — NATS accounts/auth (opt-in config):
- config/nats-server.conf with 3 accounts: SYS, FABRIC, APP
- Per-user topic permissions (router, ncs, node_worker)
- Leafnode listener :7422 with auth
- Not yet activated (requires credential provisioning)

P3.4 — Prometheus counters:
- Router /fabric_metrics: caps_refresh, caps_stale, model_select,
  offload_total, breaker_state, score_ms histogram
- Node Worker /prom_metrics: jobs_total, inflight gauge, latency_ms histogram
- NCS /prom_metrics: runtime_health, runtime_p50/p95, node_wait_ms
- All bound to 127.0.0.1 (not externally exposed)

Made-with: Cursor
2026-02-27 03:03:18 -08:00

83 lines
2.4 KiB
Python

"""Prometheus metrics for Fabric routing layer.
Exposed via /fabric_metrics (separate from main /metrics to avoid conflicts).
Falls back to no-op counters if prometheus_client is not installed.
"""
import logging
import time
from typing import Optional
logger = logging.getLogger("fabric_metrics")
try:
from prometheus_client import Counter, Gauge, Histogram, CollectorRegistry, generate_latest
PROM_AVAILABLE = True
REGISTRY = CollectorRegistry()
caps_refresh = Counter(
"fabric_caps_refresh_total", "Capabilities refresh attempts",
["status"], registry=REGISTRY,
)
caps_stale = Counter(
"fabric_caps_stale_total", "Stale capabilities used",
registry=REGISTRY,
)
model_select = Counter(
"fabric_model_select_total", "Model selection decisions",
["chosen_node", "chosen_runtime", "type"], registry=REGISTRY,
)
offload_total = Counter(
"fabric_offload_total", "Offload attempts",
["status", "node", "type"], registry=REGISTRY,
)
breaker_state = Gauge(
"fabric_breaker_state", "Circuit breaker state (1=open)",
["node", "type"], registry=REGISTRY,
)
score_hist = Histogram(
"fabric_score_ms", "Model selection score distribution",
buckets=[100, 250, 500, 1000, 2000, 5000, 10000],
registry=REGISTRY,
)
except ImportError:
PROM_AVAILABLE = False
REGISTRY = None
logger.info("prometheus_client not installed, fabric metrics disabled")
def inc_caps_refresh(status: str):
if PROM_AVAILABLE:
caps_refresh.labels(status=status).inc()
def inc_caps_stale():
if PROM_AVAILABLE:
caps_stale.inc()
def inc_model_select(node: str, runtime: str, req_type: str):
if PROM_AVAILABLE:
model_select.labels(chosen_node=node, chosen_runtime=runtime, type=req_type).inc()
def inc_offload(status: str, node: str, req_type: str):
if PROM_AVAILABLE:
offload_total.labels(status=status, node=node, type=req_type).inc()
def set_breaker(node: str, req_type: str, is_open: bool):
if PROM_AVAILABLE:
breaker_state.labels(node=node, type=req_type).set(1 if is_open else 0)
def observe_score(score: int):
if PROM_AVAILABLE:
score_hist.observe(score)
def get_metrics_text() -> Optional[bytes]:
if PROM_AVAILABLE and REGISTRY:
return generate_latest(REGISTRY)
return None