Files
microdao-daarion/services/router/fabric_metrics.py
Apple e9dedffa48 feat(production): sync all modified production files to git
Includes updates across gateway, router, node-worker, memory-service,
aurora-service, swapper, sofiia-console UI and node2 infrastructure:

- gateway-bot: Dockerfile, http_api.py, druid/aistalk prompts, doc_service
- services/router: main.py, router-config.yml, fabric_metrics, memory_retrieval,
  offload_client, prompt_builder
- services/node-worker: worker.py, main.py, config.py, fabric_metrics
- services/memory-service: Dockerfile, database.py, main.py, requirements
- services/aurora-service: main.py (+399), kling.py, quality_report.py
- services/swapper-service: main.py, swapper_config_node2.yaml
- services/sofiia-console: static/index.html (console UI update)
- config: agent_registry, crewai_agents/teams, router_agents
- ops/fabric_preflight.sh: updated preflight checks
- router-config.yml, docker-compose.node2.yml: infra updates
- docs: NODA1-AGENT-ARCHITECTURE, fabric_contract updated

Made-with: Cursor
2026-03-03 07:13:29 -08:00

128 lines
3.9 KiB
Python

"""Prometheus metrics for Fabric routing layer.
Exposed via /fabric_metrics (separate from main /metrics to avoid conflicts).
Falls back to no-op counters if prometheus_client is not installed.
"""
import logging
import time
from typing import Optional
logger = logging.getLogger("fabric_metrics")
try:
from prometheus_client import Counter, Gauge, Histogram, CollectorRegistry, generate_latest
PROM_AVAILABLE = True
REGISTRY = CollectorRegistry()
caps_refresh = Counter(
"fabric_caps_refresh_total", "Capabilities refresh attempts",
["status"], registry=REGISTRY,
)
caps_stale = Counter(
"fabric_caps_stale_total", "Stale capabilities used",
registry=REGISTRY,
)
model_select = Counter(
"fabric_model_select_total", "Model selection decisions",
["chosen_node", "chosen_runtime", "type"], registry=REGISTRY,
)
offload_total = Counter(
"fabric_offload_total", "Offload attempts",
["status", "node", "type"], registry=REGISTRY,
)
breaker_state = Gauge(
"fabric_breaker_state", "Circuit breaker state (1=open)",
["node", "type"], registry=REGISTRY,
)
score_hist = Histogram(
"fabric_score_ms", "Model selection score distribution",
buckets=[100, 250, 500, 1000, 2000, 5000, 10000],
registry=REGISTRY,
)
# ── Voice HA metrics ──────────────────────────────────────────────────────
# cap label: "voice_tts" | "voice_llm" | "voice_stt"
voice_cap_requests = Counter(
"fabric_voice_capability_requests_total",
"Voice HA capability routing requests",
["cap", "status"], registry=REGISTRY,
)
voice_offload_total = Counter(
"fabric_voice_offload_total",
"Voice HA offload attempts (node selected + NATS sent)",
["cap", "node", "status"], registry=REGISTRY,
)
voice_breaker_state = Gauge(
"fabric_voice_breaker_state",
"Voice HA circuit breaker per node+cap (1=open)",
["cap", "node"], registry=REGISTRY,
)
voice_score_hist = Histogram(
"fabric_voice_score_ms",
"Voice HA node scoring distribution",
["cap"],
buckets=[0, 50, 100, 200, 400, 800, 1600, 3200],
registry=REGISTRY,
)
except ImportError:
PROM_AVAILABLE = False
REGISTRY = None
logger.info("prometheus_client not installed, fabric metrics disabled")
def inc_caps_refresh(status: str):
if PROM_AVAILABLE:
caps_refresh.labels(status=status).inc()
def inc_caps_stale():
if PROM_AVAILABLE:
caps_stale.inc()
def inc_model_select(node: str, runtime: str, req_type: str):
if PROM_AVAILABLE:
model_select.labels(chosen_node=node, chosen_runtime=runtime, type=req_type).inc()
def inc_offload(status: str, node: str, req_type: str):
if PROM_AVAILABLE:
offload_total.labels(status=status, node=node, type=req_type).inc()
def set_breaker(node: str, req_type: str, is_open: bool):
if PROM_AVAILABLE:
breaker_state.labels(node=node, type=req_type).set(1 if is_open else 0)
def observe_score(score: int):
if PROM_AVAILABLE:
score_hist.observe(score)
def inc_voice_cap_request(cap: str, status: str):
if PROM_AVAILABLE:
voice_cap_requests.labels(cap=cap, status=status).inc()
def inc_voice_offload(cap: str, node: str, status: str):
if PROM_AVAILABLE:
voice_offload_total.labels(cap=cap, node=node, status=status).inc()
def set_voice_breaker(cap: str, node: str, is_open: bool):
if PROM_AVAILABLE:
voice_breaker_state.labels(cap=cap, node=node).set(1 if is_open else 0)
def observe_voice_score(cap: str, score: float):
if PROM_AVAILABLE:
voice_score_hist.labels(cap=cap).observe(score)
def get_metrics_text() -> Optional[bytes]:
if PROM_AVAILABLE and REGISTRY:
return generate_latest(REGISTRY)
return None