microdao-daarion/services/node-worker/fabric_metrics.py

"""Prometheus metrics for Node Worker."""
import logging

logger = logging.getLogger("worker_metrics")

try:
    from prometheus_client import Counter, Gauge, Histogram, CollectorRegistry, generate_latest
    PROM_AVAILABLE = True
    REGISTRY = CollectorRegistry()

    # Generic job metrics
    jobs_total = Counter(
        "node_worker_jobs_total", "Jobs processed",
        ["type", "status"], registry=REGISTRY,
    )
    inflight_gauge = Gauge(
        "node_worker_inflight", "Currently inflight jobs",
        registry=REGISTRY,
    )
    latency_hist = Histogram(
        "node_worker_latency_ms", "Job latency in ms",
        ["type", "model"],
        buckets=[100, 250, 500, 1000, 2500, 5000, 15000, 30000],
        registry=REGISTRY,
    )

    # ── Voice HA metrics (separate labels from generic) ───────────────────────
    # cap label: "voice.tts" | "voice.llm" | "voice.stt"
    voice_jobs_total = Counter(
        "node_worker_voice_jobs_total",
        "Voice HA jobs processed (node.{id}.voice.*.request)",
        ["cap", "status"], registry=REGISTRY,
    )
    voice_inflight_gauge = Gauge(
        "node_worker_voice_inflight",
        "Voice HA inflight jobs per capability",
        ["cap"], registry=REGISTRY,
    )
    voice_latency_hist = Histogram(
        "node_worker_voice_latency_ms",
        "Voice HA job latency in ms",
        ["cap"],
        buckets=[100, 250, 500, 1000, 1500, 2000, 3000, 5000, 9000, 12000],
        registry=REGISTRY,
    )

except ImportError:
    PROM_AVAILABLE = False
    REGISTRY = None
    logger.info("prometheus_client not installed, worker metrics disabled")


def inc_job(req_type: str, status: str):
    if PROM_AVAILABLE:
        jobs_total.labels(type=req_type, status=status).inc()


def set_inflight(count: int):
    if PROM_AVAILABLE:
        inflight_gauge.set(count)


def observe_latency(req_type: str, model: str, latency_ms: int):
    if PROM_AVAILABLE:
        latency_hist.labels(type=req_type, model=model).observe(latency_ms)


def inc_voice_job(cap: str, status: str):
    if PROM_AVAILABLE:
        voice_jobs_total.labels(cap=cap, status=status).inc()


def set_voice_inflight(cap: str, count: int):
    if PROM_AVAILABLE:
        voice_inflight_gauge.labels(cap=cap).set(count)


def observe_voice_latency(cap: str, latency_ms: int):
    if PROM_AVAILABLE:
        voice_latency_hist.labels(cap=cap).observe(latency_ms)


def get_metrics_text():
    if PROM_AVAILABLE and REGISTRY:
        return generate_latest(REGISTRY)
    return None