Includes updates across gateway, router, node-worker, memory-service, aurora-service, swapper, sofiia-console UI and node2 infrastructure: - gateway-bot: Dockerfile, http_api.py, druid/aistalk prompts, doc_service - services/router: main.py, router-config.yml, fabric_metrics, memory_retrieval, offload_client, prompt_builder - services/node-worker: worker.py, main.py, config.py, fabric_metrics - services/memory-service: Dockerfile, database.py, main.py, requirements - services/aurora-service: main.py (+399), kling.py, quality_report.py - services/swapper-service: main.py, swapper_config_node2.yaml - services/sofiia-console: static/index.html (console UI update) - config: agent_registry, crewai_agents/teams, router_agents - ops/fabric_preflight.sh: updated preflight checks - router-config.yml, docker-compose.node2.yml: infra updates - docs: NODA1-AGENT-ARCHITECTURE, fabric_contract updated Made-with: Cursor
87 lines
2.5 KiB
Python
87 lines
2.5 KiB
Python
"""Prometheus metrics for Node Worker."""
|
|
import logging
|
|
|
|
logger = logging.getLogger("worker_metrics")
|
|
|
|
try:
|
|
from prometheus_client import Counter, Gauge, Histogram, CollectorRegistry, generate_latest
|
|
PROM_AVAILABLE = True
|
|
REGISTRY = CollectorRegistry()
|
|
|
|
# Generic job metrics
|
|
jobs_total = Counter(
|
|
"node_worker_jobs_total", "Jobs processed",
|
|
["type", "status"], registry=REGISTRY,
|
|
)
|
|
inflight_gauge = Gauge(
|
|
"node_worker_inflight", "Currently inflight jobs",
|
|
registry=REGISTRY,
|
|
)
|
|
latency_hist = Histogram(
|
|
"node_worker_latency_ms", "Job latency in ms",
|
|
["type", "model"],
|
|
buckets=[100, 250, 500, 1000, 2500, 5000, 15000, 30000],
|
|
registry=REGISTRY,
|
|
)
|
|
|
|
# ── Voice HA metrics (separate labels from generic) ───────────────────────
|
|
# cap label: "voice.tts" | "voice.llm" | "voice.stt"
|
|
voice_jobs_total = Counter(
|
|
"node_worker_voice_jobs_total",
|
|
"Voice HA jobs processed (node.{id}.voice.*.request)",
|
|
["cap", "status"], registry=REGISTRY,
|
|
)
|
|
voice_inflight_gauge = Gauge(
|
|
"node_worker_voice_inflight",
|
|
"Voice HA inflight jobs per capability",
|
|
["cap"], registry=REGISTRY,
|
|
)
|
|
voice_latency_hist = Histogram(
|
|
"node_worker_voice_latency_ms",
|
|
"Voice HA job latency in ms",
|
|
["cap"],
|
|
buckets=[100, 250, 500, 1000, 1500, 2000, 3000, 5000, 9000, 12000],
|
|
registry=REGISTRY,
|
|
)
|
|
|
|
except ImportError:
|
|
PROM_AVAILABLE = False
|
|
REGISTRY = None
|
|
logger.info("prometheus_client not installed, worker metrics disabled")
|
|
|
|
|
|
def inc_job(req_type: str, status: str):
|
|
if PROM_AVAILABLE:
|
|
jobs_total.labels(type=req_type, status=status).inc()
|
|
|
|
|
|
def set_inflight(count: int):
|
|
if PROM_AVAILABLE:
|
|
inflight_gauge.set(count)
|
|
|
|
|
|
def observe_latency(req_type: str, model: str, latency_ms: int):
|
|
if PROM_AVAILABLE:
|
|
latency_hist.labels(type=req_type, model=model).observe(latency_ms)
|
|
|
|
|
|
def inc_voice_job(cap: str, status: str):
|
|
if PROM_AVAILABLE:
|
|
voice_jobs_total.labels(cap=cap, status=status).inc()
|
|
|
|
|
|
def set_voice_inflight(cap: str, count: int):
|
|
if PROM_AVAILABLE:
|
|
voice_inflight_gauge.labels(cap=cap).set(count)
|
|
|
|
|
|
def observe_voice_latency(cap: str, latency_ms: int):
|
|
if PROM_AVAILABLE:
|
|
voice_latency_hist.labels(cap=cap).observe(latency_ms)
|
|
|
|
|
|
def get_metrics_text():
|
|
if PROM_AVAILABLE and REGISTRY:
|
|
return generate_latest(REGISTRY)
|
|
return None
|