""" Metrics Contract — Matrix Bridge DAGI Phase M7.1 Single source of truth for all Prometheus metric names and their label sets. Used by: - main.py (registers metrics against this contract) - tests/test_matrix_bridge_m71_metrics_contract.py (static validation) - ops/prometheus/alerts/matrix-bridge-dagi.rules.yml (PromQL expressions) - ops/grafana/dashboards/matrix-bridge-dagi.json (panel queries) Format: METRICS_CONTRACT: Dict[metric_name, MetricSpec] MetricSpec fields: kind : "counter" | "histogram" | "gauge" labels : list of label names (empty list = no labels) help : one-line description phase : originating milestone """ from __future__ import annotations from dataclasses import dataclass, field from typing import Dict, List @dataclass(frozen=True) class MetricSpec: kind: str # "counter" | "histogram" | "gauge" labels: List[str] # label names; empty = no labels help: str phase: str = "M1" # originating milestone for traceability # ── Contract ────────────────────────────────────────────────────────────────── METRICS_CONTRACT: Dict[str, MetricSpec] = { # ── Core message traffic ────────────────────────────────────────────────── "matrix_bridge_messages_received_total": MetricSpec( kind="counter", labels=["room_id", "agent_id"], help="Total Matrix messages received", phase="M1", ), "matrix_bridge_messages_replied_total": MetricSpec( kind="counter", labels=["room_id", "agent_id", "status"], help="Total agent replies sent to Matrix (status=ok|error)", phase="M1", ), "matrix_bridge_rate_limited_total": MetricSpec( kind="counter", labels=["room_id", "agent_id", "limit_type"], help="Messages dropped by rate limiter", phase="H1", ), "matrix_bridge_gateway_errors_total": MetricSpec( kind="counter", labels=["error_type"], help="Bridge errors by stage: sync_error, network_error, http_, matrix_send_error, unexpected", phase="M1", ), # ── Latency histograms ──────────────────────────────────────────────────── "matrix_bridge_invoke_duration_seconds": MetricSpec( kind="histogram", labels=["agent_id", "node_id"], help="Latency of DAGI Router infer call, per agent and node", phase="H3", ), "matrix_bridge_send_duration_seconds": MetricSpec( kind="histogram", labels=["agent_id"], help="Latency of Matrix send_text call", phase="H3", ), "matrix_bridge_queue_wait_seconds": MetricSpec( kind="histogram", labels=["agent_id"], help="Time between enqueue and worker start processing", phase="H3", ), # ── Queue ───────────────────────────────────────────────────────────────── "matrix_bridge_queue_size": MetricSpec( kind="gauge", labels=[], help="Current number of pending items in the work queue", phase="H2", ), "matrix_bridge_queue_dropped_total": MetricSpec( kind="counter", labels=["room_id", "agent_id"], help="Messages dropped because queue was full", phase="H2", ), # ── Rate limiter gauges ─────────────────────────────────────────────────── "matrix_bridge_rate_limiter_active_rooms": MetricSpec( kind="gauge", labels=[], help="Rooms with activity in the current rate-limit window", phase="H1", ), "matrix_bridge_rate_limiter_active_senders": MetricSpec( kind="gauge", labels=[], help="Senders with activity in the current rate-limit window", phase="H1", ), # ── Routing ─────────────────────────────────────────────────────────────── "matrix_bridge_routing_reasons_total": MetricSpec( kind="counter", labels=["agent_id", "reason"], help="Message routing breakdown by agent and routing reason (slash/mention/name/default/direct)", phase="M2.2", ), "matrix_bridge_route_rejected_total": MetricSpec( kind="counter", labels=["room_id", "reason"], help="Messages rejected during routing (unknown agent, bad slash, no mapping, etc.)", phase="M2.2", ), "matrix_bridge_active_room_agent_locks": MetricSpec( kind="gauge", labels=[], help="Number of room-agent pairs currently holding a concurrency lock", phase="M2.2", ), # ── Control channel ─────────────────────────────────────────────────────── "matrix_bridge_control_commands_total": MetricSpec( kind="counter", labels=["sender", "verb", "subcommand"], help="Total control commands received from authorized operators", phase="M3.0", ), "matrix_bridge_control_rate_limited_total": MetricSpec( kind="counter", labels=["scope"], help="Total control commands rejected by rate limiter or cooldown", phase="M3.4", ), # ── Persistent deduplication ───────────────────────────────────────────── "matrix_bridge_dedupe_persistent_hits_total": MetricSpec( kind="counter", labels=["room_id"], help="Total events dropped by persistent (SQLite) deduplication", phase="M2.3", ), "matrix_bridge_dedupe_persistent_inserts_total": MetricSpec( kind="counter", labels=[], help="Total events marked as processed in persistent dedupe store", phase="M2.3", ), # ── Node-aware routing (M5.0) ───────────────────────────────────────────── "matrix_bridge_routed_total": MetricSpec( kind="counter", labels=["agent_id", "node_id", "source"], help="Total messages successfully routed, by agent, resolved node, and node source", phase="M5.0", ), "matrix_bridge_node_rejected_total": MetricSpec( kind="counter", labels=["node_id"], help="Total messages with rejected (non-allowlisted) node kwarg", phase="M5.0", ), # ── Bridge health (M7.1) ────────────────────────────────────────────────── "matrix_bridge_up": MetricSpec( kind="gauge", labels=["node_id"], help="1 if bridge started successfully; 0 on config error", phase="M7.1", ), # ── Soft-failover (M8.0) ───────────────────────────────────────────────── "matrix_bridge_failover_total": MetricSpec( kind="counter", labels=["from_node", "to_node", "reason"], help="Total successful soft-failovers by node transition and reason", phase="M8.0", ), "matrix_bridge_node_health_state": MetricSpec( kind="gauge", labels=["node_id"], help="Node health state gauge: 1=healthy 0.5=degraded 0=down", phase="M8.0", ), # ── Sticky routing anti-flap (M8.1) ────────────────────────────────────── "matrix_bridge_sticky_node_total": MetricSpec( kind="counter", labels=["node_id", "scope"], help="Total sticky routing entries set after failover, by preferred node and scope", phase="M8.1", ), "matrix_bridge_sticky_node_active": MetricSpec( kind="gauge", labels=[], help="Current count of active sticky routing entries", phase="M8.1", ), } # ── Alert metric references ──────────────────────────────────────────────────── # These are the metric base-names referenced in alert rules. # All must exist in METRICS_CONTRACT. ALERT_METRIC_REFS = frozenset({ "matrix_bridge_up", "matrix_bridge_gateway_errors_total", "matrix_bridge_messages_replied_total", "matrix_bridge_queue_dropped_total", "matrix_bridge_rate_limited_total", "matrix_bridge_control_rate_limited_total", "matrix_bridge_dedupe_persistent_hits_total", "matrix_bridge_invoke_duration_seconds", })