Files
microdao-daarion/services/matrix-bridge-dagi/app/metrics_contract.py
Apple 82d5ff2a4f feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11:
- M4: agent discovery (!agents / !status)
- M5: node-aware routing + per-node observability
- M6: dynamic policy store (node/agent overrides, import/export)
- M7: Prometheus alerts + Grafana dashboard + metrics contract
- M8: node health tracker + soft failover + sticky cache + HA persistence
- M9: two-step confirm + diff preview for dangerous commands
- M10: auto-backup, restore, retention, policy history + change detail
- M11: soak scenarios (CI tests) + live soak script

Soak infrastructure (this commit):
- POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false)
- _preflight_inject() and _check_wal() in soak script
- --db-path arg for WAL delta reporting
- Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands

Made-with: Cursor
2026-03-05 07:51:37 -08:00

225 lines
9.1 KiB
Python

"""
Metrics Contract — Matrix Bridge DAGI
Phase M7.1
Single source of truth for all Prometheus metric names and their label sets.
Used by:
- main.py (registers metrics against this contract)
- tests/test_matrix_bridge_m71_metrics_contract.py (static validation)
- ops/prometheus/alerts/matrix-bridge-dagi.rules.yml (PromQL expressions)
- ops/grafana/dashboards/matrix-bridge-dagi.json (panel queries)
Format:
METRICS_CONTRACT: Dict[metric_name, MetricSpec]
MetricSpec fields:
kind : "counter" | "histogram" | "gauge"
labels : list of label names (empty list = no labels)
help : one-line description
phase : originating milestone
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Dict, List
@dataclass(frozen=True)
class MetricSpec:
kind: str # "counter" | "histogram" | "gauge"
labels: List[str] # label names; empty = no labels
help: str
phase: str = "M1" # originating milestone for traceability
# ── Contract ──────────────────────────────────────────────────────────────────
METRICS_CONTRACT: Dict[str, MetricSpec] = {
# ── Core message traffic ──────────────────────────────────────────────────
"matrix_bridge_messages_received_total": MetricSpec(
kind="counter",
labels=["room_id", "agent_id"],
help="Total Matrix messages received",
phase="M1",
),
"matrix_bridge_messages_replied_total": MetricSpec(
kind="counter",
labels=["room_id", "agent_id", "status"],
help="Total agent replies sent to Matrix (status=ok|error)",
phase="M1",
),
"matrix_bridge_rate_limited_total": MetricSpec(
kind="counter",
labels=["room_id", "agent_id", "limit_type"],
help="Messages dropped by rate limiter",
phase="H1",
),
"matrix_bridge_gateway_errors_total": MetricSpec(
kind="counter",
labels=["error_type"],
help="Bridge errors by stage: sync_error, network_error, http_<status>, matrix_send_error, unexpected",
phase="M1",
),
# ── Latency histograms ────────────────────────────────────────────────────
"matrix_bridge_invoke_duration_seconds": MetricSpec(
kind="histogram",
labels=["agent_id", "node_id"],
help="Latency of DAGI Router infer call, per agent and node",
phase="H3",
),
"matrix_bridge_send_duration_seconds": MetricSpec(
kind="histogram",
labels=["agent_id"],
help="Latency of Matrix send_text call",
phase="H3",
),
"matrix_bridge_queue_wait_seconds": MetricSpec(
kind="histogram",
labels=["agent_id"],
help="Time between enqueue and worker start processing",
phase="H3",
),
# ── Queue ─────────────────────────────────────────────────────────────────
"matrix_bridge_queue_size": MetricSpec(
kind="gauge",
labels=[],
help="Current number of pending items in the work queue",
phase="H2",
),
"matrix_bridge_queue_dropped_total": MetricSpec(
kind="counter",
labels=["room_id", "agent_id"],
help="Messages dropped because queue was full",
phase="H2",
),
# ── Rate limiter gauges ───────────────────────────────────────────────────
"matrix_bridge_rate_limiter_active_rooms": MetricSpec(
kind="gauge",
labels=[],
help="Rooms with activity in the current rate-limit window",
phase="H1",
),
"matrix_bridge_rate_limiter_active_senders": MetricSpec(
kind="gauge",
labels=[],
help="Senders with activity in the current rate-limit window",
phase="H1",
),
# ── Routing ───────────────────────────────────────────────────────────────
"matrix_bridge_routing_reasons_total": MetricSpec(
kind="counter",
labels=["agent_id", "reason"],
help="Message routing breakdown by agent and routing reason (slash/mention/name/default/direct)",
phase="M2.2",
),
"matrix_bridge_route_rejected_total": MetricSpec(
kind="counter",
labels=["room_id", "reason"],
help="Messages rejected during routing (unknown agent, bad slash, no mapping, etc.)",
phase="M2.2",
),
"matrix_bridge_active_room_agent_locks": MetricSpec(
kind="gauge",
labels=[],
help="Number of room-agent pairs currently holding a concurrency lock",
phase="M2.2",
),
# ── Control channel ───────────────────────────────────────────────────────
"matrix_bridge_control_commands_total": MetricSpec(
kind="counter",
labels=["sender", "verb", "subcommand"],
help="Total control commands received from authorized operators",
phase="M3.0",
),
"matrix_bridge_control_rate_limited_total": MetricSpec(
kind="counter",
labels=["scope"],
help="Total control commands rejected by rate limiter or cooldown",
phase="M3.4",
),
# ── Persistent deduplication ─────────────────────────────────────────────
"matrix_bridge_dedupe_persistent_hits_total": MetricSpec(
kind="counter",
labels=["room_id"],
help="Total events dropped by persistent (SQLite) deduplication",
phase="M2.3",
),
"matrix_bridge_dedupe_persistent_inserts_total": MetricSpec(
kind="counter",
labels=[],
help="Total events marked as processed in persistent dedupe store",
phase="M2.3",
),
# ── Node-aware routing (M5.0) ─────────────────────────────────────────────
"matrix_bridge_routed_total": MetricSpec(
kind="counter",
labels=["agent_id", "node_id", "source"],
help="Total messages successfully routed, by agent, resolved node, and node source",
phase="M5.0",
),
"matrix_bridge_node_rejected_total": MetricSpec(
kind="counter",
labels=["node_id"],
help="Total messages with rejected (non-allowlisted) node kwarg",
phase="M5.0",
),
# ── Bridge health (M7.1) ──────────────────────────────────────────────────
"matrix_bridge_up": MetricSpec(
kind="gauge",
labels=["node_id"],
help="1 if bridge started successfully; 0 on config error",
phase="M7.1",
),
# ── Soft-failover (M8.0) ─────────────────────────────────────────────────
"matrix_bridge_failover_total": MetricSpec(
kind="counter",
labels=["from_node", "to_node", "reason"],
help="Total successful soft-failovers by node transition and reason",
phase="M8.0",
),
"matrix_bridge_node_health_state": MetricSpec(
kind="gauge",
labels=["node_id"],
help="Node health state gauge: 1=healthy 0.5=degraded 0=down",
phase="M8.0",
),
# ── Sticky routing anti-flap (M8.1) ──────────────────────────────────────
"matrix_bridge_sticky_node_total": MetricSpec(
kind="counter",
labels=["node_id", "scope"],
help="Total sticky routing entries set after failover, by preferred node and scope",
phase="M8.1",
),
"matrix_bridge_sticky_node_active": MetricSpec(
kind="gauge",
labels=[],
help="Current count of active sticky routing entries",
phase="M8.1",
),
}
# ── Alert metric references ────────────────────────────────────────────────────
# These are the metric base-names referenced in alert rules.
# All must exist in METRICS_CONTRACT.
ALERT_METRIC_REFS = frozenset({
"matrix_bridge_up",
"matrix_bridge_gateway_errors_total",
"matrix_bridge_messages_replied_total",
"matrix_bridge_queue_dropped_total",
"matrix_bridge_rate_limited_total",
"matrix_bridge_control_rate_limited_total",
"matrix_bridge_dedupe_persistent_hits_total",
"matrix_bridge_invoke_duration_seconds",
})