feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
This commit is contained in:
224
services/matrix-bridge-dagi/app/metrics_contract.py
Normal file
224
services/matrix-bridge-dagi/app/metrics_contract.py
Normal file
@@ -0,0 +1,224 @@
|
||||
"""
|
||||
Metrics Contract — Matrix Bridge DAGI
|
||||
Phase M7.1
|
||||
|
||||
Single source of truth for all Prometheus metric names and their label sets.
|
||||
Used by:
|
||||
- main.py (registers metrics against this contract)
|
||||
- tests/test_matrix_bridge_m71_metrics_contract.py (static validation)
|
||||
- ops/prometheus/alerts/matrix-bridge-dagi.rules.yml (PromQL expressions)
|
||||
- ops/grafana/dashboards/matrix-bridge-dagi.json (panel queries)
|
||||
|
||||
Format:
|
||||
METRICS_CONTRACT: Dict[metric_name, MetricSpec]
|
||||
|
||||
MetricSpec fields:
|
||||
kind : "counter" | "histogram" | "gauge"
|
||||
labels : list of label names (empty list = no labels)
|
||||
help : one-line description
|
||||
phase : originating milestone
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MetricSpec:
|
||||
kind: str # "counter" | "histogram" | "gauge"
|
||||
labels: List[str] # label names; empty = no labels
|
||||
help: str
|
||||
phase: str = "M1" # originating milestone for traceability
|
||||
|
||||
|
||||
# ── Contract ──────────────────────────────────────────────────────────────────
|
||||
|
||||
METRICS_CONTRACT: Dict[str, MetricSpec] = {
|
||||
|
||||
# ── Core message traffic ──────────────────────────────────────────────────
|
||||
"matrix_bridge_messages_received_total": MetricSpec(
|
||||
kind="counter",
|
||||
labels=["room_id", "agent_id"],
|
||||
help="Total Matrix messages received",
|
||||
phase="M1",
|
||||
),
|
||||
"matrix_bridge_messages_replied_total": MetricSpec(
|
||||
kind="counter",
|
||||
labels=["room_id", "agent_id", "status"],
|
||||
help="Total agent replies sent to Matrix (status=ok|error)",
|
||||
phase="M1",
|
||||
),
|
||||
"matrix_bridge_rate_limited_total": MetricSpec(
|
||||
kind="counter",
|
||||
labels=["room_id", "agent_id", "limit_type"],
|
||||
help="Messages dropped by rate limiter",
|
||||
phase="H1",
|
||||
),
|
||||
"matrix_bridge_gateway_errors_total": MetricSpec(
|
||||
kind="counter",
|
||||
labels=["error_type"],
|
||||
help="Bridge errors by stage: sync_error, network_error, http_<status>, matrix_send_error, unexpected",
|
||||
phase="M1",
|
||||
),
|
||||
|
||||
# ── Latency histograms ────────────────────────────────────────────────────
|
||||
"matrix_bridge_invoke_duration_seconds": MetricSpec(
|
||||
kind="histogram",
|
||||
labels=["agent_id", "node_id"],
|
||||
help="Latency of DAGI Router infer call, per agent and node",
|
||||
phase="H3",
|
||||
),
|
||||
"matrix_bridge_send_duration_seconds": MetricSpec(
|
||||
kind="histogram",
|
||||
labels=["agent_id"],
|
||||
help="Latency of Matrix send_text call",
|
||||
phase="H3",
|
||||
),
|
||||
"matrix_bridge_queue_wait_seconds": MetricSpec(
|
||||
kind="histogram",
|
||||
labels=["agent_id"],
|
||||
help="Time between enqueue and worker start processing",
|
||||
phase="H3",
|
||||
),
|
||||
|
||||
# ── Queue ─────────────────────────────────────────────────────────────────
|
||||
"matrix_bridge_queue_size": MetricSpec(
|
||||
kind="gauge",
|
||||
labels=[],
|
||||
help="Current number of pending items in the work queue",
|
||||
phase="H2",
|
||||
),
|
||||
"matrix_bridge_queue_dropped_total": MetricSpec(
|
||||
kind="counter",
|
||||
labels=["room_id", "agent_id"],
|
||||
help="Messages dropped because queue was full",
|
||||
phase="H2",
|
||||
),
|
||||
|
||||
# ── Rate limiter gauges ───────────────────────────────────────────────────
|
||||
"matrix_bridge_rate_limiter_active_rooms": MetricSpec(
|
||||
kind="gauge",
|
||||
labels=[],
|
||||
help="Rooms with activity in the current rate-limit window",
|
||||
phase="H1",
|
||||
),
|
||||
"matrix_bridge_rate_limiter_active_senders": MetricSpec(
|
||||
kind="gauge",
|
||||
labels=[],
|
||||
help="Senders with activity in the current rate-limit window",
|
||||
phase="H1",
|
||||
),
|
||||
|
||||
# ── Routing ───────────────────────────────────────────────────────────────
|
||||
"matrix_bridge_routing_reasons_total": MetricSpec(
|
||||
kind="counter",
|
||||
labels=["agent_id", "reason"],
|
||||
help="Message routing breakdown by agent and routing reason (slash/mention/name/default/direct)",
|
||||
phase="M2.2",
|
||||
),
|
||||
"matrix_bridge_route_rejected_total": MetricSpec(
|
||||
kind="counter",
|
||||
labels=["room_id", "reason"],
|
||||
help="Messages rejected during routing (unknown agent, bad slash, no mapping, etc.)",
|
||||
phase="M2.2",
|
||||
),
|
||||
"matrix_bridge_active_room_agent_locks": MetricSpec(
|
||||
kind="gauge",
|
||||
labels=[],
|
||||
help="Number of room-agent pairs currently holding a concurrency lock",
|
||||
phase="M2.2",
|
||||
),
|
||||
|
||||
# ── Control channel ───────────────────────────────────────────────────────
|
||||
"matrix_bridge_control_commands_total": MetricSpec(
|
||||
kind="counter",
|
||||
labels=["sender", "verb", "subcommand"],
|
||||
help="Total control commands received from authorized operators",
|
||||
phase="M3.0",
|
||||
),
|
||||
"matrix_bridge_control_rate_limited_total": MetricSpec(
|
||||
kind="counter",
|
||||
labels=["scope"],
|
||||
help="Total control commands rejected by rate limiter or cooldown",
|
||||
phase="M3.4",
|
||||
),
|
||||
|
||||
# ── Persistent deduplication ─────────────────────────────────────────────
|
||||
"matrix_bridge_dedupe_persistent_hits_total": MetricSpec(
|
||||
kind="counter",
|
||||
labels=["room_id"],
|
||||
help="Total events dropped by persistent (SQLite) deduplication",
|
||||
phase="M2.3",
|
||||
),
|
||||
"matrix_bridge_dedupe_persistent_inserts_total": MetricSpec(
|
||||
kind="counter",
|
||||
labels=[],
|
||||
help="Total events marked as processed in persistent dedupe store",
|
||||
phase="M2.3",
|
||||
),
|
||||
|
||||
# ── Node-aware routing (M5.0) ─────────────────────────────────────────────
|
||||
"matrix_bridge_routed_total": MetricSpec(
|
||||
kind="counter",
|
||||
labels=["agent_id", "node_id", "source"],
|
||||
help="Total messages successfully routed, by agent, resolved node, and node source",
|
||||
phase="M5.0",
|
||||
),
|
||||
"matrix_bridge_node_rejected_total": MetricSpec(
|
||||
kind="counter",
|
||||
labels=["node_id"],
|
||||
help="Total messages with rejected (non-allowlisted) node kwarg",
|
||||
phase="M5.0",
|
||||
),
|
||||
|
||||
# ── Bridge health (M7.1) ──────────────────────────────────────────────────
|
||||
"matrix_bridge_up": MetricSpec(
|
||||
kind="gauge",
|
||||
labels=["node_id"],
|
||||
help="1 if bridge started successfully; 0 on config error",
|
||||
phase="M7.1",
|
||||
),
|
||||
|
||||
# ── Soft-failover (M8.0) ─────────────────────────────────────────────────
|
||||
"matrix_bridge_failover_total": MetricSpec(
|
||||
kind="counter",
|
||||
labels=["from_node", "to_node", "reason"],
|
||||
help="Total successful soft-failovers by node transition and reason",
|
||||
phase="M8.0",
|
||||
),
|
||||
"matrix_bridge_node_health_state": MetricSpec(
|
||||
kind="gauge",
|
||||
labels=["node_id"],
|
||||
help="Node health state gauge: 1=healthy 0.5=degraded 0=down",
|
||||
phase="M8.0",
|
||||
),
|
||||
|
||||
# ── Sticky routing anti-flap (M8.1) ──────────────────────────────────────
|
||||
"matrix_bridge_sticky_node_total": MetricSpec(
|
||||
kind="counter",
|
||||
labels=["node_id", "scope"],
|
||||
help="Total sticky routing entries set after failover, by preferred node and scope",
|
||||
phase="M8.1",
|
||||
),
|
||||
"matrix_bridge_sticky_node_active": MetricSpec(
|
||||
kind="gauge",
|
||||
labels=[],
|
||||
help="Current count of active sticky routing entries",
|
||||
phase="M8.1",
|
||||
),
|
||||
}
|
||||
|
||||
# ── Alert metric references ────────────────────────────────────────────────────
|
||||
# These are the metric base-names referenced in alert rules.
|
||||
# All must exist in METRICS_CONTRACT.
|
||||
ALERT_METRIC_REFS = frozenset({
|
||||
"matrix_bridge_up",
|
||||
"matrix_bridge_gateway_errors_total",
|
||||
"matrix_bridge_messages_replied_total",
|
||||
"matrix_bridge_queue_dropped_total",
|
||||
"matrix_bridge_rate_limited_total",
|
||||
"matrix_bridge_control_rate_limited_total",
|
||||
"matrix_bridge_dedupe_persistent_hits_total",
|
||||
"matrix_bridge_invoke_duration_seconds",
|
||||
})
|
||||
Reference in New Issue
Block a user