--- # Prometheus alert rules — Matrix Bridge DAGI # Phase M7.1 (metrics contract hardening) # # Metric source of truth: services/matrix-bridge-dagi/app/metrics_contract.py # Runbook: docs/runbook/matrix-bridge-dagi-ops.md # # Usage: # promtool check rules ops/prometheus/alerts/matrix-bridge-dagi.rules.yml # docker run --rm -v $PWD:/w prom/prometheus:latest \ # promtool check rules /w/ops/prometheus/alerts/matrix-bridge-dagi.rules.yml groups: - name: matrix_bridge_dagi interval: 30s rules: # ── A1: Bridge process down ───────────────────────────────────────────── # metric: matrix_bridge_up{node_id} (Gauge, M7.1: labeled per node) - alert: BridgeDown expr: sum(matrix_bridge_up) == 0 for: 1m labels: severity: critical team: platform service: matrix-bridge-dagi annotations: summary: "Matrix Bridge DAGI is down" description: > `matrix_bridge_up` == 0 across all nodes — bridge process has not started or has crashed. No messages are being processed. runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a1-bridgedown" # ── A2: Matrix sync errors spike ──────────────────────────────────────── # metric: matrix_bridge_gateway_errors_total{error_type} (Counter) - alert: MatrixSyncErrors expr: > increase(matrix_bridge_gateway_errors_total{error_type="sync_error"}[5m]) > 3 for: 2m labels: severity: warning team: platform service: matrix-bridge-dagi annotations: summary: "Matrix sync errors elevated" description: > More than 3 Matrix `/sync` errors (error_type=sync_error) in the last 5 minutes. May indicate Matrix homeserver problems or network issues. runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a2-matrixsyncerrors" # ── A3: Gateway (Router) invoke errors spike ───────────────────────────── # metric: matrix_bridge_messages_replied_total{status} (Counter) - alert: GatewayInvokeErrors expr: > increase(matrix_bridge_messages_replied_total{status="error"}[5m]) > 5 for: 2m labels: severity: warning team: platform service: matrix-bridge-dagi annotations: summary: "Router invoke errors elevated (node={{ $labels.node_id }})" description: > More than 5 agent invocation errors (status=error) in the last 5 minutes. Check Router/DeepSeek connectivity and logs. runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a3-gatewayinvokeerrors" # ── A4: Queue drops ───────────────────────────────────────────────────── # metric: matrix_bridge_queue_dropped_total{room_id, agent_id} (Counter) - alert: QueueDropsHigh expr: > rate(matrix_bridge_queue_dropped_total[5m]) > 0 for: 1m labels: severity: warning team: platform service: matrix-bridge-dagi annotations: summary: "Bridge queue is dropping messages" description: > `matrix_bridge_queue_dropped_total` is increasing — work queue is full and incoming messages are being dropped. Increase `BRIDGE_QUEUE_MAX_EVENTS` or `BRIDGE_WORKER_CONCURRENCY`. runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a4-queuedrops" # ── A5: User-level rate limiting spike ────────────────────────────────── # metric: matrix_bridge_rate_limited_total{room_id, agent_id, limit_type} (Counter) - alert: RateLimitedSpike expr: > rate(matrix_bridge_rate_limited_total[5m]) > 2 for: 3m labels: severity: warning team: platform service: matrix-bridge-dagi annotations: summary: "User rate limiting spike" description: > More than 2 messages/second are being rate-limited over 3 minutes. May indicate a flood attack, misbehaving client, or limits too low. runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a5-ratelimitedspike" # ── A6: Control channel rate limiting spike ────────────────────────────── # metric: matrix_bridge_control_rate_limited_total{scope} (Counter) - alert: ControlRateLimitedSpike expr: > rate(matrix_bridge_control_rate_limited_total[5m]) > 0.5 for: 3m labels: severity: warning team: platform service: matrix-bridge-dagi annotations: summary: "Control channel rate limiting elevated" description: > More than 0.5 control commands/second rejected by rate limiter over 3 minutes. May indicate operator tooling issues or abuse attempt. runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a6-controlratelimitedspike" # ── A7: Persistent dedupe hit storm (resend loop) ──────────────────────── # metric: matrix_bridge_dedupe_persistent_hits_total{room_id} (Counter) - alert: DedupeHitStorm expr: > rate(matrix_bridge_dedupe_persistent_hits_total[10m]) > 0.5 for: 5m labels: severity: warning team: platform service: matrix-bridge-dagi annotations: summary: "Persistent deduplication hit rate elevated" description: > High rate of persistent dedupe hits — may indicate a Matrix resend storm or a client repeatedly retrying the same event_id. runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a7-dedupehitstorm" # ── A8: Invoke latency P95 high (per node) ─────────────────────────────── # metric: matrix_bridge_invoke_duration_seconds{agent_id, node_id} (Histogram) - alert: InvokeLatencyP95High expr: > histogram_quantile( 0.95, sum by (node_id, le) ( rate(matrix_bridge_invoke_duration_seconds_bucket[5m]) ) ) > 15 for: 5m labels: severity: warning team: platform service: matrix-bridge-dagi annotations: summary: "Router invoke latency P95 > 15s (node={{ $labels.node_id }})" description: > 95th percentile invoke latency for node `{{ $labels.node_id }}` exceeds 15 seconds over the last 5 minutes. Check Router load, DeepSeek API, Ollama/Swapper queue. runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a8-invokelatencyp95high"