Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
159 lines
7.8 KiB
YAML
159 lines
7.8 KiB
YAML
---
|
|
# Prometheus alert rules — Matrix Bridge DAGI
|
|
# Phase M7.1 (metrics contract hardening)
|
|
#
|
|
# Metric source of truth: services/matrix-bridge-dagi/app/metrics_contract.py
|
|
# Runbook: docs/runbook/matrix-bridge-dagi-ops.md
|
|
#
|
|
# Usage:
|
|
# promtool check rules ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
|
|
# docker run --rm -v $PWD:/w prom/prometheus:latest \
|
|
# promtool check rules /w/ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
|
|
|
|
groups:
|
|
- name: matrix_bridge_dagi
|
|
interval: 30s
|
|
rules:
|
|
|
|
# ── A1: Bridge process down ─────────────────────────────────────────────
|
|
# metric: matrix_bridge_up{node_id} (Gauge, M7.1: labeled per node)
|
|
- alert: BridgeDown
|
|
expr: sum(matrix_bridge_up) == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
team: platform
|
|
service: matrix-bridge-dagi
|
|
annotations:
|
|
summary: "Matrix Bridge DAGI is down"
|
|
description: >
|
|
`matrix_bridge_up` == 0 across all nodes — bridge process has not
|
|
started or has crashed. No messages are being processed.
|
|
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a1-bridgedown"
|
|
|
|
# ── A2: Matrix sync errors spike ────────────────────────────────────────
|
|
# metric: matrix_bridge_gateway_errors_total{error_type} (Counter)
|
|
- alert: MatrixSyncErrors
|
|
expr: >
|
|
increase(matrix_bridge_gateway_errors_total{error_type="sync_error"}[5m]) > 3
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
service: matrix-bridge-dagi
|
|
annotations:
|
|
summary: "Matrix sync errors elevated"
|
|
description: >
|
|
More than 3 Matrix `/sync` errors (error_type=sync_error) in the last
|
|
5 minutes. May indicate Matrix homeserver problems or network issues.
|
|
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a2-matrixsyncerrors"
|
|
|
|
# ── A3: Gateway (Router) invoke errors spike ─────────────────────────────
|
|
# metric: matrix_bridge_messages_replied_total{status} (Counter)
|
|
- alert: GatewayInvokeErrors
|
|
expr: >
|
|
increase(matrix_bridge_messages_replied_total{status="error"}[5m]) > 5
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
service: matrix-bridge-dagi
|
|
annotations:
|
|
summary: "Router invoke errors elevated (node={{ $labels.node_id }})"
|
|
description: >
|
|
More than 5 agent invocation errors (status=error) in the last 5 minutes.
|
|
Check Router/DeepSeek connectivity and logs.
|
|
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a3-gatewayinvokeerrors"
|
|
|
|
# ── A4: Queue drops ─────────────────────────────────────────────────────
|
|
# metric: matrix_bridge_queue_dropped_total{room_id, agent_id} (Counter)
|
|
- alert: QueueDropsHigh
|
|
expr: >
|
|
rate(matrix_bridge_queue_dropped_total[5m]) > 0
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
service: matrix-bridge-dagi
|
|
annotations:
|
|
summary: "Bridge queue is dropping messages"
|
|
description: >
|
|
`matrix_bridge_queue_dropped_total` is increasing — work queue is full
|
|
and incoming messages are being dropped. Increase
|
|
`BRIDGE_QUEUE_MAX_EVENTS` or `BRIDGE_WORKER_CONCURRENCY`.
|
|
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a4-queuedrops"
|
|
|
|
# ── A5: User-level rate limiting spike ──────────────────────────────────
|
|
# metric: matrix_bridge_rate_limited_total{room_id, agent_id, limit_type} (Counter)
|
|
- alert: RateLimitedSpike
|
|
expr: >
|
|
rate(matrix_bridge_rate_limited_total[5m]) > 2
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
service: matrix-bridge-dagi
|
|
annotations:
|
|
summary: "User rate limiting spike"
|
|
description: >
|
|
More than 2 messages/second are being rate-limited over 3 minutes.
|
|
May indicate a flood attack, misbehaving client, or limits too low.
|
|
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a5-ratelimitedspike"
|
|
|
|
# ── A6: Control channel rate limiting spike ──────────────────────────────
|
|
# metric: matrix_bridge_control_rate_limited_total{scope} (Counter)
|
|
- alert: ControlRateLimitedSpike
|
|
expr: >
|
|
rate(matrix_bridge_control_rate_limited_total[5m]) > 0.5
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
service: matrix-bridge-dagi
|
|
annotations:
|
|
summary: "Control channel rate limiting elevated"
|
|
description: >
|
|
More than 0.5 control commands/second rejected by rate limiter over
|
|
3 minutes. May indicate operator tooling issues or abuse attempt.
|
|
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a6-controlratelimitedspike"
|
|
|
|
# ── A7: Persistent dedupe hit storm (resend loop) ────────────────────────
|
|
# metric: matrix_bridge_dedupe_persistent_hits_total{room_id} (Counter)
|
|
- alert: DedupeHitStorm
|
|
expr: >
|
|
rate(matrix_bridge_dedupe_persistent_hits_total[10m]) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
service: matrix-bridge-dagi
|
|
annotations:
|
|
summary: "Persistent deduplication hit rate elevated"
|
|
description: >
|
|
High rate of persistent dedupe hits — may indicate a Matrix resend
|
|
storm or a client repeatedly retrying the same event_id.
|
|
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a7-dedupehitstorm"
|
|
|
|
# ── A8: Invoke latency P95 high (per node) ───────────────────────────────
|
|
# metric: matrix_bridge_invoke_duration_seconds{agent_id, node_id} (Histogram)
|
|
- alert: InvokeLatencyP95High
|
|
expr: >
|
|
histogram_quantile(
|
|
0.95,
|
|
sum by (node_id, le) (
|
|
rate(matrix_bridge_invoke_duration_seconds_bucket[5m])
|
|
)
|
|
) > 15
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
service: matrix-bridge-dagi
|
|
annotations:
|
|
summary: "Router invoke latency P95 > 15s (node={{ $labels.node_id }})"
|
|
description: >
|
|
95th percentile invoke latency for node `{{ $labels.node_id }}` exceeds
|
|
15 seconds over the last 5 minutes. Check Router load, DeepSeek API,
|
|
Ollama/Swapper queue.
|
|
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a8-invokelatencyp95high"
|