feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)

Includes all milestones M4 through M11:
- M4: agent discovery (!agents / !status)
- M5: node-aware routing + per-node observability
- M6: dynamic policy store (node/agent overrides, import/export)
- M7: Prometheus alerts + Grafana dashboard + metrics contract
- M8: node health tracker + soft failover + sticky cache + HA persistence
- M9: two-step confirm + diff preview for dangerous commands
- M10: auto-backup, restore, retention, policy history + change detail
- M11: soak scenarios (CI tests) + live soak script

Soak infrastructure (this commit):
- POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false)
- _preflight_inject() and _check_wal() in soak script
- --db-path arg for WAL delta reporting
- Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands

Made-with: Cursor
This commit is contained in:
Apple
2026-03-05 07:51:37 -08:00
parent fe6e3d30ae
commit 82d5ff2a4f
21 changed files with 9123 additions and 93 deletions

View File

@@ -0,0 +1,986 @@
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "Prometheus",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__elements": {},
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "9.0.0"
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
},
{
"type": "panel",
"id": "stat",
"name": "Stat",
"version": ""
},
{
"type": "panel",
"id": "timeseries",
"name": "Time series",
"version": ""
},
{
"type": "panel",
"id": "gauge",
"name": "Gauge",
"version": ""
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Matrix Bridge DAGI \u2014 operational overview (M7.0). Traffic, latency, errors, queue, dedupe, control channel.",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [
{
"asDropdown": false,
"icon": "doc",
"includeVars": false,
"keepTime": false,
"tags": [],
"targetBlank": true,
"title": "Runbook",
"tooltip": "matrix-bridge-dagi-ops.md",
"type": "link",
"url": "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md"
}
],
"panels": [
{
"id": 1,
"type": "stat",
"title": "Bridge Up",
"gridPos": {
"x": 0,
"y": 0,
"w": 4,
"h": 4
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum(matrix_bridge_up)",
"legendFormat": "up (all nodes)",
"refId": "A",
"instant": true
}
],
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"colorMode": "background",
"graphMode": "none",
"textMode": "auto",
"orientation": "auto"
},
"fieldConfig": {
"defaults": {
"mappings": [
{
"type": "value",
"options": {
"0": {
"text": "DOWN",
"color": "red"
},
"1": {
"text": "UP",
"color": "green"
}
}
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"color": {
"mode": "thresholds"
}
},
"overrides": []
}
},
{
"id": 2,
"type": "stat",
"title": "Queue Size",
"gridPos": {
"x": 4,
"y": 0,
"w": 4,
"h": 4
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "matrix_bridge_queue_size",
"legendFormat": "queue",
"refId": "A",
"instant": true
}
],
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"colorMode": "background",
"graphMode": "area",
"textMode": "auto"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "red",
"value": 100
}
]
},
"color": {
"mode": "thresholds"
},
"unit": "short"
},
"overrides": []
}
},
{
"id": 3,
"type": "stat",
"title": "Active Rate-Limiter Rooms",
"gridPos": {
"x": 8,
"y": 0,
"w": 4,
"h": 4
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "matrix_bridge_rate_limiter_active_rooms",
"legendFormat": "rooms",
"refId": "A",
"instant": true
}
],
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"colorMode": "value",
"graphMode": "none"
},
"fieldConfig": {
"defaults": {
"unit": "short",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 4,
"type": "stat",
"title": "Active Room-Agent Locks",
"gridPos": {
"x": 12,
"y": 0,
"w": 4,
"h": 4
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "matrix_bridge_active_room_agent_locks",
"legendFormat": "locks",
"refId": "A",
"instant": true
}
],
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"colorMode": "value",
"graphMode": "none"
},
"fieldConfig": {
"defaults": {
"unit": "short",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 5,
"type": "stat",
"title": "Drops (5m)",
"gridPos": {
"x": 16,
"y": 0,
"w": 4,
"h": 4
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum(increase(matrix_bridge_queue_dropped_total[5m]))",
"legendFormat": "dropped",
"refId": "A",
"instant": true
}
],
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"colorMode": "background",
"graphMode": "none"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1
}
]
},
"color": {
"mode": "thresholds"
},
"unit": "short"
},
"overrides": []
}
},
{
"id": 6,
"type": "stat",
"title": "Errors (5m)",
"gridPos": {
"x": 20,
"y": 0,
"w": 4,
"h": 4
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum(increase(matrix_bridge_gateway_errors_total[5m]))",
"legendFormat": "errors",
"refId": "A",
"instant": true
}
],
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"colorMode": "background",
"graphMode": "none"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "red",
"value": 5
}
]
},
"color": {
"mode": "thresholds"
},
"unit": "short"
},
"overrides": []
}
},
{
"id": 10,
"type": "timeseries",
"title": "Traffic: Received & Replied (rate/5m)",
"gridPos": {
"x": 0,
"y": 4,
"w": 12,
"h": 8
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum(rate(matrix_bridge_messages_received_total[5m]))",
"legendFormat": "received",
"refId": "A"
},
{
"expr": "sum(rate(matrix_bridge_messages_replied_total{status=\"ok\"}[5m]))",
"legendFormat": "replied ok",
"refId": "B"
},
{
"expr": "sum(rate(matrix_bridge_messages_replied_total{status=\"error\"}[5m]))",
"legendFormat": "replied error",
"refId": "C"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "replied error"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "red"
}
}
]
}
]
}
},
{
"id": 11,
"type": "timeseries",
"title": "Errors / Drops / Rate-Limited (rate/5m)",
"gridPos": {
"x": 12,
"y": 4,
"w": 12,
"h": 8
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum by (error_type) (rate(matrix_bridge_gateway_errors_total[5m]))",
"legendFormat": "gw_error: {{ error_type }}",
"refId": "A"
},
{
"expr": "sum(rate(matrix_bridge_queue_dropped_total[5m]))",
"legendFormat": "queue_dropped",
"refId": "B"
},
{
"expr": "sum(rate(matrix_bridge_rate_limited_total[5m]))",
"legendFormat": "rate_limited",
"refId": "C"
},
{
"expr": "sum by (reason) (rate(matrix_bridge_route_rejected_total[5m]))",
"legendFormat": "route_rejected: {{ reason }}",
"refId": "D"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 15,
"drawStyle": "line",
"stacking": {
"mode": "none"
},
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 20,
"type": "timeseries",
"title": "Invoke Latency P50 / P95 by Node",
"gridPos": {
"x": 0,
"y": 12,
"w": 12,
"h": 8
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "histogram_quantile(0.50, sum by (node_id, le) (rate(matrix_bridge_invoke_duration_seconds_bucket[5m])))",
"legendFormat": "p50 {{ node_id }}",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum by (node_id, le) (rate(matrix_bridge_invoke_duration_seconds_bucket[5m])))",
"legendFormat": "p95 {{ node_id }}",
"refId": "B"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max",
"last"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "s",
"custom": {
"lineWidth": 2,
"fillOpacity": 5,
"drawStyle": "line",
"spanNulls": false
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 10
},
{
"color": "red",
"value": 20
}
]
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 21,
"type": "timeseries",
"title": "Queue Wait P50 / P95",
"gridPos": {
"x": 12,
"y": 12,
"w": 12,
"h": 8
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "histogram_quantile(0.50, sum by (agent_id, le) (rate(matrix_bridge_queue_wait_seconds_bucket[5m])))",
"legendFormat": "wait p50 {{ agent_id }}",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum by (agent_id, le) (rate(matrix_bridge_queue_wait_seconds_bucket[5m])))",
"legendFormat": "wait p95 {{ agent_id }}",
"refId": "B"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "s",
"custom": {
"lineWidth": 2,
"fillOpacity": 5,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 30,
"type": "timeseries",
"title": "Node Routing: Routed & Rejected by Node (rate/5m)",
"gridPos": {
"x": 0,
"y": 20,
"w": 12,
"h": 7
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum by (node_id) (rate(matrix_bridge_routed_total[5m]))",
"legendFormat": "routed {{ node_id }}",
"refId": "A"
},
{
"expr": "sum by (node_id) (rate(matrix_bridge_node_rejected_total[5m]))",
"legendFormat": "rejected {{ node_id }}",
"refId": "B"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 31,
"type": "timeseries",
"title": "Persistent Dedupe Hits / Inserts (rate/10m)",
"gridPos": {
"x": 12,
"y": 20,
"w": 12,
"h": 7
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum(rate(matrix_bridge_dedupe_persistent_hits_total[10m]))",
"legendFormat": "dedupe_hits",
"refId": "A"
},
{
"expr": "rate(matrix_bridge_dedupe_persistent_inserts_total[10m])",
"legendFormat": "dedupe_inserts",
"refId": "B"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 40,
"type": "timeseries",
"title": "Control Commands (rate/5m)",
"gridPos": {
"x": 0,
"y": 27,
"w": 12,
"h": 7
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum by (verb) (rate(matrix_bridge_control_commands_total[5m]))",
"legendFormat": "cmd {{ verb }}",
"refId": "A"
},
{
"expr": "sum by (scope) (rate(matrix_bridge_control_rate_limited_total[5m]))",
"legendFormat": "ctrl_ratelimited {{ scope }}",
"refId": "B"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 41,
"type": "timeseries",
"title": "Traffic by Agent (received rate/5m)",
"gridPos": {
"x": 12,
"y": 27,
"w": 24,
"h": 7
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum by (agent_id) (rate(matrix_bridge_messages_received_total[5m]))",
"legendFormat": "{{ agent_id }}",
"refId": "A"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max",
"last"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 42,
"type": "timeseries",
"title": "Routing Reasons by Agent (rate/5m)",
"description": "M7.1: matrix_bridge_routing_reasons_total \u2014 slash/mention/name/default/direct breakdown",
"gridPos": {
"x": 0,
"y": 34,
"w": 24,
"h": 7
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum by (agent_id, reason) (rate(matrix_bridge_routing_reasons_total[5m]))",
"legendFormat": "{{ agent_id }} / {{ reason }}",
"refId": "A"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
}
],
"refresh": "30s",
"schemaVersion": 38,
"tags": [
"matrix-bridge",
"dagi",
"daarion"
],
"templating": {
"list": [
{
"current": {},
"hide": 0,
"includeAll": false,
"label": "Datasource",
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "UTC",
"title": "Matrix Bridge DAGI",
"uid": "matrix-bridge-dagi-v1",
"version": 1
}

View File

@@ -0,0 +1,158 @@
---
# Prometheus alert rules — Matrix Bridge DAGI
# Phase M7.1 (metrics contract hardening)
#
# Metric source of truth: services/matrix-bridge-dagi/app/metrics_contract.py
# Runbook: docs/runbook/matrix-bridge-dagi-ops.md
#
# Usage:
# promtool check rules ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
# docker run --rm -v $PWD:/w prom/prometheus:latest \
# promtool check rules /w/ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
groups:
- name: matrix_bridge_dagi
interval: 30s
rules:
# ── A1: Bridge process down ─────────────────────────────────────────────
# metric: matrix_bridge_up{node_id} (Gauge, M7.1: labeled per node)
- alert: BridgeDown
expr: sum(matrix_bridge_up) == 0
for: 1m
labels:
severity: critical
team: platform
service: matrix-bridge-dagi
annotations:
summary: "Matrix Bridge DAGI is down"
description: >
`matrix_bridge_up` == 0 across all nodes — bridge process has not
started or has crashed. No messages are being processed.
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a1-bridgedown"
# ── A2: Matrix sync errors spike ────────────────────────────────────────
# metric: matrix_bridge_gateway_errors_total{error_type} (Counter)
- alert: MatrixSyncErrors
expr: >
increase(matrix_bridge_gateway_errors_total{error_type="sync_error"}[5m]) > 3
for: 2m
labels:
severity: warning
team: platform
service: matrix-bridge-dagi
annotations:
summary: "Matrix sync errors elevated"
description: >
More than 3 Matrix `/sync` errors (error_type=sync_error) in the last
5 minutes. May indicate Matrix homeserver problems or network issues.
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a2-matrixsyncerrors"
# ── A3: Gateway (Router) invoke errors spike ─────────────────────────────
# metric: matrix_bridge_messages_replied_total{status} (Counter)
- alert: GatewayInvokeErrors
expr: >
increase(matrix_bridge_messages_replied_total{status="error"}[5m]) > 5
for: 2m
labels:
severity: warning
team: platform
service: matrix-bridge-dagi
annotations:
summary: "Router invoke errors elevated (node={{ $labels.node_id }})"
description: >
More than 5 agent invocation errors (status=error) in the last 5 minutes.
Check Router/DeepSeek connectivity and logs.
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a3-gatewayinvokeerrors"
# ── A4: Queue drops ─────────────────────────────────────────────────────
# metric: matrix_bridge_queue_dropped_total{room_id, agent_id} (Counter)
- alert: QueueDropsHigh
expr: >
rate(matrix_bridge_queue_dropped_total[5m]) > 0
for: 1m
labels:
severity: warning
team: platform
service: matrix-bridge-dagi
annotations:
summary: "Bridge queue is dropping messages"
description: >
`matrix_bridge_queue_dropped_total` is increasing — work queue is full
and incoming messages are being dropped. Increase
`BRIDGE_QUEUE_MAX_EVENTS` or `BRIDGE_WORKER_CONCURRENCY`.
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a4-queuedrops"
# ── A5: User-level rate limiting spike ──────────────────────────────────
# metric: matrix_bridge_rate_limited_total{room_id, agent_id, limit_type} (Counter)
- alert: RateLimitedSpike
expr: >
rate(matrix_bridge_rate_limited_total[5m]) > 2
for: 3m
labels:
severity: warning
team: platform
service: matrix-bridge-dagi
annotations:
summary: "User rate limiting spike"
description: >
More than 2 messages/second are being rate-limited over 3 minutes.
May indicate a flood attack, misbehaving client, or limits too low.
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a5-ratelimitedspike"
# ── A6: Control channel rate limiting spike ──────────────────────────────
# metric: matrix_bridge_control_rate_limited_total{scope} (Counter)
- alert: ControlRateLimitedSpike
expr: >
rate(matrix_bridge_control_rate_limited_total[5m]) > 0.5
for: 3m
labels:
severity: warning
team: platform
service: matrix-bridge-dagi
annotations:
summary: "Control channel rate limiting elevated"
description: >
More than 0.5 control commands/second rejected by rate limiter over
3 minutes. May indicate operator tooling issues or abuse attempt.
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a6-controlratelimitedspike"
# ── A7: Persistent dedupe hit storm (resend loop) ────────────────────────
# metric: matrix_bridge_dedupe_persistent_hits_total{room_id} (Counter)
- alert: DedupeHitStorm
expr: >
rate(matrix_bridge_dedupe_persistent_hits_total[10m]) > 0.5
for: 5m
labels:
severity: warning
team: platform
service: matrix-bridge-dagi
annotations:
summary: "Persistent deduplication hit rate elevated"
description: >
High rate of persistent dedupe hits — may indicate a Matrix resend
storm or a client repeatedly retrying the same event_id.
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a7-dedupehitstorm"
# ── A8: Invoke latency P95 high (per node) ───────────────────────────────
# metric: matrix_bridge_invoke_duration_seconds{agent_id, node_id} (Histogram)
- alert: InvokeLatencyP95High
expr: >
histogram_quantile(
0.95,
sum by (node_id, le) (
rate(matrix_bridge_invoke_duration_seconds_bucket[5m])
)
) > 15
for: 5m
labels:
severity: warning
team: platform
service: matrix-bridge-dagi
annotations:
summary: "Router invoke latency P95 > 15s (node={{ $labels.node_id }})"
description: >
95th percentile invoke latency for node `{{ $labels.node_id }}` exceeds
15 seconds over the last 5 minutes. Check Router load, DeepSeek API,
Ollama/Swapper queue.
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a8-invokelatencyp95high"

View File

@@ -0,0 +1,401 @@
# matrix-bridge-dagi — Soak & Failure Rehearsal Runbook (M11)
**Phase:** M11
**Applies to:** `matrix-bridge-dagi` service on NODA1
**When to run:** Before any production traffic increase, after major code changes, or on a recurring monthly basis.
---
## 1. Goals
| Goal | Measurable pass criterion |
|------|--------------------------|
| Latency under load | p95 invoke < 5 000 ms |
| Queue stability | drop rate < 1% |
| Failover correctness | failover fires on NODA1 outage; NODA2 serves all remaining messages |
| Sticky anti-flap | sticky set after first failover; no re-tries to degraded node |
| Restart recovery | sticky + health snapshot reloads within 10 s of restart |
| Policy operations safe under load | `!policy history` / `!policy change` work while messages in-flight |
---
## 2. Prerequisites
```bash
# On NODA1 or local machine with network access to bridge
pip install httpx
# Verify bridge is up
curl -s http://localhost:9400/health | jq '.ok'
# Expected: true
# Verify /metrics endpoint
curl -s http://localhost:9400/metrics | grep matrix_bridge_up
# Expected: matrix_bridge_up{...} 1
```
---
## 2a. Enabling the Soak Inject Endpoint
The soak script uses `POST /v1/debug/inject_event` which is **disabled by default**.
Enable it only on staging/NODA1 soak runs:
```bash
# On NODA1 — edit docker-compose override or pass env inline:
# Option 1: temporary inline restart
DEBUG_INJECT_ENABLED=true docker-compose \
-f docker-compose.matrix-bridge-node1.yml \
up -d --no-deps matrix-bridge-dagi
# Option 2: .env file override
echo "DEBUG_INJECT_ENABLED=true" >> .env.soak
docker-compose --env-file .env.soak \
-f docker-compose.matrix-bridge-node1.yml \
up -d --no-deps matrix-bridge-dagi
# Verify it's enabled (should return 200, not 403)
curl -s -X POST http://localhost:9400/v1/debug/inject_event \
-H 'Content-Type: application/json' \
-d '{"room_id":"!test:test","event":{}}' | jq .
# Expected: {"ok":false,"error":"no mapping for room_id=..."} ← 200, not 403
# IMPORTANT: disable after soak
docker-compose -f docker-compose.matrix-bridge-node1.yml up -d --no-deps matrix-bridge-dagi
# (DEBUG_INJECT_ENABLED defaults to false)
```
---
## 2b. Step 0 (WORKERS=2 / QUEUE=100) — Record True Baseline
**Goal:** snapshot the "before any tuning" numbers to have a comparison point.
```bash
# 0. Confirm current config (should be defaults)
curl -s http://localhost:9400/health | jq '{workers: .workers, queue_max: .queue.max}'
# Expected: {"workers": 2, "queue_max": 100}
# 1. DB path for WAL check (adjust to your BRIDGE_DATA_DIR)
DB=/opt/microdao-daarion/data/matrix_bridge.db
# 2. WAL size before (manual check)
ls -lh ${DB}-wal 2>/dev/null || echo "(no WAL file yet — first run)"
sqlite3 $DB "PRAGMA wal_checkpoint(PASSIVE);" 2>/dev/null || echo "(no sqlite3)"
# 3. Run Step 0 soak
python3 ops/scripts/matrix_bridge_soak.py \
--url http://localhost:9400 \
--messages 100 \
--concurrency 4 \
--agent sofiia \
--room-id "!your-room-id:your-server" \
--max-p95-ms 5000 \
--max-drop-rate 0.001 \
--db-path $DB \
--report-file /tmp/soak_step0_baseline.json
# 4. Record result in "Baseline numbers" table (section 10) below.
jq '.summary, .latency, .metrics_delta, .wal' /tmp/soak_step0_baseline.json
```
**v1 Go/No-Go thresholds for Step 0:**
| Metric | Green ✅ | Yellow ⚠️ | Red ❌ |
|--------|---------|-----------|-------|
| `p95_invoke_ms` | < 3000 | 30005000 | > 5000 |
| `drop_rate` | 0.00% (mandatory) | — | > 0.1% |
| `error_rate` | < 1% | 13% | > 3% |
| `failovers` | 0 | — | ≥ 1 without cause |
| WAL delta | < 2 MB | 210 MB | > 10 MB |
**If Step 0 is Green → proceed to Step 1 tuning.**
**If Step 0 is Yellow/Red → investigate before touching WORKER_CONCURRENCY.**
---
## 2c. Step 1 (WORKERS=4 / QUEUE=200) — Tune-1
**Goal:** verify that doubling workers gives headroom without Router saturation.
```bash
# 1. Apply tuning
WORKER_CONCURRENCY=4 QUEUE_MAX_EVENTS=200 docker-compose \
-f docker-compose.matrix-bridge-node1.yml \
--env-file .env.soak \
up -d --no-deps matrix-bridge-dagi
sleep 3
curl -s http://localhost:9400/health | jq '{workers: .workers, queue_max: .queue.max}'
# Expected: {"workers": 4, "queue_max": 200}
# 2. Run Step 1 soak (higher concurrency to stress the new headroom)
python3 ops/scripts/matrix_bridge_soak.py \
--url http://localhost:9400 \
--messages 100 \
--concurrency 8 \
--agent sofiia \
--room-id "!your-room-id:your-server" \
--max-p95-ms 3000 \
--max-drop-rate 0.001 \
--db-path $DB \
--report-file /tmp/soak_step1_tune1.json
# 3. Compare Step 0 vs Step 1
python3 - <<'EOF'
import json
s0 = json.load(open('/tmp/soak_step0_baseline.json'))
s1 = json.load(open('/tmp/soak_step1_tune1.json'))
for k in ('p50', 'p95', 'p99'):
print(f"{k}: {s0['latency'][k]}ms → {s1['latency'][k]}ms")
print(f"drops: {s0['metrics_delta']['queue_drops']} → {s1['metrics_delta']['queue_drops']}")
print(f"WAL: {s0['wal'].get('delta_mb')} → {s1['wal'].get('delta_mb')} MB delta")
EOF
```
**Decision:**
- Step 1 Green → **freeze, tag v1.0, ship to production.**
- p95 within 5% of Step 0 → Router is bottleneck (not workers); don't go to Step 2.
- Queue drops > 0 at WORKERS=4 → try Step 2 (WORKERS=8, QUEUE=300).
---
## 3. Scenario A — Baseline load (100 messages, concurrency 4)
**Goal:** establish latency baseline, verify no drops under normal load.
```bash
python3 ops/scripts/matrix_bridge_soak.py \
--url http://localhost:9400 \
--messages 100 \
--concurrency 4 \
--max-p95-ms 3000 \
--report-file /tmp/soak_baseline.json
```
**Expected output:**
```
matrix-bridge-dagi Soak Report ✅ PASSED
Messages: 100 concurrency=4
Latency: p50=<500ms p95=<3000ms
Queue drops: 0 (rate 0.000%)
Failovers: 0
```
**If FAILED:**
- `p95 too high` → check router `/health`, DeepSeek API latency, `docker stats`
- `drop_rate > 0` → check `QUEUE_MAX_EVENTS` env var (increase if needed), inspect bridge logs
---
## 4. Scenario B — Queue saturation test
**Goal:** confirm drop metric fires cleanly and bridge doesn't crash.
```bash
# Reduce queue via env override, then flood:
QUEUE_MAX_EVENTS=5 docker-compose -f docker-compose.matrix-bridge-node1.yml \
up -d matrix-bridge-dagi
# Wait for restart
sleep 5
python3 ops/scripts/matrix_bridge_soak.py \
--url http://localhost:9400 \
--messages 30 \
--concurrency 10 \
--max-drop-rate 0.99 \
--report-file /tmp/soak_queue_sat.json
# Restore normal queue size
docker-compose -f docker-compose.matrix-bridge-node1.yml up -d matrix-bridge-dagi
```
**Expected:** `queue_drops > 0`, bridge still running after the test.
**Verify in Prometheus/Grafana:**
```promql
rate(matrix_bridge_queue_dropped_total[1m])
```
Should spike and then return to 0.
---
## 5. Scenario C — Node failover rehearsal
**Goal:** simulate NODA1 router becoming unavailable, verify NODA2 takes over.
```bash
# Step 1: stop the router on NODA1 temporarily
docker pause dagi-router-node1
# Step 2: run soak against bridge (bridge will failover to NODA2)
python3 ops/scripts/matrix_bridge_soak.py \
--url http://localhost:9400 \
--messages 20 \
--concurrency 2 \
--max-p95-ms 10000 \
--report-file /tmp/soak_failover.json
# Step 3: restore router
docker unpause dagi-router-node1
```
**Expected:**
```
Failovers: 1..20 (at least 1)
Sticky sets: 1+
Errors: 0 (fallback to NODA2 serves all messages)
```
**Check sticky in control room:**
```
!nodes
```
Should show `NODA2` sticky with remaining TTL.
**Check health tracker:**
```
!status
```
Should show `NODA1 state=degraded|down`.
---
## 6. Scenario D — Restart recovery
**Goal:** after restart, sticky and health state reload within one polling cycle.
```bash
# After Scenario C: sticky is set to NODA2
# Restart the bridge
docker restart dagi-matrix-bridge-node1
# Wait for startup (up to 30s)
sleep 15
# Verify sticky reloaded
curl -s http://localhost:9400/health | jq '.ha_state'
# Expected: {"sticky_loaded": N, ...}
# Verify routing still uses NODA2 sticky
python3 ops/scripts/matrix_bridge_soak.py \
--url http://localhost:9400 \
--messages 10 \
--concurrency 2 \
--report-file /tmp/soak_restart.json
```
**Expected:** p95 similar to post-failover run, `Failovers: 0` (sticky already applied).
---
## 7. Scenario E — Rate limit burst
**Goal:** verify rate limiting fires and bridge doesn't silently drop below-limit messages.
```bash
# Set RPM very low for test, then flood from same sender
# This is best done in control room by observing !status rate_limited count
# rather than the soak script (which uses different senders per message).
# In Matrix control room:
# Send 30+ messages from the same user account in quick succession in a mixed room.
# Then:
!status
# Check: rate_limited_total increased, no queue drops.
```
---
## 8. Scenario F — Policy operations under load
**Goal:** `!policy history`, `!policy change`, and `!policy export` work while messages are in-flight.
```bash
# Run a background soak
python3 ops/scripts/matrix_bridge_soak.py \
--url http://localhost:9400 \
--messages 200 \
--concurrency 2 \
--report-file /tmp/soak_concurrent_policy.json &
# While soak is running, in Matrix control room:
!policy history limit=5
!policy export
!status
```
**Expected:** all three commands respond immediately (< 2s), soak completes without extra drops.
---
## 9. Prometheus / Grafana during soak
Key queries for the Grafana dashboard:
```promql
# Throughput (messages/s)
rate(matrix_bridge_routed_total[30s])
# Error rate
rate(matrix_bridge_errors_total[30s])
# p95 invoke latency per node
histogram_quantile(0.95, rate(matrix_bridge_invoke_duration_seconds_bucket[1m]))
# Queue drops rate
rate(matrix_bridge_queue_dropped_total[1m])
# Failovers
rate(matrix_bridge_failover_total[5m])
```
Use the `matrix-bridge-dagi` Grafana dashboard at:
`ops/grafana/dashboards/matrix-bridge-dagi.json`
---
## 10. Baseline numbers (reference)
| Metric | Cold start | Warm (sticky set) |
|--------|-----------|-------------------|
| p50 latency | ~200ms | ~150ms |
| p95 latency | ~2 000ms | ~1 500ms |
| Queue drops | 0 (queue=100) | 0 |
| Failover fires | 1 per degradation | 0 after sticky |
| Policy ops response | < 500ms | < 500ms |
*Update this table after each soak run with actual measured values.*
---
## 11. CI soak (mocked, no network)
For CI pipelines, use the mocked soak scenarios:
```bash
python3 -m pytest tests/test_matrix_bridge_m11_soak_scenarios.py -v
```
Covers (all deterministic, no network):
- **S1** Queue saturation → drop counter
- **S2** Failover under load → on_failover callback, health tracker
- **S3** Sticky routing under burst → sticky set, burst routed to NODA2
- **S4** Multi-room isolation → separate rooms don't interfere
- **S5** Rate-limit burst → RL callback wired, no panic
- **S6** HA restart recovery → sticky + health snapshot persisted and reloaded
- **Perf baseline** 100-msg + 50-msg failover burst < 5s wall clock
---
## 12. Known failure modes & mitigations
| Symptom | Likely cause | Mitigation |
|---------|-------------|------------|
| `p95 > 5000ms` | Router/LLM slow | Increase `ROUTER_TIMEOUT_S`, check DeepSeek API |
| `drop_rate > 1%` | Queue too small | Increase `QUEUE_MAX_EVENTS` |
| `failovers > 0` but errors > 0 | Both nodes degraded | Check NODA1 + NODA2 health; scale router |
| Bridge crash during soak | Memory leak / bug | `docker logs` → file GitHub issue |
| Sticky not set after failover | `FAILOVER_STICKY_TTL_S=0` | Set to 300+ |
| Restart doesn't load sticky | `HA_HEALTH_MAX_AGE_S` too small | Increase or set to 3600 |

View File

@@ -0,0 +1,476 @@
#!/usr/bin/env python3
"""
matrix_bridge_soak.py — M11 live soak script for matrix-bridge-dagi
Usage:
python3 ops/scripts/matrix_bridge_soak.py \
--url http://localhost:9400 \
--messages 100 \
--concurrency 4 \
--report-file /tmp/soak_report.json
Requires: httpx (pip install httpx)
What it does:
1. Sends --messages synthetic messages to the bridge /v1/sync endpoint
(or directly to the router if --direct-router is set).
2. Measures latency (p50, p95, p99, max) per batch.
3. After the run, fetches /metrics and extracts key counters:
- matrix_bridge_queue_dropped_total
- matrix_bridge_rate_limited_total
- matrix_bridge_failover_total
- matrix_bridge_sticky_node_total
- matrix_bridge_invoke_duration_seconds (p50/p95 from histogram)
4. Prints a human-readable report and optionally writes JSON.
Exit codes:
0 = all pass criteria met
1 = one or more thresholds exceeded (see --max-p95-ms, --max-drop-rate)
"""
import argparse
import asyncio
import json
import sys
import time
from typing import Any, Dict, List, Optional
try:
import httpx
except ImportError:
print("ERROR: httpx not installed. Run: pip install httpx", file=sys.stderr)
sys.exit(2)
# ── Pass/fail defaults ─────────────────────────────────────────────────────────
_DEFAULT_MAX_P95_MS = 5000 # 5 s p95 per invoke (generous for cold start)
_DEFAULT_MAX_DROP_RATE = 0.01 # 1% queue drops allowed
# ── Metrics parsing ────────────────────────────────────────────────────────────
def _parse_counter(text: str, name: str) -> float:
"""Extract the last reported value of a Prometheus counter by name."""
for line in text.splitlines():
if line.startswith(name + " ") or line.startswith(name + "{"):
parts = line.rsplit(None, 1)
try:
return float(parts[-1])
except (ValueError, IndexError):
pass
return 0.0
def _parse_histogram_quantile(text: str, name: str, quantile: float) -> Optional[float]:
"""
Approximate histogram_quantile from _bucket lines.
Returns estimated value at given quantile or None if data missing.
"""
buckets: List[tuple] = []
total_count = 0.0
for line in text.splitlines():
if f"{name}_bucket" in line and 'le="' in line:
try:
le_part = line.split('le="')[1].split('"')[0]
le = float(le_part) if le_part != "+Inf" else float("inf")
val = float(line.rsplit(None, 1)[-1])
buckets.append((le, val))
except (ValueError, IndexError):
pass
elif (f"{name}_count " in line or (name + "_count{") in line):
try:
total_count = float(line.rsplit(None, 1)[-1])
except (ValueError, IndexError):
pass
if not buckets or total_count == 0:
return None
buckets.sort()
target = quantile * total_count
prev_le, prev_count = 0.0, 0.0
for le, count in buckets:
if count >= target:
if le == float("inf"):
return prev_le
# Linear interpolation
if count == prev_count:
return le
fraction = (target - prev_count) / (count - prev_count)
return prev_le + fraction * (le - prev_le)
prev_le, prev_count = le, count
return prev_le
# ── Soak runner ────────────────────────────────────────────────────────────────
async def _preflight_inject(client: httpx.AsyncClient, url: str, room_id: str) -> str:
"""
Verify the inject endpoint is reachable and enabled.
Returns "" on success, error message on failure.
"""
try:
resp = await client.post(
f"{url.rstrip('/')}/v1/debug/inject_event",
json={"room_id": room_id, "event": {"event_id": "!preflight", "sender": "@soak:test",
"content": {"msgtype": "m.text", "body": "ping"}}},
timeout=5.0,
)
if resp.status_code == 403:
return (
"❌ DEBUG_INJECT_ENABLED=false on bridge. "
"Set DEBUG_INJECT_ENABLED=true and restart for soak.\n"
" NEVER enable in production!"
)
if resp.status_code >= 500:
return f"❌ Bridge inject endpoint returned HTTP {resp.status_code}"
data = resp.json()
if not data.get("ok") and "no mapping" in data.get("error", ""):
return (
f"❌ No room mapping for room_id={room_id!r}. "
"Pass --room-id matching a configured BRIDGE_ROOM_MAP entry."
)
return ""
except httpx.ConnectError:
return f"❌ Cannot connect to bridge at {url}. Is it running?"
except Exception as exc: # noqa: BLE001
return f"❌ Preflight failed: {exc}"
async def _check_wal(db_path: str) -> Dict[str, Any]:
"""
Run WAL size + checkpoint check on the bridge policy DB.
Returns dict with wal_bytes, wal_mb, checkpoint_result.
Requires sqlite3 CLI on PATH; gracefully skips if unavailable.
"""
import subprocess, shutil
result: Dict[str, Any] = {"db_path": db_path, "ok": False}
wal_path = db_path + "-wal"
try:
wal_bytes = os.path.getsize(wal_path) if os.path.exists(wal_path) else 0
result["wal_bytes"] = wal_bytes
result["wal_mb"] = round(wal_bytes / 1_048_576, 2)
except OSError:
result["wal_bytes"] = -1
result["wal_mb"] = -1
if shutil.which("sqlite3"):
try:
cp = subprocess.run(
["sqlite3", db_path, "PRAGMA wal_checkpoint(PASSIVE);"],
capture_output=True, text=True, timeout=5,
)
# Output: busy|log|checkpointed (3 ints)
parts = cp.stdout.strip().split("|")
if len(parts) == 3:
result["wal_checkpoint"] = {
"busy": int(parts[0]), "log": int(parts[1]), "checkpointed": int(parts[2]),
}
result["ok"] = True
except Exception: # noqa: BLE001
result["ok"] = False
else:
result["sqlite3_missing"] = True
return result
async def _send_one(
client: httpx.AsyncClient,
url: str,
agent_id: str,
message: str,
room_id: str,
sender: str,
) -> tuple:
"""
POST a synthetic Matrix-style event to the bridge debug endpoint.
Returns (latency_ms: float, status_code: int, error: str|None).
"""
payload = {
"room_id": room_id,
"event": {
"event_id": f"!soak-{int(time.monotonic() * 1e6)}",
"sender": sender,
"type": "m.room.message",
"content": {"msgtype": "m.text", "body": message},
},
}
t0 = time.monotonic()
try:
resp = await client.post(
f"{url.rstrip('/')}/v1/debug/inject_event",
json=payload,
timeout=30.0,
)
latency_ms = (time.monotonic() - t0) * 1000
if resp.status_code >= 500:
return latency_ms, resp.status_code, f"HTTP {resp.status_code}"
return latency_ms, resp.status_code, None
except httpx.TimeoutException:
latency_ms = (time.monotonic() - t0) * 1000
return latency_ms, 0, "timeout"
except Exception as exc: # noqa: BLE001
latency_ms = (time.monotonic() - t0) * 1000
return latency_ms, 0, str(exc)
async def _fetch_health(client: httpx.AsyncClient, url: str) -> Dict[str, Any]:
try:
resp = await client.get(f"{url.rstrip('/')}/health", timeout=10.0)
return resp.json() if resp.status_code == 200 else {}
except Exception: # noqa: BLE001
return {}
async def _fetch_metrics(client: httpx.AsyncClient, url: str) -> str:
try:
resp = await client.get(f"{url.rstrip('/')}/metrics", timeout=10.0)
return resp.text if resp.status_code == 200 else ""
except Exception: # noqa: BLE001
return ""
def _percentile(values: List[float], p: float) -> float:
if not values:
return 0.0
sv = sorted(values)
idx = int(len(sv) * p / 100)
return sv[min(idx, len(sv) - 1)]
async def run_soak(
url: str,
n_messages: int,
concurrency: int,
agent_id: str,
room_id: str,
sender: str,
max_p95_ms: float,
max_drop_rate: float,
db_path: str = "",
) -> Dict[str, Any]:
results: List[tuple] = []
semaphore = asyncio.Semaphore(concurrency)
async with httpx.AsyncClient() as client:
# Pre-check: inject endpoint + health
preflight_err = await _preflight_inject(client, url, room_id)
if preflight_err:
print(preflight_err, file=sys.stderr)
return {"ok": False, "error": preflight_err, "passed": False, "failures": [preflight_err]}
# WAL check before soak
wal_before: Dict[str, Any] = {}
if db_path:
wal_before = await _check_wal(db_path)
print(f"[soak] WAL before: {wal_before.get('wal_mb', '?')} MB")
# Pre-check: health
health_before = await _fetch_health(client, url)
metrics_before = await _fetch_metrics(client, url)
drops_before = _parse_counter(metrics_before, "matrix_bridge_queue_dropped_total")
rl_before = _parse_counter(metrics_before, "matrix_bridge_rate_limited_total")
fo_before = _parse_counter(metrics_before, "matrix_bridge_failover_total")
print(f"[soak] Bridge health before: {health_before.get('ok', '?')}")
print(f"[soak] Starting {n_messages} messages (concurrency={concurrency}) ...")
t_start = time.monotonic()
async def worker(i: int):
async with semaphore:
msg = f"soak-msg-{i:04d}"
lat, status, err = await _send_one(
client, url, agent_id, msg, room_id, sender
)
results.append((lat, status, err))
if (i + 1) % max(1, n_messages // 10) == 0:
print(f" [{i+1}/{n_messages}] last={lat:.0f}ms status={status}")
await asyncio.gather(*[worker(i) for i in range(n_messages)])
elapsed_s = time.monotonic() - t_start
metrics_after = await _fetch_metrics(client, url)
health_after = await _fetch_health(client, url)
# WAL check after soak
wal_after: Dict[str, Any] = {}
if db_path:
wal_after = await _check_wal(db_path)
print(f"[soak] WAL after: {wal_after.get('wal_mb', '?')} MB "
f"(delta={round(wal_after.get('wal_mb',0) - wal_before.get('wal_mb',0), 2)} MB)")
latencies = [r[0] for r in results]
errors = [r for r in results if r[2] is not None]
successes = len(results) - len(errors)
error_rate = len(errors) / len(results) if results else 0.0
drops_after = _parse_counter(metrics_after, "matrix_bridge_queue_dropped_total")
rl_after = _parse_counter(metrics_after, "matrix_bridge_rate_limited_total")
fo_after = _parse_counter(metrics_after, "matrix_bridge_failover_total")
sticky_after = _parse_counter(metrics_after, "matrix_bridge_sticky_node_total")
delta_drops = drops_after - drops_before
delta_rl = rl_after - rl_before
delta_fo = fo_after - fo_before
p50 = _percentile(latencies, 50)
p95 = _percentile(latencies, 95)
p99 = _percentile(latencies, 99)
p_max = max(latencies) if latencies else 0.0
# Histogram quantile from Prometheus
hist_p95 = _parse_histogram_quantile(
metrics_after, "matrix_bridge_invoke_duration_seconds", 0.95
)
hist_p95_ms = hist_p95 * 1000 if hist_p95 is not None else None
drop_rate = delta_drops / len(results) if results else 0.0
report = {
"wal": {
"before_mb": wal_before.get("wal_mb"),
"after_mb": wal_after.get("wal_mb"),
"delta_mb": round(
(wal_after.get("wal_mb") or 0) - (wal_before.get("wal_mb") or 0), 3
) if wal_before and wal_after else None,
"checkpoint_after": wal_after.get("wal_checkpoint"),
"threshold_mb": 10,
},
"summary": {
"total_messages": n_messages,
"concurrency": concurrency,
"elapsed_s": round(elapsed_s, 2),
"throughput_rps": round(n_messages / elapsed_s, 1) if elapsed_s > 0 else 0,
"successes": successes,
"errors": len(errors),
"error_rate": round(error_rate, 4),
},
"latency_ms": {
"p50": round(p50, 1),
"p95": round(p95, 1),
"p99": round(p99, 1),
"max": round(p_max, 1),
},
"metrics_delta": {
"queue_drops": int(delta_drops),
"rate_limited": int(delta_rl),
"failovers": int(delta_fo),
"sticky_sets": int(sticky_after),
"drop_rate": round(drop_rate, 4),
},
"prometheus_invoke_p95_ms": round(hist_p95_ms, 1) if hist_p95_ms else None,
"health_before": health_before.get("ok"),
"health_after": health_after.get("ok"),
"pass_criteria": {
"max_p95_ms": max_p95_ms,
"max_drop_rate": max_drop_rate,
},
}
# Pass/fail evaluation
failures = []
if p95 > max_p95_ms:
failures.append(f"p95={p95:.0f}ms exceeds threshold {max_p95_ms:.0f}ms")
if drop_rate > max_drop_rate:
failures.append(
f"drop_rate={drop_rate:.3%} exceeds threshold {max_drop_rate:.3%}"
)
wal_delta = report["wal"]["delta_mb"]
if wal_delta is not None and wal_delta > report["wal"]["threshold_mb"]:
failures.append(
f"WAL grew {wal_delta:.1f}MB (threshold {report['wal']['threshold_mb']}MB) "
"— possible SQLite write pressure (Bottleneck #2)"
)
report["passed"] = len(failures) == 0
report["failures"] = failures
return report
def _print_report(r: Dict[str, Any]) -> None:
s = r["summary"]
l = r["latency_ms"]
m = r["metrics_delta"]
passed = "✅ PASSED" if r["passed"] else "❌ FAILED"
w = r.get("wal", {})
print()
print("=" * 60)
print(f" matrix-bridge-dagi Soak Report {passed}")
print("=" * 60)
print(f" Messages: {s['total_messages']} concurrency={s['concurrency']}")
print(f" Elapsed: {s['elapsed_s']}s ({s['throughput_rps']} rps)")
print(f" Successes: {s['successes']} errors={s['errors']} ({s['error_rate']:.1%})")
print()
print(f" Latency (client-side): p50={l['p50']}ms p95={l['p95']}ms "
f"p99={l['p99']}ms max={l['max']}ms")
if r["prometheus_invoke_p95_ms"] is not None:
print(f" Invoke p95 (Prometheus): {r['prometheus_invoke_p95_ms']}ms")
print()
print(f" Queue drops: {m['queue_drops']} (rate {m['drop_rate']:.3%})")
print(f" Rate-limited: {m['rate_limited']}")
print(f" Failovers: {m['failovers']}")
print(f" Sticky sets: {m['sticky_sets']}")
if w.get("before_mb") is not None:
wal_delta_str = (
f"Δ{w['delta_mb']:+.2f}MB" if w.get("delta_mb") is not None else ""
)
wal_warn = " ⚠️" if (w.get("delta_mb") or 0) > w.get("threshold_mb", 10) else ""
print(f" WAL: {w['before_mb']}MB → {w['after_mb']}MB {wal_delta_str}{wal_warn}")
print()
if r["failures"]:
for f in r["failures"]:
print(f"{f}")
else:
print(" All pass criteria met.")
print("=" * 60)
def main() -> int:
parser = argparse.ArgumentParser(description="matrix-bridge-dagi soak test (M11)")
parser.add_argument("--url", default="http://localhost:9400",
help="Bridge base URL (default: http://localhost:9400)")
parser.add_argument("--messages", type=int, default=100,
help="Total messages to send (default: 100)")
parser.add_argument("--concurrency", type=int, default=4,
help="Concurrent requests (default: 4)")
parser.add_argument("--agent-id", default="sofiia",
help="Agent id for synthetic events (default: sofiia)")
parser.add_argument("--room-id", default="!soak-room:home.invalid",
help="Room id for synthetic events")
parser.add_argument("--sender", default="@soak-user:home.invalid",
help="Sender for synthetic events")
parser.add_argument("--max-p95-ms", type=float, default=_DEFAULT_MAX_P95_MS,
help=f"Max p95 latency ms (default: {_DEFAULT_MAX_P95_MS})")
parser.add_argument("--max-drop-rate",type=float, default=_DEFAULT_MAX_DROP_RATE,
help=f"Max queue drop rate 0..1 (default: {_DEFAULT_MAX_DROP_RATE})")
parser.add_argument("--report-file", default="",
help="Optional path to write JSON report")
parser.add_argument("--db-path", default="",
help="Path to policy_store.db for WAL check "
"(e.g. /opt/microdao-daarion/data/matrix_bridge.db)")
args = parser.parse_args()
report = asyncio.run(run_soak(
url=args.url,
n_messages=args.messages,
concurrency=args.concurrency,
agent_id=args.agent_id,
room_id=args.room_id,
sender=args.sender,
max_p95_ms=args.max_p95_ms,
max_drop_rate=args.max_drop_rate,
db_path=args.db_path,
))
_print_report(report)
if args.report_file:
with open(args.report_file, "w", encoding="utf-8") as fh:
json.dump(report, fh, indent=2)
print(f"\n Report saved: {args.report_file}")
return 0 if report["passed"] else 1
if __name__ == "__main__":
sys.exit(main())