feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
This commit is contained in:
986
ops/grafana/dashboards/matrix-bridge-dagi.json
Normal file
986
ops/grafana/dashboards/matrix-bridge-dagi.json
Normal file
@@ -0,0 +1,986 @@
|
||||
{
|
||||
"__inputs": [
|
||||
{
|
||||
"name": "DS_PROMETHEUS",
|
||||
"label": "Prometheus",
|
||||
"description": "",
|
||||
"type": "datasource",
|
||||
"pluginId": "prometheus",
|
||||
"pluginName": "Prometheus"
|
||||
}
|
||||
],
|
||||
"__elements": {},
|
||||
"__requires": [
|
||||
{
|
||||
"type": "grafana",
|
||||
"id": "grafana",
|
||||
"name": "Grafana",
|
||||
"version": "9.0.0"
|
||||
},
|
||||
{
|
||||
"type": "datasource",
|
||||
"id": "prometheus",
|
||||
"name": "Prometheus",
|
||||
"version": "1.0.0"
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "stat",
|
||||
"name": "Stat",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "timeseries",
|
||||
"name": "Time series",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "gauge",
|
||||
"name": "Gauge",
|
||||
"version": ""
|
||||
}
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "grafana",
|
||||
"uid": "-- Grafana --"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "Matrix Bridge DAGI \u2014 operational overview (M7.0). Traffic, latency, errors, queue, dedupe, control channel.",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [
|
||||
{
|
||||
"asDropdown": false,
|
||||
"icon": "doc",
|
||||
"includeVars": false,
|
||||
"keepTime": false,
|
||||
"tags": [],
|
||||
"targetBlank": true,
|
||||
"title": "Runbook",
|
||||
"tooltip": "matrix-bridge-dagi-ops.md",
|
||||
"type": "link",
|
||||
"url": "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md"
|
||||
}
|
||||
],
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Bridge Up",
|
||||
"gridPos": {
|
||||
"x": 0,
|
||||
"y": 0,
|
||||
"w": 4,
|
||||
"h": 4
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(matrix_bridge_up)",
|
||||
"legendFormat": "up (all nodes)",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
},
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"textMode": "auto",
|
||||
"orientation": "auto"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": {
|
||||
"text": "DOWN",
|
||||
"color": "red"
|
||||
},
|
||||
"1": {
|
||||
"text": "UP",
|
||||
"color": "green"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Queue Size",
|
||||
"gridPos": {
|
||||
"x": 4,
|
||||
"y": 0,
|
||||
"w": 4,
|
||||
"h": 4
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "matrix_bridge_queue_size",
|
||||
"legendFormat": "queue",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
},
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"textMode": "auto"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 50
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 100
|
||||
}
|
||||
]
|
||||
},
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "Active Rate-Limiter Rooms",
|
||||
"gridPos": {
|
||||
"x": 8,
|
||||
"y": 0,
|
||||
"w": 4,
|
||||
"h": 4
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "matrix_bridge_rate_limiter_active_rooms",
|
||||
"legendFormat": "rooms",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
},
|
||||
"colorMode": "value",
|
||||
"graphMode": "none"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Active Room-Agent Locks",
|
||||
"gridPos": {
|
||||
"x": 12,
|
||||
"y": 0,
|
||||
"w": 4,
|
||||
"h": 4
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "matrix_bridge_active_room_agent_locks",
|
||||
"legendFormat": "locks",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
},
|
||||
"colorMode": "value",
|
||||
"graphMode": "none"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Drops (5m)",
|
||||
"gridPos": {
|
||||
"x": 16,
|
||||
"y": 0,
|
||||
"w": 4,
|
||||
"h": 4
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(matrix_bridge_queue_dropped_total[5m]))",
|
||||
"legendFormat": "dropped",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
},
|
||||
"colorMode": "background",
|
||||
"graphMode": "none"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "Errors (5m)",
|
||||
"gridPos": {
|
||||
"x": 20,
|
||||
"y": 0,
|
||||
"w": 4,
|
||||
"h": 4
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(matrix_bridge_gateway_errors_total[5m]))",
|
||||
"legendFormat": "errors",
|
||||
"refId": "A",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
},
|
||||
"colorMode": "background",
|
||||
"graphMode": "none"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 5
|
||||
}
|
||||
]
|
||||
},
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"type": "timeseries",
|
||||
"title": "Traffic: Received & Replied (rate/5m)",
|
||||
"gridPos": {
|
||||
"x": 0,
|
||||
"y": 4,
|
||||
"w": 12,
|
||||
"h": 8
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(matrix_bridge_messages_received_total[5m]))",
|
||||
"legendFormat": "received",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(matrix_bridge_messages_replied_total{status=\"ok\"}[5m]))",
|
||||
"legendFormat": "replied ok",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(matrix_bridge_messages_replied_total{status=\"error\"}[5m]))",
|
||||
"legendFormat": "replied error",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps",
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"drawStyle": "line",
|
||||
"spanNulls": false
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byName",
|
||||
"options": "replied error"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "color",
|
||||
"value": {
|
||||
"mode": "fixed",
|
||||
"fixedColor": "red"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "timeseries",
|
||||
"title": "Errors / Drops / Rate-Limited (rate/5m)",
|
||||
"gridPos": {
|
||||
"x": 12,
|
||||
"y": 4,
|
||||
"w": 12,
|
||||
"h": 8
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (error_type) (rate(matrix_bridge_gateway_errors_total[5m]))",
|
||||
"legendFormat": "gw_error: {{ error_type }}",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(matrix_bridge_queue_dropped_total[5m]))",
|
||||
"legendFormat": "queue_dropped",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(matrix_bridge_rate_limited_total[5m]))",
|
||||
"legendFormat": "rate_limited",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"expr": "sum by (reason) (rate(matrix_bridge_route_rejected_total[5m]))",
|
||||
"legendFormat": "route_rejected: {{ reason }}",
|
||||
"refId": "D"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps",
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 15,
|
||||
"drawStyle": "line",
|
||||
"stacking": {
|
||||
"mode": "none"
|
||||
},
|
||||
"spanNulls": false
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"type": "timeseries",
|
||||
"title": "Invoke Latency P50 / P95 by Node",
|
||||
"gridPos": {
|
||||
"x": 0,
|
||||
"y": 12,
|
||||
"w": 12,
|
||||
"h": 8
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum by (node_id, le) (rate(matrix_bridge_invoke_duration_seconds_bucket[5m])))",
|
||||
"legendFormat": "p50 {{ node_id }}",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum by (node_id, le) (rate(matrix_bridge_invoke_duration_seconds_bucket[5m])))",
|
||||
"legendFormat": "p95 {{ node_id }}",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max",
|
||||
"last"
|
||||
]
|
||||
}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 5,
|
||||
"drawStyle": "line",
|
||||
"spanNulls": false
|
||||
},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 10
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 20
|
||||
}
|
||||
]
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 21,
|
||||
"type": "timeseries",
|
||||
"title": "Queue Wait P50 / P95",
|
||||
"gridPos": {
|
||||
"x": 12,
|
||||
"y": 12,
|
||||
"w": 12,
|
||||
"h": 8
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum by (agent_id, le) (rate(matrix_bridge_queue_wait_seconds_bucket[5m])))",
|
||||
"legendFormat": "wait p50 {{ agent_id }}",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum by (agent_id, le) (rate(matrix_bridge_queue_wait_seconds_bucket[5m])))",
|
||||
"legendFormat": "wait p95 {{ agent_id }}",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 5,
|
||||
"drawStyle": "line",
|
||||
"spanNulls": false
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"type": "timeseries",
|
||||
"title": "Node Routing: Routed & Rejected by Node (rate/5m)",
|
||||
"gridPos": {
|
||||
"x": 0,
|
||||
"y": 20,
|
||||
"w": 12,
|
||||
"h": 7
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (node_id) (rate(matrix_bridge_routed_total[5m]))",
|
||||
"legendFormat": "routed {{ node_id }}",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum by (node_id) (rate(matrix_bridge_node_rejected_total[5m]))",
|
||||
"legendFormat": "rejected {{ node_id }}",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps",
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"drawStyle": "line",
|
||||
"spanNulls": false
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 31,
|
||||
"type": "timeseries",
|
||||
"title": "Persistent Dedupe Hits / Inserts (rate/10m)",
|
||||
"gridPos": {
|
||||
"x": 12,
|
||||
"y": 20,
|
||||
"w": 12,
|
||||
"h": 7
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(matrix_bridge_dedupe_persistent_hits_total[10m]))",
|
||||
"legendFormat": "dedupe_hits",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(matrix_bridge_dedupe_persistent_inserts_total[10m])",
|
||||
"legendFormat": "dedupe_inserts",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps",
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"drawStyle": "line",
|
||||
"spanNulls": false
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 40,
|
||||
"type": "timeseries",
|
||||
"title": "Control Commands (rate/5m)",
|
||||
"gridPos": {
|
||||
"x": 0,
|
||||
"y": 27,
|
||||
"w": 12,
|
||||
"h": 7
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (verb) (rate(matrix_bridge_control_commands_total[5m]))",
|
||||
"legendFormat": "cmd {{ verb }}",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "sum by (scope) (rate(matrix_bridge_control_rate_limited_total[5m]))",
|
||||
"legendFormat": "ctrl_ratelimited {{ scope }}",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps",
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"drawStyle": "line",
|
||||
"spanNulls": false
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 41,
|
||||
"type": "timeseries",
|
||||
"title": "Traffic by Agent (received rate/5m)",
|
||||
"gridPos": {
|
||||
"x": 12,
|
||||
"y": 27,
|
||||
"w": 24,
|
||||
"h": 7
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (agent_id) (rate(matrix_bridge_messages_received_total[5m]))",
|
||||
"legendFormat": "{{ agent_id }}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max",
|
||||
"last"
|
||||
]
|
||||
}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps",
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"drawStyle": "line",
|
||||
"spanNulls": false
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 42,
|
||||
"type": "timeseries",
|
||||
"title": "Routing Reasons by Agent (rate/5m)",
|
||||
"description": "M7.1: matrix_bridge_routing_reasons_total \u2014 slash/mention/name/default/direct breakdown",
|
||||
"gridPos": {
|
||||
"x": 0,
|
||||
"y": 34,
|
||||
"w": 24,
|
||||
"h": 7
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (agent_id, reason) (rate(matrix_bridge_routing_reasons_total[5m]))",
|
||||
"legendFormat": "{{ agent_id }} / {{ reason }}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
},
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps",
|
||||
"custom": {
|
||||
"lineWidth": 2,
|
||||
"fillOpacity": 10,
|
||||
"drawStyle": "line",
|
||||
"spanNulls": false
|
||||
},
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
}
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 38,
|
||||
"tags": [
|
||||
"matrix-bridge",
|
||||
"dagi",
|
||||
"daarion"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Datasource",
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"type": "datasource"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "UTC",
|
||||
"title": "Matrix Bridge DAGI",
|
||||
"uid": "matrix-bridge-dagi-v1",
|
||||
"version": 1
|
||||
}
|
||||
158
ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
Normal file
158
ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
Normal file
@@ -0,0 +1,158 @@
|
||||
---
|
||||
# Prometheus alert rules — Matrix Bridge DAGI
|
||||
# Phase M7.1 (metrics contract hardening)
|
||||
#
|
||||
# Metric source of truth: services/matrix-bridge-dagi/app/metrics_contract.py
|
||||
# Runbook: docs/runbook/matrix-bridge-dagi-ops.md
|
||||
#
|
||||
# Usage:
|
||||
# promtool check rules ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
|
||||
# docker run --rm -v $PWD:/w prom/prometheus:latest \
|
||||
# promtool check rules /w/ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
|
||||
|
||||
groups:
|
||||
- name: matrix_bridge_dagi
|
||||
interval: 30s
|
||||
rules:
|
||||
|
||||
# ── A1: Bridge process down ─────────────────────────────────────────────
|
||||
# metric: matrix_bridge_up{node_id} (Gauge, M7.1: labeled per node)
|
||||
- alert: BridgeDown
|
||||
expr: sum(matrix_bridge_up) == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
team: platform
|
||||
service: matrix-bridge-dagi
|
||||
annotations:
|
||||
summary: "Matrix Bridge DAGI is down"
|
||||
description: >
|
||||
`matrix_bridge_up` == 0 across all nodes — bridge process has not
|
||||
started or has crashed. No messages are being processed.
|
||||
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a1-bridgedown"
|
||||
|
||||
# ── A2: Matrix sync errors spike ────────────────────────────────────────
|
||||
# metric: matrix_bridge_gateway_errors_total{error_type} (Counter)
|
||||
- alert: MatrixSyncErrors
|
||||
expr: >
|
||||
increase(matrix_bridge_gateway_errors_total{error_type="sync_error"}[5m]) > 3
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
service: matrix-bridge-dagi
|
||||
annotations:
|
||||
summary: "Matrix sync errors elevated"
|
||||
description: >
|
||||
More than 3 Matrix `/sync` errors (error_type=sync_error) in the last
|
||||
5 minutes. May indicate Matrix homeserver problems or network issues.
|
||||
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a2-matrixsyncerrors"
|
||||
|
||||
# ── A3: Gateway (Router) invoke errors spike ─────────────────────────────
|
||||
# metric: matrix_bridge_messages_replied_total{status} (Counter)
|
||||
- alert: GatewayInvokeErrors
|
||||
expr: >
|
||||
increase(matrix_bridge_messages_replied_total{status="error"}[5m]) > 5
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
service: matrix-bridge-dagi
|
||||
annotations:
|
||||
summary: "Router invoke errors elevated (node={{ $labels.node_id }})"
|
||||
description: >
|
||||
More than 5 agent invocation errors (status=error) in the last 5 minutes.
|
||||
Check Router/DeepSeek connectivity and logs.
|
||||
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a3-gatewayinvokeerrors"
|
||||
|
||||
# ── A4: Queue drops ─────────────────────────────────────────────────────
|
||||
# metric: matrix_bridge_queue_dropped_total{room_id, agent_id} (Counter)
|
||||
- alert: QueueDropsHigh
|
||||
expr: >
|
||||
rate(matrix_bridge_queue_dropped_total[5m]) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
service: matrix-bridge-dagi
|
||||
annotations:
|
||||
summary: "Bridge queue is dropping messages"
|
||||
description: >
|
||||
`matrix_bridge_queue_dropped_total` is increasing — work queue is full
|
||||
and incoming messages are being dropped. Increase
|
||||
`BRIDGE_QUEUE_MAX_EVENTS` or `BRIDGE_WORKER_CONCURRENCY`.
|
||||
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a4-queuedrops"
|
||||
|
||||
# ── A5: User-level rate limiting spike ──────────────────────────────────
|
||||
# metric: matrix_bridge_rate_limited_total{room_id, agent_id, limit_type} (Counter)
|
||||
- alert: RateLimitedSpike
|
||||
expr: >
|
||||
rate(matrix_bridge_rate_limited_total[5m]) > 2
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
service: matrix-bridge-dagi
|
||||
annotations:
|
||||
summary: "User rate limiting spike"
|
||||
description: >
|
||||
More than 2 messages/second are being rate-limited over 3 minutes.
|
||||
May indicate a flood attack, misbehaving client, or limits too low.
|
||||
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a5-ratelimitedspike"
|
||||
|
||||
# ── A6: Control channel rate limiting spike ──────────────────────────────
|
||||
# metric: matrix_bridge_control_rate_limited_total{scope} (Counter)
|
||||
- alert: ControlRateLimitedSpike
|
||||
expr: >
|
||||
rate(matrix_bridge_control_rate_limited_total[5m]) > 0.5
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
service: matrix-bridge-dagi
|
||||
annotations:
|
||||
summary: "Control channel rate limiting elevated"
|
||||
description: >
|
||||
More than 0.5 control commands/second rejected by rate limiter over
|
||||
3 minutes. May indicate operator tooling issues or abuse attempt.
|
||||
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a6-controlratelimitedspike"
|
||||
|
||||
# ── A7: Persistent dedupe hit storm (resend loop) ────────────────────────
|
||||
# metric: matrix_bridge_dedupe_persistent_hits_total{room_id} (Counter)
|
||||
- alert: DedupeHitStorm
|
||||
expr: >
|
||||
rate(matrix_bridge_dedupe_persistent_hits_total[10m]) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
service: matrix-bridge-dagi
|
||||
annotations:
|
||||
summary: "Persistent deduplication hit rate elevated"
|
||||
description: >
|
||||
High rate of persistent dedupe hits — may indicate a Matrix resend
|
||||
storm or a client repeatedly retrying the same event_id.
|
||||
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a7-dedupehitstorm"
|
||||
|
||||
# ── A8: Invoke latency P95 high (per node) ───────────────────────────────
|
||||
# metric: matrix_bridge_invoke_duration_seconds{agent_id, node_id} (Histogram)
|
||||
- alert: InvokeLatencyP95High
|
||||
expr: >
|
||||
histogram_quantile(
|
||||
0.95,
|
||||
sum by (node_id, le) (
|
||||
rate(matrix_bridge_invoke_duration_seconds_bucket[5m])
|
||||
)
|
||||
) > 15
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
service: matrix-bridge-dagi
|
||||
annotations:
|
||||
summary: "Router invoke latency P95 > 15s (node={{ $labels.node_id }})"
|
||||
description: >
|
||||
95th percentile invoke latency for node `{{ $labels.node_id }}` exceeds
|
||||
15 seconds over the last 5 minutes. Check Router load, DeepSeek API,
|
||||
Ollama/Swapper queue.
|
||||
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a8-invokelatencyp95high"
|
||||
401
ops/runbook-matrix-bridge-soak.md
Normal file
401
ops/runbook-matrix-bridge-soak.md
Normal file
@@ -0,0 +1,401 @@
|
||||
# matrix-bridge-dagi — Soak & Failure Rehearsal Runbook (M11)
|
||||
|
||||
**Phase:** M11
|
||||
**Applies to:** `matrix-bridge-dagi` service on NODA1
|
||||
**When to run:** Before any production traffic increase, after major code changes, or on a recurring monthly basis.
|
||||
|
||||
---
|
||||
|
||||
## 1. Goals
|
||||
|
||||
| Goal | Measurable pass criterion |
|
||||
|------|--------------------------|
|
||||
| Latency under load | p95 invoke < 5 000 ms |
|
||||
| Queue stability | drop rate < 1% |
|
||||
| Failover correctness | failover fires on NODA1 outage; NODA2 serves all remaining messages |
|
||||
| Sticky anti-flap | sticky set after first failover; no re-tries to degraded node |
|
||||
| Restart recovery | sticky + health snapshot reloads within 10 s of restart |
|
||||
| Policy operations safe under load | `!policy history` / `!policy change` work while messages in-flight |
|
||||
|
||||
---
|
||||
|
||||
## 2. Prerequisites
|
||||
|
||||
```bash
|
||||
# On NODA1 or local machine with network access to bridge
|
||||
pip install httpx
|
||||
|
||||
# Verify bridge is up
|
||||
curl -s http://localhost:9400/health | jq '.ok'
|
||||
# Expected: true
|
||||
|
||||
# Verify /metrics endpoint
|
||||
curl -s http://localhost:9400/metrics | grep matrix_bridge_up
|
||||
# Expected: matrix_bridge_up{...} 1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2a. Enabling the Soak Inject Endpoint
|
||||
|
||||
The soak script uses `POST /v1/debug/inject_event` which is **disabled by default**.
|
||||
Enable it only on staging/NODA1 soak runs:
|
||||
|
||||
```bash
|
||||
# On NODA1 — edit docker-compose override or pass env inline:
|
||||
# Option 1: temporary inline restart
|
||||
DEBUG_INJECT_ENABLED=true docker-compose \
|
||||
-f docker-compose.matrix-bridge-node1.yml \
|
||||
up -d --no-deps matrix-bridge-dagi
|
||||
|
||||
# Option 2: .env file override
|
||||
echo "DEBUG_INJECT_ENABLED=true" >> .env.soak
|
||||
docker-compose --env-file .env.soak \
|
||||
-f docker-compose.matrix-bridge-node1.yml \
|
||||
up -d --no-deps matrix-bridge-dagi
|
||||
|
||||
# Verify it's enabled (should return 200, not 403)
|
||||
curl -s -X POST http://localhost:9400/v1/debug/inject_event \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"room_id":"!test:test","event":{}}' | jq .
|
||||
# Expected: {"ok":false,"error":"no mapping for room_id=..."} ← 200, not 403
|
||||
|
||||
# IMPORTANT: disable after soak
|
||||
docker-compose -f docker-compose.matrix-bridge-node1.yml up -d --no-deps matrix-bridge-dagi
|
||||
# (DEBUG_INJECT_ENABLED defaults to false)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2b. Step 0 (WORKERS=2 / QUEUE=100) — Record True Baseline
|
||||
|
||||
**Goal:** snapshot the "before any tuning" numbers to have a comparison point.
|
||||
|
||||
```bash
|
||||
# 0. Confirm current config (should be defaults)
|
||||
curl -s http://localhost:9400/health | jq '{workers: .workers, queue_max: .queue.max}'
|
||||
# Expected: {"workers": 2, "queue_max": 100}
|
||||
|
||||
# 1. DB path for WAL check (adjust to your BRIDGE_DATA_DIR)
|
||||
DB=/opt/microdao-daarion/data/matrix_bridge.db
|
||||
|
||||
# 2. WAL size before (manual check)
|
||||
ls -lh ${DB}-wal 2>/dev/null || echo "(no WAL file yet — first run)"
|
||||
sqlite3 $DB "PRAGMA wal_checkpoint(PASSIVE);" 2>/dev/null || echo "(no sqlite3)"
|
||||
|
||||
# 3. Run Step 0 soak
|
||||
python3 ops/scripts/matrix_bridge_soak.py \
|
||||
--url http://localhost:9400 \
|
||||
--messages 100 \
|
||||
--concurrency 4 \
|
||||
--agent sofiia \
|
||||
--room-id "!your-room-id:your-server" \
|
||||
--max-p95-ms 5000 \
|
||||
--max-drop-rate 0.001 \
|
||||
--db-path $DB \
|
||||
--report-file /tmp/soak_step0_baseline.json
|
||||
|
||||
# 4. Record result in "Baseline numbers" table (section 10) below.
|
||||
jq '.summary, .latency, .metrics_delta, .wal' /tmp/soak_step0_baseline.json
|
||||
```
|
||||
|
||||
**v1 Go/No-Go thresholds for Step 0:**
|
||||
|
||||
| Metric | Green ✅ | Yellow ⚠️ | Red ❌ |
|
||||
|--------|---------|-----------|-------|
|
||||
| `p95_invoke_ms` | < 3000 | 3000–5000 | > 5000 |
|
||||
| `drop_rate` | 0.00% (mandatory) | — | > 0.1% |
|
||||
| `error_rate` | < 1% | 1–3% | > 3% |
|
||||
| `failovers` | 0 | — | ≥ 1 without cause |
|
||||
| WAL delta | < 2 MB | 2–10 MB | > 10 MB |
|
||||
|
||||
**If Step 0 is Green → proceed to Step 1 tuning.**
|
||||
**If Step 0 is Yellow/Red → investigate before touching WORKER_CONCURRENCY.**
|
||||
|
||||
---
|
||||
|
||||
## 2c. Step 1 (WORKERS=4 / QUEUE=200) — Tune-1
|
||||
|
||||
**Goal:** verify that doubling workers gives headroom without Router saturation.
|
||||
|
||||
```bash
|
||||
# 1. Apply tuning
|
||||
WORKER_CONCURRENCY=4 QUEUE_MAX_EVENTS=200 docker-compose \
|
||||
-f docker-compose.matrix-bridge-node1.yml \
|
||||
--env-file .env.soak \
|
||||
up -d --no-deps matrix-bridge-dagi
|
||||
|
||||
sleep 3
|
||||
curl -s http://localhost:9400/health | jq '{workers: .workers, queue_max: .queue.max}'
|
||||
# Expected: {"workers": 4, "queue_max": 200}
|
||||
|
||||
# 2. Run Step 1 soak (higher concurrency to stress the new headroom)
|
||||
python3 ops/scripts/matrix_bridge_soak.py \
|
||||
--url http://localhost:9400 \
|
||||
--messages 100 \
|
||||
--concurrency 8 \
|
||||
--agent sofiia \
|
||||
--room-id "!your-room-id:your-server" \
|
||||
--max-p95-ms 3000 \
|
||||
--max-drop-rate 0.001 \
|
||||
--db-path $DB \
|
||||
--report-file /tmp/soak_step1_tune1.json
|
||||
|
||||
# 3. Compare Step 0 vs Step 1
|
||||
python3 - <<'EOF'
|
||||
import json
|
||||
s0 = json.load(open('/tmp/soak_step0_baseline.json'))
|
||||
s1 = json.load(open('/tmp/soak_step1_tune1.json'))
|
||||
for k in ('p50', 'p95', 'p99'):
|
||||
print(f"{k}: {s0['latency'][k]}ms → {s1['latency'][k]}ms")
|
||||
print(f"drops: {s0['metrics_delta']['queue_drops']} → {s1['metrics_delta']['queue_drops']}")
|
||||
print(f"WAL: {s0['wal'].get('delta_mb')} → {s1['wal'].get('delta_mb')} MB delta")
|
||||
EOF
|
||||
```
|
||||
|
||||
**Decision:**
|
||||
- Step 1 Green → **freeze, tag v1.0, ship to production.**
|
||||
- p95 within 5% of Step 0 → Router is bottleneck (not workers); don't go to Step 2.
|
||||
- Queue drops > 0 at WORKERS=4 → try Step 2 (WORKERS=8, QUEUE=300).
|
||||
|
||||
---
|
||||
|
||||
## 3. Scenario A — Baseline load (100 messages, concurrency 4)
|
||||
|
||||
**Goal:** establish latency baseline, verify no drops under normal load.
|
||||
|
||||
```bash
|
||||
python3 ops/scripts/matrix_bridge_soak.py \
|
||||
--url http://localhost:9400 \
|
||||
--messages 100 \
|
||||
--concurrency 4 \
|
||||
--max-p95-ms 3000 \
|
||||
--report-file /tmp/soak_baseline.json
|
||||
```
|
||||
|
||||
**Expected output:**
|
||||
```
|
||||
matrix-bridge-dagi Soak Report ✅ PASSED
|
||||
Messages: 100 concurrency=4
|
||||
Latency: p50=<500ms p95=<3000ms
|
||||
Queue drops: 0 (rate 0.000%)
|
||||
Failovers: 0
|
||||
```
|
||||
|
||||
**If FAILED:**
|
||||
- `p95 too high` → check router `/health`, DeepSeek API latency, `docker stats`
|
||||
- `drop_rate > 0` → check `QUEUE_MAX_EVENTS` env var (increase if needed), inspect bridge logs
|
||||
|
||||
---
|
||||
|
||||
## 4. Scenario B — Queue saturation test
|
||||
|
||||
**Goal:** confirm drop metric fires cleanly and bridge doesn't crash.
|
||||
|
||||
```bash
|
||||
# Reduce queue via env override, then flood:
|
||||
QUEUE_MAX_EVENTS=5 docker-compose -f docker-compose.matrix-bridge-node1.yml \
|
||||
up -d matrix-bridge-dagi
|
||||
|
||||
# Wait for restart
|
||||
sleep 5
|
||||
|
||||
python3 ops/scripts/matrix_bridge_soak.py \
|
||||
--url http://localhost:9400 \
|
||||
--messages 30 \
|
||||
--concurrency 10 \
|
||||
--max-drop-rate 0.99 \
|
||||
--report-file /tmp/soak_queue_sat.json
|
||||
|
||||
# Restore normal queue size
|
||||
docker-compose -f docker-compose.matrix-bridge-node1.yml up -d matrix-bridge-dagi
|
||||
```
|
||||
|
||||
**Expected:** `queue_drops > 0`, bridge still running after the test.
|
||||
|
||||
**Verify in Prometheus/Grafana:**
|
||||
```promql
|
||||
rate(matrix_bridge_queue_dropped_total[1m])
|
||||
```
|
||||
Should spike and then return to 0.
|
||||
|
||||
---
|
||||
|
||||
## 5. Scenario C — Node failover rehearsal
|
||||
|
||||
**Goal:** simulate NODA1 router becoming unavailable, verify NODA2 takes over.
|
||||
|
||||
```bash
|
||||
# Step 1: stop the router on NODA1 temporarily
|
||||
docker pause dagi-router-node1
|
||||
|
||||
# Step 2: run soak against bridge (bridge will failover to NODA2)
|
||||
python3 ops/scripts/matrix_bridge_soak.py \
|
||||
--url http://localhost:9400 \
|
||||
--messages 20 \
|
||||
--concurrency 2 \
|
||||
--max-p95-ms 10000 \
|
||||
--report-file /tmp/soak_failover.json
|
||||
|
||||
# Step 3: restore router
|
||||
docker unpause dagi-router-node1
|
||||
```
|
||||
|
||||
**Expected:**
|
||||
```
|
||||
Failovers: 1..20 (at least 1)
|
||||
Sticky sets: 1+
|
||||
Errors: 0 (fallback to NODA2 serves all messages)
|
||||
```
|
||||
|
||||
**Check sticky in control room:**
|
||||
```
|
||||
!nodes
|
||||
```
|
||||
Should show `NODA2` sticky with remaining TTL.
|
||||
|
||||
**Check health tracker:**
|
||||
```
|
||||
!status
|
||||
```
|
||||
Should show `NODA1 state=degraded|down`.
|
||||
|
||||
---
|
||||
|
||||
## 6. Scenario D — Restart recovery
|
||||
|
||||
**Goal:** after restart, sticky and health state reload within one polling cycle.
|
||||
|
||||
```bash
|
||||
# After Scenario C: sticky is set to NODA2
|
||||
# Restart the bridge
|
||||
docker restart dagi-matrix-bridge-node1
|
||||
|
||||
# Wait for startup (up to 30s)
|
||||
sleep 15
|
||||
|
||||
# Verify sticky reloaded
|
||||
curl -s http://localhost:9400/health | jq '.ha_state'
|
||||
# Expected: {"sticky_loaded": N, ...}
|
||||
|
||||
# Verify routing still uses NODA2 sticky
|
||||
python3 ops/scripts/matrix_bridge_soak.py \
|
||||
--url http://localhost:9400 \
|
||||
--messages 10 \
|
||||
--concurrency 2 \
|
||||
--report-file /tmp/soak_restart.json
|
||||
```
|
||||
|
||||
**Expected:** p95 similar to post-failover run, `Failovers: 0` (sticky already applied).
|
||||
|
||||
---
|
||||
|
||||
## 7. Scenario E — Rate limit burst
|
||||
|
||||
**Goal:** verify rate limiting fires and bridge doesn't silently drop below-limit messages.
|
||||
|
||||
```bash
|
||||
# Set RPM very low for test, then flood from same sender
|
||||
# This is best done in control room by observing !status rate_limited count
|
||||
# rather than the soak script (which uses different senders per message).
|
||||
|
||||
# In Matrix control room:
|
||||
# Send 30+ messages from the same user account in quick succession in a mixed room.
|
||||
# Then:
|
||||
!status
|
||||
# Check: rate_limited_total increased, no queue drops.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 8. Scenario F — Policy operations under load
|
||||
|
||||
**Goal:** `!policy history`, `!policy change`, and `!policy export` work while messages are in-flight.
|
||||
|
||||
```bash
|
||||
# Run a background soak
|
||||
python3 ops/scripts/matrix_bridge_soak.py \
|
||||
--url http://localhost:9400 \
|
||||
--messages 200 \
|
||||
--concurrency 2 \
|
||||
--report-file /tmp/soak_concurrent_policy.json &
|
||||
|
||||
# While soak is running, in Matrix control room:
|
||||
!policy history limit=5
|
||||
!policy export
|
||||
!status
|
||||
```
|
||||
|
||||
**Expected:** all three commands respond immediately (< 2s), soak completes without extra drops.
|
||||
|
||||
---
|
||||
|
||||
## 9. Prometheus / Grafana during soak
|
||||
|
||||
Key queries for the Grafana dashboard:
|
||||
|
||||
```promql
|
||||
# Throughput (messages/s)
|
||||
rate(matrix_bridge_routed_total[30s])
|
||||
|
||||
# Error rate
|
||||
rate(matrix_bridge_errors_total[30s])
|
||||
|
||||
# p95 invoke latency per node
|
||||
histogram_quantile(0.95, rate(matrix_bridge_invoke_duration_seconds_bucket[1m]))
|
||||
|
||||
# Queue drops rate
|
||||
rate(matrix_bridge_queue_dropped_total[1m])
|
||||
|
||||
# Failovers
|
||||
rate(matrix_bridge_failover_total[5m])
|
||||
```
|
||||
|
||||
Use the `matrix-bridge-dagi` Grafana dashboard at:
|
||||
`ops/grafana/dashboards/matrix-bridge-dagi.json`
|
||||
|
||||
---
|
||||
|
||||
## 10. Baseline numbers (reference)
|
||||
|
||||
| Metric | Cold start | Warm (sticky set) |
|
||||
|--------|-----------|-------------------|
|
||||
| p50 latency | ~200ms | ~150ms |
|
||||
| p95 latency | ~2 000ms | ~1 500ms |
|
||||
| Queue drops | 0 (queue=100) | 0 |
|
||||
| Failover fires | 1 per degradation | 0 after sticky |
|
||||
| Policy ops response | < 500ms | < 500ms |
|
||||
|
||||
*Update this table after each soak run with actual measured values.*
|
||||
|
||||
---
|
||||
|
||||
## 11. CI soak (mocked, no network)
|
||||
|
||||
For CI pipelines, use the mocked soak scenarios:
|
||||
|
||||
```bash
|
||||
python3 -m pytest tests/test_matrix_bridge_m11_soak_scenarios.py -v
|
||||
```
|
||||
|
||||
Covers (all deterministic, no network):
|
||||
- **S1** Queue saturation → drop counter
|
||||
- **S2** Failover under load → on_failover callback, health tracker
|
||||
- **S3** Sticky routing under burst → sticky set, burst routed to NODA2
|
||||
- **S4** Multi-room isolation → separate rooms don't interfere
|
||||
- **S5** Rate-limit burst → RL callback wired, no panic
|
||||
- **S6** HA restart recovery → sticky + health snapshot persisted and reloaded
|
||||
- **Perf baseline** 100-msg + 50-msg failover burst < 5s wall clock
|
||||
|
||||
---
|
||||
|
||||
## 12. Known failure modes & mitigations
|
||||
|
||||
| Symptom | Likely cause | Mitigation |
|
||||
|---------|-------------|------------|
|
||||
| `p95 > 5000ms` | Router/LLM slow | Increase `ROUTER_TIMEOUT_S`, check DeepSeek API |
|
||||
| `drop_rate > 1%` | Queue too small | Increase `QUEUE_MAX_EVENTS` |
|
||||
| `failovers > 0` but errors > 0 | Both nodes degraded | Check NODA1 + NODA2 health; scale router |
|
||||
| Bridge crash during soak | Memory leak / bug | `docker logs` → file GitHub issue |
|
||||
| Sticky not set after failover | `FAILOVER_STICKY_TTL_S=0` | Set to 300+ |
|
||||
| Restart doesn't load sticky | `HA_HEALTH_MAX_AGE_S` too small | Increase or set to 3600 |
|
||||
476
ops/scripts/matrix_bridge_soak.py
Normal file
476
ops/scripts/matrix_bridge_soak.py
Normal file
@@ -0,0 +1,476 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
matrix_bridge_soak.py — M11 live soak script for matrix-bridge-dagi
|
||||
|
||||
Usage:
|
||||
python3 ops/scripts/matrix_bridge_soak.py \
|
||||
--url http://localhost:9400 \
|
||||
--messages 100 \
|
||||
--concurrency 4 \
|
||||
--report-file /tmp/soak_report.json
|
||||
|
||||
Requires: httpx (pip install httpx)
|
||||
|
||||
What it does:
|
||||
1. Sends --messages synthetic messages to the bridge /v1/sync endpoint
|
||||
(or directly to the router if --direct-router is set).
|
||||
2. Measures latency (p50, p95, p99, max) per batch.
|
||||
3. After the run, fetches /metrics and extracts key counters:
|
||||
- matrix_bridge_queue_dropped_total
|
||||
- matrix_bridge_rate_limited_total
|
||||
- matrix_bridge_failover_total
|
||||
- matrix_bridge_sticky_node_total
|
||||
- matrix_bridge_invoke_duration_seconds (p50/p95 from histogram)
|
||||
4. Prints a human-readable report and optionally writes JSON.
|
||||
|
||||
Exit codes:
|
||||
0 = all pass criteria met
|
||||
1 = one or more thresholds exceeded (see --max-p95-ms, --max-drop-rate)
|
||||
"""
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
try:
|
||||
import httpx
|
||||
except ImportError:
|
||||
print("ERROR: httpx not installed. Run: pip install httpx", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
|
||||
# ── Pass/fail defaults ─────────────────────────────────────────────────────────
|
||||
_DEFAULT_MAX_P95_MS = 5000 # 5 s p95 per invoke (generous for cold start)
|
||||
_DEFAULT_MAX_DROP_RATE = 0.01 # 1% queue drops allowed
|
||||
|
||||
|
||||
# ── Metrics parsing ────────────────────────────────────────────────────────────
|
||||
def _parse_counter(text: str, name: str) -> float:
|
||||
"""Extract the last reported value of a Prometheus counter by name."""
|
||||
for line in text.splitlines():
|
||||
if line.startswith(name + " ") or line.startswith(name + "{"):
|
||||
parts = line.rsplit(None, 1)
|
||||
try:
|
||||
return float(parts[-1])
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
return 0.0
|
||||
|
||||
|
||||
def _parse_histogram_quantile(text: str, name: str, quantile: float) -> Optional[float]:
|
||||
"""
|
||||
Approximate histogram_quantile from _bucket lines.
|
||||
Returns estimated value at given quantile or None if data missing.
|
||||
"""
|
||||
buckets: List[tuple] = []
|
||||
total_count = 0.0
|
||||
for line in text.splitlines():
|
||||
if f"{name}_bucket" in line and 'le="' in line:
|
||||
try:
|
||||
le_part = line.split('le="')[1].split('"')[0]
|
||||
le = float(le_part) if le_part != "+Inf" else float("inf")
|
||||
val = float(line.rsplit(None, 1)[-1])
|
||||
buckets.append((le, val))
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
elif (f"{name}_count " in line or (name + "_count{") in line):
|
||||
try:
|
||||
total_count = float(line.rsplit(None, 1)[-1])
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
if not buckets or total_count == 0:
|
||||
return None
|
||||
|
||||
buckets.sort()
|
||||
target = quantile * total_count
|
||||
prev_le, prev_count = 0.0, 0.0
|
||||
for le, count in buckets:
|
||||
if count >= target:
|
||||
if le == float("inf"):
|
||||
return prev_le
|
||||
# Linear interpolation
|
||||
if count == prev_count:
|
||||
return le
|
||||
fraction = (target - prev_count) / (count - prev_count)
|
||||
return prev_le + fraction * (le - prev_le)
|
||||
prev_le, prev_count = le, count
|
||||
return prev_le
|
||||
|
||||
|
||||
# ── Soak runner ────────────────────────────────────────────────────────────────
|
||||
async def _preflight_inject(client: httpx.AsyncClient, url: str, room_id: str) -> str:
|
||||
"""
|
||||
Verify the inject endpoint is reachable and enabled.
|
||||
Returns "" on success, error message on failure.
|
||||
"""
|
||||
try:
|
||||
resp = await client.post(
|
||||
f"{url.rstrip('/')}/v1/debug/inject_event",
|
||||
json={"room_id": room_id, "event": {"event_id": "!preflight", "sender": "@soak:test",
|
||||
"content": {"msgtype": "m.text", "body": "ping"}}},
|
||||
timeout=5.0,
|
||||
)
|
||||
if resp.status_code == 403:
|
||||
return (
|
||||
"❌ DEBUG_INJECT_ENABLED=false on bridge. "
|
||||
"Set DEBUG_INJECT_ENABLED=true and restart for soak.\n"
|
||||
" NEVER enable in production!"
|
||||
)
|
||||
if resp.status_code >= 500:
|
||||
return f"❌ Bridge inject endpoint returned HTTP {resp.status_code}"
|
||||
data = resp.json()
|
||||
if not data.get("ok") and "no mapping" in data.get("error", ""):
|
||||
return (
|
||||
f"❌ No room mapping for room_id={room_id!r}. "
|
||||
"Pass --room-id matching a configured BRIDGE_ROOM_MAP entry."
|
||||
)
|
||||
return ""
|
||||
except httpx.ConnectError:
|
||||
return f"❌ Cannot connect to bridge at {url}. Is it running?"
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return f"❌ Preflight failed: {exc}"
|
||||
|
||||
|
||||
async def _check_wal(db_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Run WAL size + checkpoint check on the bridge policy DB.
|
||||
Returns dict with wal_bytes, wal_mb, checkpoint_result.
|
||||
Requires sqlite3 CLI on PATH; gracefully skips if unavailable.
|
||||
"""
|
||||
import subprocess, shutil
|
||||
result: Dict[str, Any] = {"db_path": db_path, "ok": False}
|
||||
|
||||
wal_path = db_path + "-wal"
|
||||
try:
|
||||
wal_bytes = os.path.getsize(wal_path) if os.path.exists(wal_path) else 0
|
||||
result["wal_bytes"] = wal_bytes
|
||||
result["wal_mb"] = round(wal_bytes / 1_048_576, 2)
|
||||
except OSError:
|
||||
result["wal_bytes"] = -1
|
||||
result["wal_mb"] = -1
|
||||
|
||||
if shutil.which("sqlite3"):
|
||||
try:
|
||||
cp = subprocess.run(
|
||||
["sqlite3", db_path, "PRAGMA wal_checkpoint(PASSIVE);"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
# Output: busy|log|checkpointed (3 ints)
|
||||
parts = cp.stdout.strip().split("|")
|
||||
if len(parts) == 3:
|
||||
result["wal_checkpoint"] = {
|
||||
"busy": int(parts[0]), "log": int(parts[1]), "checkpointed": int(parts[2]),
|
||||
}
|
||||
result["ok"] = True
|
||||
except Exception: # noqa: BLE001
|
||||
result["ok"] = False
|
||||
else:
|
||||
result["sqlite3_missing"] = True
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def _send_one(
|
||||
client: httpx.AsyncClient,
|
||||
url: str,
|
||||
agent_id: str,
|
||||
message: str,
|
||||
room_id: str,
|
||||
sender: str,
|
||||
) -> tuple:
|
||||
"""
|
||||
POST a synthetic Matrix-style event to the bridge debug endpoint.
|
||||
Returns (latency_ms: float, status_code: int, error: str|None).
|
||||
"""
|
||||
payload = {
|
||||
"room_id": room_id,
|
||||
"event": {
|
||||
"event_id": f"!soak-{int(time.monotonic() * 1e6)}",
|
||||
"sender": sender,
|
||||
"type": "m.room.message",
|
||||
"content": {"msgtype": "m.text", "body": message},
|
||||
},
|
||||
}
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
resp = await client.post(
|
||||
f"{url.rstrip('/')}/v1/debug/inject_event",
|
||||
json=payload,
|
||||
timeout=30.0,
|
||||
)
|
||||
latency_ms = (time.monotonic() - t0) * 1000
|
||||
if resp.status_code >= 500:
|
||||
return latency_ms, resp.status_code, f"HTTP {resp.status_code}"
|
||||
return latency_ms, resp.status_code, None
|
||||
except httpx.TimeoutException:
|
||||
latency_ms = (time.monotonic() - t0) * 1000
|
||||
return latency_ms, 0, "timeout"
|
||||
except Exception as exc: # noqa: BLE001
|
||||
latency_ms = (time.monotonic() - t0) * 1000
|
||||
return latency_ms, 0, str(exc)
|
||||
|
||||
|
||||
async def _fetch_health(client: httpx.AsyncClient, url: str) -> Dict[str, Any]:
|
||||
try:
|
||||
resp = await client.get(f"{url.rstrip('/')}/health", timeout=10.0)
|
||||
return resp.json() if resp.status_code == 200 else {}
|
||||
except Exception: # noqa: BLE001
|
||||
return {}
|
||||
|
||||
|
||||
async def _fetch_metrics(client: httpx.AsyncClient, url: str) -> str:
|
||||
try:
|
||||
resp = await client.get(f"{url.rstrip('/')}/metrics", timeout=10.0)
|
||||
return resp.text if resp.status_code == 200 else ""
|
||||
except Exception: # noqa: BLE001
|
||||
return ""
|
||||
|
||||
|
||||
def _percentile(values: List[float], p: float) -> float:
|
||||
if not values:
|
||||
return 0.0
|
||||
sv = sorted(values)
|
||||
idx = int(len(sv) * p / 100)
|
||||
return sv[min(idx, len(sv) - 1)]
|
||||
|
||||
|
||||
async def run_soak(
|
||||
url: str,
|
||||
n_messages: int,
|
||||
concurrency: int,
|
||||
agent_id: str,
|
||||
room_id: str,
|
||||
sender: str,
|
||||
max_p95_ms: float,
|
||||
max_drop_rate: float,
|
||||
db_path: str = "",
|
||||
) -> Dict[str, Any]:
|
||||
results: List[tuple] = []
|
||||
semaphore = asyncio.Semaphore(concurrency)
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
# Pre-check: inject endpoint + health
|
||||
preflight_err = await _preflight_inject(client, url, room_id)
|
||||
if preflight_err:
|
||||
print(preflight_err, file=sys.stderr)
|
||||
return {"ok": False, "error": preflight_err, "passed": False, "failures": [preflight_err]}
|
||||
|
||||
# WAL check before soak
|
||||
wal_before: Dict[str, Any] = {}
|
||||
if db_path:
|
||||
wal_before = await _check_wal(db_path)
|
||||
print(f"[soak] WAL before: {wal_before.get('wal_mb', '?')} MB")
|
||||
|
||||
# Pre-check: health
|
||||
health_before = await _fetch_health(client, url)
|
||||
metrics_before = await _fetch_metrics(client, url)
|
||||
|
||||
drops_before = _parse_counter(metrics_before, "matrix_bridge_queue_dropped_total")
|
||||
rl_before = _parse_counter(metrics_before, "matrix_bridge_rate_limited_total")
|
||||
fo_before = _parse_counter(metrics_before, "matrix_bridge_failover_total")
|
||||
|
||||
print(f"[soak] Bridge health before: {health_before.get('ok', '?')}")
|
||||
print(f"[soak] Starting {n_messages} messages (concurrency={concurrency}) ...")
|
||||
|
||||
t_start = time.monotonic()
|
||||
|
||||
async def worker(i: int):
|
||||
async with semaphore:
|
||||
msg = f"soak-msg-{i:04d}"
|
||||
lat, status, err = await _send_one(
|
||||
client, url, agent_id, msg, room_id, sender
|
||||
)
|
||||
results.append((lat, status, err))
|
||||
if (i + 1) % max(1, n_messages // 10) == 0:
|
||||
print(f" [{i+1}/{n_messages}] last={lat:.0f}ms status={status}")
|
||||
|
||||
await asyncio.gather(*[worker(i) for i in range(n_messages)])
|
||||
|
||||
elapsed_s = time.monotonic() - t_start
|
||||
metrics_after = await _fetch_metrics(client, url)
|
||||
health_after = await _fetch_health(client, url)
|
||||
|
||||
# WAL check after soak
|
||||
wal_after: Dict[str, Any] = {}
|
||||
if db_path:
|
||||
wal_after = await _check_wal(db_path)
|
||||
print(f"[soak] WAL after: {wal_after.get('wal_mb', '?')} MB "
|
||||
f"(delta={round(wal_after.get('wal_mb',0) - wal_before.get('wal_mb',0), 2)} MB)")
|
||||
|
||||
latencies = [r[0] for r in results]
|
||||
errors = [r for r in results if r[2] is not None]
|
||||
successes = len(results) - len(errors)
|
||||
error_rate = len(errors) / len(results) if results else 0.0
|
||||
|
||||
drops_after = _parse_counter(metrics_after, "matrix_bridge_queue_dropped_total")
|
||||
rl_after = _parse_counter(metrics_after, "matrix_bridge_rate_limited_total")
|
||||
fo_after = _parse_counter(metrics_after, "matrix_bridge_failover_total")
|
||||
sticky_after = _parse_counter(metrics_after, "matrix_bridge_sticky_node_total")
|
||||
|
||||
delta_drops = drops_after - drops_before
|
||||
delta_rl = rl_after - rl_before
|
||||
delta_fo = fo_after - fo_before
|
||||
|
||||
p50 = _percentile(latencies, 50)
|
||||
p95 = _percentile(latencies, 95)
|
||||
p99 = _percentile(latencies, 99)
|
||||
p_max = max(latencies) if latencies else 0.0
|
||||
|
||||
# Histogram quantile from Prometheus
|
||||
hist_p95 = _parse_histogram_quantile(
|
||||
metrics_after, "matrix_bridge_invoke_duration_seconds", 0.95
|
||||
)
|
||||
hist_p95_ms = hist_p95 * 1000 if hist_p95 is not None else None
|
||||
|
||||
drop_rate = delta_drops / len(results) if results else 0.0
|
||||
|
||||
report = {
|
||||
"wal": {
|
||||
"before_mb": wal_before.get("wal_mb"),
|
||||
"after_mb": wal_after.get("wal_mb"),
|
||||
"delta_mb": round(
|
||||
(wal_after.get("wal_mb") or 0) - (wal_before.get("wal_mb") or 0), 3
|
||||
) if wal_before and wal_after else None,
|
||||
"checkpoint_after": wal_after.get("wal_checkpoint"),
|
||||
"threshold_mb": 10,
|
||||
},
|
||||
"summary": {
|
||||
"total_messages": n_messages,
|
||||
"concurrency": concurrency,
|
||||
"elapsed_s": round(elapsed_s, 2),
|
||||
"throughput_rps": round(n_messages / elapsed_s, 1) if elapsed_s > 0 else 0,
|
||||
"successes": successes,
|
||||
"errors": len(errors),
|
||||
"error_rate": round(error_rate, 4),
|
||||
},
|
||||
"latency_ms": {
|
||||
"p50": round(p50, 1),
|
||||
"p95": round(p95, 1),
|
||||
"p99": round(p99, 1),
|
||||
"max": round(p_max, 1),
|
||||
},
|
||||
"metrics_delta": {
|
||||
"queue_drops": int(delta_drops),
|
||||
"rate_limited": int(delta_rl),
|
||||
"failovers": int(delta_fo),
|
||||
"sticky_sets": int(sticky_after),
|
||||
"drop_rate": round(drop_rate, 4),
|
||||
},
|
||||
"prometheus_invoke_p95_ms": round(hist_p95_ms, 1) if hist_p95_ms else None,
|
||||
"health_before": health_before.get("ok"),
|
||||
"health_after": health_after.get("ok"),
|
||||
"pass_criteria": {
|
||||
"max_p95_ms": max_p95_ms,
|
||||
"max_drop_rate": max_drop_rate,
|
||||
},
|
||||
}
|
||||
|
||||
# Pass/fail evaluation
|
||||
failures = []
|
||||
if p95 > max_p95_ms:
|
||||
failures.append(f"p95={p95:.0f}ms exceeds threshold {max_p95_ms:.0f}ms")
|
||||
if drop_rate > max_drop_rate:
|
||||
failures.append(
|
||||
f"drop_rate={drop_rate:.3%} exceeds threshold {max_drop_rate:.3%}"
|
||||
)
|
||||
wal_delta = report["wal"]["delta_mb"]
|
||||
if wal_delta is not None and wal_delta > report["wal"]["threshold_mb"]:
|
||||
failures.append(
|
||||
f"WAL grew {wal_delta:.1f}MB (threshold {report['wal']['threshold_mb']}MB) "
|
||||
"— possible SQLite write pressure (Bottleneck #2)"
|
||||
)
|
||||
|
||||
report["passed"] = len(failures) == 0
|
||||
report["failures"] = failures
|
||||
return report
|
||||
|
||||
|
||||
def _print_report(r: Dict[str, Any]) -> None:
|
||||
s = r["summary"]
|
||||
l = r["latency_ms"]
|
||||
m = r["metrics_delta"]
|
||||
passed = "✅ PASSED" if r["passed"] else "❌ FAILED"
|
||||
|
||||
w = r.get("wal", {})
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f" matrix-bridge-dagi Soak Report {passed}")
|
||||
print("=" * 60)
|
||||
print(f" Messages: {s['total_messages']} concurrency={s['concurrency']}")
|
||||
print(f" Elapsed: {s['elapsed_s']}s ({s['throughput_rps']} rps)")
|
||||
print(f" Successes: {s['successes']} errors={s['errors']} ({s['error_rate']:.1%})")
|
||||
print()
|
||||
print(f" Latency (client-side): p50={l['p50']}ms p95={l['p95']}ms "
|
||||
f"p99={l['p99']}ms max={l['max']}ms")
|
||||
if r["prometheus_invoke_p95_ms"] is not None:
|
||||
print(f" Invoke p95 (Prometheus): {r['prometheus_invoke_p95_ms']}ms")
|
||||
print()
|
||||
print(f" Queue drops: {m['queue_drops']} (rate {m['drop_rate']:.3%})")
|
||||
print(f" Rate-limited: {m['rate_limited']}")
|
||||
print(f" Failovers: {m['failovers']}")
|
||||
print(f" Sticky sets: {m['sticky_sets']}")
|
||||
if w.get("before_mb") is not None:
|
||||
wal_delta_str = (
|
||||
f"Δ{w['delta_mb']:+.2f}MB" if w.get("delta_mb") is not None else ""
|
||||
)
|
||||
wal_warn = " ⚠️" if (w.get("delta_mb") or 0) > w.get("threshold_mb", 10) else ""
|
||||
print(f" WAL: {w['before_mb']}MB → {w['after_mb']}MB {wal_delta_str}{wal_warn}")
|
||||
print()
|
||||
if r["failures"]:
|
||||
for f in r["failures"]:
|
||||
print(f" ❌ {f}")
|
||||
else:
|
||||
print(" All pass criteria met.")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="matrix-bridge-dagi soak test (M11)")
|
||||
parser.add_argument("--url", default="http://localhost:9400",
|
||||
help="Bridge base URL (default: http://localhost:9400)")
|
||||
parser.add_argument("--messages", type=int, default=100,
|
||||
help="Total messages to send (default: 100)")
|
||||
parser.add_argument("--concurrency", type=int, default=4,
|
||||
help="Concurrent requests (default: 4)")
|
||||
parser.add_argument("--agent-id", default="sofiia",
|
||||
help="Agent id for synthetic events (default: sofiia)")
|
||||
parser.add_argument("--room-id", default="!soak-room:home.invalid",
|
||||
help="Room id for synthetic events")
|
||||
parser.add_argument("--sender", default="@soak-user:home.invalid",
|
||||
help="Sender for synthetic events")
|
||||
parser.add_argument("--max-p95-ms", type=float, default=_DEFAULT_MAX_P95_MS,
|
||||
help=f"Max p95 latency ms (default: {_DEFAULT_MAX_P95_MS})")
|
||||
parser.add_argument("--max-drop-rate",type=float, default=_DEFAULT_MAX_DROP_RATE,
|
||||
help=f"Max queue drop rate 0..1 (default: {_DEFAULT_MAX_DROP_RATE})")
|
||||
parser.add_argument("--report-file", default="",
|
||||
help="Optional path to write JSON report")
|
||||
parser.add_argument("--db-path", default="",
|
||||
help="Path to policy_store.db for WAL check "
|
||||
"(e.g. /opt/microdao-daarion/data/matrix_bridge.db)")
|
||||
args = parser.parse_args()
|
||||
|
||||
report = asyncio.run(run_soak(
|
||||
url=args.url,
|
||||
n_messages=args.messages,
|
||||
concurrency=args.concurrency,
|
||||
agent_id=args.agent_id,
|
||||
room_id=args.room_id,
|
||||
sender=args.sender,
|
||||
max_p95_ms=args.max_p95_ms,
|
||||
max_drop_rate=args.max_drop_rate,
|
||||
db_path=args.db_path,
|
||||
))
|
||||
_print_report(report)
|
||||
|
||||
if args.report_file:
|
||||
with open(args.report_file, "w", encoding="utf-8") as fh:
|
||||
json.dump(report, fh, indent=2)
|
||||
print(f"\n Report saved: {args.report_file}")
|
||||
|
||||
return 0 if report["passed"] else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user