Files
microdao-daarion/ops/grafana/dashboards/matrix-bridge-dagi.json
Apple 82d5ff2a4f feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11:
- M4: agent discovery (!agents / !status)
- M5: node-aware routing + per-node observability
- M6: dynamic policy store (node/agent overrides, import/export)
- M7: Prometheus alerts + Grafana dashboard + metrics contract
- M8: node health tracker + soft failover + sticky cache + HA persistence
- M9: two-step confirm + diff preview for dangerous commands
- M10: auto-backup, restore, retention, policy history + change detail
- M11: soak scenarios (CI tests) + live soak script

Soak infrastructure (this commit):
- POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false)
- _preflight_inject() and _check_wal() in soak script
- --db-path arg for WAL delta reporting
- Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands

Made-with: Cursor
2026-03-05 07:51:37 -08:00

986 lines
22 KiB
JSON

{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "Prometheus",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__elements": {},
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "9.0.0"
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
},
{
"type": "panel",
"id": "stat",
"name": "Stat",
"version": ""
},
{
"type": "panel",
"id": "timeseries",
"name": "Time series",
"version": ""
},
{
"type": "panel",
"id": "gauge",
"name": "Gauge",
"version": ""
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Matrix Bridge DAGI \u2014 operational overview (M7.0). Traffic, latency, errors, queue, dedupe, control channel.",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [
{
"asDropdown": false,
"icon": "doc",
"includeVars": false,
"keepTime": false,
"tags": [],
"targetBlank": true,
"title": "Runbook",
"tooltip": "matrix-bridge-dagi-ops.md",
"type": "link",
"url": "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md"
}
],
"panels": [
{
"id": 1,
"type": "stat",
"title": "Bridge Up",
"gridPos": {
"x": 0,
"y": 0,
"w": 4,
"h": 4
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum(matrix_bridge_up)",
"legendFormat": "up (all nodes)",
"refId": "A",
"instant": true
}
],
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"colorMode": "background",
"graphMode": "none",
"textMode": "auto",
"orientation": "auto"
},
"fieldConfig": {
"defaults": {
"mappings": [
{
"type": "value",
"options": {
"0": {
"text": "DOWN",
"color": "red"
},
"1": {
"text": "UP",
"color": "green"
}
}
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"color": {
"mode": "thresholds"
}
},
"overrides": []
}
},
{
"id": 2,
"type": "stat",
"title": "Queue Size",
"gridPos": {
"x": 4,
"y": 0,
"w": 4,
"h": 4
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "matrix_bridge_queue_size",
"legendFormat": "queue",
"refId": "A",
"instant": true
}
],
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"colorMode": "background",
"graphMode": "area",
"textMode": "auto"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "red",
"value": 100
}
]
},
"color": {
"mode": "thresholds"
},
"unit": "short"
},
"overrides": []
}
},
{
"id": 3,
"type": "stat",
"title": "Active Rate-Limiter Rooms",
"gridPos": {
"x": 8,
"y": 0,
"w": 4,
"h": 4
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "matrix_bridge_rate_limiter_active_rooms",
"legendFormat": "rooms",
"refId": "A",
"instant": true
}
],
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"colorMode": "value",
"graphMode": "none"
},
"fieldConfig": {
"defaults": {
"unit": "short",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 4,
"type": "stat",
"title": "Active Room-Agent Locks",
"gridPos": {
"x": 12,
"y": 0,
"w": 4,
"h": 4
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "matrix_bridge_active_room_agent_locks",
"legendFormat": "locks",
"refId": "A",
"instant": true
}
],
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"colorMode": "value",
"graphMode": "none"
},
"fieldConfig": {
"defaults": {
"unit": "short",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 5,
"type": "stat",
"title": "Drops (5m)",
"gridPos": {
"x": 16,
"y": 0,
"w": 4,
"h": 4
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum(increase(matrix_bridge_queue_dropped_total[5m]))",
"legendFormat": "dropped",
"refId": "A",
"instant": true
}
],
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"colorMode": "background",
"graphMode": "none"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1
}
]
},
"color": {
"mode": "thresholds"
},
"unit": "short"
},
"overrides": []
}
},
{
"id": 6,
"type": "stat",
"title": "Errors (5m)",
"gridPos": {
"x": 20,
"y": 0,
"w": 4,
"h": 4
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum(increase(matrix_bridge_gateway_errors_total[5m]))",
"legendFormat": "errors",
"refId": "A",
"instant": true
}
],
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"colorMode": "background",
"graphMode": "none"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "red",
"value": 5
}
]
},
"color": {
"mode": "thresholds"
},
"unit": "short"
},
"overrides": []
}
},
{
"id": 10,
"type": "timeseries",
"title": "Traffic: Received & Replied (rate/5m)",
"gridPos": {
"x": 0,
"y": 4,
"w": 12,
"h": 8
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum(rate(matrix_bridge_messages_received_total[5m]))",
"legendFormat": "received",
"refId": "A"
},
{
"expr": "sum(rate(matrix_bridge_messages_replied_total{status=\"ok\"}[5m]))",
"legendFormat": "replied ok",
"refId": "B"
},
{
"expr": "sum(rate(matrix_bridge_messages_replied_total{status=\"error\"}[5m]))",
"legendFormat": "replied error",
"refId": "C"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "replied error"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "red"
}
}
]
}
]
}
},
{
"id": 11,
"type": "timeseries",
"title": "Errors / Drops / Rate-Limited (rate/5m)",
"gridPos": {
"x": 12,
"y": 4,
"w": 12,
"h": 8
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum by (error_type) (rate(matrix_bridge_gateway_errors_total[5m]))",
"legendFormat": "gw_error: {{ error_type }}",
"refId": "A"
},
{
"expr": "sum(rate(matrix_bridge_queue_dropped_total[5m]))",
"legendFormat": "queue_dropped",
"refId": "B"
},
{
"expr": "sum(rate(matrix_bridge_rate_limited_total[5m]))",
"legendFormat": "rate_limited",
"refId": "C"
},
{
"expr": "sum by (reason) (rate(matrix_bridge_route_rejected_total[5m]))",
"legendFormat": "route_rejected: {{ reason }}",
"refId": "D"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 15,
"drawStyle": "line",
"stacking": {
"mode": "none"
},
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 20,
"type": "timeseries",
"title": "Invoke Latency P50 / P95 by Node",
"gridPos": {
"x": 0,
"y": 12,
"w": 12,
"h": 8
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "histogram_quantile(0.50, sum by (node_id, le) (rate(matrix_bridge_invoke_duration_seconds_bucket[5m])))",
"legendFormat": "p50 {{ node_id }}",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum by (node_id, le) (rate(matrix_bridge_invoke_duration_seconds_bucket[5m])))",
"legendFormat": "p95 {{ node_id }}",
"refId": "B"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max",
"last"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "s",
"custom": {
"lineWidth": 2,
"fillOpacity": 5,
"drawStyle": "line",
"spanNulls": false
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 10
},
{
"color": "red",
"value": 20
}
]
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 21,
"type": "timeseries",
"title": "Queue Wait P50 / P95",
"gridPos": {
"x": 12,
"y": 12,
"w": 12,
"h": 8
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "histogram_quantile(0.50, sum by (agent_id, le) (rate(matrix_bridge_queue_wait_seconds_bucket[5m])))",
"legendFormat": "wait p50 {{ agent_id }}",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum by (agent_id, le) (rate(matrix_bridge_queue_wait_seconds_bucket[5m])))",
"legendFormat": "wait p95 {{ agent_id }}",
"refId": "B"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "s",
"custom": {
"lineWidth": 2,
"fillOpacity": 5,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 30,
"type": "timeseries",
"title": "Node Routing: Routed & Rejected by Node (rate/5m)",
"gridPos": {
"x": 0,
"y": 20,
"w": 12,
"h": 7
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum by (node_id) (rate(matrix_bridge_routed_total[5m]))",
"legendFormat": "routed {{ node_id }}",
"refId": "A"
},
{
"expr": "sum by (node_id) (rate(matrix_bridge_node_rejected_total[5m]))",
"legendFormat": "rejected {{ node_id }}",
"refId": "B"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 31,
"type": "timeseries",
"title": "Persistent Dedupe Hits / Inserts (rate/10m)",
"gridPos": {
"x": 12,
"y": 20,
"w": 12,
"h": 7
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum(rate(matrix_bridge_dedupe_persistent_hits_total[10m]))",
"legendFormat": "dedupe_hits",
"refId": "A"
},
{
"expr": "rate(matrix_bridge_dedupe_persistent_inserts_total[10m])",
"legendFormat": "dedupe_inserts",
"refId": "B"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 40,
"type": "timeseries",
"title": "Control Commands (rate/5m)",
"gridPos": {
"x": 0,
"y": 27,
"w": 12,
"h": 7
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum by (verb) (rate(matrix_bridge_control_commands_total[5m]))",
"legendFormat": "cmd {{ verb }}",
"refId": "A"
},
{
"expr": "sum by (scope) (rate(matrix_bridge_control_rate_limited_total[5m]))",
"legendFormat": "ctrl_ratelimited {{ scope }}",
"refId": "B"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 41,
"type": "timeseries",
"title": "Traffic by Agent (received rate/5m)",
"gridPos": {
"x": 12,
"y": 27,
"w": 24,
"h": 7
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum by (agent_id) (rate(matrix_bridge_messages_received_total[5m]))",
"legendFormat": "{{ agent_id }}",
"refId": "A"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max",
"last"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 42,
"type": "timeseries",
"title": "Routing Reasons by Agent (rate/5m)",
"description": "M7.1: matrix_bridge_routing_reasons_total \u2014 slash/mention/name/default/direct breakdown",
"gridPos": {
"x": 0,
"y": 34,
"w": 24,
"h": 7
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum by (agent_id, reason) (rate(matrix_bridge_routing_reasons_total[5m]))",
"legendFormat": "{{ agent_id }} / {{ reason }}",
"refId": "A"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
}
],
"refresh": "30s",
"schemaVersion": 38,
"tags": [
"matrix-bridge",
"dagi",
"daarion"
],
"templating": {
"list": [
{
"current": {},
"hide": 0,
"includeAll": false,
"label": "Datasource",
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "UTC",
"title": "Matrix Bridge DAGI",
"uid": "matrix-bridge-dagi-v1",
"version": 1
}