feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)

Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
2026-03-05 07:51:37 -08:00
parent fe6e3d30ae
commit 82d5ff2a4f
21 changed files with 9123 additions and 93 deletions
--- a/ops/grafana/dashboards/matrix-bridge-dagi.json
+++ b/ops/grafana/dashboards/matrix-bridge-dagi.json
@@ -0,0 +1,986 @@
+{
+  "__inputs": [
+    {
+      "name": "DS_PROMETHEUS",
+      "label": "Prometheus",
+      "description": "",
+      "type": "datasource",
+      "pluginId": "prometheus",
+      "pluginName": "Prometheus"
+    }
+  ],
+  "__elements": {},
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "9.0.0"
+    },
+    {
+      "type": "datasource",
+      "id": "prometheus",
+      "name": "Prometheus",
+      "version": "1.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "stat",
+      "name": "Stat",
+      "version": ""
+    },
+    {
+      "type": "panel",
+      "id": "timeseries",
+      "name": "Time series",
+      "version": ""
+    },
+    {
+      "type": "panel",
+      "id": "gauge",
+      "name": "Gauge",
+      "version": ""
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "description": "Matrix Bridge DAGI \u2014 operational overview (M7.0). Traffic, latency, errors, queue, dedupe, control channel.",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [
+    {
+      "asDropdown": false,
+      "icon": "doc",
+      "includeVars": false,
+      "keepTime": false,
+      "tags": [],
+      "targetBlank": true,
+      "title": "Runbook",
+      "tooltip": "matrix-bridge-dagi-ops.md",
+      "type": "link",
+      "url": "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md"
+    }
+  ],
+  "panels": [
+    {
+      "id": 1,
+      "type": "stat",
+      "title": "Bridge Up",
+      "gridPos": {
+        "x": 0,
+        "y": 0,
+        "w": 4,
+        "h": 4
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum(matrix_bridge_up)",
+          "legendFormat": "up (all nodes)",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "options": {
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ]
+        },
+        "colorMode": "background",
+        "graphMode": "none",
+        "textMode": "auto",
+        "orientation": "auto"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "mappings": [
+            {
+              "type": "value",
+              "options": {
+                "0": {
+                  "text": "DOWN",
+                  "color": "red"
+                },
+                "1": {
+                  "text": "UP",
+                  "color": "green"
+                }
+              }
+            }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "color": {
+            "mode": "thresholds"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 2,
+      "type": "stat",
+      "title": "Queue Size",
+      "gridPos": {
+        "x": 4,
+        "y": 0,
+        "w": 4,
+        "h": 4
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "matrix_bridge_queue_size",
+          "legendFormat": "queue",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "options": {
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ]
+        },
+        "colorMode": "background",
+        "graphMode": "area",
+        "textMode": "auto"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 50
+              },
+              {
+                "color": "red",
+                "value": 100
+              }
+            ]
+          },
+          "color": {
+            "mode": "thresholds"
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 3,
+      "type": "stat",
+      "title": "Active Rate-Limiter Rooms",
+      "gridPos": {
+        "x": 8,
+        "y": 0,
+        "w": 4,
+        "h": 4
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "matrix_bridge_rate_limiter_active_rooms",
+          "legendFormat": "rooms",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "options": {
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ]
+        },
+        "colorMode": "value",
+        "graphMode": "none"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short",
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 4,
+      "type": "stat",
+      "title": "Active Room-Agent Locks",
+      "gridPos": {
+        "x": 12,
+        "y": 0,
+        "w": 4,
+        "h": 4
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "matrix_bridge_active_room_agent_locks",
+          "legendFormat": "locks",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "options": {
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ]
+        },
+        "colorMode": "value",
+        "graphMode": "none"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short",
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 5,
+      "type": "stat",
+      "title": "Drops (5m)",
+      "gridPos": {
+        "x": 16,
+        "y": 0,
+        "w": 4,
+        "h": 4
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum(increase(matrix_bridge_queue_dropped_total[5m]))",
+          "legendFormat": "dropped",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "options": {
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ]
+        },
+        "colorMode": "background",
+        "graphMode": "none"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 1
+              }
+            ]
+          },
+          "color": {
+            "mode": "thresholds"
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 6,
+      "type": "stat",
+      "title": "Errors (5m)",
+      "gridPos": {
+        "x": 20,
+        "y": 0,
+        "w": 4,
+        "h": 4
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum(increase(matrix_bridge_gateway_errors_total[5m]))",
+          "legendFormat": "errors",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "options": {
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ]
+        },
+        "colorMode": "background",
+        "graphMode": "none"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 1
+              },
+              {
+                "color": "red",
+                "value": 5
+              }
+            ]
+          },
+          "color": {
+            "mode": "thresholds"
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 10,
+      "type": "timeseries",
+      "title": "Traffic: Received & Replied (rate/5m)",
+      "gridPos": {
+        "x": 0,
+        "y": 4,
+        "w": 12,
+        "h": 8
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum(rate(matrix_bridge_messages_received_total[5m]))",
+          "legendFormat": "received",
+          "refId": "A"
+        },
+        {
+          "expr": "sum(rate(matrix_bridge_messages_replied_total{status=\"ok\"}[5m]))",
+          "legendFormat": "replied ok",
+          "refId": "B"
+        },
+        {
+          "expr": "sum(rate(matrix_bridge_messages_replied_total{status=\"error\"}[5m]))",
+          "legendFormat": "replied error",
+          "refId": "C"
+        }
+      ],
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": [
+            "mean",
+            "max"
+          ]
+        }
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 10,
+            "drawStyle": "line",
+            "spanNulls": false
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "replied error"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "red"
+                }
+              }
+            ]
+          }
+        ]
+      }
+    },
+    {
+      "id": 11,
+      "type": "timeseries",
+      "title": "Errors / Drops / Rate-Limited (rate/5m)",
+      "gridPos": {
+        "x": 12,
+        "y": 4,
+        "w": 12,
+        "h": 8
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum by (error_type) (rate(matrix_bridge_gateway_errors_total[5m]))",
+          "legendFormat": "gw_error: {{ error_type }}",
+          "refId": "A"
+        },
+        {
+          "expr": "sum(rate(matrix_bridge_queue_dropped_total[5m]))",
+          "legendFormat": "queue_dropped",
+          "refId": "B"
+        },
+        {
+          "expr": "sum(rate(matrix_bridge_rate_limited_total[5m]))",
+          "legendFormat": "rate_limited",
+          "refId": "C"
+        },
+        {
+          "expr": "sum by (reason) (rate(matrix_bridge_route_rejected_total[5m]))",
+          "legendFormat": "route_rejected: {{ reason }}",
+          "refId": "D"
+        }
+      ],
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": [
+            "mean",
+            "max"
+          ]
+        }
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 15,
+            "drawStyle": "line",
+            "stacking": {
+              "mode": "none"
+            },
+            "spanNulls": false
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 20,
+      "type": "timeseries",
+      "title": "Invoke Latency P50 / P95 by Node",
+      "gridPos": {
+        "x": 0,
+        "y": 12,
+        "w": 12,
+        "h": 8
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, sum by (node_id, le) (rate(matrix_bridge_invoke_duration_seconds_bucket[5m])))",
+          "legendFormat": "p50 {{ node_id }}",
+          "refId": "A"
+        },
+        {
+          "expr": "histogram_quantile(0.95, sum by (node_id, le) (rate(matrix_bridge_invoke_duration_seconds_bucket[5m])))",
+          "legendFormat": "p95 {{ node_id }}",
+          "refId": "B"
+        }
+      ],
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": [
+            "mean",
+            "max",
+            "last"
+          ]
+        }
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 5,
+            "drawStyle": "line",
+            "spanNulls": false
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 10
+              },
+              {
+                "color": "red",
+                "value": 20
+              }
+            ]
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 21,
+      "type": "timeseries",
+      "title": "Queue Wait P50 / P95",
+      "gridPos": {
+        "x": 12,
+        "y": 12,
+        "w": 12,
+        "h": 8
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, sum by (agent_id, le) (rate(matrix_bridge_queue_wait_seconds_bucket[5m])))",
+          "legendFormat": "wait p50 {{ agent_id }}",
+          "refId": "A"
+        },
+        {
+          "expr": "histogram_quantile(0.95, sum by (agent_id, le) (rate(matrix_bridge_queue_wait_seconds_bucket[5m])))",
+          "legendFormat": "wait p95 {{ agent_id }}",
+          "refId": "B"
+        }
+      ],
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": [
+            "mean",
+            "max"
+          ]
+        }
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 5,
+            "drawStyle": "line",
+            "spanNulls": false
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 30,
+      "type": "timeseries",
+      "title": "Node Routing: Routed & Rejected by Node (rate/5m)",
+      "gridPos": {
+        "x": 0,
+        "y": 20,
+        "w": 12,
+        "h": 7
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum by (node_id) (rate(matrix_bridge_routed_total[5m]))",
+          "legendFormat": "routed {{ node_id }}",
+          "refId": "A"
+        },
+        {
+          "expr": "sum by (node_id) (rate(matrix_bridge_node_rejected_total[5m]))",
+          "legendFormat": "rejected {{ node_id }}",
+          "refId": "B"
+        }
+      ],
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": [
+            "mean",
+            "max"
+          ]
+        }
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 10,
+            "drawStyle": "line",
+            "spanNulls": false
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 31,
+      "type": "timeseries",
+      "title": "Persistent Dedupe Hits / Inserts (rate/10m)",
+      "gridPos": {
+        "x": 12,
+        "y": 20,
+        "w": 12,
+        "h": 7
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum(rate(matrix_bridge_dedupe_persistent_hits_total[10m]))",
+          "legendFormat": "dedupe_hits",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(matrix_bridge_dedupe_persistent_inserts_total[10m])",
+          "legendFormat": "dedupe_inserts",
+          "refId": "B"
+        }
+      ],
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": [
+            "mean",
+            "max"
+          ]
+        }
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 10,
+            "drawStyle": "line",
+            "spanNulls": false
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 40,
+      "type": "timeseries",
+      "title": "Control Commands (rate/5m)",
+      "gridPos": {
+        "x": 0,
+        "y": 27,
+        "w": 12,
+        "h": 7
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum by (verb) (rate(matrix_bridge_control_commands_total[5m]))",
+          "legendFormat": "cmd {{ verb }}",
+          "refId": "A"
+        },
+        {
+          "expr": "sum by (scope) (rate(matrix_bridge_control_rate_limited_total[5m]))",
+          "legendFormat": "ctrl_ratelimited {{ scope }}",
+          "refId": "B"
+        }
+      ],
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": [
+            "mean",
+            "max"
+          ]
+        }
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 10,
+            "drawStyle": "line",
+            "spanNulls": false
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 41,
+      "type": "timeseries",
+      "title": "Traffic by Agent (received rate/5m)",
+      "gridPos": {
+        "x": 12,
+        "y": 27,
+        "w": 24,
+        "h": 7
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum by (agent_id) (rate(matrix_bridge_messages_received_total[5m]))",
+          "legendFormat": "{{ agent_id }}",
+          "refId": "A"
+        }
+      ],
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": [
+            "mean",
+            "max",
+            "last"
+          ]
+        }
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 10,
+            "drawStyle": "line",
+            "spanNulls": false
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 42,
+      "type": "timeseries",
+      "title": "Routing Reasons by Agent (rate/5m)",
+      "description": "M7.1: matrix_bridge_routing_reasons_total \u2014 slash/mention/name/default/direct breakdown",
+      "gridPos": {
+        "x": 0,
+        "y": 34,
+        "w": 24,
+        "h": 7
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum by (agent_id, reason) (rate(matrix_bridge_routing_reasons_total[5m]))",
+          "legendFormat": "{{ agent_id }} / {{ reason }}",
+          "refId": "A"
+        }
+      ],
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": [
+            "mean",
+            "max"
+          ]
+        }
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 10,
+            "drawStyle": "line",
+            "spanNulls": false
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 38,
+  "tags": [
+    "matrix-bridge",
+    "dagi",
+    "daarion"
+  ],
+  "templating": {
+    "list": [
+      {
+        "current": {},
+        "hide": 0,
+        "includeAll": false,
+        "label": "Datasource",
+        "multi": false,
+        "name": "datasource",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "type": "datasource"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "UTC",
+  "title": "Matrix Bridge DAGI",
+  "uid": "matrix-bridge-dagi-v1",
+  "version": 1
+}
--- a/ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
+++ b/ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
@@ -0,0 +1,158 @@
+---
+# Prometheus alert rules — Matrix Bridge DAGI
+# Phase M7.1 (metrics contract hardening)
+#
+# Metric source of truth: services/matrix-bridge-dagi/app/metrics_contract.py
+# Runbook: docs/runbook/matrix-bridge-dagi-ops.md
+#
+# Usage:
+#   promtool check rules ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
+#   docker run --rm -v $PWD:/w prom/prometheus:latest \
+#     promtool check rules /w/ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
+
+groups:
+  - name: matrix_bridge_dagi
+    interval: 30s
+    rules:
+
+      # ── A1: Bridge process down ─────────────────────────────────────────────
+      # metric: matrix_bridge_up{node_id}   (Gauge, M7.1: labeled per node)
+      - alert: BridgeDown
+        expr: sum(matrix_bridge_up) == 0
+        for: 1m
+        labels:
+          severity: critical
+          team: platform
+          service: matrix-bridge-dagi
+        annotations:
+          summary: "Matrix Bridge DAGI is down"
+          description: >
+            `matrix_bridge_up` == 0 across all nodes — bridge process has not
+            started or has crashed. No messages are being processed.
+          runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a1-bridgedown"
+
+      # ── A2: Matrix sync errors spike ────────────────────────────────────────
+      # metric: matrix_bridge_gateway_errors_total{error_type}   (Counter)
+      - alert: MatrixSyncErrors
+        expr: >
+          increase(matrix_bridge_gateway_errors_total{error_type="sync_error"}[5m]) > 3
+        for: 2m
+        labels:
+          severity: warning
+          team: platform
+          service: matrix-bridge-dagi
+        annotations:
+          summary: "Matrix sync errors elevated"
+          description: >
+            More than 3 Matrix `/sync` errors (error_type=sync_error) in the last
+            5 minutes. May indicate Matrix homeserver problems or network issues.
+          runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a2-matrixsyncerrors"
+
+      # ── A3: Gateway (Router) invoke errors spike ─────────────────────────────
+      # metric: matrix_bridge_messages_replied_total{status}   (Counter)
+      - alert: GatewayInvokeErrors
+        expr: >
+          increase(matrix_bridge_messages_replied_total{status="error"}[5m]) > 5
+        for: 2m
+        labels:
+          severity: warning
+          team: platform
+          service: matrix-bridge-dagi
+        annotations:
+          summary: "Router invoke errors elevated (node={{ $labels.node_id }})"
+          description: >
+            More than 5 agent invocation errors (status=error) in the last 5 minutes.
+            Check Router/DeepSeek connectivity and logs.
+          runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a3-gatewayinvokeerrors"
+
+      # ── A4: Queue drops ─────────────────────────────────────────────────────
+      # metric: matrix_bridge_queue_dropped_total{room_id, agent_id}   (Counter)
+      - alert: QueueDropsHigh
+        expr: >
+          rate(matrix_bridge_queue_dropped_total[5m]) > 0
+        for: 1m
+        labels:
+          severity: warning
+          team: platform
+          service: matrix-bridge-dagi
+        annotations:
+          summary: "Bridge queue is dropping messages"
+          description: >
+            `matrix_bridge_queue_dropped_total` is increasing — work queue is full
+            and incoming messages are being dropped. Increase
+            `BRIDGE_QUEUE_MAX_EVENTS` or `BRIDGE_WORKER_CONCURRENCY`.
+          runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a4-queuedrops"
+
+      # ── A5: User-level rate limiting spike ──────────────────────────────────
+      # metric: matrix_bridge_rate_limited_total{room_id, agent_id, limit_type}   (Counter)
+      - alert: RateLimitedSpike
+        expr: >
+          rate(matrix_bridge_rate_limited_total[5m]) > 2
+        for: 3m
+        labels:
+          severity: warning
+          team: platform
+          service: matrix-bridge-dagi
+        annotations:
+          summary: "User rate limiting spike"
+          description: >
+            More than 2 messages/second are being rate-limited over 3 minutes.
+            May indicate a flood attack, misbehaving client, or limits too low.
+          runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a5-ratelimitedspike"
+
+      # ── A6: Control channel rate limiting spike ──────────────────────────────
+      # metric: matrix_bridge_control_rate_limited_total{scope}   (Counter)
+      - alert: ControlRateLimitedSpike
+        expr: >
+          rate(matrix_bridge_control_rate_limited_total[5m]) > 0.5
+        for: 3m
+        labels:
+          severity: warning
+          team: platform
+          service: matrix-bridge-dagi
+        annotations:
+          summary: "Control channel rate limiting elevated"
+          description: >
+            More than 0.5 control commands/second rejected by rate limiter over
+            3 minutes. May indicate operator tooling issues or abuse attempt.
+          runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a6-controlratelimitedspike"
+
+      # ── A7: Persistent dedupe hit storm (resend loop) ────────────────────────
+      # metric: matrix_bridge_dedupe_persistent_hits_total{room_id}   (Counter)
+      - alert: DedupeHitStorm
+        expr: >
+          rate(matrix_bridge_dedupe_persistent_hits_total[10m]) > 0.5
+        for: 5m
+        labels:
+          severity: warning
+          team: platform
+          service: matrix-bridge-dagi
+        annotations:
+          summary: "Persistent deduplication hit rate elevated"
+          description: >
+            High rate of persistent dedupe hits — may indicate a Matrix resend
+            storm or a client repeatedly retrying the same event_id.
+          runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a7-dedupehitstorm"
+
+      # ── A8: Invoke latency P95 high (per node) ───────────────────────────────
+      # metric: matrix_bridge_invoke_duration_seconds{agent_id, node_id}   (Histogram)
+      - alert: InvokeLatencyP95High
+        expr: >
+          histogram_quantile(
+            0.95,
+            sum by (node_id, le) (
+              rate(matrix_bridge_invoke_duration_seconds_bucket[5m])
+            )
+          ) > 15
+        for: 5m
+        labels:
+          severity: warning
+          team: platform
+          service: matrix-bridge-dagi
+        annotations:
+          summary: "Router invoke latency P95 > 15s (node={{ $labels.node_id }})"
+          description: >
+            95th percentile invoke latency for node `{{ $labels.node_id }}` exceeds
+            15 seconds over the last 5 minutes. Check Router load, DeepSeek API,
+            Ollama/Swapper queue.
+          runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a8-invokelatencyp95high"
--- a/ops/runbook-matrix-bridge-soak.md
+++ b/ops/runbook-matrix-bridge-soak.md
@@ -0,0 +1,401 @@
+# matrix-bridge-dagi — Soak & Failure Rehearsal Runbook (M11)
+
+**Phase:** M11  
+**Applies to:** `matrix-bridge-dagi` service on NODA1  
+**When to run:** Before any production traffic increase, after major code changes, or on a recurring monthly basis.
+
+---
+
+## 1. Goals
+
+| Goal | Measurable pass criterion |
+|------|--------------------------|
+| Latency under load | p95 invoke < 5 000 ms |
+| Queue stability | drop rate < 1% |
+| Failover correctness | failover fires on NODA1 outage; NODA2 serves all remaining messages |
+| Sticky anti-flap | sticky set after first failover; no re-tries to degraded node |
+| Restart recovery | sticky + health snapshot reloads within 10 s of restart |
+| Policy operations safe under load | `!policy history` / `!policy change` work while messages in-flight |
+
+---
+
+## 2. Prerequisites
+
+```bash
+# On NODA1 or local machine with network access to bridge
+pip install httpx
+
+# Verify bridge is up
+curl -s http://localhost:9400/health | jq '.ok'
+# Expected: true
+
+# Verify /metrics endpoint
+curl -s http://localhost:9400/metrics | grep matrix_bridge_up
+# Expected: matrix_bridge_up{...} 1
+```
+
+---
+
+## 2a. Enabling the Soak Inject Endpoint
+
+The soak script uses `POST /v1/debug/inject_event` which is **disabled by default**.
+Enable it only on staging/NODA1 soak runs:
+
+```bash
+# On NODA1 — edit docker-compose override or pass env inline:
+# Option 1: temporary inline restart
+DEBUG_INJECT_ENABLED=true docker-compose \
+  -f docker-compose.matrix-bridge-node1.yml \
+  up -d --no-deps matrix-bridge-dagi
+
+# Option 2: .env file override
+echo "DEBUG_INJECT_ENABLED=true" >> .env.soak
+docker-compose --env-file .env.soak \
+  -f docker-compose.matrix-bridge-node1.yml \
+  up -d --no-deps matrix-bridge-dagi
+
+# Verify it's enabled (should return 200, not 403)
+curl -s -X POST http://localhost:9400/v1/debug/inject_event \
+  -H 'Content-Type: application/json' \
+  -d '{"room_id":"!test:test","event":{}}' | jq .
+# Expected: {"ok":false,"error":"no mapping for room_id=..."}  ← 200, not 403
+
+# IMPORTANT: disable after soak
+docker-compose -f docker-compose.matrix-bridge-node1.yml up -d --no-deps matrix-bridge-dagi
+# (DEBUG_INJECT_ENABLED defaults to false)
+```
+
+---
+
+## 2b. Step 0 (WORKERS=2 / QUEUE=100) — Record True Baseline
+
+**Goal:** snapshot the "before any tuning" numbers to have a comparison point.
+
+```bash
+# 0. Confirm current config (should be defaults)
+curl -s http://localhost:9400/health | jq '{workers: .workers, queue_max: .queue.max}'
+# Expected: {"workers": 2, "queue_max": 100}
+
+# 1. DB path for WAL check (adjust to your BRIDGE_DATA_DIR)
+DB=/opt/microdao-daarion/data/matrix_bridge.db
+
+# 2. WAL size before (manual check)
+ls -lh ${DB}-wal 2>/dev/null || echo "(no WAL file yet — first run)"
+sqlite3 $DB "PRAGMA wal_checkpoint(PASSIVE);" 2>/dev/null || echo "(no sqlite3)"
+
+# 3. Run Step 0 soak
+python3 ops/scripts/matrix_bridge_soak.py \
+  --url   http://localhost:9400 \
+  --messages   100 \
+  --concurrency  4 \
+  --agent  sofiia \
+  --room-id "!your-room-id:your-server" \
+  --max-p95-ms  5000 \
+  --max-drop-rate 0.001 \
+  --db-path $DB \
+  --report-file /tmp/soak_step0_baseline.json
+
+# 4. Record result in "Baseline numbers" table (section 10) below.
+jq '.summary, .latency, .metrics_delta, .wal' /tmp/soak_step0_baseline.json
+```
+
+**v1 Go/No-Go thresholds for Step 0:**
+
+| Metric | Green ✅ | Yellow ⚠️ | Red ❌ |
+|--------|---------|-----------|-------|
+| `p95_invoke_ms` | < 3000 | 3000–5000 | > 5000 |
+| `drop_rate` | 0.00% (mandatory) | — | > 0.1% |
+| `error_rate` | < 1% | 1–3% | > 3% |
+| `failovers` | 0 | — | ≥ 1 without cause |
+| WAL delta | < 2 MB | 2–10 MB | > 10 MB |
+
+**If Step 0 is Green → proceed to Step 1 tuning.**
+**If Step 0 is Yellow/Red → investigate before touching WORKER_CONCURRENCY.**
+
+---
+
+## 2c. Step 1 (WORKERS=4 / QUEUE=200) — Tune-1
+
+**Goal:** verify that doubling workers gives headroom without Router saturation.
+
+```bash
+# 1. Apply tuning
+WORKER_CONCURRENCY=4 QUEUE_MAX_EVENTS=200 docker-compose \
+  -f docker-compose.matrix-bridge-node1.yml \
+  --env-file .env.soak \
+  up -d --no-deps matrix-bridge-dagi
+
+sleep 3
+curl -s http://localhost:9400/health | jq '{workers: .workers, queue_max: .queue.max}'
+# Expected: {"workers": 4, "queue_max": 200}
+
+# 2. Run Step 1 soak (higher concurrency to stress the new headroom)
+python3 ops/scripts/matrix_bridge_soak.py \
+  --url   http://localhost:9400 \
+  --messages   100 \
+  --concurrency  8 \
+  --agent  sofiia \
+  --room-id "!your-room-id:your-server" \
+  --max-p95-ms  3000 \
+  --max-drop-rate 0.001 \
+  --db-path $DB \
+  --report-file /tmp/soak_step1_tune1.json
+
+# 3. Compare Step 0 vs Step 1
+python3 - <<'EOF'
+import json
+s0 = json.load(open('/tmp/soak_step0_baseline.json'))
+s1 = json.load(open('/tmp/soak_step1_tune1.json'))
+for k in ('p50', 'p95', 'p99'):
+    print(f"{k}: {s0['latency'][k]}ms → {s1['latency'][k]}ms")
+print(f"drops: {s0['metrics_delta']['queue_drops']} → {s1['metrics_delta']['queue_drops']}")
+print(f"WAL: {s0['wal'].get('delta_mb')} → {s1['wal'].get('delta_mb')} MB delta")
+EOF
+```
+
+**Decision:**
+- Step 1 Green → **freeze, tag v1.0, ship to production.**
+- p95 within 5% of Step 0 → Router is bottleneck (not workers); don't go to Step 2.
+- Queue drops > 0 at WORKERS=4 → try Step 2 (WORKERS=8, QUEUE=300).
+
+---
+
+## 3. Scenario A — Baseline load (100 messages, concurrency 4)
+
+**Goal:** establish latency baseline, verify no drops under normal load.
+
+```bash
+python3 ops/scripts/matrix_bridge_soak.py \
+  --url http://localhost:9400 \
+  --messages 100 \
+  --concurrency 4 \
+  --max-p95-ms 3000 \
+  --report-file /tmp/soak_baseline.json
+```
+
+**Expected output:**
+```
+matrix-bridge-dagi Soak Report  ✅ PASSED
+  Messages:    100  concurrency=4
+  Latency: p50=<500ms  p95=<3000ms
+  Queue drops:  0  (rate 0.000%)
+  Failovers:    0
+```
+
+**If FAILED:**
+- `p95 too high` → check router `/health`, DeepSeek API latency, `docker stats`
+- `drop_rate > 0` → check `QUEUE_MAX_EVENTS` env var (increase if needed), inspect bridge logs
+
+---
+
+## 4. Scenario B — Queue saturation test
+
+**Goal:** confirm drop metric fires cleanly and bridge doesn't crash.
+
+```bash
+# Reduce queue via env override, then flood:
+QUEUE_MAX_EVENTS=5 docker-compose -f docker-compose.matrix-bridge-node1.yml \
+  up -d matrix-bridge-dagi
+
+# Wait for restart
+sleep 5
+
+python3 ops/scripts/matrix_bridge_soak.py \
+  --url http://localhost:9400 \
+  --messages 30 \
+  --concurrency 10 \
+  --max-drop-rate 0.99 \
+  --report-file /tmp/soak_queue_sat.json
+
+# Restore normal queue size
+docker-compose -f docker-compose.matrix-bridge-node1.yml up -d matrix-bridge-dagi
+```
+
+**Expected:** `queue_drops > 0`, bridge still running after the test.
+
+**Verify in Prometheus/Grafana:**
+```promql
+rate(matrix_bridge_queue_dropped_total[1m])
+```
+Should spike and then return to 0.
+
+---
+
+## 5. Scenario C — Node failover rehearsal
+
+**Goal:** simulate NODA1 router becoming unavailable, verify NODA2 takes over.
+
+```bash
+# Step 1: stop the router on NODA1 temporarily
+docker pause dagi-router-node1
+
+# Step 2: run soak against bridge (bridge will failover to NODA2)
+python3 ops/scripts/matrix_bridge_soak.py \
+  --url http://localhost:9400 \
+  --messages 20 \
+  --concurrency 2 \
+  --max-p95-ms 10000 \
+  --report-file /tmp/soak_failover.json
+
+# Step 3: restore router
+docker unpause dagi-router-node1
+```
+
+**Expected:**
+```
+  Failovers:   1..20  (at least 1)
+  Sticky sets: 1+
+  Errors:      0  (fallback to NODA2 serves all messages)
+```
+
+**Check sticky in control room:**
+```
+!nodes
+```
+Should show `NODA2` sticky with remaining TTL.
+
+**Check health tracker:**
+```
+!status
+```
+Should show `NODA1 state=degraded|down`.
+
+---
+
+## 6. Scenario D — Restart recovery
+
+**Goal:** after restart, sticky and health state reload within one polling cycle.
+
+```bash
+# After Scenario C: sticky is set to NODA2
+# Restart the bridge
+docker restart dagi-matrix-bridge-node1
+
+# Wait for startup (up to 30s)
+sleep 15
+
+# Verify sticky reloaded
+curl -s http://localhost:9400/health | jq '.ha_state'
+# Expected: {"sticky_loaded": N, ...}
+
+# Verify routing still uses NODA2 sticky
+python3 ops/scripts/matrix_bridge_soak.py \
+  --url http://localhost:9400 \
+  --messages 10 \
+  --concurrency 2 \
+  --report-file /tmp/soak_restart.json
+```
+
+**Expected:** p95 similar to post-failover run, `Failovers: 0` (sticky already applied).
+
+---
+
+## 7. Scenario E — Rate limit burst
+
+**Goal:** verify rate limiting fires and bridge doesn't silently drop below-limit messages.
+
+```bash
+# Set RPM very low for test, then flood from same sender
+# This is best done in control room by observing !status rate_limited count
+# rather than the soak script (which uses different senders per message).
+
+# In Matrix control room:
+# Send 30+ messages from the same user account in quick succession in a mixed room.
+# Then:
+!status
+# Check: rate_limited_total increased, no queue drops.
+```
+
+---
+
+## 8. Scenario F — Policy operations under load
+
+**Goal:** `!policy history`, `!policy change`, and `!policy export` work while messages are in-flight.
+
+```bash
+# Run a background soak
+python3 ops/scripts/matrix_bridge_soak.py \
+  --url http://localhost:9400 \
+  --messages 200 \
+  --concurrency 2 \
+  --report-file /tmp/soak_concurrent_policy.json &
+
+# While soak is running, in Matrix control room:
+!policy history limit=5
+!policy export
+!status
+```
+
+**Expected:** all three commands respond immediately (< 2s), soak completes without extra drops.
+
+---
+
+## 9. Prometheus / Grafana during soak
+
+Key queries for the Grafana dashboard:
+
+```promql
+# Throughput (messages/s)
+rate(matrix_bridge_routed_total[30s])
+
+# Error rate
+rate(matrix_bridge_errors_total[30s])
+
+# p95 invoke latency per node
+histogram_quantile(0.95, rate(matrix_bridge_invoke_duration_seconds_bucket[1m]))
+
+# Queue drops rate
+rate(matrix_bridge_queue_dropped_total[1m])
+
+# Failovers
+rate(matrix_bridge_failover_total[5m])
+```
+
+Use the `matrix-bridge-dagi` Grafana dashboard at:  
+`ops/grafana/dashboards/matrix-bridge-dagi.json`
+
+---
+
+## 10. Baseline numbers (reference)
+
+| Metric | Cold start | Warm (sticky set) |
+|--------|-----------|-------------------|
+| p50 latency | ~200ms | ~150ms |
+| p95 latency | ~2 000ms | ~1 500ms |
+| Queue drops | 0 (queue=100) | 0 |
+| Failover fires | 1 per degradation | 0 after sticky |
+| Policy ops response | < 500ms | < 500ms |
+
+*Update this table after each soak run with actual measured values.*
+
+---
+
+## 11. CI soak (mocked, no network)
+
+For CI pipelines, use the mocked soak scenarios:
+
+```bash
+python3 -m pytest tests/test_matrix_bridge_m11_soak_scenarios.py -v
+```
+
+Covers (all deterministic, no network):
+- **S1** Queue saturation → drop counter
+- **S2** Failover under load → on_failover callback, health tracker
+- **S3** Sticky routing under burst → sticky set, burst routed to NODA2
+- **S4** Multi-room isolation → separate rooms don't interfere
+- **S5** Rate-limit burst → RL callback wired, no panic
+- **S6** HA restart recovery → sticky + health snapshot persisted and reloaded
+- **Perf baseline** 100-msg + 50-msg failover burst < 5s wall clock
+
+---
+
+## 12. Known failure modes & mitigations
+
+| Symptom | Likely cause | Mitigation |
+|---------|-------------|------------|
+| `p95 > 5000ms` | Router/LLM slow | Increase `ROUTER_TIMEOUT_S`, check DeepSeek API |
+| `drop_rate > 1%` | Queue too small | Increase `QUEUE_MAX_EVENTS` |
+| `failovers > 0` but errors > 0 | Both nodes degraded | Check NODA1 + NODA2 health; scale router |
+| Bridge crash during soak | Memory leak / bug | `docker logs` → file GitHub issue |
+| Sticky not set after failover | `FAILOVER_STICKY_TTL_S=0` | Set to 300+ |
+| Restart doesn't load sticky | `HA_HEALTH_MAX_AGE_S` too small | Increase or set to 3600 |
--- a/ops/scripts/matrix_bridge_soak.py
+++ b/ops/scripts/matrix_bridge_soak.py
@@ -0,0 +1,476 @@
+#!/usr/bin/env python3
+"""
+matrix_bridge_soak.py — M11 live soak script for matrix-bridge-dagi
+
+Usage:
+  python3 ops/scripts/matrix_bridge_soak.py \
+      --url http://localhost:9400 \
+      --messages 100 \
+      --concurrency 4 \
+      --report-file /tmp/soak_report.json
+
+Requires: httpx (pip install httpx)
+
+What it does:
+  1. Sends --messages synthetic messages to the bridge /v1/sync endpoint
+     (or directly to the router if --direct-router is set).
+  2. Measures latency (p50, p95, p99, max) per batch.
+  3. After the run, fetches /metrics and extracts key counters:
+       - matrix_bridge_queue_dropped_total
+       - matrix_bridge_rate_limited_total
+       - matrix_bridge_failover_total
+       - matrix_bridge_sticky_node_total
+       - matrix_bridge_invoke_duration_seconds (p50/p95 from histogram)
+  4. Prints a human-readable report and optionally writes JSON.
+
+Exit codes:
+  0 = all pass criteria met
+  1 = one or more thresholds exceeded (see --max-p95-ms, --max-drop-rate)
+"""
+import argparse
+import asyncio
+import json
+import sys
+import time
+from typing import Any, Dict, List, Optional
+
+try:
+    import httpx
+except ImportError:
+    print("ERROR: httpx not installed. Run: pip install httpx", file=sys.stderr)
+    sys.exit(2)
+
+# ── Pass/fail defaults ─────────────────────────────────────────────────────────
+_DEFAULT_MAX_P95_MS  = 5000   # 5 s p95 per invoke (generous for cold start)
+_DEFAULT_MAX_DROP_RATE = 0.01  # 1% queue drops allowed
+
+
+# ── Metrics parsing ────────────────────────────────────────────────────────────
+def _parse_counter(text: str, name: str) -> float:
+    """Extract the last reported value of a Prometheus counter by name."""
+    for line in text.splitlines():
+        if line.startswith(name + " ") or line.startswith(name + "{"):
+            parts = line.rsplit(None, 1)
+            try:
+                return float(parts[-1])
+            except (ValueError, IndexError):
+                pass
+    return 0.0
+
+
+def _parse_histogram_quantile(text: str, name: str, quantile: float) -> Optional[float]:
+    """
+    Approximate histogram_quantile from _bucket lines.
+    Returns estimated value at given quantile or None if data missing.
+    """
+    buckets: List[tuple] = []
+    total_count = 0.0
+    for line in text.splitlines():
+        if f"{name}_bucket" in line and 'le="' in line:
+            try:
+                le_part = line.split('le="')[1].split('"')[0]
+                le = float(le_part) if le_part != "+Inf" else float("inf")
+                val = float(line.rsplit(None, 1)[-1])
+                buckets.append((le, val))
+            except (ValueError, IndexError):
+                pass
+        elif (f"{name}_count " in line or (name + "_count{") in line):
+            try:
+                total_count = float(line.rsplit(None, 1)[-1])
+            except (ValueError, IndexError):
+                pass
+
+    if not buckets or total_count == 0:
+        return None
+
+    buckets.sort()
+    target = quantile * total_count
+    prev_le, prev_count = 0.0, 0.0
+    for le, count in buckets:
+        if count >= target:
+            if le == float("inf"):
+                return prev_le
+            # Linear interpolation
+            if count == prev_count:
+                return le
+            fraction = (target - prev_count) / (count - prev_count)
+            return prev_le + fraction * (le - prev_le)
+        prev_le, prev_count = le, count
+    return prev_le
+
+
+# ── Soak runner ────────────────────────────────────────────────────────────────
+async def _preflight_inject(client: httpx.AsyncClient, url: str, room_id: str) -> str:
+    """
+    Verify the inject endpoint is reachable and enabled.
+    Returns "" on success, error message on failure.
+    """
+    try:
+        resp = await client.post(
+            f"{url.rstrip('/')}/v1/debug/inject_event",
+            json={"room_id": room_id, "event": {"event_id": "!preflight", "sender": "@soak:test",
+                                                  "content": {"msgtype": "m.text", "body": "ping"}}},
+            timeout=5.0,
+        )
+        if resp.status_code == 403:
+            return (
+                "❌ DEBUG_INJECT_ENABLED=false on bridge. "
+                "Set DEBUG_INJECT_ENABLED=true and restart for soak.\n"
+                "   NEVER enable in production!"
+            )
+        if resp.status_code >= 500:
+            return f"❌ Bridge inject endpoint returned HTTP {resp.status_code}"
+        data = resp.json()
+        if not data.get("ok") and "no mapping" in data.get("error", ""):
+            return (
+                f"❌ No room mapping for room_id={room_id!r}. "
+                "Pass --room-id matching a configured BRIDGE_ROOM_MAP entry."
+            )
+        return ""
+    except httpx.ConnectError:
+        return f"❌ Cannot connect to bridge at {url}. Is it running?"
+    except Exception as exc:  # noqa: BLE001
+        return f"❌ Preflight failed: {exc}"
+
+
+async def _check_wal(db_path: str) -> Dict[str, Any]:
+    """
+    Run WAL size + checkpoint check on the bridge policy DB.
+    Returns dict with wal_bytes, wal_mb, checkpoint_result.
+    Requires sqlite3 CLI on PATH; gracefully skips if unavailable.
+    """
+    import subprocess, shutil
+    result: Dict[str, Any] = {"db_path": db_path, "ok": False}
+
+    wal_path = db_path + "-wal"
+    try:
+        wal_bytes = os.path.getsize(wal_path) if os.path.exists(wal_path) else 0
+        result["wal_bytes"] = wal_bytes
+        result["wal_mb"]    = round(wal_bytes / 1_048_576, 2)
+    except OSError:
+        result["wal_bytes"] = -1
+        result["wal_mb"]    = -1
+
+    if shutil.which("sqlite3"):
+        try:
+            cp = subprocess.run(
+                ["sqlite3", db_path, "PRAGMA wal_checkpoint(PASSIVE);"],
+                capture_output=True, text=True, timeout=5,
+            )
+            # Output: busy|log|checkpointed (3 ints)
+            parts = cp.stdout.strip().split("|")
+            if len(parts) == 3:
+                result["wal_checkpoint"] = {
+                    "busy": int(parts[0]), "log": int(parts[1]), "checkpointed": int(parts[2]),
+                }
+            result["ok"] = True
+        except Exception:  # noqa: BLE001
+            result["ok"] = False
+    else:
+        result["sqlite3_missing"] = True
+
+    return result
+
+
+async def _send_one(
+    client: httpx.AsyncClient,
+    url: str,
+    agent_id: str,
+    message: str,
+    room_id: str,
+    sender: str,
+) -> tuple:
+    """
+    POST a synthetic Matrix-style event to the bridge debug endpoint.
+    Returns (latency_ms: float, status_code: int, error: str|None).
+    """
+    payload = {
+        "room_id": room_id,
+        "event": {
+            "event_id": f"!soak-{int(time.monotonic() * 1e6)}",
+            "sender": sender,
+            "type": "m.room.message",
+            "content": {"msgtype": "m.text", "body": message},
+        },
+    }
+    t0 = time.monotonic()
+    try:
+        resp = await client.post(
+            f"{url.rstrip('/')}/v1/debug/inject_event",
+            json=payload,
+            timeout=30.0,
+        )
+        latency_ms = (time.monotonic() - t0) * 1000
+        if resp.status_code >= 500:
+            return latency_ms, resp.status_code, f"HTTP {resp.status_code}"
+        return latency_ms, resp.status_code, None
+    except httpx.TimeoutException:
+        latency_ms = (time.monotonic() - t0) * 1000
+        return latency_ms, 0, "timeout"
+    except Exception as exc:  # noqa: BLE001
+        latency_ms = (time.monotonic() - t0) * 1000
+        return latency_ms, 0, str(exc)
+
+
+async def _fetch_health(client: httpx.AsyncClient, url: str) -> Dict[str, Any]:
+    try:
+        resp = await client.get(f"{url.rstrip('/')}/health", timeout=10.0)
+        return resp.json() if resp.status_code == 200 else {}
+    except Exception:  # noqa: BLE001
+        return {}
+
+
+async def _fetch_metrics(client: httpx.AsyncClient, url: str) -> str:
+    try:
+        resp = await client.get(f"{url.rstrip('/')}/metrics", timeout=10.0)
+        return resp.text if resp.status_code == 200 else ""
+    except Exception:  # noqa: BLE001
+        return ""
+
+
+def _percentile(values: List[float], p: float) -> float:
+    if not values:
+        return 0.0
+    sv = sorted(values)
+    idx = int(len(sv) * p / 100)
+    return sv[min(idx, len(sv) - 1)]
+
+
+async def run_soak(
+    url: str,
+    n_messages: int,
+    concurrency: int,
+    agent_id: str,
+    room_id: str,
+    sender: str,
+    max_p95_ms: float,
+    max_drop_rate: float,
+    db_path: str = "",
+) -> Dict[str, Any]:
+    results: List[tuple] = []
+    semaphore = asyncio.Semaphore(concurrency)
+
+    async with httpx.AsyncClient() as client:
+        # Pre-check: inject endpoint + health
+        preflight_err = await _preflight_inject(client, url, room_id)
+        if preflight_err:
+            print(preflight_err, file=sys.stderr)
+            return {"ok": False, "error": preflight_err, "passed": False, "failures": [preflight_err]}
+
+        # WAL check before soak
+        wal_before: Dict[str, Any] = {}
+        if db_path:
+            wal_before = await _check_wal(db_path)
+            print(f"[soak] WAL before: {wal_before.get('wal_mb', '?')} MB")
+
+        # Pre-check: health
+        health_before = await _fetch_health(client, url)
+        metrics_before = await _fetch_metrics(client, url)
+
+        drops_before  = _parse_counter(metrics_before, "matrix_bridge_queue_dropped_total")
+        rl_before     = _parse_counter(metrics_before, "matrix_bridge_rate_limited_total")
+        fo_before     = _parse_counter(metrics_before, "matrix_bridge_failover_total")
+
+        print(f"[soak] Bridge health before: {health_before.get('ok', '?')}")
+        print(f"[soak] Starting {n_messages} messages (concurrency={concurrency}) ...")
+
+        t_start = time.monotonic()
+
+        async def worker(i: int):
+            async with semaphore:
+                msg = f"soak-msg-{i:04d}"
+                lat, status, err = await _send_one(
+                    client, url, agent_id, msg, room_id, sender
+                )
+                results.append((lat, status, err))
+                if (i + 1) % max(1, n_messages // 10) == 0:
+                    print(f"  [{i+1}/{n_messages}] last={lat:.0f}ms status={status}")
+
+        await asyncio.gather(*[worker(i) for i in range(n_messages)])
+
+        elapsed_s = time.monotonic() - t_start
+        metrics_after = await _fetch_metrics(client, url)
+        health_after  = await _fetch_health(client, url)
+
+        # WAL check after soak
+        wal_after: Dict[str, Any] = {}
+        if db_path:
+            wal_after = await _check_wal(db_path)
+            print(f"[soak] WAL after:  {wal_after.get('wal_mb', '?')} MB "
+                  f"(delta={round(wal_after.get('wal_mb',0) - wal_before.get('wal_mb',0), 2)} MB)")
+
+    latencies  = [r[0] for r in results]
+    errors     = [r for r in results if r[2] is not None]
+    successes  = len(results) - len(errors)
+    error_rate = len(errors) / len(results) if results else 0.0
+
+    drops_after = _parse_counter(metrics_after, "matrix_bridge_queue_dropped_total")
+    rl_after    = _parse_counter(metrics_after, "matrix_bridge_rate_limited_total")
+    fo_after    = _parse_counter(metrics_after, "matrix_bridge_failover_total")
+    sticky_after = _parse_counter(metrics_after, "matrix_bridge_sticky_node_total")
+
+    delta_drops = drops_after - drops_before
+    delta_rl    = rl_after    - rl_before
+    delta_fo    = fo_after    - fo_before
+
+    p50 = _percentile(latencies, 50)
+    p95 = _percentile(latencies, 95)
+    p99 = _percentile(latencies, 99)
+    p_max = max(latencies) if latencies else 0.0
+
+    # Histogram quantile from Prometheus
+    hist_p95 = _parse_histogram_quantile(
+        metrics_after, "matrix_bridge_invoke_duration_seconds", 0.95
+    )
+    hist_p95_ms = hist_p95 * 1000 if hist_p95 is not None else None
+
+    drop_rate = delta_drops / len(results) if results else 0.0
+
+    report = {
+        "wal": {
+            "before_mb": wal_before.get("wal_mb"),
+            "after_mb":  wal_after.get("wal_mb"),
+            "delta_mb":  round(
+                (wal_after.get("wal_mb") or 0) - (wal_before.get("wal_mb") or 0), 3
+            ) if wal_before and wal_after else None,
+            "checkpoint_after": wal_after.get("wal_checkpoint"),
+            "threshold_mb": 10,
+        },
+        "summary": {
+            "total_messages": n_messages,
+            "concurrency": concurrency,
+            "elapsed_s": round(elapsed_s, 2),
+            "throughput_rps": round(n_messages / elapsed_s, 1) if elapsed_s > 0 else 0,
+            "successes": successes,
+            "errors": len(errors),
+            "error_rate": round(error_rate, 4),
+        },
+        "latency_ms": {
+            "p50": round(p50, 1),
+            "p95": round(p95, 1),
+            "p99": round(p99, 1),
+            "max": round(p_max, 1),
+        },
+        "metrics_delta": {
+            "queue_drops": int(delta_drops),
+            "rate_limited": int(delta_rl),
+            "failovers": int(delta_fo),
+            "sticky_sets": int(sticky_after),
+            "drop_rate": round(drop_rate, 4),
+        },
+        "prometheus_invoke_p95_ms": round(hist_p95_ms, 1) if hist_p95_ms else None,
+        "health_before": health_before.get("ok"),
+        "health_after":  health_after.get("ok"),
+        "pass_criteria": {
+            "max_p95_ms": max_p95_ms,
+            "max_drop_rate": max_drop_rate,
+        },
+    }
+
+    # Pass/fail evaluation
+    failures = []
+    if p95 > max_p95_ms:
+        failures.append(f"p95={p95:.0f}ms exceeds threshold {max_p95_ms:.0f}ms")
+    if drop_rate > max_drop_rate:
+        failures.append(
+            f"drop_rate={drop_rate:.3%} exceeds threshold {max_drop_rate:.3%}"
+        )
+    wal_delta = report["wal"]["delta_mb"]
+    if wal_delta is not None and wal_delta > report["wal"]["threshold_mb"]:
+        failures.append(
+            f"WAL grew {wal_delta:.1f}MB (threshold {report['wal']['threshold_mb']}MB) "
+            "— possible SQLite write pressure (Bottleneck #2)"
+        )
+
+    report["passed"] = len(failures) == 0
+    report["failures"] = failures
+    return report
+
+
+def _print_report(r: Dict[str, Any]) -> None:
+    s = r["summary"]
+    l = r["latency_ms"]
+    m = r["metrics_delta"]
+    passed = "✅ PASSED" if r["passed"] else "❌ FAILED"
+
+    w = r.get("wal", {})
+    print()
+    print("=" * 60)
+    print(f"  matrix-bridge-dagi Soak Report  {passed}")
+    print("=" * 60)
+    print(f"  Messages:    {s['total_messages']}  concurrency={s['concurrency']}")
+    print(f"  Elapsed:     {s['elapsed_s']}s  ({s['throughput_rps']} rps)")
+    print(f"  Successes:   {s['successes']}  errors={s['errors']} ({s['error_rate']:.1%})")
+    print()
+    print(f"  Latency (client-side):  p50={l['p50']}ms  p95={l['p95']}ms  "
+          f"p99={l['p99']}ms  max={l['max']}ms")
+    if r["prometheus_invoke_p95_ms"] is not None:
+        print(f"  Invoke p95 (Prometheus): {r['prometheus_invoke_p95_ms']}ms")
+    print()
+    print(f"  Queue drops:   {m['queue_drops']}  (rate {m['drop_rate']:.3%})")
+    print(f"  Rate-limited:  {m['rate_limited']}")
+    print(f"  Failovers:     {m['failovers']}")
+    print(f"  Sticky sets:   {m['sticky_sets']}")
+    if w.get("before_mb") is not None:
+        wal_delta_str = (
+            f"Δ{w['delta_mb']:+.2f}MB" if w.get("delta_mb") is not None else ""
+        )
+        wal_warn = " ⚠️" if (w.get("delta_mb") or 0) > w.get("threshold_mb", 10) else ""
+        print(f"  WAL:           {w['before_mb']}MB → {w['after_mb']}MB {wal_delta_str}{wal_warn}")
+    print()
+    if r["failures"]:
+        for f in r["failures"]:
+            print(f"  ❌ {f}")
+    else:
+        print("  All pass criteria met.")
+    print("=" * 60)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="matrix-bridge-dagi soak test (M11)")
+    parser.add_argument("--url",          default="http://localhost:9400",
+                        help="Bridge base URL (default: http://localhost:9400)")
+    parser.add_argument("--messages",     type=int, default=100,
+                        help="Total messages to send (default: 100)")
+    parser.add_argument("--concurrency",  type=int, default=4,
+                        help="Concurrent requests (default: 4)")
+    parser.add_argument("--agent-id",     default="sofiia",
+                        help="Agent id for synthetic events (default: sofiia)")
+    parser.add_argument("--room-id",      default="!soak-room:home.invalid",
+                        help="Room id for synthetic events")
+    parser.add_argument("--sender",       default="@soak-user:home.invalid",
+                        help="Sender for synthetic events")
+    parser.add_argument("--max-p95-ms",   type=float, default=_DEFAULT_MAX_P95_MS,
+                        help=f"Max p95 latency ms (default: {_DEFAULT_MAX_P95_MS})")
+    parser.add_argument("--max-drop-rate",type=float, default=_DEFAULT_MAX_DROP_RATE,
+                        help=f"Max queue drop rate 0..1 (default: {_DEFAULT_MAX_DROP_RATE})")
+    parser.add_argument("--report-file",  default="",
+                        help="Optional path to write JSON report")
+    parser.add_argument("--db-path",      default="",
+                        help="Path to policy_store.db for WAL check "
+                             "(e.g. /opt/microdao-daarion/data/matrix_bridge.db)")
+    args = parser.parse_args()
+
+    report = asyncio.run(run_soak(
+        url=args.url,
+        n_messages=args.messages,
+        concurrency=args.concurrency,
+        agent_id=args.agent_id,
+        room_id=args.room_id,
+        sender=args.sender,
+        max_p95_ms=args.max_p95_ms,
+        max_drop_rate=args.max_drop_rate,
+        db_path=args.db_path,
+    ))
+    _print_report(report)
+
+    if args.report_file:
+        with open(args.report_file, "w", encoding="utf-8") as fh:
+            json.dump(report, fh, indent=2)
+        print(f"\n  Report saved: {args.report_file}")
+
+    return 0 if report["passed"] else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())