From 82d5ff2a4fb8a3d2986d9a6d6e77c36282a70760 Mon Sep 17 00:00:00 2001
From: Apple <apple@MacBook-Pro.local>
Date: Thu, 5 Mar 2026 07:51:37 -0800
Subject: [PATCH] =?UTF-8?q?feat(matrix-bridge-dagi):=20M4=E2=80=93M11=20+?=
 =?UTF-8?q?=20soak=20infrastructure=20(debug=20inject=20endpoint)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Includes all milestones M4 through M11:
- M4: agent discovery (!agents / !status)
- M5: node-aware routing + per-node observability
- M6: dynamic policy store (node/agent overrides, import/export)
- M7: Prometheus alerts + Grafana dashboard + metrics contract
- M8: node health tracker + soft failover + sticky cache + HA persistence
- M9: two-step confirm + diff preview for dangerous commands
- M10: auto-backup, restore, retention, policy history + change detail
- M11: soak scenarios (CI tests) + live soak script

Soak infrastructure (this commit):
- POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false)
- _preflight_inject() and _check_wal() in soak script
- --db-path arg for WAL delta reporting
- Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands

Made-with: Cursor
---
 docker-compose.matrix-bridge-node1.yml        |   35 +
 .../dashboards/matrix-bridge-dagi.json        |  986 ++++++
 .../alerts/matrix-bridge-dagi.rules.yml       |  158 +
 ops/runbook-matrix-bridge-soak.md             |  401 +++
 ops/scripts/matrix_bridge_soak.py             |  476 +++
 services/matrix-bridge-dagi/app/config.py     |   79 +-
 .../matrix-bridge-dagi/app/confirm_store.py   |  167 +
 services/matrix-bridge-dagi/app/control.py    |  946 +++++-
 .../matrix-bridge-dagi/app/control_limiter.py |  138 +
 .../matrix-bridge-dagi/app/control_runner.py  |  296 ++
 services/matrix-bridge-dagi/app/discovery.py  |  210 ++
 .../matrix-bridge-dagi/app/event_store.py     |  213 ++
 services/matrix-bridge-dagi/app/ingress.py    | 2801 ++++++++++++++++-
 services/matrix-bridge-dagi/app/main.py       |  466 ++-
 .../app/metrics_contract.py                   |  224 ++
 .../matrix-bridge-dagi/app/mixed_routing.py   |   22 +
 .../matrix-bridge-dagi/app/node_health.py     |  262 ++
 .../matrix-bridge-dagi/app/node_policy.py     |  179 ++
 .../matrix-bridge-dagi/app/policy_store.py    | 1007 ++++++
 .../matrix-bridge-dagi/app/sticky_cache.py    |  149 +
 services/matrix-bridge-dagi/requirements.txt  |    1 +
 21 files changed, 9123 insertions(+), 93 deletions(-)
 create mode 100644 ops/grafana/dashboards/matrix-bridge-dagi.json
 create mode 100644 ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
 create mode 100644 ops/runbook-matrix-bridge-soak.md
 create mode 100644 ops/scripts/matrix_bridge_soak.py
 create mode 100644 services/matrix-bridge-dagi/app/confirm_store.py
 create mode 100644 services/matrix-bridge-dagi/app/control_limiter.py
 create mode 100644 services/matrix-bridge-dagi/app/control_runner.py
 create mode 100644 services/matrix-bridge-dagi/app/discovery.py
 create mode 100644 services/matrix-bridge-dagi/app/event_store.py
 create mode 100644 services/matrix-bridge-dagi/app/metrics_contract.py
 create mode 100644 services/matrix-bridge-dagi/app/node_health.py
 create mode 100644 services/matrix-bridge-dagi/app/node_policy.py
 create mode 100644 services/matrix-bridge-dagi/app/policy_store.py
 create mode 100644 services/matrix-bridge-dagi/app/sticky_cache.py

diff --git a/docker-compose.matrix-bridge-node1.yml b/docker-compose.matrix-bridge-node1.yml
index 4cd401f3..d4eb13f2 100644
--- a/docker-compose.matrix-bridge-node1.yml
+++ b/docker-compose.matrix-bridge-node1.yml
@@ -67,6 +67,41 @@ services:
       - BRIDGE_CONTROL_ROOMS=${BRIDGE_CONTROL_ROOMS:-}
       # "ignore" (silent) | "reply_error" (⛔ reply to unauthorised attempts)
       - CONTROL_UNAUTHORIZED_BEHAVIOR=${CONTROL_UNAUTHORIZED_BEHAVIOR:-ignore}
+      # ── M3.1: Runbook runner token ───────────────────────────────────────
+      # X-Control-Token for POST /api/runbooks/internal/runs (sofiia-console)
+      - SOFIIA_CONTROL_TOKEN=${SOFIIA_CONTROL_TOKEN:-}
+      # M3.4: Control channel safety — rate limiting + cooldown
+      - CONTROL_ROOM_RPM=${CONTROL_ROOM_RPM:-60}
+      - CONTROL_OPERATOR_RPM=${CONTROL_OPERATOR_RPM:-30}
+      - CONTROL_RUN_NEXT_RPM=${CONTROL_RUN_NEXT_RPM:-20}
+      - CONTROL_COOLDOWN_S=${CONTROL_COOLDOWN_S:-2.0}
+      # M2.3: Persistent event deduplication
+      - PERSISTENT_DEDUPE=${PERSISTENT_DEDUPE:-1}
+      - BRIDGE_DATA_DIR=${BRIDGE_DATA_DIR:-/app/data}
+      - PROCESSED_EVENTS_TTL_H=${PROCESSED_EVENTS_TTL_H:-48}
+      - PROCESSED_EVENTS_PRUNE_BATCH=${PROCESSED_EVENTS_PRUNE_BATCH:-5000}
+      - PROCESSED_EVENTS_PRUNE_INTERVAL_S=${PROCESSED_EVENTS_PRUNE_INTERVAL_S:-3600}
+      # M4.0: agent discovery
+      - DISCOVERY_RPM=${DISCOVERY_RPM:-20}
+      # M5.0: node-aware routing
+      - BRIDGE_ALLOWED_NODES=${BRIDGE_ALLOWED_NODES:-NODA1}
+      - BRIDGE_DEFAULT_NODE=${BRIDGE_DEFAULT_NODE:-NODA1}
+      - BRIDGE_ROOM_NODE_MAP=${BRIDGE_ROOM_NODE_MAP:-}
+      # M8.0: Node health + soft-failover thresholds
+      - NODE_FAIL_CONSEC=${NODE_FAIL_CONSEC:-3}
+      - NODE_LAT_EWMA_S=${NODE_LAT_EWMA_S:-12.0}
+      - NODE_EWMA_ALPHA=${NODE_EWMA_ALPHA:-0.3}
+      # M8.1: Sticky failover TTL (0 = disabled)
+      - FAILOVER_STICKY_TTL_S=${FAILOVER_STICKY_TTL_S:-300}
+      # M8.2: HA state persistence
+      - HA_HEALTH_SNAPSHOT_INTERVAL_S=${HA_HEALTH_SNAPSHOT_INTERVAL_S:-60}
+      - HA_HEALTH_MAX_AGE_S=${HA_HEALTH_MAX_AGE_S:-600}
+      # M9.0: Two-step confirmation TTL for dangerous commands (0 = disabled)
+      - CONFIRM_TTL_S=${CONFIRM_TTL_S:-120}
+      - POLICY_EXPORT_RETENTION_DAYS=${POLICY_EXPORT_RETENTION_DAYS:-30}
+      - POLICY_HISTORY_LIMIT=${POLICY_HISTORY_LIMIT:-100}
+      # M11 soak: NEVER set to true in production
+      - DEBUG_INJECT_ENABLED=${DEBUG_INJECT_ENABLED:-false}
 
       # ── M2.2: Mixed room guard rails ────────────────────────────────────
       # Fail-fast if any room defines more agents than this
diff --git a/ops/grafana/dashboards/matrix-bridge-dagi.json b/ops/grafana/dashboards/matrix-bridge-dagi.json
new file mode 100644
index 00000000..d406ff72
--- /dev/null
+++ b/ops/grafana/dashboards/matrix-bridge-dagi.json
@@ -0,0 +1,986 @@
+{
+  "__inputs": [
+    {
+      "name": "DS_PROMETHEUS",
+      "label": "Prometheus",
+      "description": "",
+      "type": "datasource",
+      "pluginId": "prometheus",
+      "pluginName": "Prometheus"
+    }
+  ],
+  "__elements": {},
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "9.0.0"
+    },
+    {
+      "type": "datasource",
+      "id": "prometheus",
+      "name": "Prometheus",
+      "version": "1.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "stat",
+      "name": "Stat",
+      "version": ""
+    },
+    {
+      "type": "panel",
+      "id": "timeseries",
+      "name": "Time series",
+      "version": ""
+    },
+    {
+      "type": "panel",
+      "id": "gauge",
+      "name": "Gauge",
+      "version": ""
+    }
+  ],
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "description": "Matrix Bridge DAGI \u2014 operational overview (M7.0). Traffic, latency, errors, queue, dedupe, control channel.",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [
+    {
+      "asDropdown": false,
+      "icon": "doc",
+      "includeVars": false,
+      "keepTime": false,
+      "tags": [],
+      "targetBlank": true,
+      "title": "Runbook",
+      "tooltip": "matrix-bridge-dagi-ops.md",
+      "type": "link",
+      "url": "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md"
+    }
+  ],
+  "panels": [
+    {
+      "id": 1,
+      "type": "stat",
+      "title": "Bridge Up",
+      "gridPos": {
+        "x": 0,
+        "y": 0,
+        "w": 4,
+        "h": 4
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum(matrix_bridge_up)",
+          "legendFormat": "up (all nodes)",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "options": {
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ]
+        },
+        "colorMode": "background",
+        "graphMode": "none",
+        "textMode": "auto",
+        "orientation": "auto"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "mappings": [
+            {
+              "type": "value",
+              "options": {
+                "0": {
+                  "text": "DOWN",
+                  "color": "red"
+                },
+                "1": {
+                  "text": "UP",
+                  "color": "green"
+                }
+              }
+            }
+          ],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          },
+          "color": {
+            "mode": "thresholds"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 2,
+      "type": "stat",
+      "title": "Queue Size",
+      "gridPos": {
+        "x": 4,
+        "y": 0,
+        "w": 4,
+        "h": 4
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "matrix_bridge_queue_size",
+          "legendFormat": "queue",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "options": {
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ]
+        },
+        "colorMode": "background",
+        "graphMode": "area",
+        "textMode": "auto"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 50
+              },
+              {
+                "color": "red",
+                "value": 100
+              }
+            ]
+          },
+          "color": {
+            "mode": "thresholds"
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 3,
+      "type": "stat",
+      "title": "Active Rate-Limiter Rooms",
+      "gridPos": {
+        "x": 8,
+        "y": 0,
+        "w": 4,
+        "h": 4
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "matrix_bridge_rate_limiter_active_rooms",
+          "legendFormat": "rooms",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "options": {
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ]
+        },
+        "colorMode": "value",
+        "graphMode": "none"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short",
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 4,
+      "type": "stat",
+      "title": "Active Room-Agent Locks",
+      "gridPos": {
+        "x": 12,
+        "y": 0,
+        "w": 4,
+        "h": 4
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "matrix_bridge_active_room_agent_locks",
+          "legendFormat": "locks",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "options": {
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ]
+        },
+        "colorMode": "value",
+        "graphMode": "none"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short",
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 5,
+      "type": "stat",
+      "title": "Drops (5m)",
+      "gridPos": {
+        "x": 16,
+        "y": 0,
+        "w": 4,
+        "h": 4
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum(increase(matrix_bridge_queue_dropped_total[5m]))",
+          "legendFormat": "dropped",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "options": {
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ]
+        },
+        "colorMode": "background",
+        "graphMode": "none"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 1
+              }
+            ]
+          },
+          "color": {
+            "mode": "thresholds"
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 6,
+      "type": "stat",
+      "title": "Errors (5m)",
+      "gridPos": {
+        "x": 20,
+        "y": 0,
+        "w": 4,
+        "h": 4
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum(increase(matrix_bridge_gateway_errors_total[5m]))",
+          "legendFormat": "errors",
+          "refId": "A",
+          "instant": true
+        }
+      ],
+      "options": {
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ]
+        },
+        "colorMode": "background",
+        "graphMode": "none"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 1
+              },
+              {
+                "color": "red",
+                "value": 5
+              }
+            ]
+          },
+          "color": {
+            "mode": "thresholds"
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 10,
+      "type": "timeseries",
+      "title": "Traffic: Received & Replied (rate/5m)",
+      "gridPos": {
+        "x": 0,
+        "y": 4,
+        "w": 12,
+        "h": 8
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum(rate(matrix_bridge_messages_received_total[5m]))",
+          "legendFormat": "received",
+          "refId": "A"
+        },
+        {
+          "expr": "sum(rate(matrix_bridge_messages_replied_total{status=\"ok\"}[5m]))",
+          "legendFormat": "replied ok",
+          "refId": "B"
+        },
+        {
+          "expr": "sum(rate(matrix_bridge_messages_replied_total{status=\"error\"}[5m]))",
+          "legendFormat": "replied error",
+          "refId": "C"
+        }
+      ],
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": [
+            "mean",
+            "max"
+          ]
+        }
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 10,
+            "drawStyle": "line",
+            "spanNulls": false
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "replied error"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "fixed",
+                  "fixedColor": "red"
+                }
+              }
+            ]
+          }
+        ]
+      }
+    },
+    {
+      "id": 11,
+      "type": "timeseries",
+      "title": "Errors / Drops / Rate-Limited (rate/5m)",
+      "gridPos": {
+        "x": 12,
+        "y": 4,
+        "w": 12,
+        "h": 8
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum by (error_type) (rate(matrix_bridge_gateway_errors_total[5m]))",
+          "legendFormat": "gw_error: {{ error_type }}",
+          "refId": "A"
+        },
+        {
+          "expr": "sum(rate(matrix_bridge_queue_dropped_total[5m]))",
+          "legendFormat": "queue_dropped",
+          "refId": "B"
+        },
+        {
+          "expr": "sum(rate(matrix_bridge_rate_limited_total[5m]))",
+          "legendFormat": "rate_limited",
+          "refId": "C"
+        },
+        {
+          "expr": "sum by (reason) (rate(matrix_bridge_route_rejected_total[5m]))",
+          "legendFormat": "route_rejected: {{ reason }}",
+          "refId": "D"
+        }
+      ],
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": [
+            "mean",
+            "max"
+          ]
+        }
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 15,
+            "drawStyle": "line",
+            "stacking": {
+              "mode": "none"
+            },
+            "spanNulls": false
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 20,
+      "type": "timeseries",
+      "title": "Invoke Latency P50 / P95 by Node",
+      "gridPos": {
+        "x": 0,
+        "y": 12,
+        "w": 12,
+        "h": 8
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, sum by (node_id, le) (rate(matrix_bridge_invoke_duration_seconds_bucket[5m])))",
+          "legendFormat": "p50 {{ node_id }}",
+          "refId": "A"
+        },
+        {
+          "expr": "histogram_quantile(0.95, sum by (node_id, le) (rate(matrix_bridge_invoke_duration_seconds_bucket[5m])))",
+          "legendFormat": "p95 {{ node_id }}",
+          "refId": "B"
+        }
+      ],
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": [
+            "mean",
+            "max",
+            "last"
+          ]
+        }
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 5,
+            "drawStyle": "line",
+            "spanNulls": false
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 10
+              },
+              {
+                "color": "red",
+                "value": 20
+              }
+            ]
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 21,
+      "type": "timeseries",
+      "title": "Queue Wait P50 / P95",
+      "gridPos": {
+        "x": 12,
+        "y": 12,
+        "w": 12,
+        "h": 8
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, sum by (agent_id, le) (rate(matrix_bridge_queue_wait_seconds_bucket[5m])))",
+          "legendFormat": "wait p50 {{ agent_id }}",
+          "refId": "A"
+        },
+        {
+          "expr": "histogram_quantile(0.95, sum by (agent_id, le) (rate(matrix_bridge_queue_wait_seconds_bucket[5m])))",
+          "legendFormat": "wait p95 {{ agent_id }}",
+          "refId": "B"
+        }
+      ],
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": [
+            "mean",
+            "max"
+          ]
+        }
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 5,
+            "drawStyle": "line",
+            "spanNulls": false
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 30,
+      "type": "timeseries",
+      "title": "Node Routing: Routed & Rejected by Node (rate/5m)",
+      "gridPos": {
+        "x": 0,
+        "y": 20,
+        "w": 12,
+        "h": 7
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum by (node_id) (rate(matrix_bridge_routed_total[5m]))",
+          "legendFormat": "routed {{ node_id }}",
+          "refId": "A"
+        },
+        {
+          "expr": "sum by (node_id) (rate(matrix_bridge_node_rejected_total[5m]))",
+          "legendFormat": "rejected {{ node_id }}",
+          "refId": "B"
+        }
+      ],
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": [
+            "mean",
+            "max"
+          ]
+        }
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 10,
+            "drawStyle": "line",
+            "spanNulls": false
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 31,
+      "type": "timeseries",
+      "title": "Persistent Dedupe Hits / Inserts (rate/10m)",
+      "gridPos": {
+        "x": 12,
+        "y": 20,
+        "w": 12,
+        "h": 7
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum(rate(matrix_bridge_dedupe_persistent_hits_total[10m]))",
+          "legendFormat": "dedupe_hits",
+          "refId": "A"
+        },
+        {
+          "expr": "rate(matrix_bridge_dedupe_persistent_inserts_total[10m])",
+          "legendFormat": "dedupe_inserts",
+          "refId": "B"
+        }
+      ],
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": [
+            "mean",
+            "max"
+          ]
+        }
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 10,
+            "drawStyle": "line",
+            "spanNulls": false
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 40,
+      "type": "timeseries",
+      "title": "Control Commands (rate/5m)",
+      "gridPos": {
+        "x": 0,
+        "y": 27,
+        "w": 12,
+        "h": 7
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum by (verb) (rate(matrix_bridge_control_commands_total[5m]))",
+          "legendFormat": "cmd {{ verb }}",
+          "refId": "A"
+        },
+        {
+          "expr": "sum by (scope) (rate(matrix_bridge_control_rate_limited_total[5m]))",
+          "legendFormat": "ctrl_ratelimited {{ scope }}",
+          "refId": "B"
+        }
+      ],
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": [
+            "mean",
+            "max"
+          ]
+        }
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 10,
+            "drawStyle": "line",
+            "spanNulls": false
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 41,
+      "type": "timeseries",
+      "title": "Traffic by Agent (received rate/5m)",
+      "gridPos": {
+        "x": 12,
+        "y": 27,
+        "w": 24,
+        "h": 7
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum by (agent_id) (rate(matrix_bridge_messages_received_total[5m]))",
+          "legendFormat": "{{ agent_id }}",
+          "refId": "A"
+        }
+      ],
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": [
+            "mean",
+            "max",
+            "last"
+          ]
+        }
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 10,
+            "drawStyle": "line",
+            "spanNulls": false
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "id": 42,
+      "type": "timeseries",
+      "title": "Routing Reasons by Agent (rate/5m)",
+      "description": "M7.1: matrix_bridge_routing_reasons_total \u2014 slash/mention/name/default/direct breakdown",
+      "gridPos": {
+        "x": 0,
+        "y": 34,
+        "w": 24,
+        "h": 7
+      },
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum by (agent_id, reason) (rate(matrix_bridge_routing_reasons_total[5m]))",
+          "legendFormat": "{{ agent_id }} / {{ reason }}",
+          "refId": "A"
+        }
+      ],
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": [
+            "mean",
+            "max"
+          ]
+        }
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 10,
+            "drawStyle": "line",
+            "spanNulls": false
+          },
+          "color": {
+            "mode": "palette-classic"
+          }
+        },
+        "overrides": []
+      }
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 38,
+  "tags": [
+    "matrix-bridge",
+    "dagi",
+    "daarion"
+  ],
+  "templating": {
+    "list": [
+      {
+        "current": {},
+        "hide": 0,
+        "includeAll": false,
+        "label": "Datasource",
+        "multi": false,
+        "name": "datasource",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "type": "datasource"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "UTC",
+  "title": "Matrix Bridge DAGI",
+  "uid": "matrix-bridge-dagi-v1",
+  "version": 1
+}
\ No newline at end of file
diff --git a/ops/prometheus/alerts/matrix-bridge-dagi.rules.yml b/ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
new file mode 100644
index 00000000..cd7c9d4b
--- /dev/null
+++ b/ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
@@ -0,0 +1,158 @@
+---
+# Prometheus alert rules — Matrix Bridge DAGI
+# Phase M7.1 (metrics contract hardening)
+#
+# Metric source of truth: services/matrix-bridge-dagi/app/metrics_contract.py
+# Runbook: docs/runbook/matrix-bridge-dagi-ops.md
+#
+# Usage:
+#   promtool check rules ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
+#   docker run --rm -v $PWD:/w prom/prometheus:latest \
+#     promtool check rules /w/ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
+
+groups:
+  - name: matrix_bridge_dagi
+    interval: 30s
+    rules:
+
+      # ── A1: Bridge process down ─────────────────────────────────────────────
+      # metric: matrix_bridge_up{node_id}   (Gauge, M7.1: labeled per node)
+      - alert: BridgeDown
+        expr: sum(matrix_bridge_up) == 0
+        for: 1m
+        labels:
+          severity: critical
+          team: platform
+          service: matrix-bridge-dagi
+        annotations:
+          summary: "Matrix Bridge DAGI is down"
+          description: >
+            `matrix_bridge_up` == 0 across all nodes — bridge process has not
+            started or has crashed. No messages are being processed.
+          runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a1-bridgedown"
+
+      # ── A2: Matrix sync errors spike ────────────────────────────────────────
+      # metric: matrix_bridge_gateway_errors_total{error_type}   (Counter)
+      - alert: MatrixSyncErrors
+        expr: >
+          increase(matrix_bridge_gateway_errors_total{error_type="sync_error"}[5m]) > 3
+        for: 2m
+        labels:
+          severity: warning
+          team: platform
+          service: matrix-bridge-dagi
+        annotations:
+          summary: "Matrix sync errors elevated"
+          description: >
+            More than 3 Matrix `/sync` errors (error_type=sync_error) in the last
+            5 minutes. May indicate Matrix homeserver problems or network issues.
+          runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a2-matrixsyncerrors"
+
+      # ── A3: Gateway (Router) invoke errors spike ─────────────────────────────
+      # metric: matrix_bridge_messages_replied_total{status}   (Counter)
+      - alert: GatewayInvokeErrors
+        expr: >
+          increase(matrix_bridge_messages_replied_total{status="error"}[5m]) > 5
+        for: 2m
+        labels:
+          severity: warning
+          team: platform
+          service: matrix-bridge-dagi
+        annotations:
+          summary: "Router invoke errors elevated (node={{ $labels.node_id }})"
+          description: >
+            More than 5 agent invocation errors (status=error) in the last 5 minutes.
+            Check Router/DeepSeek connectivity and logs.
+          runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a3-gatewayinvokeerrors"
+
+      # ── A4: Queue drops ─────────────────────────────────────────────────────
+      # metric: matrix_bridge_queue_dropped_total{room_id, agent_id}   (Counter)
+      - alert: QueueDropsHigh
+        expr: >
+          rate(matrix_bridge_queue_dropped_total[5m]) > 0
+        for: 1m
+        labels:
+          severity: warning
+          team: platform
+          service: matrix-bridge-dagi
+        annotations:
+          summary: "Bridge queue is dropping messages"
+          description: >
+            `matrix_bridge_queue_dropped_total` is increasing — work queue is full
+            and incoming messages are being dropped. Increase
+            `BRIDGE_QUEUE_MAX_EVENTS` or `BRIDGE_WORKER_CONCURRENCY`.
+          runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a4-queuedrops"
+
+      # ── A5: User-level rate limiting spike ──────────────────────────────────
+      # metric: matrix_bridge_rate_limited_total{room_id, agent_id, limit_type}   (Counter)
+      - alert: RateLimitedSpike
+        expr: >
+          rate(matrix_bridge_rate_limited_total[5m]) > 2
+        for: 3m
+        labels:
+          severity: warning
+          team: platform
+          service: matrix-bridge-dagi
+        annotations:
+          summary: "User rate limiting spike"
+          description: >
+            More than 2 messages/second are being rate-limited over 3 minutes.
+            May indicate a flood attack, misbehaving client, or limits too low.
+          runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a5-ratelimitedspike"
+
+      # ── A6: Control channel rate limiting spike ──────────────────────────────
+      # metric: matrix_bridge_control_rate_limited_total{scope}   (Counter)
+      - alert: ControlRateLimitedSpike
+        expr: >
+          rate(matrix_bridge_control_rate_limited_total[5m]) > 0.5
+        for: 3m
+        labels:
+          severity: warning
+          team: platform
+          service: matrix-bridge-dagi
+        annotations:
+          summary: "Control channel rate limiting elevated"
+          description: >
+            More than 0.5 control commands/second rejected by rate limiter over
+            3 minutes. May indicate operator tooling issues or abuse attempt.
+          runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a6-controlratelimitedspike"
+
+      # ── A7: Persistent dedupe hit storm (resend loop) ────────────────────────
+      # metric: matrix_bridge_dedupe_persistent_hits_total{room_id}   (Counter)
+      - alert: DedupeHitStorm
+        expr: >
+          rate(matrix_bridge_dedupe_persistent_hits_total[10m]) > 0.5
+        for: 5m
+        labels:
+          severity: warning
+          team: platform
+          service: matrix-bridge-dagi
+        annotations:
+          summary: "Persistent deduplication hit rate elevated"
+          description: >
+            High rate of persistent dedupe hits — may indicate a Matrix resend
+            storm or a client repeatedly retrying the same event_id.
+          runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a7-dedupehitstorm"
+
+      # ── A8: Invoke latency P95 high (per node) ───────────────────────────────
+      # metric: matrix_bridge_invoke_duration_seconds{agent_id, node_id}   (Histogram)
+      - alert: InvokeLatencyP95High
+        expr: >
+          histogram_quantile(
+            0.95,
+            sum by (node_id, le) (
+              rate(matrix_bridge_invoke_duration_seconds_bucket[5m])
+            )
+          ) > 15
+        for: 5m
+        labels:
+          severity: warning
+          team: platform
+          service: matrix-bridge-dagi
+        annotations:
+          summary: "Router invoke latency P95 > 15s (node={{ $labels.node_id }})"
+          description: >
+            95th percentile invoke latency for node `{{ $labels.node_id }}` exceeds
+            15 seconds over the last 5 minutes. Check Router load, DeepSeek API,
+            Ollama/Swapper queue.
+          runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a8-invokelatencyp95high"
diff --git a/ops/runbook-matrix-bridge-soak.md b/ops/runbook-matrix-bridge-soak.md
new file mode 100644
index 00000000..ecf8ffa2
--- /dev/null
+++ b/ops/runbook-matrix-bridge-soak.md
@@ -0,0 +1,401 @@
+# matrix-bridge-dagi — Soak & Failure Rehearsal Runbook (M11)
+
+**Phase:** M11  
+**Applies to:** `matrix-bridge-dagi` service on NODA1  
+**When to run:** Before any production traffic increase, after major code changes, or on a recurring monthly basis.
+
+---
+
+## 1. Goals
+
+| Goal | Measurable pass criterion |
+|------|--------------------------|
+| Latency under load | p95 invoke < 5 000 ms |
+| Queue stability | drop rate < 1% |
+| Failover correctness | failover fires on NODA1 outage; NODA2 serves all remaining messages |
+| Sticky anti-flap | sticky set after first failover; no re-tries to degraded node |
+| Restart recovery | sticky + health snapshot reloads within 10 s of restart |
+| Policy operations safe under load | `!policy history` / `!policy change` work while messages in-flight |
+
+---
+
+## 2. Prerequisites
+
+```bash
+# On NODA1 or local machine with network access to bridge
+pip install httpx
+
+# Verify bridge is up
+curl -s http://localhost:9400/health | jq '.ok'
+# Expected: true
+
+# Verify /metrics endpoint
+curl -s http://localhost:9400/metrics | grep matrix_bridge_up
+# Expected: matrix_bridge_up{...} 1
+```
+
+---
+
+## 2a. Enabling the Soak Inject Endpoint
+
+The soak script uses `POST /v1/debug/inject_event` which is **disabled by default**.
+Enable it only on staging/NODA1 soak runs:
+
+```bash
+# On NODA1 — edit docker-compose override or pass env inline:
+# Option 1: temporary inline restart
+DEBUG_INJECT_ENABLED=true docker-compose \
+  -f docker-compose.matrix-bridge-node1.yml \
+  up -d --no-deps matrix-bridge-dagi
+
+# Option 2: .env file override
+echo "DEBUG_INJECT_ENABLED=true" >> .env.soak
+docker-compose --env-file .env.soak \
+  -f docker-compose.matrix-bridge-node1.yml \
+  up -d --no-deps matrix-bridge-dagi
+
+# Verify it's enabled (should return 200, not 403)
+curl -s -X POST http://localhost:9400/v1/debug/inject_event \
+  -H 'Content-Type: application/json' \
+  -d '{"room_id":"!test:test","event":{}}' | jq .
+# Expected: {"ok":false,"error":"no mapping for room_id=..."}  ← 200, not 403
+
+# IMPORTANT: disable after soak
+docker-compose -f docker-compose.matrix-bridge-node1.yml up -d --no-deps matrix-bridge-dagi
+# (DEBUG_INJECT_ENABLED defaults to false)
+```
+
+---
+
+## 2b. Step 0 (WORKERS=2 / QUEUE=100) — Record True Baseline
+
+**Goal:** snapshot the "before any tuning" numbers to have a comparison point.
+
+```bash
+# 0. Confirm current config (should be defaults)
+curl -s http://localhost:9400/health | jq '{workers: .workers, queue_max: .queue.max}'
+# Expected: {"workers": 2, "queue_max": 100}
+
+# 1. DB path for WAL check (adjust to your BRIDGE_DATA_DIR)
+DB=/opt/microdao-daarion/data/matrix_bridge.db
+
+# 2. WAL size before (manual check)
+ls -lh ${DB}-wal 2>/dev/null || echo "(no WAL file yet — first run)"
+sqlite3 $DB "PRAGMA wal_checkpoint(PASSIVE);" 2>/dev/null || echo "(no sqlite3)"
+
+# 3. Run Step 0 soak
+python3 ops/scripts/matrix_bridge_soak.py \
+  --url   http://localhost:9400 \
+  --messages   100 \
+  --concurrency  4 \
+  --agent  sofiia \
+  --room-id "!your-room-id:your-server" \
+  --max-p95-ms  5000 \
+  --max-drop-rate 0.001 \
+  --db-path $DB \
+  --report-file /tmp/soak_step0_baseline.json
+
+# 4. Record result in "Baseline numbers" table (section 10) below.
+jq '.summary, .latency, .metrics_delta, .wal' /tmp/soak_step0_baseline.json
+```
+
+**v1 Go/No-Go thresholds for Step 0:**
+
+| Metric | Green ✅ | Yellow ⚠️ | Red ❌ |
+|--------|---------|-----------|-------|
+| `p95_invoke_ms` | < 3000 | 3000–5000 | > 5000 |
+| `drop_rate` | 0.00% (mandatory) | — | > 0.1% |
+| `error_rate` | < 1% | 1–3% | > 3% |
+| `failovers` | 0 | — | ≥ 1 without cause |
+| WAL delta | < 2 MB | 2–10 MB | > 10 MB |
+
+**If Step 0 is Green → proceed to Step 1 tuning.**
+**If Step 0 is Yellow/Red → investigate before touching WORKER_CONCURRENCY.**
+
+---
+
+## 2c. Step 1 (WORKERS=4 / QUEUE=200) — Tune-1
+
+**Goal:** verify that doubling workers gives headroom without Router saturation.
+
+```bash
+# 1. Apply tuning
+WORKER_CONCURRENCY=4 QUEUE_MAX_EVENTS=200 docker-compose \
+  -f docker-compose.matrix-bridge-node1.yml \
+  --env-file .env.soak \
+  up -d --no-deps matrix-bridge-dagi
+
+sleep 3
+curl -s http://localhost:9400/health | jq '{workers: .workers, queue_max: .queue.max}'
+# Expected: {"workers": 4, "queue_max": 200}
+
+# 2. Run Step 1 soak (higher concurrency to stress the new headroom)
+python3 ops/scripts/matrix_bridge_soak.py \
+  --url   http://localhost:9400 \
+  --messages   100 \
+  --concurrency  8 \
+  --agent  sofiia \
+  --room-id "!your-room-id:your-server" \
+  --max-p95-ms  3000 \
+  --max-drop-rate 0.001 \
+  --db-path $DB \
+  --report-file /tmp/soak_step1_tune1.json
+
+# 3. Compare Step 0 vs Step 1
+python3 - <<'EOF'
+import json
+s0 = json.load(open('/tmp/soak_step0_baseline.json'))
+s1 = json.load(open('/tmp/soak_step1_tune1.json'))
+for k in ('p50', 'p95', 'p99'):
+    print(f"{k}: {s0['latency'][k]}ms → {s1['latency'][k]}ms")
+print(f"drops: {s0['metrics_delta']['queue_drops']} → {s1['metrics_delta']['queue_drops']}")
+print(f"WAL: {s0['wal'].get('delta_mb')} → {s1['wal'].get('delta_mb')} MB delta")
+EOF
+```
+
+**Decision:**
+- Step 1 Green → **freeze, tag v1.0, ship to production.**
+- p95 within 5% of Step 0 → Router is bottleneck (not workers); don't go to Step 2.
+- Queue drops > 0 at WORKERS=4 → try Step 2 (WORKERS=8, QUEUE=300).
+
+---
+
+## 3. Scenario A — Baseline load (100 messages, concurrency 4)
+
+**Goal:** establish latency baseline, verify no drops under normal load.
+
+```bash
+python3 ops/scripts/matrix_bridge_soak.py \
+  --url http://localhost:9400 \
+  --messages 100 \
+  --concurrency 4 \
+  --max-p95-ms 3000 \
+  --report-file /tmp/soak_baseline.json
+```
+
+**Expected output:**
+```
+matrix-bridge-dagi Soak Report  ✅ PASSED
+  Messages:    100  concurrency=4
+  Latency: p50=<500ms  p95=<3000ms
+  Queue drops:  0  (rate 0.000%)
+  Failovers:    0
+```
+
+**If FAILED:**
+- `p95 too high` → check router `/health`, DeepSeek API latency, `docker stats`
+- `drop_rate > 0` → check `QUEUE_MAX_EVENTS` env var (increase if needed), inspect bridge logs
+
+---
+
+## 4. Scenario B — Queue saturation test
+
+**Goal:** confirm drop metric fires cleanly and bridge doesn't crash.
+
+```bash
+# Reduce queue via env override, then flood:
+QUEUE_MAX_EVENTS=5 docker-compose -f docker-compose.matrix-bridge-node1.yml \
+  up -d matrix-bridge-dagi
+
+# Wait for restart
+sleep 5
+
+python3 ops/scripts/matrix_bridge_soak.py \
+  --url http://localhost:9400 \
+  --messages 30 \
+  --concurrency 10 \
+  --max-drop-rate 0.99 \
+  --report-file /tmp/soak_queue_sat.json
+
+# Restore normal queue size
+docker-compose -f docker-compose.matrix-bridge-node1.yml up -d matrix-bridge-dagi
+```
+
+**Expected:** `queue_drops > 0`, bridge still running after the test.
+
+**Verify in Prometheus/Grafana:**
+```promql
+rate(matrix_bridge_queue_dropped_total[1m])
+```
+Should spike and then return to 0.
+
+---
+
+## 5. Scenario C — Node failover rehearsal
+
+**Goal:** simulate NODA1 router becoming unavailable, verify NODA2 takes over.
+
+```bash
+# Step 1: stop the router on NODA1 temporarily
+docker pause dagi-router-node1
+
+# Step 2: run soak against bridge (bridge will failover to NODA2)
+python3 ops/scripts/matrix_bridge_soak.py \
+  --url http://localhost:9400 \
+  --messages 20 \
+  --concurrency 2 \
+  --max-p95-ms 10000 \
+  --report-file /tmp/soak_failover.json
+
+# Step 3: restore router
+docker unpause dagi-router-node1
+```
+
+**Expected:**
+```
+  Failovers:   1..20  (at least 1)
+  Sticky sets: 1+
+  Errors:      0  (fallback to NODA2 serves all messages)
+```
+
+**Check sticky in control room:**
+```
+!nodes
+```
+Should show `NODA2` sticky with remaining TTL.
+
+**Check health tracker:**
+```
+!status
+```
+Should show `NODA1 state=degraded|down`.
+
+---
+
+## 6. Scenario D — Restart recovery
+
+**Goal:** after restart, sticky and health state reload within one polling cycle.
+
+```bash
+# After Scenario C: sticky is set to NODA2
+# Restart the bridge
+docker restart dagi-matrix-bridge-node1
+
+# Wait for startup (up to 30s)
+sleep 15
+
+# Verify sticky reloaded
+curl -s http://localhost:9400/health | jq '.ha_state'
+# Expected: {"sticky_loaded": N, ...}
+
+# Verify routing still uses NODA2 sticky
+python3 ops/scripts/matrix_bridge_soak.py \
+  --url http://localhost:9400 \
+  --messages 10 \
+  --concurrency 2 \
+  --report-file /tmp/soak_restart.json
+```
+
+**Expected:** p95 similar to post-failover run, `Failovers: 0` (sticky already applied).
+
+---
+
+## 7. Scenario E — Rate limit burst
+
+**Goal:** verify rate limiting fires and bridge doesn't silently drop below-limit messages.
+
+```bash
+# Set RPM very low for test, then flood from same sender
+# This is best done in control room by observing !status rate_limited count
+# rather than the soak script (which uses different senders per message).
+
+# In Matrix control room:
+# Send 30+ messages from the same user account in quick succession in a mixed room.
+# Then:
+!status
+# Check: rate_limited_total increased, no queue drops.
+```
+
+---
+
+## 8. Scenario F — Policy operations under load
+
+**Goal:** `!policy history`, `!policy change`, and `!policy export` work while messages are in-flight.
+
+```bash
+# Run a background soak
+python3 ops/scripts/matrix_bridge_soak.py \
+  --url http://localhost:9400 \
+  --messages 200 \
+  --concurrency 2 \
+  --report-file /tmp/soak_concurrent_policy.json &
+
+# While soak is running, in Matrix control room:
+!policy history limit=5
+!policy export
+!status
+```
+
+**Expected:** all three commands respond immediately (< 2s), soak completes without extra drops.
+
+---
+
+## 9. Prometheus / Grafana during soak
+
+Key queries for the Grafana dashboard:
+
+```promql
+# Throughput (messages/s)
+rate(matrix_bridge_routed_total[30s])
+
+# Error rate
+rate(matrix_bridge_errors_total[30s])
+
+# p95 invoke latency per node
+histogram_quantile(0.95, rate(matrix_bridge_invoke_duration_seconds_bucket[1m]))
+
+# Queue drops rate
+rate(matrix_bridge_queue_dropped_total[1m])
+
+# Failovers
+rate(matrix_bridge_failover_total[5m])
+```
+
+Use the `matrix-bridge-dagi` Grafana dashboard at:  
+`ops/grafana/dashboards/matrix-bridge-dagi.json`
+
+---
+
+## 10. Baseline numbers (reference)
+
+| Metric | Cold start | Warm (sticky set) |
+|--------|-----------|-------------------|
+| p50 latency | ~200ms | ~150ms |
+| p95 latency | ~2 000ms | ~1 500ms |
+| Queue drops | 0 (queue=100) | 0 |
+| Failover fires | 1 per degradation | 0 after sticky |
+| Policy ops response | < 500ms | < 500ms |
+
+*Update this table after each soak run with actual measured values.*
+
+---
+
+## 11. CI soak (mocked, no network)
+
+For CI pipelines, use the mocked soak scenarios:
+
+```bash
+python3 -m pytest tests/test_matrix_bridge_m11_soak_scenarios.py -v
+```
+
+Covers (all deterministic, no network):
+- **S1** Queue saturation → drop counter
+- **S2** Failover under load → on_failover callback, health tracker
+- **S3** Sticky routing under burst → sticky set, burst routed to NODA2
+- **S4** Multi-room isolation → separate rooms don't interfere
+- **S5** Rate-limit burst → RL callback wired, no panic
+- **S6** HA restart recovery → sticky + health snapshot persisted and reloaded
+- **Perf baseline** 100-msg + 50-msg failover burst < 5s wall clock
+
+---
+
+## 12. Known failure modes & mitigations
+
+| Symptom | Likely cause | Mitigation |
+|---------|-------------|------------|
+| `p95 > 5000ms` | Router/LLM slow | Increase `ROUTER_TIMEOUT_S`, check DeepSeek API |
+| `drop_rate > 1%` | Queue too small | Increase `QUEUE_MAX_EVENTS` |
+| `failovers > 0` but errors > 0 | Both nodes degraded | Check NODA1 + NODA2 health; scale router |
+| Bridge crash during soak | Memory leak / bug | `docker logs` → file GitHub issue |
+| Sticky not set after failover | `FAILOVER_STICKY_TTL_S=0` | Set to 300+ |
+| Restart doesn't load sticky | `HA_HEALTH_MAX_AGE_S` too small | Increase or set to 3600 |
diff --git a/ops/scripts/matrix_bridge_soak.py b/ops/scripts/matrix_bridge_soak.py
new file mode 100644
index 00000000..ed774705
--- /dev/null
+++ b/ops/scripts/matrix_bridge_soak.py
@@ -0,0 +1,476 @@
+#!/usr/bin/env python3
+"""
+matrix_bridge_soak.py — M11 live soak script for matrix-bridge-dagi
+
+Usage:
+  python3 ops/scripts/matrix_bridge_soak.py \
+      --url http://localhost:9400 \
+      --messages 100 \
+      --concurrency 4 \
+      --report-file /tmp/soak_report.json
+
+Requires: httpx (pip install httpx)
+
+What it does:
+  1. Sends --messages synthetic messages to the bridge /v1/sync endpoint
+     (or directly to the router if --direct-router is set).
+  2. Measures latency (p50, p95, p99, max) per batch.
+  3. After the run, fetches /metrics and extracts key counters:
+       - matrix_bridge_queue_dropped_total
+       - matrix_bridge_rate_limited_total
+       - matrix_bridge_failover_total
+       - matrix_bridge_sticky_node_total
+       - matrix_bridge_invoke_duration_seconds (p50/p95 from histogram)
+  4. Prints a human-readable report and optionally writes JSON.
+
+Exit codes:
+  0 = all pass criteria met
+  1 = one or more thresholds exceeded (see --max-p95-ms, --max-drop-rate)
+"""
+import argparse
+import asyncio
+import json
+import sys
+import time
+from typing import Any, Dict, List, Optional
+
+try:
+    import httpx
+except ImportError:
+    print("ERROR: httpx not installed. Run: pip install httpx", file=sys.stderr)
+    sys.exit(2)
+
+# ── Pass/fail defaults ─────────────────────────────────────────────────────────
+_DEFAULT_MAX_P95_MS  = 5000   # 5 s p95 per invoke (generous for cold start)
+_DEFAULT_MAX_DROP_RATE = 0.01  # 1% queue drops allowed
+
+
+# ── Metrics parsing ────────────────────────────────────────────────────────────
+def _parse_counter(text: str, name: str) -> float:
+    """Extract the last reported value of a Prometheus counter by name."""
+    for line in text.splitlines():
+        if line.startswith(name + " ") or line.startswith(name + "{"):
+            parts = line.rsplit(None, 1)
+            try:
+                return float(parts[-1])
+            except (ValueError, IndexError):
+                pass
+    return 0.0
+
+
+def _parse_histogram_quantile(text: str, name: str, quantile: float) -> Optional[float]:
+    """
+    Approximate histogram_quantile from _bucket lines.
+    Returns estimated value at given quantile or None if data missing.
+    """
+    buckets: List[tuple] = []
+    total_count = 0.0
+    for line in text.splitlines():
+        if f"{name}_bucket" in line and 'le="' in line:
+            try:
+                le_part = line.split('le="')[1].split('"')[0]
+                le = float(le_part) if le_part != "+Inf" else float("inf")
+                val = float(line.rsplit(None, 1)[-1])
+                buckets.append((le, val))
+            except (ValueError, IndexError):
+                pass
+        elif (f"{name}_count " in line or (name + "_count{") in line):
+            try:
+                total_count = float(line.rsplit(None, 1)[-1])
+            except (ValueError, IndexError):
+                pass
+
+    if not buckets or total_count == 0:
+        return None
+
+    buckets.sort()
+    target = quantile * total_count
+    prev_le, prev_count = 0.0, 0.0
+    for le, count in buckets:
+        if count >= target:
+            if le == float("inf"):
+                return prev_le
+            # Linear interpolation
+            if count == prev_count:
+                return le
+            fraction = (target - prev_count) / (count - prev_count)
+            return prev_le + fraction * (le - prev_le)
+        prev_le, prev_count = le, count
+    return prev_le
+
+
+# ── Soak runner ────────────────────────────────────────────────────────────────
+async def _preflight_inject(client: httpx.AsyncClient, url: str, room_id: str) -> str:
+    """
+    Verify the inject endpoint is reachable and enabled.
+    Returns "" on success, error message on failure.
+    """
+    try:
+        resp = await client.post(
+            f"{url.rstrip('/')}/v1/debug/inject_event",
+            json={"room_id": room_id, "event": {"event_id": "!preflight", "sender": "@soak:test",
+                                                  "content": {"msgtype": "m.text", "body": "ping"}}},
+            timeout=5.0,
+        )
+        if resp.status_code == 403:
+            return (
+                "❌ DEBUG_INJECT_ENABLED=false on bridge. "
+                "Set DEBUG_INJECT_ENABLED=true and restart for soak.\n"
+                "   NEVER enable in production!"
+            )
+        if resp.status_code >= 500:
+            return f"❌ Bridge inject endpoint returned HTTP {resp.status_code}"
+        data = resp.json()
+        if not data.get("ok") and "no mapping" in data.get("error", ""):
+            return (
+                f"❌ No room mapping for room_id={room_id!r}. "
+                "Pass --room-id matching a configured BRIDGE_ROOM_MAP entry."
+            )
+        return ""
+    except httpx.ConnectError:
+        return f"❌ Cannot connect to bridge at {url}. Is it running?"
+    except Exception as exc:  # noqa: BLE001
+        return f"❌ Preflight failed: {exc}"
+
+
+async def _check_wal(db_path: str) -> Dict[str, Any]:
+    """
+    Run WAL size + checkpoint check on the bridge policy DB.
+    Returns dict with wal_bytes, wal_mb, checkpoint_result.
+    Requires sqlite3 CLI on PATH; gracefully skips if unavailable.
+    """
+    import subprocess, shutil
+    result: Dict[str, Any] = {"db_path": db_path, "ok": False}
+
+    wal_path = db_path + "-wal"
+    try:
+        wal_bytes = os.path.getsize(wal_path) if os.path.exists(wal_path) else 0
+        result["wal_bytes"] = wal_bytes
+        result["wal_mb"]    = round(wal_bytes / 1_048_576, 2)
+    except OSError:
+        result["wal_bytes"] = -1
+        result["wal_mb"]    = -1
+
+    if shutil.which("sqlite3"):
+        try:
+            cp = subprocess.run(
+                ["sqlite3", db_path, "PRAGMA wal_checkpoint(PASSIVE);"],
+                capture_output=True, text=True, timeout=5,
+            )
+            # Output: busy|log|checkpointed (3 ints)
+            parts = cp.stdout.strip().split("|")
+            if len(parts) == 3:
+                result["wal_checkpoint"] = {
+                    "busy": int(parts[0]), "log": int(parts[1]), "checkpointed": int(parts[2]),
+                }
+            result["ok"] = True
+        except Exception:  # noqa: BLE001
+            result["ok"] = False
+    else:
+        result["sqlite3_missing"] = True
+
+    return result
+
+
+async def _send_one(
+    client: httpx.AsyncClient,
+    url: str,
+    agent_id: str,
+    message: str,
+    room_id: str,
+    sender: str,
+) -> tuple:
+    """
+    POST a synthetic Matrix-style event to the bridge debug endpoint.
+    Returns (latency_ms: float, status_code: int, error: str|None).
+    """
+    payload = {
+        "room_id": room_id,
+        "event": {
+            "event_id": f"!soak-{int(time.monotonic() * 1e6)}",
+            "sender": sender,
+            "type": "m.room.message",
+            "content": {"msgtype": "m.text", "body": message},
+        },
+    }
+    t0 = time.monotonic()
+    try:
+        resp = await client.post(
+            f"{url.rstrip('/')}/v1/debug/inject_event",
+            json=payload,
+            timeout=30.0,
+        )
+        latency_ms = (time.monotonic() - t0) * 1000
+        if resp.status_code >= 500:
+            return latency_ms, resp.status_code, f"HTTP {resp.status_code}"
+        return latency_ms, resp.status_code, None
+    except httpx.TimeoutException:
+        latency_ms = (time.monotonic() - t0) * 1000
+        return latency_ms, 0, "timeout"
+    except Exception as exc:  # noqa: BLE001
+        latency_ms = (time.monotonic() - t0) * 1000
+        return latency_ms, 0, str(exc)
+
+
+async def _fetch_health(client: httpx.AsyncClient, url: str) -> Dict[str, Any]:
+    try:
+        resp = await client.get(f"{url.rstrip('/')}/health", timeout=10.0)
+        return resp.json() if resp.status_code == 200 else {}
+    except Exception:  # noqa: BLE001
+        return {}
+
+
+async def _fetch_metrics(client: httpx.AsyncClient, url: str) -> str:
+    try:
+        resp = await client.get(f"{url.rstrip('/')}/metrics", timeout=10.0)
+        return resp.text if resp.status_code == 200 else ""
+    except Exception:  # noqa: BLE001
+        return ""
+
+
+def _percentile(values: List[float], p: float) -> float:
+    if not values:
+        return 0.0
+    sv = sorted(values)
+    idx = int(len(sv) * p / 100)
+    return sv[min(idx, len(sv) - 1)]
+
+
+async def run_soak(
+    url: str,
+    n_messages: int,
+    concurrency: int,
+    agent_id: str,
+    room_id: str,
+    sender: str,
+    max_p95_ms: float,
+    max_drop_rate: float,
+    db_path: str = "",
+) -> Dict[str, Any]:
+    results: List[tuple] = []
+    semaphore = asyncio.Semaphore(concurrency)
+
+    async with httpx.AsyncClient() as client:
+        # Pre-check: inject endpoint + health
+        preflight_err = await _preflight_inject(client, url, room_id)
+        if preflight_err:
+            print(preflight_err, file=sys.stderr)
+            return {"ok": False, "error": preflight_err, "passed": False, "failures": [preflight_err]}
+
+        # WAL check before soak
+        wal_before: Dict[str, Any] = {}
+        if db_path:
+            wal_before = await _check_wal(db_path)
+            print(f"[soak] WAL before: {wal_before.get('wal_mb', '?')} MB")
+
+        # Pre-check: health
+        health_before = await _fetch_health(client, url)
+        metrics_before = await _fetch_metrics(client, url)
+
+        drops_before  = _parse_counter(metrics_before, "matrix_bridge_queue_dropped_total")
+        rl_before     = _parse_counter(metrics_before, "matrix_bridge_rate_limited_total")
+        fo_before     = _parse_counter(metrics_before, "matrix_bridge_failover_total")
+
+        print(f"[soak] Bridge health before: {health_before.get('ok', '?')}")
+        print(f"[soak] Starting {n_messages} messages (concurrency={concurrency}) ...")
+
+        t_start = time.monotonic()
+
+        async def worker(i: int):
+            async with semaphore:
+                msg = f"soak-msg-{i:04d}"
+                lat, status, err = await _send_one(
+                    client, url, agent_id, msg, room_id, sender
+                )
+                results.append((lat, status, err))
+                if (i + 1) % max(1, n_messages // 10) == 0:
+                    print(f"  [{i+1}/{n_messages}] last={lat:.0f}ms status={status}")
+
+        await asyncio.gather(*[worker(i) for i in range(n_messages)])
+
+        elapsed_s = time.monotonic() - t_start
+        metrics_after = await _fetch_metrics(client, url)
+        health_after  = await _fetch_health(client, url)
+
+        # WAL check after soak
+        wal_after: Dict[str, Any] = {}
+        if db_path:
+            wal_after = await _check_wal(db_path)
+            print(f"[soak] WAL after:  {wal_after.get('wal_mb', '?')} MB "
+                  f"(delta={round(wal_after.get('wal_mb',0) - wal_before.get('wal_mb',0), 2)} MB)")
+
+    latencies  = [r[0] for r in results]
+    errors     = [r for r in results if r[2] is not None]
+    successes  = len(results) - len(errors)
+    error_rate = len(errors) / len(results) if results else 0.0
+
+    drops_after = _parse_counter(metrics_after, "matrix_bridge_queue_dropped_total")
+    rl_after    = _parse_counter(metrics_after, "matrix_bridge_rate_limited_total")
+    fo_after    = _parse_counter(metrics_after, "matrix_bridge_failover_total")
+    sticky_after = _parse_counter(metrics_after, "matrix_bridge_sticky_node_total")
+
+    delta_drops = drops_after - drops_before
+    delta_rl    = rl_after    - rl_before
+    delta_fo    = fo_after    - fo_before
+
+    p50 = _percentile(latencies, 50)
+    p95 = _percentile(latencies, 95)
+    p99 = _percentile(latencies, 99)
+    p_max = max(latencies) if latencies else 0.0
+
+    # Histogram quantile from Prometheus
+    hist_p95 = _parse_histogram_quantile(
+        metrics_after, "matrix_bridge_invoke_duration_seconds", 0.95
+    )
+    hist_p95_ms = hist_p95 * 1000 if hist_p95 is not None else None
+
+    drop_rate = delta_drops / len(results) if results else 0.0
+
+    report = {
+        "wal": {
+            "before_mb": wal_before.get("wal_mb"),
+            "after_mb":  wal_after.get("wal_mb"),
+            "delta_mb":  round(
+                (wal_after.get("wal_mb") or 0) - (wal_before.get("wal_mb") or 0), 3
+            ) if wal_before and wal_after else None,
+            "checkpoint_after": wal_after.get("wal_checkpoint"),
+            "threshold_mb": 10,
+        },
+        "summary": {
+            "total_messages": n_messages,
+            "concurrency": concurrency,
+            "elapsed_s": round(elapsed_s, 2),
+            "throughput_rps": round(n_messages / elapsed_s, 1) if elapsed_s > 0 else 0,
+            "successes": successes,
+            "errors": len(errors),
+            "error_rate": round(error_rate, 4),
+        },
+        "latency_ms": {
+            "p50": round(p50, 1),
+            "p95": round(p95, 1),
+            "p99": round(p99, 1),
+            "max": round(p_max, 1),
+        },
+        "metrics_delta": {
+            "queue_drops": int(delta_drops),
+            "rate_limited": int(delta_rl),
+            "failovers": int(delta_fo),
+            "sticky_sets": int(sticky_after),
+            "drop_rate": round(drop_rate, 4),
+        },
+        "prometheus_invoke_p95_ms": round(hist_p95_ms, 1) if hist_p95_ms else None,
+        "health_before": health_before.get("ok"),
+        "health_after":  health_after.get("ok"),
+        "pass_criteria": {
+            "max_p95_ms": max_p95_ms,
+            "max_drop_rate": max_drop_rate,
+        },
+    }
+
+    # Pass/fail evaluation
+    failures = []
+    if p95 > max_p95_ms:
+        failures.append(f"p95={p95:.0f}ms exceeds threshold {max_p95_ms:.0f}ms")
+    if drop_rate > max_drop_rate:
+        failures.append(
+            f"drop_rate={drop_rate:.3%} exceeds threshold {max_drop_rate:.3%}"
+        )
+    wal_delta = report["wal"]["delta_mb"]
+    if wal_delta is not None and wal_delta > report["wal"]["threshold_mb"]:
+        failures.append(
+            f"WAL grew {wal_delta:.1f}MB (threshold {report['wal']['threshold_mb']}MB) "
+            "— possible SQLite write pressure (Bottleneck #2)"
+        )
+
+    report["passed"] = len(failures) == 0
+    report["failures"] = failures
+    return report
+
+
+def _print_report(r: Dict[str, Any]) -> None:
+    s = r["summary"]
+    l = r["latency_ms"]
+    m = r["metrics_delta"]
+    passed = "✅ PASSED" if r["passed"] else "❌ FAILED"
+
+    w = r.get("wal", {})
+    print()
+    print("=" * 60)
+    print(f"  matrix-bridge-dagi Soak Report  {passed}")
+    print("=" * 60)
+    print(f"  Messages:    {s['total_messages']}  concurrency={s['concurrency']}")
+    print(f"  Elapsed:     {s['elapsed_s']}s  ({s['throughput_rps']} rps)")
+    print(f"  Successes:   {s['successes']}  errors={s['errors']} ({s['error_rate']:.1%})")
+    print()
+    print(f"  Latency (client-side):  p50={l['p50']}ms  p95={l['p95']}ms  "
+          f"p99={l['p99']}ms  max={l['max']}ms")
+    if r["prometheus_invoke_p95_ms"] is not None:
+        print(f"  Invoke p95 (Prometheus): {r['prometheus_invoke_p95_ms']}ms")
+    print()
+    print(f"  Queue drops:   {m['queue_drops']}  (rate {m['drop_rate']:.3%})")
+    print(f"  Rate-limited:  {m['rate_limited']}")
+    print(f"  Failovers:     {m['failovers']}")
+    print(f"  Sticky sets:   {m['sticky_sets']}")
+    if w.get("before_mb") is not None:
+        wal_delta_str = (
+            f"Δ{w['delta_mb']:+.2f}MB" if w.get("delta_mb") is not None else ""
+        )
+        wal_warn = " ⚠️" if (w.get("delta_mb") or 0) > w.get("threshold_mb", 10) else ""
+        print(f"  WAL:           {w['before_mb']}MB → {w['after_mb']}MB {wal_delta_str}{wal_warn}")
+    print()
+    if r["failures"]:
+        for f in r["failures"]:
+            print(f"  ❌ {f}")
+    else:
+        print("  All pass criteria met.")
+    print("=" * 60)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="matrix-bridge-dagi soak test (M11)")
+    parser.add_argument("--url",          default="http://localhost:9400",
+                        help="Bridge base URL (default: http://localhost:9400)")
+    parser.add_argument("--messages",     type=int, default=100,
+                        help="Total messages to send (default: 100)")
+    parser.add_argument("--concurrency",  type=int, default=4,
+                        help="Concurrent requests (default: 4)")
+    parser.add_argument("--agent-id",     default="sofiia",
+                        help="Agent id for synthetic events (default: sofiia)")
+    parser.add_argument("--room-id",      default="!soak-room:home.invalid",
+                        help="Room id for synthetic events")
+    parser.add_argument("--sender",       default="@soak-user:home.invalid",
+                        help="Sender for synthetic events")
+    parser.add_argument("--max-p95-ms",   type=float, default=_DEFAULT_MAX_P95_MS,
+                        help=f"Max p95 latency ms (default: {_DEFAULT_MAX_P95_MS})")
+    parser.add_argument("--max-drop-rate",type=float, default=_DEFAULT_MAX_DROP_RATE,
+                        help=f"Max queue drop rate 0..1 (default: {_DEFAULT_MAX_DROP_RATE})")
+    parser.add_argument("--report-file",  default="",
+                        help="Optional path to write JSON report")
+    parser.add_argument("--db-path",      default="",
+                        help="Path to policy_store.db for WAL check "
+                             "(e.g. /opt/microdao-daarion/data/matrix_bridge.db)")
+    args = parser.parse_args()
+
+    report = asyncio.run(run_soak(
+        url=args.url,
+        n_messages=args.messages,
+        concurrency=args.concurrency,
+        agent_id=args.agent_id,
+        room_id=args.room_id,
+        sender=args.sender,
+        max_p95_ms=args.max_p95_ms,
+        max_drop_rate=args.max_drop_rate,
+        db_path=args.db_path,
+    ))
+    _print_report(report)
+
+    if args.report_file:
+        with open(args.report_file, "w", encoding="utf-8") as fh:
+            json.dump(report, fh, indent=2)
+        print(f"\n  Report saved: {args.report_file}")
+
+    return 0 if report["passed"] else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/services/matrix-bridge-dagi/app/config.py b/services/matrix-bridge-dagi/app/config.py
index c418a72f..699fe783 100644
--- a/services/matrix-bridge-dagi/app/config.py
+++ b/services/matrix-bridge-dagi/app/config.py
@@ -1,5 +1,5 @@
 """
-matrix-bridge-dagi — configuration and validation (M2.1 + M2.2 + M3.0)
+matrix-bridge-dagi — configuration and validation (M2.1 + M2.2 + M3.0 + M3.1)
 """
 import os
 from dataclasses import dataclass, field
@@ -54,6 +54,54 @@ class BridgeConfig:
     # "ignore" | "reply_error" (send ⛔ to room on unauthorized attempt)
     control_unauthorized_behavior: str
 
+    # M3.1: Runbook runner — sofiia-console control token
+    sofiia_control_token: str          # X-Control-Token for /api/runbooks/internal/runs
+
+    # M3.4: Control channel safety — rate limiting + cooldown
+    control_room_rpm: int          # Max commands per room per minute (0 = unlimited)
+    control_operator_rpm: int      # Max commands per operator per minute
+    control_run_next_rpm: int      # Max !runbook next calls per run_id per minute
+    control_cooldown_s: float      # Anti-double-click debounce per (operator, verb, subcmd)
+
+    # M2.3: Persistent event deduplication
+    persistent_dedupe: bool        # Enable SQLite-backed dedupe across restarts
+    bridge_data_dir: str           # Directory for SQLite DB and other bridge data
+    processed_events_ttl_h: int    # TTL for processed events (hours)
+    processed_events_prune_batch: int       # Max rows to prune per prune run
+    processed_events_prune_interval_s: int  # Prune interval in seconds (0 = disable periodic)
+
+    # M4.0: agent discovery
+    discovery_rpm: int             # Max !agents replies per room per minute (0 = unlimited)
+
+    # M5.0: node-aware routing
+    bridge_allowed_nodes: str      # Comma-separated: "NODA1,NODA2"
+    bridge_default_node: str       # Default node when none specified
+    bridge_room_node_map: str      # Optional: "!roomA:server=NODA2;!roomB:server=NODA1"
+
+    # M8.0: node health + soft-failover thresholds
+    node_fail_consecutive: int     # Consecutive failures before node marked "down"
+    node_lat_ewma_s: float         # EWMA latency threshold (seconds) → "degraded"
+    node_ewma_alpha: float         # EWMA smoothing factor (0..1)
+
+    # M8.1: sticky failover cache
+    failover_sticky_ttl_s: float   # Seconds to hold sticky node preference after failover (0 = disabled)
+
+    # M8.2: HA state persistence
+    ha_health_snapshot_interval_s: int  # Seconds between node health writes to DB (0 = disabled)
+    ha_health_max_age_s: int            # Max age of health snapshot to load on startup (seconds)
+
+    # M9.0: Two-step confirmation TTL
+    confirm_ttl_s: float  # Seconds a pending !confirm nonce is valid (0 = disabled)
+
+    # M10.0: Policy export retention
+    policy_export_retention_days: int  # Days to keep policy exports (0 = keep forever)
+
+    # M10.2: Policy change history
+    policy_history_limit: int  # Max rows in policy_changes table (0 = unlimited)
+
+    # M11 soak: synthetic event injection (NEVER enable in production)
+    debug_inject_enabled: bool  # POST /v1/debug/inject_event (default: False)
+
     # Service identity
     node_id: str
     build_sha: str
@@ -99,6 +147,35 @@ def load_config() -> BridgeConfig:
         bridge_operator_allowlist=_optional("BRIDGE_OPERATOR_ALLOWLIST", ""),
         bridge_control_rooms=_optional("BRIDGE_CONTROL_ROOMS", ""),
         control_unauthorized_behavior=_optional("CONTROL_UNAUTHORIZED_BEHAVIOR", "ignore"),
+        sofiia_control_token=_optional("SOFIIA_CONTROL_TOKEN", ""),
+        control_room_rpm=max(0, int(_optional("CONTROL_ROOM_RPM", "60"))),
+        control_operator_rpm=max(0, int(_optional("CONTROL_OPERATOR_RPM", "30"))),
+        control_run_next_rpm=max(0, int(_optional("CONTROL_RUN_NEXT_RPM", "20"))),
+        control_cooldown_s=max(0.0, float(_optional("CONTROL_COOLDOWN_S", "2.0"))),
+        persistent_dedupe=_optional("PERSISTENT_DEDUPE", "1").strip() not in ("0", "false", ""),
+        bridge_data_dir=_optional("BRIDGE_DATA_DIR", "/app/data"),
+        processed_events_ttl_h=max(1, int(_optional("PROCESSED_EVENTS_TTL_H", "48"))),
+        processed_events_prune_batch=max(1, int(_optional("PROCESSED_EVENTS_PRUNE_BATCH", "5000"))),
+        processed_events_prune_interval_s=max(0, int(_optional("PROCESSED_EVENTS_PRUNE_INTERVAL_S", "3600"))),
+        discovery_rpm=max(0, int(_optional("DISCOVERY_RPM", "20"))),
+        bridge_allowed_nodes=_optional("BRIDGE_ALLOWED_NODES", "NODA1"),
+        bridge_default_node=_optional("BRIDGE_DEFAULT_NODE", "NODA1"),
+        bridge_room_node_map=_optional("BRIDGE_ROOM_NODE_MAP", ""),
+        # M8.0: node health thresholds
+        node_fail_consecutive=max(1, int(_optional("NODE_FAIL_CONSEC", "3"))),
+        node_lat_ewma_s=max(0.5, float(_optional("NODE_LAT_EWMA_S", "12.0"))),
+        node_ewma_alpha=min(1.0, max(0.01, float(_optional("NODE_EWMA_ALPHA", "0.3")))),
+        # M8.1: sticky failover TTL (0 = disabled)
+        failover_sticky_ttl_s=max(0.0, float(_optional("FAILOVER_STICKY_TTL_S", "300.0"))),
+        # M8.2: HA state persistence
+        ha_health_snapshot_interval_s=max(0, int(_optional("HA_HEALTH_SNAPSHOT_INTERVAL_S", "60"))),
+        ha_health_max_age_s=max(0, int(_optional("HA_HEALTH_MAX_AGE_S", "600"))),
+        # M9.0: Two-step confirmation TTL (0 = disabled)
+        confirm_ttl_s=max(0.0, float(_optional("CONFIRM_TTL_S", "120.0"))),
+        policy_export_retention_days=max(0, int(_optional("POLICY_EXPORT_RETENTION_DAYS", "30"))),
+        policy_history_limit=max(0, int(_optional("POLICY_HISTORY_LIMIT", "100"))),
+        debug_inject_enabled=_optional("DEBUG_INJECT_ENABLED", "false").lower()
+            in ("1", "true", "yes"),
         node_id=_optional("NODE_ID", "NODA1"),
         build_sha=_optional("BUILD_SHA", "dev"),
         build_time=_optional("BUILD_TIME", "local"),
diff --git a/services/matrix-bridge-dagi/app/confirm_store.py b/services/matrix-bridge-dagi/app/confirm_store.py
new file mode 100644
index 00000000..217e39d7
--- /dev/null
+++ b/services/matrix-bridge-dagi/app/confirm_store.py
@@ -0,0 +1,167 @@
+"""
+confirm_store — M9.0: Two-step confirmation for dangerous control commands.
+
+Flow:
+  1. Operator issues a dangerous command (e.g. !node set, !policy import mode=replace).
+  2. Bridge calls ConfirmStore.add(..., callback=<coroutine>) → returns a nonce.
+  3. Bridge replies: "Type !confirm <nonce> within Ns to apply."
+  4. Operator sends !confirm <nonce>.
+  5. Bridge calls ConfirmStore.pop(nonce, sender_hash) → returns PendingConfirmation.
+  6. Bridge executes callback() → (reply_text, diff_summary).
+  7. Audit trail: matrix.control.intent / matrix.control.confirmed / matrix.control.applied.
+
+Safety:
+  - One pending entry per sender (new request replaces old).
+  - Nonce is sender-bound: wrong sender_hash → pop returns None.
+  - TTL enforced via monotonic time; expired entries not returned.
+  - Nonce: 6 uppercase alphanumeric (NONCE_LEN chars from NONCE_CHARS).
+"""
+from __future__ import annotations
+
+import secrets
+import string
+import threading
+import time
+from dataclasses import dataclass, field
+from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple
+
+NONCE_LEN = 6
+NONCE_CHARS = string.ascii_uppercase + string.digits
+
+_DEFAULT_TTL_S = 120.0
+
+
+def make_nonce() -> str:
+    """Generate a cryptographically random 6-char uppercase alphanumeric nonce."""
+    return "".join(secrets.choice(NONCE_CHARS) for _ in range(NONCE_LEN))
+
+
+@dataclass
+class PendingConfirmation:
+    """A pending two-step confirmation waiting for !confirm <nonce>."""
+    nonce: str
+    sender_hash: str
+    verb: str                # e.g. "node.set", "room.agents set", "policy.import"
+    normalized_args: str     # human-readable args for audit
+    action_summary: str      # "!node set room=!x:s node=NODA2"
+    room_id: str             # Matrix room_id where the intent was issued
+    callback: Callable[[], Awaitable[Tuple[str, str]]]  # async () → (reply_text, diff_summary)
+    expires_at: float        # time.monotonic() deadline
+
+
+class ConfirmStore:
+    """
+    In-memory, thread-safe store for pending two-step confirmation entries.
+
+    One pending entry per sender at a time.  If the same sender issues a new
+    dangerous command before confirming the previous one, the old entry is
+    replaced (new nonce issued).
+    """
+
+    def __init__(self, ttl_s: float = _DEFAULT_TTL_S) -> None:
+        self.ttl_s = ttl_s
+        self._lock = threading.RLock()
+        self._by_nonce: Dict[str, PendingConfirmation] = {}
+        self._by_sender: Dict[str, str] = {}  # sender_hash → nonce
+
+    # ── Public API ────────────────────────────────────────────────────────────
+
+    def add(
+        self,
+        sender_hash: str,
+        verb: str,
+        normalized_args: str,
+        action_summary: str,
+        room_id: str,
+        callback: Callable[[], Awaitable[Tuple[str, str]]],
+    ) -> str:
+        """
+        Create a pending confirmation entry.  Returns the nonce string.
+
+        If the sender already has a pending entry it is replaced (old nonce
+        becomes invalid immediately).
+        """
+        nonce = make_nonce()
+        expires_at = time.monotonic() + self.ttl_s
+        entry = PendingConfirmation(
+            nonce=nonce,
+            sender_hash=sender_hash,
+            verb=verb,
+            normalized_args=normalized_args,
+            action_summary=action_summary,
+            room_id=room_id,
+            callback=callback,
+            expires_at=expires_at,
+        )
+        with self._lock:
+            # Evict any previous pending entry for this sender
+            old_nonce = self._by_sender.get(sender_hash)
+            if old_nonce:
+                self._by_nonce.pop(old_nonce, None)
+            self._by_nonce[nonce] = entry
+            self._by_sender[sender_hash] = nonce
+        return nonce
+
+    def pop(self, nonce: str, sender_hash: str) -> Optional[PendingConfirmation]:
+        """
+        Retrieve and atomically remove a pending confirmation.
+
+        Returns None if:
+          - nonce does not exist,
+          - sender_hash does not match the entry owner,
+          - or the entry has expired.
+        """
+        nonce = nonce.upper()
+        with self._lock:
+            entry = self._by_nonce.get(nonce)
+            if entry is None:
+                return None
+            if entry.sender_hash != sender_hash:
+                # Wrong sender — deny without disclosing any detail
+                return None
+            if time.monotonic() > entry.expires_at:
+                # Expired — clean up and deny
+                self._by_nonce.pop(nonce, None)
+                self._by_sender.pop(entry.sender_hash, None)
+                return None
+            # Valid confirmation — consume the entry
+            self._by_nonce.pop(nonce)
+            self._by_sender.pop(sender_hash, None)
+            return entry
+
+    def pending_nonce(self, sender_hash: str) -> Optional[str]:
+        """
+        Return the current pending nonce for a sender (non-destructive peek).
+        Returns None if no entry or the entry has expired.
+        """
+        with self._lock:
+            nonce = self._by_sender.get(sender_hash)
+            if nonce is None:
+                return None
+            entry = self._by_nonce.get(nonce)
+            if entry is None or time.monotonic() > entry.expires_at:
+                # Lazy eviction
+                self._by_nonce.pop(nonce, None)
+                self._by_sender.pop(sender_hash, None)
+                return None
+            return nonce
+
+    def pending_count(self) -> int:
+        """Number of non-expired pending entries (for /health, metrics)."""
+        now = time.monotonic()
+        with self._lock:
+            return sum(1 for e in self._by_nonce.values() if now <= e.expires_at)
+
+    def cleanup(self) -> int:
+        """Eagerly remove all expired entries.  Returns count removed."""
+        now = time.monotonic()
+        removed = 0
+        with self._lock:
+            expired_nonces = [
+                n for n, e in self._by_nonce.items() if now > e.expires_at
+            ]
+            for n in expired_nonces:
+                entry = self._by_nonce.pop(n)
+                self._by_sender.pop(entry.sender_hash, None)
+                removed += 1
+        return removed
diff --git a/services/matrix-bridge-dagi/app/control.py b/services/matrix-bridge-dagi/app/control.py
index 3495843b..d2f8a169 100644
--- a/services/matrix-bridge-dagi/app/control.py
+++ b/services/matrix-bridge-dagi/app/control.py
@@ -23,18 +23,124 @@ Audit events emitted:
 import logging
 import re
 from dataclasses import dataclass, field
-from typing import Dict, FrozenSet, List, Optional, Tuple
+from typing import Any, Dict, FrozenSet, List, Optional, Tuple
 
 logger = logging.getLogger(__name__)
 
 # ── Constants ─────────────────────────────────────────────────────────────────
 
-# Supported control verbs (M3.1+ will implement them fully)
+# Supported control verbs
 VERB_RUNBOOK = "runbook"
 VERB_STATUS = "status"
+VERB_NODES = "nodes"   # M5.1: node policy overview
+VERB_NODE  = "node"    # M6.0: dynamic room-node override commands
+VERB_ROOM   = "room"    # M6.1: dynamic mixed room agent overrides
+VERB_POLICY  = "policy"   # M6.2: policy snapshot export/import
+VERB_CONFIRM = "confirm"  # M9.0: two-step confirmation for dangerous commands
 VERB_HELP = "help"
 
-KNOWN_VERBS: FrozenSet[str] = frozenset({VERB_RUNBOOK, VERB_STATUS, VERB_HELP})
+KNOWN_VERBS: FrozenSet[str] = frozenset({
+    VERB_RUNBOOK, VERB_STATUS, VERB_NODES, VERB_NODE,
+    VERB_ROOM, VERB_POLICY, VERB_CONFIRM, VERB_HELP,
+})
+
+# ── M9.0: Dangerous command detection ─────────────────────────────────────────
+
+def is_dangerous_cmd(cmd: "ControlCommand") -> bool:
+    """
+    Return True if the command requires two-step confirmation before applying.
+
+    Dangerous verbs:
+      !node set room=... node=...          — changes room routing
+      !room agents set room=... agents=... — replaces all agents for a room
+      !policy import ...                   — overwrites policy DB (both modes)
+    """
+    v = cmd.verb
+    sub = (cmd.subcommand or "").strip().lower()
+    if v == VERB_NODE and sub == "set":
+        return True
+    if v == VERB_ROOM and sub == "agents" and cmd.args and cmd.args[0].lower() == "set":
+        return True
+    if v == VERB_POLICY and sub == "import":
+        return True
+    # M10.0: prune_exports is dangerous only when dry_run=0 (actual deletion)
+    if v == VERB_POLICY and sub == "prune_exports":
+        dry_raw = cmd.kwargs.get("dry_run", "1").strip()
+        is_dry = dry_raw not in ("0", "false", "no")
+        return not is_dry
+    # M10.1: restore is always dangerous (no dry_run option)
+    if v == VERB_POLICY and sub == "restore":
+        return True
+    return False
+
+
+def build_normalized_args(cmd: "ControlCommand") -> str:
+    """
+    Build a human-readable normalized representation of the command args.
+    Used in audit events and confirmation prompts.
+    """
+    parts: list[str] = []
+    # For !room agents set, skip the "set" positional from args display
+    skip_first_arg = cmd.verb == VERB_ROOM and cmd.subcommand == "agents"
+    for i, a in enumerate(cmd.args):
+        if skip_first_arg and i == 0:
+            continue
+        parts.append(a)
+    for k, v in sorted(cmd.kwargs.items()):
+        parts.append(f"{k}={v}")
+    return " ".join(parts)
+
+
+def confirm_intent_reply(action_summary: str, nonce: str, ttl_s: int) -> str:
+    """Reply when a dangerous command is held pending confirmation (M9.0)."""
+    return (
+        f"⚠️ **Confirm required**\n"
+        f"Action: `{action_summary}`\n"
+        f"Type `!confirm {nonce}` within {ttl_s}s to apply.\n"
+        f"_(Only you can confirm this action.)_"
+    )
+
+
+def confirm_success_reply(action_result: str) -> str:
+    """Reply when a confirmation is accepted and the action applied (M9.0)."""
+    return f"✅ Confirmed and applied.\n{action_result}"
+
+
+def confirm_expired_reply() -> str:
+    """Reply when the nonce is invalid, expired, or from a different sender (M9.0)."""
+    return (
+        "❌ Invalid or expired confirmation code.  "
+        "The action was **not** applied.\n"
+        "Re-issue the original command to get a new code."
+    )
+
+# M6.1: !room subcommand + actions
+ROOM_SUBCMD_AGENTS  = "agents"
+ROOM_ACTION_SET     = "set"
+ROOM_ACTION_ADD     = "add"
+ROOM_ACTION_REMOVE  = "remove"
+ROOM_ACTION_GET     = "get"
+ROOM_ACTION_LIST    = "list"
+ROOM_ACTION_UNSET   = "unset"   # remove full override
+_VALID_ROOM_ACTIONS = frozenset({
+    ROOM_ACTION_SET, ROOM_ACTION_ADD, ROOM_ACTION_REMOVE,
+    ROOM_ACTION_GET, ROOM_ACTION_LIST, ROOM_ACTION_UNSET,
+})
+
+# M6.0: !node subcommands
+NODE_SUBCMD_SET   = "set"
+NODE_SUBCMD_UNSET = "unset"
+NODE_SUBCMD_GET   = "get"
+NODE_SUBCMD_LIST  = "list"
+_VALID_NODE_SUBCMDS = frozenset({NODE_SUBCMD_SET, NODE_SUBCMD_UNSET, NODE_SUBCMD_GET, NODE_SUBCMD_LIST})
+
+# Runbook subcommands (M3.x)
+SUBCOMMAND_START = "start"           # M3.1 — implemented
+SUBCOMMAND_NEXT = "next"             # M3.2 — implemented
+SUBCOMMAND_COMPLETE = "complete"     # M3.2 — implemented
+SUBCOMMAND_EVIDENCE = "evidence"     # M3.3 — implemented
+SUBCOMMAND_STATUS = "status"         # M3.3 — implemented
+SUBCOMMAND_POST_REVIEW = "post_review"  # M3.3 — implemented
 
 # Max command line length to guard against garbage injection
 _MAX_CMD_LEN = 512
@@ -225,10 +331,814 @@ def check_authorization(
 # ── Reply helpers ─────────────────────────────────────────────────────────────
 
 def not_implemented_reply(cmd: ControlCommand) -> str:
-    """Reply for known commands not yet implemented (M3.0 stub)."""
+    """Reply for known commands not yet implemented."""
     return (
         f"✅ Command acknowledged: `{cmd.raw}`\n"
-        f"⏳ `!{cmd.verb} {cmd.subcommand}` — implementation pending (M3.1+)."
+        f"⏳ `!{cmd.verb} {cmd.subcommand}` — implementation pending."
+    )
+
+
+def next_usage_reply() -> str:
+    """Reply when !runbook next is called without a run_id."""
+    return (
+        "⚠️ Usage: `!runbook next <run_id>`\n"
+        "Example: `!runbook next abc-123`"
+    )
+
+
+def complete_usage_reply() -> str:
+    """Reply when !runbook complete is missing required args."""
+    return (
+        "⚠️ Usage: `!runbook complete <run_id> step=<n> status=ok|warn|fail [notes=...]`\n"
+        "Example: `!runbook complete abc-123 step=3 status=ok notes=done`\n"
+        "Notes with spaces: join without quotes — `notes=done_and_verified`."
+    )
+
+
+def start_usage_reply() -> str:
+    """Reply when !runbook start is called with missing/invalid runbook_path."""
+    return (
+        "⚠️ Usage: `!runbook start <runbook_path> [node=NODA1]`\n"
+        "Example: `!runbook start runbooks/rehearsal-v1-checklist.md node=NODA1`\n"
+        "runbook_path must be a relative path without `..`."
+    )
+
+
+def runbook_started_reply(run_id: str, steps_total: int, status: str) -> str:
+    """Success reply after sofiia-console creates a runbook run."""
+    return (
+        f"✅ runbook started: `run_id={run_id}` steps={steps_total} status={status}\n"
+        f"Next: `!runbook next {run_id}`"
+    )
+
+
+def runbook_start_error_reply(reason: str) -> str:
+    """Error reply when sofiia-console returns a non-2xx or connection error."""
+    return f"❌ failed to start runbook: {reason}"
+
+
+# ── M3.2 reply helpers ────────────────────────────────────────────────────────
+
+# Max chars of instructions_md to include in Matrix message before truncating
+_INSTRUCTIONS_EXCERPT_MAX = 1500
+
+
+def next_manual_reply(
+    run_id: str,
+    step_index: int,
+    steps_total: Optional[int],
+    title: str,
+    instructions_md: str,
+) -> str:
+    """Reply for a manual step returned by !runbook next."""
+    step_label = f"Step {step_index + 1}"
+    if steps_total:
+        step_label += f"/{steps_total}"
+
+    excerpt = instructions_md.strip()
+    truncated = False
+    if len(excerpt) > _INSTRUCTIONS_EXCERPT_MAX:
+        excerpt = excerpt[:_INSTRUCTIONS_EXCERPT_MAX].rsplit("\n", 1)[0]
+        truncated = True
+
+    parts = [
+        f"🧭 {step_label}: **{title}**",
+        "",
+        excerpt,
+    ]
+    if truncated:
+        parts.append("_...(truncated — open in console for full instructions)_")
+    parts += [
+        "",
+        f"Complete: `!runbook complete {run_id} step={step_index} status=ok`",
+    ]
+    return "\n".join(parts)
+
+
+def next_auto_reply(
+    run_id: str,
+    step_index: int,
+    action_type: str,
+    step_status: str,
+    duration_ms: Optional[int],
+    completed: bool,
+) -> str:
+    """Reply for an auto step (http_check/script) completed by !runbook next."""
+    emoji = {"ok": "✅", "warn": "⚠️", "fail": "❌"}.get(step_status, "ℹ️")
+    dur = f"  duration={duration_ms}ms" if duration_ms is not None else ""
+    header = f"{emoji} step {step_index + 1} ({action_type}) {step_status}{dur}"
+
+    if completed:
+        return (
+            f"{header}\n"
+            "🎉 All steps completed!\n"
+            f"Get evidence: `!runbook evidence {run_id}`"
+        )
+    return f"{header}\nNext: `!runbook next {run_id}`"
+
+
+def next_error_reply(run_id: str, reason: str) -> str:
+    """Error reply when !runbook next fails."""
+    return f"❌ failed to advance runbook: {reason}"
+
+
+def complete_ok_reply(run_id: str, step_index: int, status: str, run_completed: bool) -> str:
+    """Success reply after !runbook complete."""
+    emoji = {"ok": "✅", "warn": "⚠️", "fail": "❌", "skipped": "⏭️"}.get(status, "✅")
+    line1 = f"{emoji} recorded step {step_index + 1}: {status}"
+    if run_completed:
+        return f"{line1}\n🎉 All steps completed!\nGet evidence: `!runbook evidence {run_id}`"
+    return f"{line1}\nNext: `!runbook next {run_id}`"
+
+
+def complete_error_reply(run_id: str, reason: str) -> str:
+    """Error reply when !runbook complete fails."""
+    return f"❌ failed to complete step: {reason}"
+
+
+# ── M3.3 reply helpers ────────────────────────────────────────────────────────
+
+def status_usage_reply() -> str:
+    return (
+        "⚠️ Usage: `!runbook status <run_id>`\n"
+        "Example: `!runbook status abc-123`"
+    )
+
+
+def evidence_usage_reply() -> str:
+    return (
+        "⚠️ Usage: `!runbook evidence <run_id>`\n"
+        "Example: `!runbook evidence abc-123`"
+    )
+
+
+def post_review_usage_reply() -> str:
+    return (
+        "⚠️ Usage: `!runbook post_review <run_id>`\n"
+        "Example: `!runbook post_review abc-123`"
+    )
+
+
+def status_reply(run: dict) -> str:
+    """Format !runbook status reply from a get_run response."""
+    run_id = run.get("run_id", "?")
+    status = run.get("status", "?")
+    current = run.get("current_step", 0)
+    steps_total = run.get("steps_total") or len(run.get("steps", []))
+    runbook_path = run.get("runbook_path", "?")
+    node_id = run.get("node_id", "?")
+    evidence_path = run.get("evidence_path")
+
+    # Count warn/fail steps
+    steps = run.get("steps", [])
+    warn_count = sum(1 for s in steps if s.get("status") == "warn")
+    fail_count = sum(1 for s in steps if s.get("status") == "fail")
+
+    status_emoji = {
+        "running": "🔄", "completed": "✅", "aborted": "🛑", "paused": "⏸️",
+    }.get(status, "ℹ️")
+
+    step_label = f"{current}/{steps_total}" if steps_total else str(current)
+    lines = [
+        f"{status_emoji} `run_id={run_id}` status={status} step={step_label}",
+        f"runbook: `{runbook_path}`  node: {node_id}",
+    ]
+    if warn_count or fail_count:
+        lines.append(f"warn={warn_count}  fail={fail_count}")
+    if evidence_path:
+        lines.append(f"evidence: `{evidence_path}`")
+
+    if status == "completed" and not evidence_path:
+        lines.append(f"Get evidence: `!runbook evidence {run_id}`")
+    elif status == "completed" and evidence_path:
+        lines.append(f"Post-review: `!runbook post_review {run_id}`")
+
+    return "\n".join(lines)
+
+
+def status_error_reply(run_id: str, reason: str) -> str:
+    return f"❌ failed to get status: {reason}"
+
+
+def evidence_reply(result: dict) -> str:
+    """Success reply after !runbook evidence."""
+    path = result.get("evidence_path", "?")
+    size = result.get("bytes", 0)
+    run_id = result.get("run_id", "")
+    ts = result.get("created_at", "")
+    lines = [f"📄 evidence created: `{path}` (bytes={size})"]
+    if ts:
+        lines.append(f"created_at: {ts}")
+    if run_id:
+        lines.append(f"Next: `!runbook post_review {run_id}`")
+    return "\n".join(lines)
+
+
+def evidence_error_reply(run_id: str, reason: str) -> str:
+    return f"❌ failed to generate evidence: {reason}"
+
+
+def post_review_reply(result: dict) -> str:
+    """Success reply after !runbook post_review."""
+    path = result.get("path", "?")
+    size = result.get("bytes", 0)
+    ts = result.get("created_at", "")
+    lines = [f"🧾 post-review created: `{path}` (bytes={size})"]
+    if ts:
+        lines.append(f"created_at: {ts}")
+    return "\n".join(lines)
+
+
+def post_review_error_reply(run_id: str, reason: str) -> str:
+    return f"❌ failed to generate post-review: {reason}"
+
+
+# ── M3.4 safety helpers ───────────────────────────────────────────────────────
+
+#: Maximum length of notes/free-text operator input accepted before truncation.
+MAX_NOTES_LEN: int = 500
+
+#: Control characters (U+0000–U+001F minus tab/newline) that must be stripped.
+_CTRL_CHARS = "".join(chr(i) for i in range(32) if i not in (9, 10, 13))
+
+
+def sanitize_notes(notes: str) -> str:
+    """
+    Strip control characters and truncate notes to MAX_NOTES_LEN.
+
+    Safe to call with any string; returns empty string for falsy input.
+    """
+    if not notes:
+        return ""
+    cleaned = notes.translate(str.maketrans("", "", _CTRL_CHARS))
+    if len(cleaned) > MAX_NOTES_LEN:
+        cleaned = cleaned[:MAX_NOTES_LEN] + "…"
+    return cleaned
+
+
+def rate_limited_reply(scope: str, retry_after_s: float) -> str:
+    """Reply when a control command is rejected by rate limiter or cooldown."""
+    secs = f"{retry_after_s:.0f}s" if retry_after_s >= 1 else "a moment"
+    return f"⏳ rate limited ({scope}), retry after {secs}"
+
+
+def status_not_available_reply() -> str:
+    return "⚠️ Bridge status not available (service initialising or config missing)."
+
+
+# M5.1: !nodes reply
+_MAX_ROOM_OVERRIDES_SHOWN = 10
+
+
+def nodes_reply(
+    policy_info: dict,
+    node_stats: Optional[dict] = None,
+    sticky_info: Optional[dict] = None,
+) -> str:
+    """
+    Compact reply for `!nodes` in control room.
+
+    policy_info:  from NodePolicy.as_info_dict()
+    node_stats:   optional dict {node_id: {"routed": N, "rejected": M, "health": ..., ...}}
+    sticky_info:  optional dict from StickyNodeCache (M8.1)
+    """
+    default = policy_info.get("default_node", "?")
+    allowed = sorted(policy_info.get("allowed_nodes") or [])
+    overrides = policy_info.get("room_overrides", {}) or {}
+
+    allowed_str = ", ".join(f"`{n}`" for n in allowed)
+    lines = [
+        "🌐 **Node policy**",
+        f"Default: `{default}`  Allowed: {allowed_str}",
+    ]
+
+    if isinstance(overrides, dict) and overrides:
+        lines.append(f"\n**Room overrides** ({len(overrides)}):")
+        items = list(overrides.items())[:_MAX_ROOM_OVERRIDES_SHOWN]
+        for room_id, node in items:
+            lines.append(f"  `{room_id}` → `{node}`")
+        if len(overrides) > _MAX_ROOM_OVERRIDES_SHOWN:
+            lines.append(f"  _(+{len(overrides) - _MAX_ROOM_OVERRIDES_SHOWN} more)_")
+    elif isinstance(overrides, int):
+        # as_info_dict returns room_overrides as int count, not dict
+        if overrides:
+            lines.append(f"\nRoom overrides: {overrides}")
+        else:
+            lines.append("\nNo room overrides configured.")
+    else:
+        lines.append("\nNo room overrides configured.")
+
+    if node_stats:
+        lines.append("\n**Per-node stats** (since last restart):")
+        for node_id in sorted(node_stats):
+            ns = node_stats[node_id]
+            routed = ns.get("routed", 0)
+            rejected = ns.get("rejected", 0)
+            health = ns.get("health", "")
+            ewma = ns.get("ewma_latency_s")
+            consec = ns.get("consecutive_failures", 0)
+            stat_parts = [f"routed={routed}", f"rejected={rejected}"]
+            if health:
+                stat_parts.append(f"health={health}")
+            if ewma is not None:
+                stat_parts.append(f"ewma={ewma:.2f}s")
+            if consec:
+                stat_parts.append(f"consec_fail={consec}")
+            lines.append(f"  `{node_id}`: " + "  ".join(stat_parts))
+
+    # M8.1: sticky cache section
+    if sticky_info is not None:
+        active = sticky_info.get("active_keys", 0)
+        ttl = sticky_info.get("ttl_s", 0)
+        if active:
+            lines.append(f"\n**Sticky routing** (anti-flap): {active} active  ttl={ttl:.0f}s")
+            for entry in sticky_info.get("entries", []):
+                rem = entry.get("remaining_s", 0)
+                lines.append(
+                    f"  `{entry['key']}` → `{entry['node']}` ({rem:.0f}s left)"
+                )
+            if sticky_info.get("truncated"):
+                lines.append(f"  _(+{sticky_info['truncated']} more)_")
+        else:
+            lines.append(f"\nSticky routing: none active  ttl={ttl:.0f}s")
+
+    return "\n".join(lines)
+
+
+# ── M6.0: !node subcommand parser + reply helpers ──────────────────────────────
+
+import re as _re
+
+_ROOM_KWARG_RE    = _re.compile(r"\broom=(\S+)", _re.IGNORECASE)
+_NODE_VAL_RE      = _re.compile(r"\bnode=(\w+)", _re.IGNORECASE)
+_ROOM_ID_RE       = _re.compile(r"^![a-zA-Z0-9._\-]+:[a-zA-Z0-9._\-]+$")
+
+
+def parse_node_cmd(args_text: str) -> Tuple[str, Optional[str], Optional[str]]:
+    """
+    Parse `!node <subcommand> [room=...] [node=...]` arguments.
+
+    Returns (subcmd, room_id_or_None, node_id_or_None).
+    subcmd is lower-cased; node_id is upper-cased.
+    """
+    parts = args_text.strip().split(None, 1)
+    if not parts:
+        return ("", None, None)
+    subcmd = parts[0].lower()
+    rest = parts[1] if len(parts) > 1 else ""
+
+    room_m = _ROOM_KWARG_RE.search(rest)
+    node_m = _NODE_VAL_RE.search(rest)
+
+    room_id = room_m.group(1) if room_m else None
+    node_id = node_m.group(1).upper() if node_m else None
+    return (subcmd, room_id, node_id)
+
+
+def node_cmd_validate_room(room_id: str) -> bool:
+    """Return True if room_id matches basic Matrix room ID format."""
+    return bool(_ROOM_ID_RE.match(room_id)) if room_id else False
+
+
+def node_cmd_reply_set(room_id: str, node_id: str) -> str:
+    return f"✅ Override set: `{room_id}` → `{node_id}`"
+
+
+def node_cmd_reply_unset_ok(room_id: str) -> str:
+    return f"✅ Override removed for `{room_id}`"
+
+
+def node_cmd_reply_unset_not_found(room_id: str) -> str:
+    return f"ℹ️ No override was set for `{room_id}`"
+
+
+def node_cmd_reply_get(
+    room_id: str,
+    node_id: Optional[str],
+    env_node: Optional[str],
+    default_node: str,
+) -> str:
+    lines = [f"📌 **Node info for** `{room_id}`"]
+    if node_id:
+        lines.append(f"Dynamic override: `{node_id}` _(set by operator)_")
+    else:
+        lines.append("Dynamic override: _none_")
+    if env_node:
+        lines.append(f"Env map: `{env_node}`")
+    lines.append(f"Default: `{default_node}`")
+    effective = node_id or env_node or default_node
+    lines.append(f"\nEffective node: **`{effective}`**")
+    return "\n".join(lines)
+
+
+def node_cmd_reply_list(
+    overrides: List[Tuple[str, str, int]],
+    total: int,
+) -> str:
+    import datetime
+    lines = [f"📋 **Dynamic node overrides** ({total} total)"]
+    if not overrides:
+        lines.append("_None set._")
+    else:
+        for room_id, node_id, updated_at in overrides:
+            ts = datetime.datetime.utcfromtimestamp(updated_at).strftime("%Y-%m-%d %H:%M")
+            lines.append(f"  `{room_id}` → `{node_id}` _(at {ts} UTC)_")
+        if total > len(overrides):
+            lines.append(f"  _(+{total - len(overrides)} more)_")
+    return "\n".join(lines)
+
+
+def node_cmd_reply_error(msg: str) -> str:
+    return (
+        f"❌ {msg}\n\n"
+        "Usage:\n"
+        "  `!node set room=!room:server node=NODA2`\n"
+        "  `!node unset room=!room:server`\n"
+        "  `!node get room=!room:server`\n"
+        "  `!node list`"
+    )
+
+
+# ── M6.1: !room agents reply helpers ──────────────────────────────────────────
+
+_AGENTS_KWARG_RE = _re.compile(r"\bagents=(\S+)", _re.IGNORECASE)
+_AGENT_KWARG_RE  = _re.compile(r"\bagent=(\w+)", _re.IGNORECASE)
+_DEFAULT_KWARG_RE = _re.compile(r"\bdefault=(\w+)", _re.IGNORECASE)
+
+
+def parse_room_agents_cmd(
+    subcommand: str,
+    args: tuple,
+    kwargs: Dict[str, str],
+) -> Tuple[str, Optional[str], Optional[List[str]], Optional[str], Optional[str]]:
+    """
+    Parse !room agents <action> [room=...] [agents=...] [agent=...] [default=...] args.
+
+    Returns (action, room_id, agents_or_None, single_agent_or_None, default_agent_or_None).
+    action:        the ROOM_ACTION_* constant (from args[0] or subcommand)
+    room_id:       from kwargs["room"]
+    agents:        from kwargs["agents"] as a list (for set command)
+    single_agent:  from kwargs["agent"] (for add/remove)
+    default_agent: from kwargs["default"]
+    """
+    # action is args[0] when subcommand == "agents"
+    action = (args[0].lower() if args else "").strip() or subcommand.lower()
+    room_id = kwargs.get("room")
+
+    # agents= may be comma-separated
+    raw_agents = kwargs.get("agents", "")
+    agents: Optional[List[str]] = (
+        [a.strip().lower() for a in raw_agents.split(",") if a.strip()]
+        if raw_agents else None
+    )
+
+    single_agent = kwargs.get("agent", "").strip().lower() or None
+    default_agent = kwargs.get("default", "").strip().lower() or None
+    return action, room_id, agents, single_agent, default_agent
+
+
+def room_agents_reply_set(room_id: str, agents: List[str], default_agent: str) -> str:
+    agents_str = ", ".join(f"`{a}`" for a in sorted(agents))
+    return (
+        f"✅ Agent override set for `{room_id}`\n"
+        f"Agents: {agents_str}\n"
+        f"Default: `{default_agent}`"
+    )
+
+
+def room_agents_reply_add(room_id: str, agent: str, agents: List[str], default_agent: Optional[str]) -> str:
+    agents_str = ", ".join(f"`{a}`" for a in sorted(agents))
+    return (
+        f"✅ Agent `{agent}` added to `{room_id}`\n"
+        f"Current agents: {agents_str}"
+        + (f"\nDefault: `{default_agent}`" if default_agent else "")
+    )
+
+
+def room_agents_reply_remove(room_id: str, agent: str, agents: List[str], default_agent: Optional[str]) -> str:
+    if agents:
+        agents_str = ", ".join(f"`{a}`" for a in sorted(agents))
+        return (
+            f"✅ Agent `{agent}` removed from `{room_id}`\n"
+            f"Remaining: {agents_str}"
+            + (f"\nDefault: `{default_agent}`" if default_agent else "")
+        )
+    return f"✅ Agent `{agent}` removed — no agents left, override cleared for `{room_id}`"
+
+
+def room_agents_reply_unset_ok(room_id: str) -> str:
+    return f"✅ Agent override cleared for `{room_id}` (using env/default config)"
+
+
+def room_agents_reply_unset_not_found(room_id: str) -> str:
+    return f"ℹ️ No agent override was set for `{room_id}`"
+
+
+def room_agents_reply_get(
+    room_id: str,
+    override_agents: Optional[List[str]],
+    override_default: Optional[str],
+    env_agents: Optional[List[str]],
+    env_default: Optional[str],
+) -> str:
+    lines = [f"📌 **Agent policy for** `{room_id}`"]
+    if override_agents:
+        agents_str = ", ".join(f"`{a}`" for a in sorted(override_agents))
+        lines.append(f"Dynamic override: {agents_str}  default=`{override_default or '?'}`")
+    else:
+        lines.append("Dynamic override: _none_")
+    if env_agents:
+        env_str = ", ".join(f"`{a}`" for a in sorted(env_agents))
+        lines.append(f"Env config: {env_str}  default=`{env_default or '?'}`")
+    else:
+        lines.append("Env config: _not configured_")
+    effective_agents = override_agents or env_agents or []
+    effective_default = override_default or env_default or "?"
+    lines.append(f"\nEffective agents: **{', '.join(f'`{a}`' for a in sorted(effective_agents))}**  default=**`{effective_default}`**")
+    return "\n".join(lines)
+
+
+def room_agents_reply_list(
+    overrides: List[Tuple[str, List[str], Optional[str], int]],
+    total: int,
+) -> str:
+    import datetime
+    lines = [f"📋 **Dynamic agent overrides** ({total} total)"]
+    if not overrides:
+        lines.append("_None set._")
+    else:
+        for room_id, agents, default_agent, updated_at in overrides:
+            ts = datetime.datetime.utcfromtimestamp(updated_at).strftime("%Y-%m-%d %H:%M")
+            agents_str = ", ".join(agents)
+            lines.append(f"  `{room_id}`: [{agents_str}]  default=`{default_agent or '?'}` _(at {ts} UTC)_")
+        if total > len(overrides):
+            lines.append(f"  _(+{total - len(overrides)} more)_")
+    return "\n".join(lines)
+
+
+def room_agents_reply_error(msg: str) -> str:
+    return (
+        f"❌ {msg}\n\n"
+        "Usage:\n"
+        "  `!room agents set room=!X agents=sofiia,helion [default=sofiia]`\n"
+        "  `!room agents add room=!X agent=druid`\n"
+        "  `!room agents remove room=!X agent=helion`\n"
+        "  `!room agents get room=!X`\n"
+        "  `!room agents unset room=!X`\n"
+        "  `!room agents list`"
+    )
+
+
+# ── M6.2: !policy export/import reply helpers + path validator ────────────────
+
+import os as _os
+import json as _json
+
+
+POLICY_EXPORTS_SUBDIR = "policy_exports"
+
+
+def validate_export_path(exports_dir: str, filename: str) -> Optional[str]:
+    """
+    Validate and resolve an export filename to an absolute path.
+
+    Security: only allow simple filenames (no slashes, no `..`).
+    Returns the absolute safe path, or None if invalid.
+    """
+    if not filename:
+        return None
+    # Reject anything with directory separators or traversal sequences
+    if "/" in filename or "\\" in filename or ".." in filename:
+        return None
+    # Only allow safe characters: alphanumeric, dash, underscore, dot
+    if not _re.match(r"^[a-zA-Z0-9._\-]+$", filename):
+        return None
+    full_path = _os.path.join(exports_dir, filename)
+    try:
+        resolved = _os.path.realpath(full_path)
+        exports_resolved = _os.path.realpath(exports_dir)
+        if not resolved.startswith(exports_resolved + _os.sep):
+            return None
+    except Exception:  # noqa: BLE001
+        return None
+    return full_path
+
+
+def policy_export_reply(path: str, node_count: int, agent_count: int) -> str:
+    filename = _os.path.basename(path)
+    return (
+        f"✅ **Policy exported**\n"
+        f"File: `{filename}`\n"
+        f"Node overrides: {node_count}  Agent overrides: {agent_count}"
+    )
+
+
+def policy_import_dry_run_reply(stats: dict, mode: str) -> str:
+    return (
+        f"🔍 **Import dry-run** (mode=`{mode}`, no changes applied)\n"
+        f"Node overrides: +{stats.get('node_added',0)} ~{stats.get('node_updated',0)} -{stats.get('node_deleted',0)}\n"
+        f"Agent overrides: +{stats.get('agent_added',0)} ~{stats.get('agent_updated',0)} -{stats.get('agent_deleted',0)}\n"
+        f"_Use `dry_run=0` to apply._"
+    )
+
+
+def format_import_diff(diff: Any) -> str:
+    """
+    Format an ImportDiff as a human-readable Markdown string (M9.1).
+    `diff` is an ImportDiff instance from policy_store.
+    """
+    lines: List[str] = []
+
+    # Node overrides row
+    node_parts: List[str] = []
+    if diff.node_added:   node_parts.append(f"+{diff.node_added} added")
+    if diff.node_updated: node_parts.append(f"~{diff.node_updated} updated")
+    if diff.node_deleted: node_parts.append(f"-{diff.node_deleted} deleted ⚠️")
+    lines.append("**Node overrides:** " + (", ".join(node_parts) if node_parts else "no changes"))
+
+    # Agent overrides row
+    agent_parts: List[str] = []
+    if diff.agent_added:   agent_parts.append(f"+{diff.agent_added} added")
+    if diff.agent_updated: agent_parts.append(f"~{diff.agent_updated} updated")
+    if diff.agent_deleted: agent_parts.append(f"-{diff.agent_deleted} deleted ⚠️")
+    lines.append("**Agent overrides:** " + (", ".join(agent_parts) if agent_parts else "no changes"))
+
+    # Sample affected rooms
+    if getattr(diff, "sample_keys", None):
+        keys_str = ", ".join(f"`{k}`" for k in diff.sample_keys)
+        more = diff.total_changes() - len(diff.sample_keys)
+        suffix = f" _(+{more} more)_" if more > 0 else ""
+        lines.append(f"**Affected rooms:** {keys_str}{suffix}")
+
+    # Replace danger banner
+    if getattr(diff, "is_replace", False):
+        lines.append("⚠️ **REPLACE mode** — existing overrides NOT in the file will be **deleted**.")
+
+    return "\n".join(lines)
+
+
+def policy_import_intent_reply(
+    diff: Any,
+    action_summary: str,
+    nonce: str,
+    ttl_s: int,
+) -> str:
+    """Reply for !policy import intent with diff preview (M9.1)."""
+    lines = [
+        "⚠️ **Confirm required**",
+        f"Action: `{action_summary}`",
+        "",
+        "**Preview:**",
+        format_import_diff(diff),
+        "",
+    ]
+    if diff.total_changes() == 0:
+        lines.append("_(No policy changes would be made.)_")
+        lines.append("")
+    lines += [
+        f"Type `!confirm {nonce}` within {ttl_s}s to apply.",
+        "_(Only you can confirm. If the file changes, this confirm will be rejected.)_",
+    ]
+    return "\n".join(lines)
+
+
+def policy_import_reply(stats: dict, mode: str) -> str:
+    return (
+        f"✅ **Policy imported** (mode=`{mode}`)\n"
+        f"Node overrides: +{stats.get('node_added',0)} ~{stats.get('node_updated',0)} -{stats.get('node_deleted',0)}\n"
+        f"Agent overrides: +{stats.get('agent_added',0)} ~{stats.get('agent_updated',0)} -{stats.get('agent_deleted',0)}"
+    )
+
+
+def policy_restore_intent_reply(
+    diff: Any,
+    action_summary: str,
+    nonce: str,
+    ttl_s: int,
+) -> str:
+    """Reply for !policy restore intent — rollback preview + confirm prompt (M10.1)."""
+    diff_text = format_import_diff(diff)
+    return (
+        f"🔄 **Policy restore (rollback) preview**\n"
+        f"{diff_text}\n\n"
+        f"⚠️ **Rollback action:** `{action_summary}`\n\n"
+        f"Type `!confirm {nonce}` to apply restore (expires in {ttl_s}s)"
+    )
+
+
+def policy_restore_applied_reply(
+    stats: Any,
+    mode: str,
+    autobackup_basename: str = "",
+) -> str:
+    """Reply after !policy restore is confirmed and applied (M10.1)."""
+    n_a = stats.get("node_added",   0) if isinstance(stats, dict) else 0
+    n_u = stats.get("node_updated", 0) if isinstance(stats, dict) else 0
+    n_d = stats.get("node_deleted", 0) if isinstance(stats, dict) else 0
+    a_a = stats.get("agent_added",   0) if isinstance(stats, dict) else 0
+    a_u = stats.get("agent_updated", 0) if isinstance(stats, dict) else 0
+    a_d = stats.get("agent_deleted", 0) if isinstance(stats, dict) else 0
+    backup_line = (
+        f"\n\n💾 Pre-restore backup saved: `{autobackup_basename}`"
+        if autobackup_basename else ""
+    )
+    return (
+        f"✅ **Policy restored** (mode={mode})\n"
+        f"Node overrides: +{n_a} ~{n_u} -{n_d}\n"
+        f"Agent overrides: +{a_a} ~{a_u} -{a_d}"
+        f"{backup_line}"
+    )
+
+
+def policy_history_reply(changes: List[Any]) -> str:
+    """
+    Format policy_changes records for !policy history reply (M10.2).
+
+    Each line: #{n}. [id:NN] [YYYY-MM-DD HH:MM] verb/mode  +Xn ~Yn -Zn  `file`  op:`hash8`  [⚠️]
+    Use !policy change id=NN to see full details.
+    """
+    if not changes:
+        return "📋 **Policy change history**\nNo policy changes recorded yet."
+    lines = ["📋 **Policy change history** (most recent first)\n"]
+    for i, c in enumerate(changes, 1):
+        destr_flag = " ⚠️" if c.is_destructive else ""
+        fname = c.source_file[:40] + "…" if len(c.source_file) > 40 else c.source_file
+        line = (
+            f"{i}. [id:{c.id}] [{c.when_str()}] `{c.verb}/{c.mode}`"
+            f"  {c.changes_short()}{destr_flag}"
+            f"  `{fname}`"
+            f"  op:`{c.sender_hash[:8]}`"
+        )
+        lines.append(line)
+    lines.append("\nUse `!policy change id=<n>` for full details of a specific change.")
+    return "\n".join(lines)
+
+
+def policy_change_detail_reply(change: Any) -> str:
+    """
+    Format full details of a single PolicyChange for !policy change id=<n> (M10.3).
+    """
+    destr_str = "⚠️ Yes" if change.is_destructive else "No"
+    fname = change.source_file[:60] + "…" if len(change.source_file) > 60 else change.source_file
+    lines = [
+        f"🔍 **Policy change #{change.id}**\n",
+        f"**Verb:**        `{change.verb}`",
+        f"**Mode:**        `{change.mode}`",
+        f"**Applied:**     {change.when_str()} UTC",
+        f"**Operator:**    op:`{change.sender_hash[:8]}`",
+        f"**File:**        `{fname}`",
+        f"**Destructive:** {destr_str}",
+        "",
+        "**Changes:**",
+        f"  Nodes:   +{change.node_added} added  ~{change.node_updated} updated  -{change.node_deleted} deleted",
+        f"  Agents:  +{change.agent_added} added  ~{change.agent_updated} updated  -{change.agent_deleted} deleted",
+        "",
+        "**Summary:**",
+        f"  {change.diff_summary}",
+    ]
+    return "\n".join(lines)
+
+
+def policy_prune_preview_reply(result: Any, retention_days: int) -> str:
+    """Reply for !policy prune_exports dry_run=1 — preview of what would be pruned (M10.0)."""
+    if result.count == 0:
+        return (
+            f"🗑️ **Policy exports prune preview** (retention={retention_days}d)\n"
+            "No files older than the retention window found. Nothing to prune."
+        )
+    samples = result.sample_filenames(5)
+    sample_str = "\n".join(f"  - `{f}`" for f in samples)
+    more = result.count - len(samples)
+    more_str = f"\n  _(+{more} more)_" if more > 0 else ""
+    size_kb = result.total_bytes // 1024
+    return (
+        f"🗑️ **Policy exports prune preview** (retention={retention_days}d)\n"
+        f"Would delete **{result.count}** file(s) (~{size_kb} KB):\n"
+        f"{sample_str}{more_str}\n\n"
+        f"To actually prune: `!policy prune_exports dry_run=0`"
+    )
+
+
+def policy_prune_applied_reply(result: Any, retention_days: int) -> str:
+    """Reply after !policy prune_exports dry_run=0 is confirmed and applied (M10.0)."""
+    if result.count == 0:
+        return (
+            f"🗑️ **Policy exports pruned** (retention={retention_days}d)\n"
+            "No files matched the retention window."
+        )
+    size_kb = result.total_bytes // 1024
+    return (
+        f"✅ **Policy exports pruned** (retention={retention_days}d)\n"
+        f"Deleted **{result.count}** file(s) (~{size_kb} KB freed)."
+    )
+
+
+def policy_cmd_error(msg: str) -> str:
+    return (
+        f"❌ {msg}\n\n"
+        "Usage:\n"
+        "  `!policy export`\n"
+        "  `!policy import path=policy-YYYYMMDD-HHMMSS.json [mode=merge|replace] [dry_run=0]`"
     )
 
 
@@ -252,12 +1162,26 @@ def help_reply() -> str:
     """Brief help text."""
     return (
         "**DAGI Bridge — Control Commands**\n\n"
-        "`!runbook start <path> [node=NODA1]` — Start a runbook run\n"
-        "`!runbook next <run_id>` — Advance to next step\n"
-        "`!runbook complete <run_id> step=<n> status=ok` — Mark step complete\n"
-        "`!runbook evidence <run_id>` — Get evidence artifact path\n"
-        "`!runbook status <run_id>` — Show current run state\n"
-        "`!status` — Bridge health summary\n"
+        "`!runbook start <path> [node=NODA1]` — Start a runbook run ✅\n"
+        "`!runbook next <run_id>` — Advance to next step ✅\n"
+        "`!runbook complete <run_id> step=<n> status=ok [notes=...]` — Mark step complete ✅\n"
+        "`!runbook status <run_id>` — Show run status ✅\n"
+        "`!runbook evidence <run_id>` — Generate release evidence ✅\n"
+        "`!runbook post_review <run_id>` — Generate post-release review ✅\n"
+        "`!status` — Bridge health summary ✅\n"
+        "`!nodes` — Node policy overview ✅\n"
+        "`!node set room=!room:server node=NODA2` — Set room-node override ✅\n"
+        "`!node unset room=!room:server` — Remove room-node override ✅\n"
+        "`!node get room=!room:server` — Show current override ✅\n"
+        "`!node list` — List dynamic overrides (top 10) ✅\n"
+        "`!room agents set room=!X agents=sofiia,helion [default=sofiia]` — Set agent list ✅\n"
+        "`!room agents add room=!X agent=druid` — Add agent to room ✅\n"
+        "`!room agents remove room=!X agent=helion` — Remove agent from room ✅\n"
+        "`!room agents get room=!X` — Show current agent policy ✅\n"
+        "`!room agents list` — List all rooms with agent overrides ✅\n"
+        "`!room agents unset room=!X` — Remove all agent overrides for room ✅\n"
+        "`!policy export` — Export policy snapshot to file ✅\n"
+        "`!policy import path=<file> [mode=merge|replace] [dry_run=0]` — Import policy snapshot ✅\n"
         "`!help` — This message\n\n"
         "_Only authorised operators can issue control commands._"
     )
diff --git a/services/matrix-bridge-dagi/app/control_limiter.py b/services/matrix-bridge-dagi/app/control_limiter.py
new file mode 100644
index 00000000..303ff0c8
--- /dev/null
+++ b/services/matrix-bridge-dagi/app/control_limiter.py
@@ -0,0 +1,138 @@
+"""
+control_limiter — M3.4: Rate limiting + cooldown for Matrix control channel.
+
+Protection layers:
+  1. Per-room sliding window     — CONTROL_ROOM_RPM (default 60)
+  2. Per-operator sliding window — CONTROL_OPERATOR_RPM (default 30)
+  3. Per-run sliding window      — CONTROL_RUN_NEXT_RPM (default 20, only !runbook next)
+  4. Per-operator cooldown       — CONTROL_COOLDOWN_S (default 2s, anti-double-click)
+
+All state is in-memory (lost on restart), which is intentional — limits reset with the bridge.
+
+Thread safety: not needed (asyncio single-threaded event loop).
+"""
+from __future__ import annotations
+
+import time
+from collections import defaultdict, deque
+from typing import Dict, Tuple
+
+
+# Sentinel value for "unlimited" (rpm == 0 → skip check)
+_UNLIMITED = 0
+
+
+class ControlRateLimiter:
+    """
+    Sliding-window rate limiter + cooldown for the Matrix control channel.
+
+    All rpm values are requests-per-minute over a 60-second rolling window.
+    cooldown_s is a per-{operator, verb, subcommand} debounce window (anti-double-click).
+    """
+
+    def __init__(
+        self,
+        room_rpm: int = 60,
+        operator_rpm: int = 30,
+        run_next_rpm: int = 20,
+        cooldown_s: float = 2.0,
+    ) -> None:
+        self.room_rpm = room_rpm
+        self.operator_rpm = operator_rpm
+        self.run_next_rpm = run_next_rpm
+        self.cooldown_s = cooldown_s
+
+        # Sliding-window storage: key → deque[float] (monotonic timestamps)
+        self._room_windows: Dict[str, deque] = defaultdict(deque)
+        self._op_windows: Dict[str, deque] = defaultdict(deque)
+        self._run_windows: Dict[str, deque] = defaultdict(deque)
+
+        # Cooldown: (sender_hash, verb, subcommand) → last accepted timestamp
+        self._cooldown_times: Dict[str, float] = {}
+
+    # ── Sliding window helpers ─────────────────────────────────────────────────
+
+    @staticmethod
+    def _check_window(
+        windows: Dict[str, deque],
+        key: str,
+        rpm: int,
+    ) -> Tuple[bool, float]:
+        """
+        Sliding-window check over a 60-second window.
+
+        Returns (allowed, retry_after_seconds).
+        If rpm == 0, always allowed.
+        """
+        if rpm == _UNLIMITED:
+            return True, 0.0
+
+        now = time.monotonic()
+        window = windows[key]
+        cutoff = now - 60.0
+
+        # Evict expired entries
+        while window and window[0] < cutoff:
+            window.popleft()
+
+        if len(window) >= rpm:
+            # Time until oldest entry expires
+            retry_after = max(0.0, 60.0 - (now - window[0]))
+            return False, retry_after
+
+        window.append(now)
+        return True, 0.0
+
+    # ── Public check methods ───────────────────────────────────────────────────
+
+    def check_room(self, room_id: str) -> Tuple[bool, float]:
+        """Per-room rate limit check. Returns (allowed, retry_after_s)."""
+        return self._check_window(self._room_windows, room_id, self.room_rpm)
+
+    def check_operator(self, sender_hash: str) -> Tuple[bool, float]:
+        """Per-operator rate limit check. Returns (allowed, retry_after_s)."""
+        return self._check_window(self._op_windows, sender_hash, self.operator_rpm)
+
+    def check_run_next(self, run_id: str) -> Tuple[bool, float]:
+        """
+        Per-run rate limit for !runbook next — prevents rapid-fire advancement.
+        Returns (allowed, retry_after_s).
+        """
+        return self._check_window(self._run_windows, run_id, self.run_next_rpm)
+
+    def check_cooldown(
+        self,
+        sender_hash: str,
+        verb: str,
+        subcommand: str,
+    ) -> Tuple[bool, float]:
+        """
+        Anti-double-click cooldown per (operator, verb, subcommand).
+
+        Returns (allowed, wait_s). On first call → records timestamp and allows.
+        On subsequent calls within cooldown_s → blocks and returns remaining wait.
+        """
+        if self.cooldown_s <= 0:
+            return True, 0.0
+
+        key = f"{sender_hash}:{verb}:{subcommand}"
+        now = time.monotonic()
+        last = self._cooldown_times.get(key)
+
+        if last is not None:
+            elapsed = now - last
+            if elapsed < self.cooldown_s:
+                return False, self.cooldown_s - elapsed
+
+        self._cooldown_times[key] = now
+        return True, 0.0
+
+    # ── Summary ───────────────────────────────────────────────────────────────
+
+    def as_health_dict(self) -> dict:
+        return {
+            "room_rpm": self.room_rpm,
+            "operator_rpm": self.operator_rpm,
+            "run_next_rpm": self.run_next_rpm,
+            "cooldown_s": self.cooldown_s,
+        }
diff --git a/services/matrix-bridge-dagi/app/control_runner.py b/services/matrix-bridge-dagi/app/control_runner.py
new file mode 100644
index 00000000..22dc78a6
--- /dev/null
+++ b/services/matrix-bridge-dagi/app/control_runner.py
@@ -0,0 +1,296 @@
+"""
+control_runner — M3.1 + M3.2 + M3.3
+
+Thin async HTTP client that calls the sofiia-console internal runbook API
+on behalf of the Matrix bridge control channel.
+
+All functions are stateless; callers supply the pre-built AsyncClient.
+"""
+from __future__ import annotations
+
+import logging
+from typing import Optional
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+# Runbook path guards (fail-fast in the bridge, before calling the console)
+_MAX_PATH_LEN = 256
+_FORBIDDEN_SEGMENTS = {"..", "~"}
+
+
+class RunnerError(Exception):
+    """Raised when the sofiia-console returns an error or call fails."""
+
+
+def validate_runbook_path(path: str) -> Optional[str]:
+    """
+    Return None if valid, or an error string describing the problem.
+    Checks: non-empty, max length, no traversal segments, no absolute paths.
+    """
+    path = path.strip()
+    if not path:
+        return "runbook_path is required"
+    if len(path) > _MAX_PATH_LEN:
+        return f"runbook_path too long (max {_MAX_PATH_LEN} chars)"
+    if path.startswith("/"):
+        return "absolute paths are not allowed"
+    parts = path.replace("\\", "/").split("/")
+    for part in parts:
+        if part in _FORBIDDEN_SEGMENTS:
+            return f"forbidden path segment: {part!r}"
+    return None
+
+
+async def start_runbook_run(
+    http_client: httpx.AsyncClient,
+    console_url: str,
+    control_token: str,
+    runbook_path: str,
+    operator_id: str,
+    node_id: str = "NODA1",
+    timeout: float = 15.0,
+) -> dict:
+    """
+    POST /api/runbooks/internal/runs → {run_id, status, current_step, steps_total}
+
+    Raises RunnerError on HTTP error or non-2xx response.
+    """
+    url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs"
+    payload: dict = {
+        "runbook_path": runbook_path,
+        "operator_id": operator_id,
+        "node_id": node_id,
+    }
+
+    try:
+        resp = await http_client.post(
+            url,
+            json=payload,
+            headers={"X-Control-Token": control_token},
+            timeout=timeout,
+        )
+    except httpx.RequestError as exc:
+        raise RunnerError(f"connection error: {exc}") from exc
+
+    if resp.status_code != 200:
+        detail = _extract_error_detail(resp)
+        raise RunnerError(f"HTTP {resp.status_code}: {detail}")
+
+    try:
+        return resp.json()
+    except Exception as exc:
+        raise RunnerError(f"invalid JSON response: {exc}") from exc
+
+
+def _extract_error_detail(resp: httpx.Response) -> str:
+    """Extract a short error detail from an httpx response (safe: never raises)."""
+    try:
+        body = resp.json()
+        if isinstance(body, dict) and body.get("detail"):
+            return str(body["detail"])[:200]
+    except Exception:
+        pass
+    try:
+        return (resp.text or "")[:200]
+    except Exception:
+        return "<no detail>"
+
+
+async def get_runbook_run(
+    http_client: httpx.AsyncClient,
+    console_url: str,
+    control_token: str,
+    run_id: str,
+    timeout: float = 10.0,
+) -> dict:
+    """
+    GET /api/runbooks/internal/runs/{run_id} → full run with steps.
+    """
+    url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}"
+
+    try:
+        resp = await http_client.get(
+            url,
+            headers={"X-Control-Token": control_token},
+            timeout=timeout,
+        )
+    except httpx.RequestError as exc:
+        raise RunnerError(f"connection error: {exc}") from exc
+
+    if resp.status_code == 404:
+        raise RunnerError(f"run {run_id!r} not found")
+    if resp.status_code != 200:
+        raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
+
+    try:
+        return resp.json()
+    except Exception as exc:
+        raise RunnerError(f"invalid JSON response: {exc}") from exc
+
+
+# ── M3.2 ──────────────────────────────────────────────────────────────────────
+
+async def next_runbook_step(
+    http_client: httpx.AsyncClient,
+    console_url: str,
+    control_token: str,
+    run_id: str,
+    operator_id: str = "",
+    timeout: float = 30.0,
+) -> dict:
+    """
+    POST /api/runbooks/internal/runs/{run_id}/next
+
+    Returns either:
+      {type:"manual", step_index, title, section, instructions_md, steps_total?}
+      {type:"http_check"|"script", step_index, title, result, step_status, next_step, completed}
+
+    Raises RunnerError on HTTP error, 404 (run not found / not active).
+    """
+    url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}/next"
+    payload = {"operator_id": operator_id} if operator_id else {}
+
+    try:
+        resp = await http_client.post(
+            url,
+            json=payload,
+            headers={"X-Control-Token": control_token},
+            timeout=timeout,
+        )
+    except httpx.RequestError as exc:
+        raise RunnerError(f"connection error: {exc}") from exc
+
+    if resp.status_code == 404:
+        detail = _extract_error_detail(resp)
+        raise RunnerError(f"run not found or not active: {detail}")
+    if resp.status_code != 200:
+        raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
+
+    try:
+        return resp.json()
+    except Exception as exc:
+        raise RunnerError(f"invalid JSON response: {exc}") from exc
+
+
+async def complete_runbook_step(
+    http_client: httpx.AsyncClient,
+    console_url: str,
+    control_token: str,
+    run_id: str,
+    step_index: int,
+    status: str,
+    notes: str = "",
+    operator_id: str = "",
+    timeout: float = 15.0,
+) -> dict:
+    """
+    POST /api/runbooks/internal/runs/{run_id}/steps/{step_index}/complete
+
+    Returns: {ok, run_id, step_index, status, next_step, steps_total, run_completed}
+
+    Raises RunnerError on HTTP error or 404 (run/step not found or wrong current step).
+    """
+    url = (
+        f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}"
+        f"/steps/{step_index}/complete"
+    )
+    payload: dict = {"status": status}
+    if notes:
+        payload["notes"] = notes
+    if operator_id:
+        payload["operator_id"] = operator_id
+
+    try:
+        resp = await http_client.post(
+            url,
+            json=payload,
+            headers={"X-Control-Token": control_token},
+            timeout=timeout,
+        )
+    except httpx.RequestError as exc:
+        raise RunnerError(f"connection error: {exc}") from exc
+
+    if resp.status_code == 404:
+        detail = _extract_error_detail(resp)
+        raise RunnerError(f"step not found or not current: {detail}")
+    if resp.status_code != 200:
+        raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
+
+    try:
+        return resp.json()
+    except Exception as exc:
+        raise RunnerError(f"invalid JSON response: {exc}") from exc
+
+
+# ── M3.3 ──────────────────────────────────────────────────────────────────────
+
+async def generate_evidence(
+    http_client: httpx.AsyncClient,
+    console_url: str,
+    control_token: str,
+    run_id: str,
+    timeout: float = 30.0,
+) -> dict:
+    """
+    POST /api/runbooks/internal/runs/{run_id}/evidence
+
+    Returns: {evidence_path, bytes, created_at, run_id}
+
+    Raises RunnerError on HTTP error or 404 (run not found).
+    """
+    url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}/evidence"
+    try:
+        resp = await http_client.post(
+            url,
+            headers={"X-Control-Token": control_token},
+            timeout=timeout,
+        )
+    except httpx.RequestError as exc:
+        raise RunnerError(f"connection error: {exc}") from exc
+
+    if resp.status_code == 404:
+        raise RunnerError(f"run {run_id!r} not found")
+    if resp.status_code != 200:
+        raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
+
+    try:
+        return resp.json()
+    except Exception as exc:
+        raise RunnerError(f"invalid JSON response: {exc}") from exc
+
+
+async def generate_post_review(
+    http_client: httpx.AsyncClient,
+    console_url: str,
+    control_token: str,
+    run_id: str,
+    timeout: float = 30.0,
+) -> dict:
+    """
+    POST /api/runbooks/internal/runs/{run_id}/post_review
+
+    Returns: {path, bytes, created_at, run_id}
+
+    Raises RunnerError on HTTP error or 404.
+    """
+    url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}/post_review"
+    try:
+        resp = await http_client.post(
+            url,
+            headers={"X-Control-Token": control_token},
+            timeout=timeout,
+        )
+    except httpx.RequestError as exc:
+        raise RunnerError(f"connection error: {exc}") from exc
+
+    if resp.status_code == 404:
+        raise RunnerError(f"run {run_id!r} not found")
+    if resp.status_code != 200:
+        raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
+
+    try:
+        return resp.json()
+    except Exception as exc:
+        raise RunnerError(f"invalid JSON response: {exc}") from exc
diff --git a/services/matrix-bridge-dagi/app/discovery.py b/services/matrix-bridge-dagi/app/discovery.py
new file mode 100644
index 00000000..14d4ecab
--- /dev/null
+++ b/services/matrix-bridge-dagi/app/discovery.py
@@ -0,0 +1,210 @@
+"""
+discovery — M4.0: Agent discovery helpers for Matrix user rooms.
+
+Provides formatted replies for `!agents` and `!agents status` commands.
+These commands are available to all room members (no auth required) and
+are processed BEFORE routing to the LLM agent.
+
+Supports:
+  - Mixed rooms: list all agents, default, usage examples
+  - Direct rooms: show single agent mapping
+  - Unknown rooms: "no mapping" notice
+"""
+from __future__ import annotations
+
+import datetime
+from typing import Optional
+
+from .mixed_routing import MixedRoomConfig
+from .room_mapping import RoomMappingConfig  # noqa: F401 — used in type hints
+
+
+def _fmt_ts(ts: int) -> str:
+    """Format a Unix timestamp as compact UTC string."""
+    try:
+        return datetime.datetime.fromtimestamp(ts, tz=datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    except Exception:  # noqa: BLE001
+        return str(ts)
+
+# Discovery command prefix
+DISCOVERY_CMD = "!agents"
+
+# Reply length cap (Matrix message, not truncated — kept short by design)
+_MAX_REPLY_LEN = 3500
+
+
+def is_discovery_message(text: str) -> bool:
+    """Return True if the message is a !agents discovery command."""
+    lower = text.strip().lower()
+    return lower == DISCOVERY_CMD or lower.startswith(DISCOVERY_CMD + " ")
+
+
+def agents_reply(
+    room_id: str,
+    room_map: RoomMappingConfig,
+    mixed_room_config: Optional[MixedRoomConfig],
+) -> str:
+    """
+    Build a discovery reply for the given room.
+
+    Mixed room  → list agents, default, usage examples.
+    Direct room → single agent mapping.
+    Unknown     → 'no mapping' notice.
+    """
+    # Mixed room check first
+    if mixed_room_config and mixed_room_config.is_mixed(room_id):
+        room = mixed_room_config.rooms.get(room_id)
+        if room is not None:
+            return _mixed_room_reply(room_id, room)
+
+    # Direct room check
+    agent_id = room_map.agent_for_room(room_id)
+    if agent_id is not None:
+        return _direct_room_reply(agent_id)
+
+    return _unknown_room_reply()
+
+
+def _mixed_room_reply(room_id: str, room) -> str:
+    """Format reply for a mixed room."""
+    agents = room.agents
+    default = room.default_agent or (agents[0] if agents else "?")
+    agent_list = ", ".join(f"**{a}**" for a in agents)
+
+    lines = [
+        f"🤖 **Agents available in this room:** {agent_list}",
+        f"⭐ **Default:** {default}",
+        "",
+        "**How to address an agent:**",
+    ]
+    for agent in agents[:5]:  # show max 5 examples
+        lines.append(f"  • `/{agent} <message>` — slash command")
+        lines.append(f"  • `@{agent} <message>` — mention")
+        lines.append(f"  • `{agent}: <message>` — colon prefix")
+    lines.extend([
+        "",
+        f"_Messages without prefix go to **{default}** by default._",
+    ])
+    return "\n".join(lines)
+
+
+def _direct_room_reply(agent_id: str) -> str:
+    """Format reply for a directly-mapped room (1 agent)."""
+    return (
+        f"🤖 This room is mapped to agent: **{agent_id}**\n\n"
+        f"All messages are forwarded to **{agent_id}** automatically.\n"
+        f"No prefix needed — just write your message."
+    )
+
+
+def _unknown_room_reply() -> str:
+    """Format reply when room has no mapping."""
+    return (
+        "⚠️ This room has no agent mapping.\n\n"
+        "Contact an operator to configure an agent for this room."
+    )
+
+
+# ── Bridge status reply (M4.1) ────────────────────────────────────────────────
+
+def bridge_status_reply(snapshot: dict) -> str:
+    """
+    Format a concise bridge health snapshot for `!status` in control room.
+
+    snapshot keys (all optional with defaults):
+      node_id, queue_size, queue_max, worker_count,
+      room_count, mixed_room_count, operators_count,
+      control_safety (dict), persistent_dedupe (dict),
+      dedupe_hits, dedupe_inserts
+    """
+    node_id = snapshot.get("node_id", "?")
+    q_size = snapshot.get("queue_size", "?")
+    q_max = snapshot.get("queue_max", "?")
+    workers = snapshot.get("worker_count", "?")
+    rooms = snapshot.get("room_count", 0)
+    mixed = snapshot.get("mixed_room_count", 0)
+    ops = snapshot.get("operators_count", 0)
+
+    safety = snapshot.get("control_safety") or {}
+    dedupe = snapshot.get("persistent_dedupe") or {}
+
+    node_policy = snapshot.get("node_policy") or {}
+    default_node = node_policy.get("default_node", node_id)
+    allowed_nodes = node_policy.get("allowed_nodes") or []
+    room_overrides = node_policy.get("room_overrides", 0)
+
+    lines = [
+        f"📡 **Bridge status** — node: `{node_id}`",
+        "",
+        f"**Queue:** {q_size}/{q_max}  workers: {workers}",
+        f"**Rooms:** {rooms} direct  {mixed} mixed  ops: {ops} operators",
+        "",
+    ]
+
+    # M5.0: node policy
+    if allowed_nodes:
+        allowed_str = ", ".join(f"`{n}`" for n in sorted(allowed_nodes))
+        lines.append(
+            f"**Node policy:** default=`{default_node}`  allowed={allowed_str}  room_overrides={room_overrides}"
+        )
+
+    # Control safety
+    if safety:
+        enabled = "✅" if safety.get("enabled") else "⬜"
+        lines.append(
+            f"**Control safety {enabled}:** "
+            f"room={safety.get('room_rpm', '?')}rpm  "
+            f"op={safety.get('operator_rpm', '?')}rpm  "
+            f"cooldown={safety.get('cooldown_s', '?')}s"
+        )
+
+    # Persistent dedupe
+    if dedupe:
+        ok_emoji = "✅" if dedupe.get("ok") else "❌"
+        pruned = dedupe.get("pruned_rows_last", 0)
+        ttl = dedupe.get("ttl_h", "?")
+        lines.append(
+            f"**Dedupe {ok_emoji}:** ttl={ttl}h  pruned_last={pruned}  "
+            f"db=`{dedupe.get('db_path') or 'n/a'}`"
+        )
+
+    # M6.0/M6.1: policy store status
+    ps = snapshot.get("policy_store") or {}
+    if ps:
+        ps_ok = "✅" if ps.get("ok") else "❌"
+        ps_node_count = ps.get("overrides_count", 0)
+        ps_agent_count = ps.get("agent_overrides_count", snapshot.get("policy_agent_overrides_count", 0))
+        ps_path = ps.get("policy_store_path") or ps.get("path") or "n/a"
+        lines.append(
+            f"**Policy store {ps_ok}:** node_overrides={ps_node_count}  "
+            f"agent_overrides={ps_agent_count}  db=`{ps_path}`"
+        )
+
+    # M6.2: last export/import timestamps + DB mtime
+    _last_export = snapshot.get("policy_last_export_at")
+    _last_import = snapshot.get("policy_last_import_at")
+    _db_mtime    = snapshot.get("policy_db_mtime")
+    _snap_parts: list = []
+    if _last_export:
+        _snap_parts.append(f"last_export=`{_fmt_ts(_last_export)}`")
+    if _last_import:
+        _snap_parts.append(f"last_import=`{_fmt_ts(_last_import)}`")
+    if _db_mtime:
+        _snap_parts.append(f"db_mtime=`{_fmt_ts(_db_mtime)}`")
+    if _snap_parts:
+        lines.append("**Policy snapshots:** " + "  ".join(_snap_parts))
+
+    # M5.1: per-node routed/rejected breakdown
+    node_stats = snapshot.get("nodes") or {}
+    if node_stats:
+        lines.append("\n**Per-node stats:**")
+        for nid in sorted(node_stats):
+            ns = node_stats[nid]
+            lines.append(
+                f"  `{nid}`: routed={ns.get('routed', 0)}  rejected={ns.get('rejected', 0)}"
+            )
+
+    reply = "\n".join(lines)
+    if len(reply) > _MAX_REPLY_LEN:
+        reply = reply[:_MAX_REPLY_LEN - 3] + "…"
+    return reply
diff --git a/services/matrix-bridge-dagi/app/event_store.py b/services/matrix-bridge-dagi/app/event_store.py
new file mode 100644
index 00000000..a461f66c
--- /dev/null
+++ b/services/matrix-bridge-dagi/app/event_store.py
@@ -0,0 +1,213 @@
+"""
+event_store — M2.3: Persistent event deduplication via SQLite.
+
+Stores processed Matrix event_ids so that bridge restarts do not reprocess
+events still returned by /sync (within TTL window).
+
+Schema:
+  processed_events (room_id, event_id, ts, sender_hash)
+    PK: (room_id, event_id)
+    IDX: idx_processed_events_ts (ts)
+
+Design notes:
+  - Uses aiosqlite for non-blocking async access from the ingress event loop.
+  - Prune is best-effort: failures are logged but do NOT abort processing.
+  - If the DB is unavailable (init error, corruption), EventStore degrades to
+    a no-op: is_processed() returns False, mark_processed() is a no-op.
+    The in-memory LRU dedupe (H1) continues to protect within a single run.
+  - WAL mode is enabled for better concurrent read performance.
+"""
+from __future__ import annotations
+
+import logging
+import time
+from pathlib import Path
+from typing import Optional, Tuple
+
+try:
+    import aiosqlite
+    _AIOSQLITE_OK = True
+except ImportError:  # pragma: no cover
+    aiosqlite = None  # type: ignore
+    _AIOSQLITE_OK = False
+
+logger = logging.getLogger(__name__)
+
+_SCHEMA = """
+CREATE TABLE IF NOT EXISTS processed_events (
+    room_id     TEXT NOT NULL,
+    event_id    TEXT NOT NULL,
+    ts          INTEGER NOT NULL,
+    sender_hash TEXT,
+    PRIMARY KEY (room_id, event_id)
+);
+CREATE INDEX IF NOT EXISTS idx_processed_events_ts ON processed_events (ts);
+"""
+
+
+class EventStore:
+    """
+    Async SQLite-backed deduplication store for Matrix event_ids.
+
+    Usage:
+        store = EventStore("/app/data/matrix_bridge.db", ttl_h=48)
+        await store.open()
+        ...
+        hit = await store.is_processed(room_id, event_id)
+        if not hit:
+            await store.mark_processed(room_id, event_id, sender_hash)
+        ...
+        pruned = await store.prune(batch=5000)
+        await store.close()
+    """
+
+    def __init__(
+        self,
+        db_path: str,
+        ttl_h: int = 48,
+        prune_batch: int = 5000,
+    ) -> None:
+        self.db_path = db_path
+        self.ttl_h = ttl_h
+        self.prune_batch = prune_batch
+        self._db: Optional["aiosqlite.Connection"] = None
+        self._ok: bool = False
+        self._last_prune_at: Optional[float] = None
+        self._pruned_rows_last: int = 0
+
+    # ── Lifecycle ─────────────────────────────────────────────────────────────
+
+    async def open(self) -> bool:
+        """
+        Open the SQLite connection and apply schema.
+
+        Returns True on success; False on failure (degraded mode).
+        """
+        if not _AIOSQLITE_OK:
+            logger.warning("aiosqlite not available — persistent dedupe disabled")
+            return False
+        try:
+            Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
+            self._db = await aiosqlite.connect(self.db_path)
+            # WAL mode: better concurrent read, non-blocking writes
+            await self._db.execute("PRAGMA journal_mode=WAL")
+            await self._db.execute("PRAGMA synchronous=NORMAL")
+            await self._db.executescript(_SCHEMA)
+            await self._db.commit()
+            self._ok = True
+            logger.info("EventStore opened: %s (ttl_h=%d)", self.db_path, self.ttl_h)
+            return True
+        except Exception as exc:
+            logger.error("EventStore.open failed — degraded: %s", exc)
+            self._ok = False
+            return False
+
+    async def close(self) -> None:
+        """Close the SQLite connection gracefully."""
+        if self._db is not None:
+            try:
+                await self._db.close()
+            except Exception as exc:  # pragma: no cover
+                logger.warning("EventStore.close error: %s", exc)
+            self._db = None
+        self._ok = False
+
+    # ── Core operations ───────────────────────────────────────────────────────
+
+    async def is_processed(self, room_id: str, event_id: str) -> bool:
+        """
+        Return True if (room_id, event_id) has already been processed.
+
+        Safe to call even when degraded (returns False → no false deduplication).
+        """
+        if not self._ok or self._db is None:
+            return False
+        try:
+            async with self._db.execute(
+                "SELECT 1 FROM processed_events WHERE room_id=? AND event_id=? LIMIT 1",
+                (room_id, event_id),
+            ) as cursor:
+                row = await cursor.fetchone()
+            return row is not None
+        except Exception as exc:
+            logger.warning("EventStore.is_processed error (degraded): %s", exc)
+            return False
+
+    async def mark_processed(
+        self,
+        room_id: str,
+        event_id: str,
+        sender_hash: str = "",
+    ) -> bool:
+        """
+        Insert (room_id, event_id) as processed.
+
+        Returns True on success, False if already exists or on error.
+        Uses INSERT OR IGNORE to avoid duplicates without raising.
+        """
+        if not self._ok or self._db is None:
+            return False
+        ts = int(time.time())
+        try:
+            await self._db.execute(
+                "INSERT OR IGNORE INTO processed_events (room_id, event_id, ts, sender_hash) "
+                "VALUES (?, ?, ?, ?)",
+                (room_id, event_id, ts, sender_hash or None),
+            )
+            await self._db.commit()
+            return True
+        except Exception as exc:
+            logger.warning("EventStore.mark_processed error (degraded): %s", exc)
+            return False
+
+    # ── Prune ─────────────────────────────────────────────────────────────────
+
+    async def prune(self, batch: Optional[int] = None) -> int:
+        """
+        Delete events older than ttl_h.
+
+        Returns the number of rows deleted (0 on error or degraded).
+        Uses LIMIT batch to avoid long locks on large tables.
+        """
+        if not self._ok or self._db is None:
+            return 0
+
+        cutoff = int(time.time()) - self.ttl_h * 3600
+        effective_batch = batch or self.prune_batch
+        deleted = 0
+
+        try:
+            # SQLite DELETE with LIMIT requires compiling with SQLITE_ENABLE_UPDATE_DELETE_LIMIT,
+            # which may not be available. Use a subquery approach instead.
+            await self._db.execute(
+                "DELETE FROM processed_events "
+                "WHERE rowid IN ("
+                "  SELECT rowid FROM processed_events WHERE ts < ? LIMIT ?"
+                ")",
+                (cutoff, effective_batch),
+            )
+            await self._db.commit()
+            # Estimate rows deleted from changes()
+            async with self._db.execute("SELECT changes()") as cursor:
+                row = await cursor.fetchone()
+            deleted = row[0] if row else 0
+            self._last_prune_at = time.time()
+            self._pruned_rows_last = deleted
+            if deleted:
+                logger.info("EventStore pruned %d rows (cutoff=%d)", deleted, cutoff)
+        except Exception as exc:
+            logger.warning("EventStore.prune error: %s", exc)
+
+        return deleted
+
+    # ── Health / introspection ─────────────────────────────────────────────────
+
+    def as_health_dict(self) -> dict:
+        return {
+            "enabled": self._ok,
+            "db_path": self.db_path,
+            "ttl_h": self.ttl_h,
+            "ok": self._ok,
+            "last_prune_at": self._last_prune_at,
+            "pruned_rows_last": self._pruned_rows_last,
+        }
diff --git a/services/matrix-bridge-dagi/app/ingress.py b/services/matrix-bridge-dagi/app/ingress.py
index 8659c49b..238d5bb0 100644
--- a/services/matrix-bridge-dagi/app/ingress.py
+++ b/services/matrix-bridge-dagi/app/ingress.py
@@ -1,5 +1,5 @@
 """
-Matrix Ingress + Egress Loop — Phase M1.4 + H1 + H2 + H3 + M2.1 + M2.2 + M3.0 (control channel)
+Matrix Ingress + Egress Loop — Phase M1.4 + H1 + H2 + H3 + M2.1 + M2.2 + M3.0 + M3.1 + M3.3 + SessionScopeV2
 
 Architecture (H2):
   Reader task  → asyncio.Queue(maxsize) → N Worker tasks
@@ -26,7 +26,10 @@ Queue entry: _QueueEntry(event, room_id, agent_id, enqueue_time, routing_reason,
 """
 
 import asyncio
+import hashlib
+import json as _json
 import logging
+import os as _os
 import time
 from dataclasses import dataclass, field
 from typing import Any, Callable, Dict, List, Optional
@@ -37,11 +40,57 @@ from .control import (
     ControlConfig, ControlCommand,
     check_authorization, parse_command, is_control_message,
     not_implemented_reply, unknown_command_reply, unauthorized_reply, help_reply,
-    VERB_HELP,
+    start_usage_reply, runbook_started_reply, runbook_start_error_reply,
+    next_usage_reply, next_manual_reply, next_auto_reply, next_error_reply,
+    complete_usage_reply, complete_ok_reply, complete_error_reply,
+    status_usage_reply, status_reply, status_error_reply,
+    evidence_usage_reply, evidence_reply, evidence_error_reply,
+    post_review_usage_reply, post_review_reply, post_review_error_reply,
+    rate_limited_reply, sanitize_notes, MAX_NOTES_LEN,
+    status_not_available_reply, nodes_reply,
+    VERB_HELP, VERB_RUNBOOK, VERB_STATUS, VERB_NODES, VERB_NODE, VERB_CONFIRM,
+    is_dangerous_cmd, build_normalized_args,
+    confirm_intent_reply, confirm_success_reply, confirm_expired_reply,
+    NODE_SUBCMD_SET, NODE_SUBCMD_UNSET, NODE_SUBCMD_GET, NODE_SUBCMD_LIST,
+    parse_node_cmd, node_cmd_validate_room,
+    node_cmd_reply_set, node_cmd_reply_unset_ok, node_cmd_reply_unset_not_found,
+    node_cmd_reply_get, node_cmd_reply_list, node_cmd_reply_error,
+    VERB_ROOM, ROOM_SUBCMD_AGENTS,
+    ROOM_ACTION_SET, ROOM_ACTION_ADD, ROOM_ACTION_REMOVE,
+    ROOM_ACTION_GET, ROOM_ACTION_LIST, ROOM_ACTION_UNSET,
+    parse_room_agents_cmd,
+    room_agents_reply_set, room_agents_reply_add, room_agents_reply_remove,
+    room_agents_reply_unset_ok, room_agents_reply_unset_not_found,
+    room_agents_reply_get, room_agents_reply_list, room_agents_reply_error,
+    VERB_POLICY, POLICY_EXPORTS_SUBDIR, validate_export_path,
+    policy_import_intent_reply, format_import_diff as _format_import_diff,
+    policy_export_reply, policy_import_dry_run_reply, policy_import_reply,
+    policy_cmd_error,
+    policy_prune_preview_reply, policy_prune_applied_reply,
+    policy_restore_intent_reply, policy_restore_applied_reply,
+    policy_history_reply, policy_change_detail_reply,
+    SUBCOMMAND_START, SUBCOMMAND_NEXT, SUBCOMMAND_COMPLETE,
+    SUBCOMMAND_STATUS, SUBCOMMAND_EVIDENCE, SUBCOMMAND_POST_REVIEW,
 )
+from .control_limiter import ControlRateLimiter
+from .discovery import agents_reply, bridge_status_reply, is_discovery_message
+from .event_store import EventStore
+from .node_policy import (
+    NodePolicy, NodeResolution,
+    NODE_SOURCE_DEFAULT, NODE_SOURCE_EXPLICIT, NODE_SOURCE_ROOM_MAP,
+    extract_node_kwarg, node_rejected_reply,
+)
+from .node_health import (
+    NodeHealthTracker, NodeHealthConfig,
+    NODE_STATE_HEALTHY, NODE_STATE_DEGRADED, NODE_STATE_DOWN,
+    FAILOVER_REASON_TIMEOUT, FAILOVER_REASON_HTTP_5XX, FAILOVER_REASON_NETWORK,
+)
+from .sticky_cache import StickyNodeCache, make_sticky_key
+from .confirm_store import ConfirmStore
+from . import control_runner as _ctrl_runner
 from .matrix_client import MatrixClient
 from .mixed_routing import (
-    MixedRoomConfig, route_message, reply_prefix,
+    MixedRoomConfig, MixedRoom, route_message, reply_prefix, build_override_config,
     REASON_REJECTED_UNKNOWN_AGENT, REASON_REJECTED_SLASH_TOO_LONG, REASON_REJECTED_NO_MAPPING,
 )
 from .rate_limit import InMemoryRateLimiter
@@ -71,6 +120,35 @@ class _QueueEntry:
     is_mixed: bool = False        # True for mixed-room entries (reply tagging, session isolation)
 
 
+# ── Session Scope v2 ───────────────────────────────────────────────────────────
+# Invariants:
+#   1. Control room messages never reach the Router (no session_key needed there).
+#   2. Matrix and Telegram never share a key namespace (prefix "matrix:").
+#   3. Mixed rooms: each (room_id, agent_id) pair has its own key — no cross-agent leakage.
+#   4. Logs/metrics receive sender_hash (sha256[:16]), never raw Matrix user_id.
+
+SCOPE_ROOM_AGENT = "room_agent"    # default: shared room context per agent
+SCOPE_OPS_RUNBOOK = "ops_runbook"  # future: control/ops room invocations
+SCOPE_DM_USER = "dm_agent_user"    # future: per-user DM isolation
+
+
+def _sender_hash(sender: str) -> str:
+    """PII-safe 16-hex hash of a Matrix user_id (e.g. @alice:server → 'a3f9...')."""
+    return hashlib.sha256(sender.encode()).hexdigest()[:16]
+
+
+def _build_session_key(room_id: str, agent_id: str, scope: str = SCOPE_ROOM_AGENT) -> str:
+    """
+    Canonical session key v2: matrix:{scope}:{room_key}:{agent_id}
+
+    Examples:
+      matrix:room_agent:roomXserver_yourdomain:sofiia
+      matrix:ops_runbook:opsroomXserver:sofiia
+    """
+    room_key = room_id.replace("!", "").replace(":", "_")
+    return f"matrix:{scope}:{room_key}:{agent_id}"
+
+
 # ── Router invoke ──────────────────────────────────────────────────────────────
 
 async def _invoke_router(
@@ -80,6 +158,9 @@ async def _invoke_router(
     node_id: str,
     prompt: str,
     session_id: str,
+    sender_hash: str = "",
+    scope: str = SCOPE_ROOM_AGENT,
+    node_source: str = NODE_SOURCE_DEFAULT,
 ) -> str:
     """POST /v1/agents/{agent_id}/infer → response text. Raises httpx.HTTPError on failure."""
     url = f"{router_url.rstrip('/')}/v1/agents/{agent_id}/infer"
@@ -87,7 +168,14 @@ async def _invoke_router(
         "prompt": prompt,
         "session_id": session_id,
         "user_id": "matrix_bridge",
-        "metadata": {"transport": "matrix", "node_id": node_id},
+        "metadata": {
+            "transport": "matrix",
+            "node_id": node_id,
+            "node_source": node_source,  # M5.0: how node was resolved
+            "session_key": session_id,  # explicit for Router/Memory to index on
+            "sender_hash": sender_hash,  # PII-safe sender fingerprint
+            "scope": scope,
+        },
     }
     resp = await http_client.post(url, json=payload, timeout=_ROUTER_TIMEOUT_S)
     resp.raise_for_status()
@@ -102,6 +190,20 @@ async def _invoke_router(
     return (text if isinstance(text, str) else str(text)).strip()
 
 
+# ── M6.2: File helpers (run in thread) ────────────────────────────────────────
+
+def _write_json_file(path: str, data: Any) -> None:
+    """Synchronously write data as JSON to path (UTF-8, 2-space indent)."""
+    with open(path, "w", encoding="utf-8") as fh:
+        _json.dump(data, fh, ensure_ascii=False, indent=2)
+
+
+def _read_json_file(path: str) -> Any:
+    """Synchronously read and parse a JSON file."""
+    with open(path, encoding="utf-8") as fh:
+        return _json.load(fh)
+
+
 # ── Audit write ────────────────────────────────────────────────────────────────
 
 async def _write_audit(
@@ -187,6 +289,16 @@ class MatrixIngressLoop:
         # M3.0: control channel
         control_config: Optional[ControlConfig] = None,
         control_unauthorized_behavior: str = "ignore",   # "ignore" | "reply_error"
+        # M3.1: runbook runner integration
+        sofiia_control_token: str = "",
+        # M3.4: control channel safety
+        control_limiter: Optional["ControlRateLimiter"] = None,
+        # M2.3: persistent event deduplication
+        event_store: Optional["EventStore"] = None,
+        # M4.0: agent discovery
+        discovery_rpm: int = 20,
+        # M5.0: node-aware routing
+        node_policy: Optional["NodePolicy"] = None,
         # Callbacks
         on_message_received: Optional[Callable[[str, str], None]] = None,
         on_message_replied: Optional[Callable[[str, str, str], None]] = None,
@@ -194,12 +306,39 @@ class MatrixIngressLoop:
         on_rate_limited: Optional[Callable[[str, str, str], None]] = None,
         on_queue_dropped: Optional[Callable[[str, str], None]] = None,
         on_queue_size: Optional[Callable[[int], None]] = None,
-        on_invoke_latency: Optional[Callable[[str, float], None]] = None,
+        on_invoke_latency: Optional[Callable[..., None]] = None,   # (agent_id, duration_s, node_id="")
         on_send_latency: Optional[Callable[[str, float], None]] = None,
         on_queue_wait: Optional[Callable[[str, float], None]] = None,
         on_routed: Optional[Callable[[str, str], None]] = None,
         on_route_rejected: Optional[Callable[[str, str], None]] = None,
         on_control_command: Optional[Callable[[str, str, str], None]] = None,
+        on_control_rate_limited: Optional[Callable[[str], None]] = None,
+        on_dedupe_persistent_hit: Optional[Callable[[str, str], None]] = None,
+        on_dedupe_persistent_insert: Optional[Callable[[], None]] = None,
+        # M5.0: node routing callbacks
+        on_node_selected: Optional[Callable[[str, str, str], None]] = None,  # (agent_id, node_id, source)
+        on_node_rejected: Optional[Callable[[str], None]] = None,            # (rejected_node)
+        # M5.1: per-node stats for !status reply
+        node_stats_getter: Optional[Callable[[], Dict[str, Any]]] = None,
+        # M6.0: persistent policy store for dynamic room-node overrides
+        policy_store: Optional[Any] = None,   # app.policy_store.PolicyStore
+        # M6.2: data directory for policy exports/imports
+        bridge_data_dir: Optional[str] = None,
+        # M8.0: node health tracker for soft-failover
+        node_health_tracker: Optional[NodeHealthTracker] = None,
+        on_failover: Optional[Callable[[str, str, str], None]] = None,  # (from_node, to_node, reason)
+        # M8.1: sticky failover cache (anti-flap)
+        sticky_cache: Optional[StickyNodeCache] = None,
+        on_sticky_set: Optional[Callable[[str, str], None]] = None,  # (node_id, scope)
+        # M8.2: HA state persistence config
+        ha_health_snapshot_interval_s: int = 60,
+        ha_health_max_age_s: int = 600,
+        # M9.0: Two-step confirmation store for dangerous commands
+        confirm_store: Optional[ConfirmStore] = None,
+        # M10.0: Auto-backup retention policy (days; 0 = keep forever)
+        policy_export_retention_days: int = 30,
+        # M10.2: max rows to keep in policy_changes history (0 = unlimited)
+        policy_history_limit: int = 100,
     ) -> None:
         self._hs_url = matrix_homeserver_url
         self._token = matrix_access_token
@@ -226,12 +365,54 @@ class MatrixIngressLoop:
         self._mixed_room_config = mixed_room_config
         self._control_config = control_config
         self._control_unauthorized_behavior = control_unauthorized_behavior
+        self._control_token = sofiia_control_token
         self._unknown_agent_behavior = unknown_agent_behavior
         self._max_slash_len = max_slash_len
         self._mixed_concurrency_cap = mixed_concurrency_cap
         self._on_routed = on_routed
         self._on_route_rejected = on_route_rejected
         self._on_control_command = on_control_command
+        self._on_control_rate_limited = on_control_rate_limited
+        # M3.4: control channel safety
+        self._control_limiter = control_limiter
+        # M2.3: persistent event deduplication
+        self._event_store: Optional[EventStore] = event_store
+        self._on_dedupe_persistent_hit = on_dedupe_persistent_hit
+        self._on_dedupe_persistent_insert = on_dedupe_persistent_insert
+        # M5.0: node routing callbacks
+        self._on_node_selected = on_node_selected
+        self._on_node_rejected = on_node_rejected
+        # M5.1: per-node stats getter for !status reply
+        self._node_stats_getter = node_stats_getter
+        # M6.0: persistent policy store
+        self._policy_store = policy_store
+        # M6.2: policy exports directory
+        self._bridge_data_dir: Optional[str] = bridge_data_dir
+        self._policy_last_export_at: Optional[int] = None
+        self._policy_last_import_at: Optional[int] = None
+        # M10.0: auto-backup retention
+        self._policy_export_retention_days: int = policy_export_retention_days
+        # M10.2: history table row limit
+        self._policy_history_limit: int = policy_history_limit
+        # M8.0: node health + soft-failover
+        self._node_health_tracker: Optional[NodeHealthTracker] = node_health_tracker
+        self._on_failover = on_failover
+        # M8.1: sticky failover cache
+        self._sticky_cache: Optional[StickyNodeCache] = sticky_cache
+        self._on_sticky_set = on_sticky_set
+        # M9.0: two-step confirmation store
+        self._confirm_store: Optional[ConfirmStore] = confirm_store
+        # M8.2: HA state persistence
+        self._ha_health_snapshot_interval_s: int = ha_health_snapshot_interval_s
+        self._ha_health_max_age_s: int = ha_health_max_age_s
+        self._ha_sticky_loaded: int = 0      # count of sticky entries loaded on startup
+        self._ha_health_loaded: bool = False  # whether health state was loaded on startup
+        # M4.0: agent discovery — simple per-room sliding window (reuses InMemoryRateLimiter logic)
+        self._discovery_rpm = discovery_rpm
+        # M5.0: node-aware routing policy
+        self._node_policy: Optional[NodePolicy] = node_policy
+        from collections import defaultdict, deque
+        self._discovery_windows: dict = defaultdict(deque)
         # Lazily populated semaphores keyed by "{room_id}:{agent_id}"
         self._concurrency_locks: Dict[str, asyncio.Semaphore] = {}
         self._next_batch: Optional[str] = None
@@ -249,6 +430,69 @@ class MatrixIngressLoop:
     def worker_count(self) -> int:
         return self._worker_count
 
+    def get_status(self) -> Dict[str, Any]:
+        """Return a simple bridge status dict for health/ops queries."""
+        status: Dict[str, Any] = {
+            "queue_size": self._queue.qsize() if self._queue else 0,
+            "queue_max": self._queue_max,
+            "worker_count": self._worker_count,
+        }
+        if self._node_policy is not None:
+            status["node_policy"] = self._node_policy.as_info_dict()
+        # M5.1: per-node routed/rejected counters
+        if self._node_stats_getter is not None:
+            status["nodes"] = self._node_stats_getter()
+        # M6.0: policy store info
+        if self._policy_store is not None:
+            try:
+                status["policy_store_ok"] = self._policy_store.is_open
+                status["policy_store_path"] = self._policy_store.db_path
+                status["policy_overrides_count"] = self._policy_store.count_overrides()
+                status["policy_agent_overrides_count"] = self._policy_store.count_agent_overrides()  # M6.1
+            except Exception as exc:  # noqa: BLE001
+                status["policy_store_ok"] = False
+                status["policy_store_error"] = str(exc)
+        # M6.2: policy snapshot timestamps
+        if self._policy_last_export_at is not None:
+            status["policy_last_export_at"] = self._policy_last_export_at
+        if self._policy_last_import_at is not None:
+            status["policy_last_import_at"] = self._policy_last_import_at
+        # M6.2: policy DB mtime (best-effort)
+        if self._policy_store is not None:
+            try:
+                db_path = self._policy_store.db_path
+                if db_path and _os.path.exists(db_path):
+                    status["policy_db_mtime"] = int(_os.path.getmtime(db_path))
+            except Exception:  # noqa: BLE001
+                pass
+        # M10.2: policy change history count
+        if self._policy_store is not None and self._policy_store.is_open:
+            try:
+                status["policy_changes_count"] = self._policy_store.get_policy_changes_count()
+            except Exception:  # noqa: BLE001
+                pass
+        # M8.0: node health tracker state
+        if self._node_health_tracker is not None:
+            allowed = (
+                self._node_policy.allowed_nodes
+                if self._node_policy is not None
+                else None
+            )
+            status["node_health"] = self._node_health_tracker.all_info(allowed)
+        # M8.1: sticky failover cache info
+        if self._sticky_cache is not None:
+            status["sticky_active_keys"] = self._sticky_cache.active_count()
+            status["sticky_ttl_s"] = self._sticky_cache.ttl_s
+        # M9.0: pending confirmations
+        if self._confirm_store is not None:
+            status["confirm_pending"] = self._confirm_store.pending_count()
+            status["confirm_ttl_s"] = self._confirm_store.ttl_s
+        # M8.2: HA persistence info
+        status["ha_sticky_loaded"] = self._ha_sticky_loaded
+        status["ha_health_loaded"] = self._ha_health_loaded
+        status["ha_health_snapshot_interval_s"] = self._ha_health_snapshot_interval_s
+        return status
+
     @property
     def active_lock_count(self) -> int:
         """Number of room-agent pairs currently holding a concurrency lock."""
@@ -307,6 +551,9 @@ class MatrixIngressLoop:
                 )
 
             async with httpx.AsyncClient() as http_client:
+                # M8.2: Load persisted HA state before processing any messages
+                await self._load_ha_state()
+
                 # Start workers
                 worker_tasks = [
                     asyncio.create_task(
@@ -316,6 +563,18 @@ class MatrixIngressLoop:
                     for i in range(self._worker_count)
                 ]
 
+                # M8.2: Start periodic node health snapshot task
+                _health_snapshot_task = None
+                if (
+                    self._ha_health_snapshot_interval_s > 0
+                    and self._policy_store is not None
+                    and self._node_health_tracker is not None
+                ):
+                    _health_snapshot_task = asyncio.create_task(
+                        self._node_health_snapshot_loop(),
+                        name="ha_health_snapshot",
+                    )
+
                 # Run reader until stop_event
                 await self._reader(client, queue, http_client, stop_event)
 
@@ -337,6 +596,9 @@ class MatrixIngressLoop:
                 # Cancel workers
                 for task in worker_tasks:
                     task.cancel()
+                # M8.2: Cancel health snapshot task if running
+                if _health_snapshot_task is not None and not _health_snapshot_task.done():
+                    _health_snapshot_task.cancel()
                 results = await asyncio.gather(*worker_tasks, return_exceptions=True)
                 cancelled = sum(1 for r in results if isinstance(r, asyncio.CancelledError))
                 logger.info("Workers stopped (%d cancelled)", cancelled)
@@ -392,6 +654,11 @@ class MatrixIngressLoop:
                 continue
             messages = client.extract_room_messages(sync_resp, mapping.room_id)
             for event in messages:
+                text = event.get("content", {}).get("body", "").strip()
+                # M4.0: agent discovery before routing
+                if is_discovery_message(text):
+                    await self._handle_discovery(client, http_client, event, mapping.room_id)
+                    continue
                 await self._try_enqueue(client, queue, http_client, event, mapping)
 
         # Mixed rooms: 1 room → N agents, routing per message (M2.1)
@@ -399,6 +666,11 @@ class MatrixIngressLoop:
             for room_id in self._mixed_room_config.rooms:
                 messages = client.extract_room_messages(sync_resp, room_id)
                 for event in messages:
+                    text = event.get("content", {}).get("body", "").strip()
+                    # M4.0: agent discovery before routing
+                    if is_discovery_message(text):
+                        await self._handle_discovery(client, http_client, event, room_id)
+                        continue
                     await self._try_enqueue_mixed(client, queue, http_client, event, room_id)
 
     async def _try_enqueue(
@@ -441,6 +713,28 @@ class MatrixIngressLoop:
         # Dedupe — mark before enqueue (prevents double-enqueue on retry)
         client.mark_seen(event_id)
 
+        # M2.3: Persistent dedupe (cross-restart protection)
+        if self._event_store is not None:
+            try:
+                already = await self._event_store.is_processed(room_id, event_id)
+            except Exception as exc:
+                logger.warning("EventStore.is_processed error (degraded): %s", exc)
+                already = False
+
+            if already:
+                logger.debug("Persistent dedupe hit: event=%s room=%s", event_id, room_id)
+                if self._on_dedupe_persistent_hit:
+                    self._on_dedupe_persistent_hit(room_id, agent_id)
+                await _write_audit(
+                    http_client, self._console_url, self._internal_token,
+                    event="matrix.dedupe.persistent_hit",
+                    agent_id=agent_id, node_id=self._node_id,
+                    room_id=room_id, event_id=event_id,
+                    status="ok",
+                    data={"sender": sender},
+                )
+                return
+
         # H2: Enqueue or drop
         entry = _QueueEntry(
             event=event,
@@ -448,8 +742,10 @@ class MatrixIngressLoop:
             agent_id=agent_id,
             enqueue_time=time.monotonic(),
         )
+        enqueued = False
         try:
             queue.put_nowait(entry)
+            enqueued = True
             qsize = queue.qsize()
             logger.debug("Enqueued event=%s qsize=%d", event_id, qsize)
             if self._on_queue_size:
@@ -470,6 +766,16 @@ class MatrixIngressLoop:
                 data={"queue_max": self._queue_max, "sender": sender},
             )
 
+        # M2.3: Mark as processed ONLY after successful enqueue
+        if enqueued and self._event_store is not None:
+            sender_hash = _sender_hash(sender)
+            try:
+                await self._event_store.mark_processed(room_id, event_id, sender_hash)
+                if self._on_dedupe_persistent_insert:
+                    self._on_dedupe_persistent_insert()
+            except Exception as exc:
+                logger.warning("EventStore.mark_processed error (degraded): %s", exc)
+
     async def _try_enqueue_mixed(
         self,
         client: MatrixClient,
@@ -487,9 +793,27 @@ class MatrixIngressLoop:
         if not text:
             return
 
+        # M6.1: look up dynamic agent override for this room
+        _routing_config = self._mixed_room_config
+        if self._policy_store is not None and self._policy_store.is_open:
+            try:
+                _agent_ov = await asyncio.to_thread(
+                    self._policy_store.get_agent_override, room_id
+                )
+                if _agent_ov is not None:
+                    _ov_agents, _ov_default = _agent_ov
+                    _effective_default = _ov_default or (_ov_agents[0] if _ov_agents else None)
+                    if _ov_agents and _effective_default:
+                        _routing_config = build_override_config(
+                            self._mixed_room_config, room_id,
+                            _ov_agents, _effective_default,
+                        )
+            except Exception as _exc:  # noqa: BLE001
+                logger.warning("PolicyStore get_agent_override failed: %s", _exc)
+
         # Route message to determine target agent
         agent_id, routing_reason, effective_text = route_message(
-            text, room_id, self._mixed_room_config, self._room_map.allowed_agents,
+            text, room_id, _routing_config, self._room_map.allowed_agents,
             max_slash_len=self._max_slash_len,
         )
 
@@ -552,6 +876,28 @@ class MatrixIngressLoop:
 
         client.mark_seen(event_id)
 
+        # M2.3: Persistent dedupe (cross-restart protection, mixed rooms)
+        if self._event_store is not None:
+            try:
+                already = await self._event_store.is_processed(room_id, event_id)
+            except Exception as exc:
+                logger.warning("EventStore.is_processed error mixed (degraded): %s", exc)
+                already = False
+
+            if already:
+                logger.debug("Persistent dedupe hit (mixed): event=%s room=%s agent=%s", event_id, room_id, agent_id)
+                if self._on_dedupe_persistent_hit:
+                    self._on_dedupe_persistent_hit(room_id, agent_id)
+                await _write_audit(
+                    http_client, self._console_url, self._internal_token,
+                    event="matrix.dedupe.persistent_hit",
+                    agent_id=agent_id, node_id=self._node_id,
+                    room_id=room_id, event_id=event_id,
+                    status="ok",
+                    data={"sender": sender, "routing_reason": routing_reason},
+                )
+                return
+
         # Store effective_text (stripped of routing token) in a patched event copy
         effective_event = dict(event)
         effective_event["content"] = dict(event.get("content", {}))
@@ -565,8 +911,10 @@ class MatrixIngressLoop:
             routing_reason=routing_reason,
             is_mixed=True,
         )
+        enqueued_mixed = False
         try:
             queue.put_nowait(entry)
+            enqueued_mixed = True
             qsize = queue.qsize()
             logger.debug(
                 "Enqueued (mixed): event=%s agent=%s reason=%s qsize=%d",
@@ -590,6 +938,1531 @@ class MatrixIngressLoop:
                 data={"queue_max": self._queue_max, "sender": sender},
             )
 
+        # M2.3: Mark as processed ONLY after successful enqueue
+        if enqueued_mixed and self._event_store is not None:
+            sender_hash = _sender_hash(sender)
+            try:
+                await self._event_store.mark_processed(room_id, event_id, sender_hash)
+                if self._on_dedupe_persistent_insert:
+                    self._on_dedupe_persistent_insert()
+            except Exception as exc:
+                logger.warning("EventStore.mark_processed error mixed (degraded): %s", exc)
+
+    # ── M4.0: Agent discovery ──────────────────────────────────────────────────
+
+    def _check_discovery_rate(self, room_id: str) -> bool:
+        """Sliding-window per-room rate check for discovery replies. Returns True if allowed."""
+        if self._discovery_rpm <= 0:
+            return True
+        now = time.monotonic()
+        window = self._discovery_windows[room_id]
+        cutoff = now - 60.0
+        while window and window[0] < cutoff:
+            window.popleft()
+        if len(window) >= self._discovery_rpm:
+            return False
+        window.append(now)
+        return True
+
+    async def _handle_discovery(
+        self,
+        client: MatrixClient,
+        http_client: httpx.AsyncClient,
+        event: Dict[str, Any],
+        room_id: str,
+    ) -> None:
+        """
+        Reply to !agents in any user room (no auth required).
+        Rate-limited per room. Marks event as seen + persisted (no router enqueue).
+        """
+        event_id = event.get("event_id", "")
+        sender = event.get("sender", "")
+
+        # Rate limit for discovery replies
+        if not self._check_discovery_rate(room_id):
+            logger.debug("Discovery rate limited: room=%s", room_id)
+            client.mark_seen(event_id)
+            return
+
+        client.mark_seen(event_id)
+
+        # M6.1: use store-based agent config if available for accurate discovery
+        _disc_config = self._mixed_room_config
+        if self._policy_store is not None and self._policy_store.is_open:
+            try:
+                _disc_ov = await asyncio.to_thread(
+                    self._policy_store.get_agent_override, room_id
+                )
+                if _disc_ov is not None:
+                    _d_agents, _d_default = _disc_ov
+                    _d_eff_default = _d_default or (_d_agents[0] if _d_agents else None)
+                    if _d_agents and _d_eff_default and self._mixed_room_config is not None:
+                        _disc_config = build_override_config(
+                            self._mixed_room_config, room_id, _d_agents, _d_eff_default,
+                        )
+            except Exception:  # noqa: BLE001
+                pass
+
+        reply = agents_reply(room_id, self._room_map, _disc_config)
+        txn_id = MatrixClient.make_txn_id(room_id, event_id + "_discovery")
+        try:
+            await client.send_text(room_id, reply, txn_id)
+        except Exception as exc:
+            logger.warning("Could not send discovery reply: %s", exc)
+
+        # Persist dedupe so restart doesn't re-deliver this discovery
+        if self._event_store is not None:
+            sender_hash = _sender_hash(sender)
+            await self._event_store.mark_processed(room_id, event_id, sender_hash)
+
+    # ── M6.1: Dynamic mixed room agent overrides via !room agents command ─────
+
+    async def _handle_room_cmd(
+        self,
+        http_client: httpx.AsyncClient,
+        sender: str,
+        ctrl_room_id: str,
+        event_id: str,
+        cmd_subcommand: str,
+        cmd_args: tuple,
+        cmd_kwargs: Dict[str, str],
+    ) -> str:
+        """Handle `!room agents <set|add|remove|get|list|unset>` from authorized operator."""
+        if self._policy_store is None or not self._policy_store.is_open:
+            return "⚠️ Policy store not available."
+
+        if cmd_subcommand != ROOM_SUBCMD_AGENTS:
+            return room_agents_reply_error(
+                f"Unknown subcommand: `{cmd_subcommand or '?'}`. Use `!room agents <action>`."
+            )
+
+        action, room_id, agents_list, single_agent, default_agent = parse_room_agents_cmd(
+            cmd_subcommand, cmd_args, cmd_kwargs,
+        )
+
+        if action not in (ROOM_ACTION_SET, ROOM_ACTION_ADD, ROOM_ACTION_REMOVE,
+                          ROOM_ACTION_GET, ROOM_ACTION_LIST, ROOM_ACTION_UNSET):
+            return room_agents_reply_error(f"Unknown action: `{action or '?'}`")
+
+        # Validate allowed agents from global policy
+        allowed_all = self._room_map.allowed_agents  # global allowed agents set
+
+        # ── list ──────────────────────────────────────────────────────────────
+        if action == ROOM_ACTION_LIST:
+            try:
+                rows = await asyncio.to_thread(self._policy_store.list_agent_overrides, 10)
+                total = await asyncio.to_thread(self._policy_store.count_agent_overrides)
+            except Exception as exc:
+                logger.warning("PolicyStore list_agent_overrides error: %s", exc)
+                return "⚠️ Could not read policy store."
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.room.agents.list",
+                agent_id="control", node_id=self._node_id,
+                room_id=ctrl_room_id, event_id=event_id,
+                status="ok", data={"sender": sender, "total": total},
+            )
+            return room_agents_reply_list(rows, total)
+
+        # ── subcommands that require room_id ──────────────────────────────────
+        if not room_id:
+            return room_agents_reply_error("Missing `room=` argument.")
+        if not node_cmd_validate_room(room_id):
+            return room_agents_reply_error(
+                f"Invalid room ID format: `{room_id}`\nExpected: `!localpart:server`"
+            )
+
+        # ── get ───────────────────────────────────────────────────────────────
+        if action == ROOM_ACTION_GET:
+            try:
+                ov = await asyncio.to_thread(self._policy_store.get_agent_override, room_id)
+            except Exception as exc:
+                logger.warning("PolicyStore get_agent_override error: %s", exc)
+                return "⚠️ Could not read policy store."
+            ov_agents, ov_default = (ov if ov else (None, None))
+            env_room = (
+                self._mixed_room_config.rooms.get(room_id) if self._mixed_room_config else None
+            )
+            env_agents = list(env_room.agents) if env_room else None
+            env_default = env_room.default_agent if env_room else None
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.room.agents.get",
+                agent_id="control", node_id=self._node_id,
+                room_id=ctrl_room_id, event_id=event_id,
+                status="ok", data={"sender": sender, "queried_room": room_id},
+            )
+            return room_agents_reply_get(room_id, ov_agents, ov_default, env_agents, env_default)
+
+        # ── unset ─────────────────────────────────────────────────────────────
+        if action == ROOM_ACTION_UNSET:
+            try:
+                deleted = await asyncio.to_thread(self._policy_store.delete_agent_override, room_id)
+            except Exception as exc:
+                logger.warning("PolicyStore delete_agent_override error: %s", exc)
+                return "⚠️ Could not write to policy store."
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.room.agents.unset",
+                agent_id="control", node_id=self._node_id,
+                room_id=ctrl_room_id, event_id=event_id,
+                status="ok", data={"sender": sender, "target_room": room_id, "was_set": deleted},
+            )
+            return room_agents_reply_unset_ok(room_id) if deleted else room_agents_reply_unset_not_found(room_id)
+
+        # ── add ───────────────────────────────────────────────────────────────
+        if action == ROOM_ACTION_ADD:
+            if not single_agent:
+                return room_agents_reply_error("Missing `agent=` argument for `add`.")
+            if single_agent not in allowed_all:
+                allowed_str = ", ".join(f"`{a}`" for a in sorted(allowed_all))
+                return room_agents_reply_error(f"Agent `{single_agent}` not in allowed agents: {allowed_str}")
+            try:
+                new_agents, new_default = await asyncio.to_thread(
+                    self._policy_store.add_agent_to_room, room_id, single_agent, sender
+                )
+            except Exception as exc:
+                logger.warning("PolicyStore add_agent_to_room error: %s", exc)
+                return "⚠️ Could not write to policy store."
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.room.agents.add",
+                agent_id="control", node_id=self._node_id,
+                room_id=ctrl_room_id, event_id=event_id,
+                status="ok", data={"sender": sender, "target_room": room_id, "agent": single_agent},
+            )
+            return room_agents_reply_add(room_id, single_agent, new_agents, new_default)
+
+        # ── remove ────────────────────────────────────────────────────────────
+        if action == ROOM_ACTION_REMOVE:
+            if not single_agent:
+                return room_agents_reply_error("Missing `agent=` argument for `remove`.")
+            try:
+                removed, err = await asyncio.to_thread(
+                    self._policy_store.remove_agent_from_room, room_id, single_agent, sender
+                )
+            except Exception as exc:
+                logger.warning("PolicyStore remove_agent_from_room error: %s", exc)
+                return "⚠️ Could not write to policy store."
+            if not removed:
+                return room_agents_reply_error(err or "Could not remove agent.")
+            # Get updated state
+            try:
+                ov = await asyncio.to_thread(self._policy_store.get_agent_override, room_id)
+            except Exception:  # noqa: BLE001
+                ov = None
+            remaining = ov[0] if ov else []
+            new_default_r = ov[1] if ov else None
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.room.agents.remove",
+                agent_id="control", node_id=self._node_id,
+                room_id=ctrl_room_id, event_id=event_id,
+                status="ok", data={"sender": sender, "target_room": room_id, "agent": single_agent},
+            )
+            return room_agents_reply_remove(room_id, single_agent, remaining, new_default_r)
+
+        # ── set ───────────────────────────────────────────────────────────────
+        if action == ROOM_ACTION_SET:
+            if not agents_list:
+                return room_agents_reply_error("Missing `agents=` argument for `set`.")
+            invalid = [a for a in agents_list if a not in allowed_all]
+            if invalid:
+                allowed_str = ", ".join(f"`{a}`" for a in sorted(allowed_all))
+                return room_agents_reply_error(
+                    f"Unknown agents: {', '.join(f'`{a}`' for a in invalid)}\nAllowed: {allowed_str}"
+                )
+            effective_default = default_agent if default_agent else agents_list[0]
+            if effective_default not in agents_list:
+                return room_agents_reply_error(
+                    f"Default agent `{effective_default}` not in provided agents list."
+                )
+            try:
+                await asyncio.to_thread(
+                    self._policy_store.set_agent_override,
+                    room_id, agents_list, effective_default, sender,
+                )
+            except Exception as exc:
+                logger.warning("PolicyStore set_agent_override error: %s", exc)
+                return "⚠️ Could not write to policy store."
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.room.agents.set",
+                agent_id="control", node_id=self._node_id,
+                room_id=ctrl_room_id, event_id=event_id,
+                status="ok", data={
+                    "sender": sender, "target_room": room_id,
+                    "agents": agents_list, "default": effective_default,
+                },
+            )
+            return room_agents_reply_set(room_id, agents_list, effective_default)
+
+        return room_agents_reply_error("Unhandled action.")
+
+    # ── M6.2: Policy snapshot export / import ─────────────────────────────────
+
+    async def _handle_policy_cmd(
+        self,
+        http_client: httpx.AsyncClient,
+        sender: str,
+        ctrl_room_id: str,
+        event_id: str,
+        subcommand: Optional[str],
+        cmd_kwargs: Dict[str, str],
+    ) -> str:
+        """
+        Handle `!policy export`, `!policy import`, and `!policy prune_exports`.
+        Requires policy_store and bridge_data_dir to be configured.
+        """
+        if self._policy_store is None or not self._policy_store.is_open:
+            return policy_cmd_error("Policy store is not available.")
+
+        if not self._bridge_data_dir:
+            return policy_cmd_error("BRIDGE_DATA_DIR is not configured.")
+
+        exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
+
+        # ── export ────────────────────────────────────────────────────────────
+        if subcommand == "export":
+            try:
+                snapshot = await asyncio.to_thread(self._policy_store.export_all)
+                node_count  = len(snapshot.get("room_node_overrides",  []))
+                agent_count = len(snapshot.get("room_agent_overrides", []))
+
+                import datetime as _dt
+                ts = _dt.datetime.now(_dt.timezone.utc).strftime("%Y%m%d-%H%M%S")
+                filename = f"policy-{ts}.json"
+
+                await asyncio.to_thread(_os.makedirs, exports_dir, exist_ok=True)
+                export_path = _os.path.join(exports_dir, filename)
+                await asyncio.to_thread(
+                    _write_json_file, export_path, snapshot
+                )
+
+                self._policy_last_export_at = int(time.time())
+                await _write_audit(
+                    http_client, self._console_url, self._internal_token,
+                    event="matrix.control.policy.export",
+                    agent_id="control", node_id=self._node_id,
+                    room_id=ctrl_room_id, event_id=event_id,
+                    status="ok", data={
+                        "sender": sender, "file": filename,
+                        "node_overrides": node_count, "agent_overrides": agent_count,
+                    },
+                )
+                return policy_export_reply(export_path, node_count, agent_count)
+
+            except Exception as exc:  # noqa: BLE001
+                logger.exception("_handle_policy_cmd export error: %s", exc)
+                return policy_cmd_error(f"Export failed: {exc}")
+
+        # ── import ────────────────────────────────────────────────────────────
+        if subcommand == "import":
+            filename = cmd_kwargs.get("path", "").strip()
+            if not filename:
+                return policy_cmd_error("Missing `path=` argument.")
+
+            safe_path = validate_export_path(exports_dir, filename)
+            if safe_path is None:
+                return policy_cmd_error(
+                    f"Invalid path `{filename}`. Only simple filenames within the exports "
+                    f"directory are allowed."
+                )
+
+            mode_raw = cmd_kwargs.get("mode", "merge").strip().lower()
+            if mode_raw not in ("merge", "replace"):
+                return policy_cmd_error("mode must be `merge` or `replace`.")
+
+            dry_raw = cmd_kwargs.get("dry_run", "1").strip()
+            dry_run = dry_raw not in ("0", "false", "no")
+
+            try:
+                raw_text = await asyncio.to_thread(_read_json_file, safe_path)
+            except FileNotFoundError:
+                return policy_cmd_error(f"File not found: `{filename}`")
+            except Exception as exc:  # noqa: BLE001
+                return policy_cmd_error(f"Cannot read file: {exc}")
+
+            try:
+                data = raw_text if isinstance(raw_text, dict) else {}
+                stats = await asyncio.to_thread(
+                    self._policy_store.import_snapshot,
+                    data, mode_raw, dry_run, sender,
+                )
+            except ValueError as ve:
+                return policy_cmd_error(str(ve))
+            except Exception as exc:  # noqa: BLE001
+                logger.exception("_handle_policy_cmd import error: %s", exc)
+                return policy_cmd_error(f"Import failed: {exc}")
+
+            if not dry_run:
+                self._policy_last_import_at = int(time.time())
+                # M10.2: record in policy change history
+                _is_destr = (
+                    stats.get("node_deleted", 0) + stats.get("agent_deleted", 0)
+                ) > 0
+                _ds = (
+                    f"node: +{stats['node_added']} ~{stats['node_updated']} "
+                    f"-{stats['node_deleted']}; "
+                    f"agent: +{stats['agent_added']} ~{stats['agent_updated']} "
+                    f"-{stats['agent_deleted']}"
+                )
+                try:
+                    await asyncio.to_thread(
+                        self._policy_store.record_policy_change,
+                        "policy.import", mode_raw, filename,
+                        _sender_hash(sender), _ds, _is_destr,
+                        stats.get("node_added", 0), stats.get("node_updated", 0),
+                        stats.get("node_deleted", 0),
+                        stats.get("agent_added", 0), stats.get("agent_updated", 0),
+                        stats.get("agent_deleted", 0),
+                        self._policy_history_limit,
+                    )
+                except Exception as _exc:  # noqa: BLE001
+                    logger.warning("Failed to record import history (non-fatal): %s", _exc)
+
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.policy.import",
+                agent_id="control", node_id=self._node_id,
+                room_id=ctrl_room_id, event_id=event_id,
+                status="ok", data={
+                    "sender": sender, "file": filename,
+                    "mode": mode_raw, "dry_run": dry_run, "stats": stats,
+                },
+            )
+            return (
+                policy_import_dry_run_reply(stats, mode_raw)
+                if dry_run
+                else policy_import_reply(stats, mode_raw)
+            )
+
+        # ── restore (M10.1) ───────────────────────────────────────────────────
+        if subcommand == "restore":
+            filename = cmd_kwargs.get("path", "").strip()
+            if not filename:
+                return policy_cmd_error("Missing `path=` argument.")
+
+            safe_path = validate_export_path(exports_dir, filename)
+            if safe_path is None:
+                return policy_cmd_error(
+                    f"Invalid path `{filename}`. Only simple filenames within the exports "
+                    "directory are allowed."
+                )
+
+            mode_raw = cmd_kwargs.get("mode", "replace").strip().lower()
+            if mode_raw not in ("merge", "replace"):
+                return policy_cmd_error("mode must be `merge` or `replace`.")
+
+            try:
+                raw_data = await asyncio.to_thread(_read_json_file, safe_path)
+            except FileNotFoundError:
+                return policy_cmd_error(f"File not found: `{filename}`")
+            except Exception as exc:  # noqa: BLE001
+                return policy_cmd_error(f"Cannot read file: {exc}")
+
+            data = raw_data if isinstance(raw_data, dict) else {}
+            try:
+                stats = await asyncio.to_thread(
+                    self._policy_store.import_snapshot,
+                    data, mode_raw, False, sender,
+                )
+                self._policy_last_import_at = int(time.time())
+            except ValueError as ve:
+                return policy_cmd_error(str(ve))
+            except Exception as exc:  # noqa: BLE001
+                logger.exception("_handle_policy_cmd restore error: %s", exc)
+                return policy_cmd_error(f"Restore failed: {exc}")
+
+            # M10.2: record in policy change history
+            _is_destr = (
+                stats.get("node_deleted", 0) + stats.get("agent_deleted", 0)
+            ) > 0
+            _rds = (
+                f"restore/{mode_raw}: "
+                f"node: +{stats['node_added']} ~{stats['node_updated']} "
+                f"-{stats['node_deleted']}; "
+                f"agent: +{stats['agent_added']} ~{stats['agent_updated']} "
+                f"-{stats['agent_deleted']}"
+            )
+            try:
+                await asyncio.to_thread(
+                    self._policy_store.record_policy_change,
+                    "policy.restore", mode_raw, filename,
+                    _sender_hash(sender), _rds, _is_destr,
+                    stats.get("node_added", 0), stats.get("node_updated", 0),
+                    stats.get("node_deleted", 0),
+                    stats.get("agent_added", 0), stats.get("agent_updated", 0),
+                    stats.get("agent_deleted", 0),
+                    self._policy_history_limit,
+                )
+            except Exception as _exc:  # noqa: BLE001
+                logger.warning("Failed to record restore history (non-fatal): %s", _exc)
+
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.policy.restore",
+                agent_id="control", node_id=self._node_id,
+                room_id=ctrl_room_id, event_id=event_id,
+                status="ok",
+                data={
+                    "sender": sender, "file": filename,
+                    "mode": mode_raw, "stats": stats,
+                },
+            )
+            return policy_restore_applied_reply(stats, mode_raw)
+
+        # ── prune_exports ─────────────────────────────────────────────────────
+        if subcommand == "prune_exports":
+            dry_raw = cmd_kwargs.get("dry_run", "1").strip()
+            dry_run = dry_raw not in ("0", "false", "no")
+            try:
+                retention_days = int(
+                    cmd_kwargs.get(
+                        "retention_days", str(self._policy_export_retention_days)
+                    )
+                )
+            except (ValueError, TypeError):
+                return policy_cmd_error("`retention_days` must be a positive integer.")
+
+            try:
+                await asyncio.to_thread(_os.makedirs, exports_dir, exist_ok=True)
+                result = await asyncio.to_thread(
+                    self._policy_store.prune_exports,
+                    exports_dir, retention_days, dry_run,
+                )
+            except Exception as exc:  # noqa: BLE001
+                logger.exception("_handle_policy_cmd prune error: %s", exc)
+                return policy_cmd_error(f"Prune failed: {exc}")
+
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.policy.prune_exports",
+                agent_id="control", node_id=self._node_id,
+                room_id=ctrl_room_id, event_id=event_id,
+                status="ok",
+                data={
+                    "sender": sender,
+                    "dry_run": dry_run,
+                    "retention_days": retention_days,
+                    "files_to_delete": result.count,
+                    "bytes_to_free": result.total_bytes,
+                },
+            )
+            return (
+                policy_prune_preview_reply(result, retention_days)
+                if dry_run
+                else policy_prune_applied_reply(result, retention_days)
+            )
+
+        # ── history (M10.2) ───────────────────────────────────────────────────
+        if subcommand == "history":
+            try:
+                limit_raw = int(cmd_kwargs.get("limit", "10"))
+            except (ValueError, TypeError):
+                return policy_cmd_error("`limit` must be a positive integer.")
+            safe_limit = max(1, min(limit_raw, 20))
+            try:
+                changes = await asyncio.to_thread(
+                    self._policy_store.list_policy_changes, safe_limit,
+                )
+            except Exception as exc:  # noqa: BLE001
+                logger.exception("_handle_policy_cmd history error: %s", exc)
+                return policy_cmd_error(f"History fetch failed: {exc}")
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.policy.history",
+                agent_id="control", node_id=self._node_id,
+                room_id=ctrl_room_id, event_id=event_id,
+                status="ok", data={"sender": sender, "limit": safe_limit, "count": len(changes)},
+            )
+            return policy_history_reply(changes)
+
+        # ── change (M10.3) ────────────────────────────────────────────────────
+        if subcommand == "change":
+            try:
+                change_id = int(cmd_kwargs.get("id", "0"))
+            except (ValueError, TypeError):
+                return policy_cmd_error("`id` must be a positive integer (DB change id).")
+            if change_id <= 0:
+                return policy_cmd_error("Missing or invalid `id=` argument. "
+                                        "Use `!policy history` to get change ids.")
+            try:
+                change = await asyncio.to_thread(
+                    self._policy_store.get_policy_change_by_id, change_id,
+                )
+            except Exception as exc:  # noqa: BLE001
+                logger.exception("_handle_policy_cmd change detail error: %s", exc)
+                return policy_cmd_error(f"DB error: {exc}")
+            if change is None:
+                return policy_cmd_error(
+                    f"Change id={change_id} not found. "
+                    "Use `!policy history` to see available ids."
+                )
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.policy.change_detail",
+                agent_id="control", node_id=self._node_id,
+                room_id=ctrl_room_id, event_id=event_id,
+                status="ok", data={"sender": sender, "change_id": change_id},
+            )
+            return policy_change_detail_reply(change)
+
+        return policy_cmd_error(f"Unknown subcommand: `{subcommand!r}`.")
+
+    # ── M5.1: Nodes overview for operators ────────────────────────────────────
+
+    async def _handle_nodes(
+        self,
+        http_client: httpx.AsyncClient,
+        sender: str,
+        room_id: str,
+        event_id: str,
+    ) -> str:
+        """Return node policy overview for `!nodes` in control room."""
+        try:
+            policy_info = (
+                self._node_policy.as_info_dict()
+                if self._node_policy is not None
+                else {}
+            )
+            node_stats = (
+                self._node_stats_getter()
+                if self._node_stats_getter is not None
+                else {}
+            )
+            # M8.0: merge health state into node_stats
+            if self._node_health_tracker is not None:
+                allowed = (
+                    self._node_policy.allowed_nodes
+                    if self._node_policy is not None
+                    else None
+                )
+                health_all = self._node_health_tracker.all_info(allowed)
+                for nid, info in health_all.items():
+                    if nid not in node_stats:
+                        node_stats[nid] = {}
+                    node_stats[nid]["health"] = info.get("state", NODE_STATE_HEALTHY)
+                    node_stats[nid]["ewma_latency_s"] = info.get("ewma_latency_s")
+                    node_stats[nid]["consecutive_failures"] = info.get("consecutive_failures", 0)
+            # M8.1: include sticky cache info
+            sticky_info = None
+            if self._sticky_cache is not None:
+                sticky_entries = self._sticky_cache.active_entries()
+                sticky_info = {
+                    "active_keys": len(sticky_entries),
+                    "ttl_s": self._sticky_cache.ttl_s,
+                    "entries": [
+                        {"key": k, "node": n, "remaining_s": round(r, 0)}
+                        for k, n, r in sticky_entries[:5]   # show at most 5
+                    ],
+                }
+                if len(sticky_entries) > 5:
+                    sticky_info["truncated"] = len(sticky_entries) - 5
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.nodes",
+                agent_id="control", node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="ok", data={"sender": sender},
+            )
+            return nodes_reply(policy_info, node_stats, sticky_info=sticky_info)
+        except Exception as exc:
+            logger.warning("_handle_nodes error: %s", exc)
+            return "⚠️ Node info not available."
+
+    # ── M8.2: HA state persistence helpers ────────────────────────────────────
+
+    async def _load_ha_state(self) -> None:
+        """
+        Load persisted HA state (sticky entries + node health) from PolicyStore on startup.
+        Non-fatal: any error is logged and bridge continues in in-memory-only mode.
+        """
+        if self._policy_store is None or not self._policy_store.is_open:
+            return
+
+        # Load sticky entries
+        if self._sticky_cache is not None:
+            try:
+                entries = await asyncio.to_thread(self._policy_store.load_sticky_entries)
+                now_unix = int(time.time())
+                loaded = 0
+                for key, node_id, expires_at_unix in entries:
+                    remaining_s = expires_at_unix - now_unix
+                    if remaining_s > 0:
+                        self._sticky_cache.set(key, node_id, ttl_s=float(remaining_s))
+                        loaded += 1
+                self._ha_sticky_loaded = loaded
+                logger.info("HA: loaded %d sticky entries from DB", loaded)
+            except Exception as exc:  # noqa: BLE001
+                logger.warning("HA: failed to load sticky entries (non-fatal): %s", exc)
+
+        # Load node health snapshot
+        if self._node_health_tracker is not None and self._ha_health_max_age_s > 0:
+            try:
+                snapshot = await asyncio.to_thread(
+                    self._policy_store.load_node_health, self._ha_health_max_age_s
+                )
+                if snapshot:
+                    for node_id, info in snapshot.items():
+                        self._node_health_tracker.restore_node(
+                            node_id,
+                            ewma_latency_s=info.get("ewma_latency_s"),
+                            consecutive_failures=int(info.get("consecutive_failures", 0)),
+                        )
+                    self._ha_health_loaded = True
+                    logger.info(
+                        "HA: loaded node health for %d nodes from DB", len(snapshot)
+                    )
+                else:
+                    logger.info("HA: no fresh node health snapshot found in DB")
+            except Exception as exc:  # noqa: BLE001
+                logger.warning("HA: failed to load node health (non-fatal): %s", exc)
+
+    async def _sticky_persist_set(self, key: str, node_id: str) -> None:
+        """Persist a sticky entry to DB after setting it in-memory (M8.2)."""
+        if self._policy_store is None or not self._policy_store.is_open:
+            return
+        if self._sticky_cache is None:
+            return
+        expires_at_unix = int(time.time()) + int(self._sticky_cache.ttl_s)
+        try:
+            await asyncio.to_thread(
+                self._policy_store.upsert_sticky, key, node_id, expires_at_unix
+            )
+        except Exception as exc:  # noqa: BLE001
+            logger.warning("HA: failed to persist sticky key=%s (non-fatal): %s", key, exc)
+
+    async def _sticky_persist_delete(self, key: str) -> None:
+        """Remove a sticky entry from DB after deleting it in-memory (M8.2)."""
+        if self._policy_store is None or not self._policy_store.is_open:
+            return
+        try:
+            await asyncio.to_thread(self._policy_store.delete_sticky, key)
+        except Exception as exc:  # noqa: BLE001
+            logger.warning("HA: failed to delete sticky key=%s (non-fatal): %s", key, exc)
+
+    async def _node_health_snapshot_loop(self) -> None:
+        """
+        Background task: periodically write node health state to DB (M8.2).
+        Runs until policy_store becomes unavailable or interval is 0.
+        """
+        if self._ha_health_snapshot_interval_s <= 0:
+            return
+        logger.debug(
+            "HA: health snapshot loop started (interval=%ds)", self._ha_health_snapshot_interval_s
+        )
+        while True:
+            await asyncio.sleep(self._ha_health_snapshot_interval_s)
+            if self._policy_store is None or not self._policy_store.is_open:
+                break
+            if self._node_health_tracker is None or self._node_policy is None:
+                break
+            try:
+                for node_id in sorted(self._node_policy.allowed_nodes):
+                    info = self._node_health_tracker.as_info_dict(node_id)
+                    await asyncio.to_thread(
+                        self._policy_store.upsert_node_health,
+                        node_id,
+                        info.get("ewma_latency_s"),
+                        int(info.get("consecutive_failures", 0)),
+                    )
+                logger.debug(
+                    "HA: health snapshot written for %d nodes",
+                    len(self._node_policy.allowed_nodes),
+                )
+            except Exception as exc:  # noqa: BLE001
+                logger.warning("HA: health snapshot write failed (non-fatal): %s", exc)
+
+    # ── M9.0: Two-step confirmation for dangerous control commands ─────────────
+
+    async def _handle_policy_import_intent(
+        self,
+        http_client: "httpx.AsyncClient",
+        cmd: "ControlCommand",
+        sender: str,
+        room_id: str,
+        event_id: str,
+        action_summary: str,
+        normalized: str,
+    ) -> str:
+        """
+        M9.1: Intent step for !policy import with diff preview and hash binding.
+
+        Reads the file, computes a diff preview, stores a hash-bound callback,
+        and returns a formatted preview reply containing the nonce.
+        The confirm callback verifies the file hasn't changed before applying.
+        """
+        assert self._confirm_store is not None
+
+        # ── Validate args ──────────────────────────────────────────────────────
+        if self._policy_store is None or not self._policy_store.is_open:
+            return policy_cmd_error("Policy store not available.")
+        if not self._bridge_data_dir:
+            return policy_cmd_error("BRIDGE_DATA_DIR not configured.")
+
+        filename = cmd.kwargs.get("path", "").strip()
+        if not filename:
+            return policy_cmd_error("Missing `path=` argument.")
+
+        exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
+        safe_path = validate_export_path(exports_dir, filename)
+        if safe_path is None:
+            return policy_cmd_error(
+                f"Invalid path `{filename}`. Only simple filenames within the exports "
+                "directory are allowed."
+            )
+
+        mode_raw = cmd.kwargs.get("mode", "merge").strip().lower()
+        if mode_raw not in ("merge", "replace"):
+            return policy_cmd_error("mode must be `merge` or `replace`.")
+
+        # ── Read file + compute diff preview ───────────────────────────────────
+        try:
+            raw_data = await asyncio.to_thread(_read_json_file, safe_path)
+        except FileNotFoundError:
+            return policy_cmd_error(f"File not found: `{filename}`")
+        except Exception as exc:  # noqa: BLE001
+            return policy_cmd_error(f"Cannot read file: {exc}")
+
+        if not isinstance(raw_data, dict):
+            return policy_cmd_error("Invalid JSON format (expected object).")
+
+        try:
+            diff = await asyncio.to_thread(
+                self._policy_store.compute_import_diff, raw_data, mode_raw,
+            )
+        except ValueError as ve:
+            return policy_cmd_error(str(ve))
+        except Exception as exc:  # noqa: BLE001
+            return policy_cmd_error(f"Preview failed: {exc}")
+
+        # ── Compute snapshot hash for confirm binding ─────────────────────────
+        _content_bytes = _json.dumps(
+            raw_data, sort_keys=True, ensure_ascii=True
+        ).encode("utf-8")
+        snapshot_hash = hashlib.sha256(
+            (filename + ":" + mode_raw + ":").encode("utf-8") + _content_bytes
+        ).hexdigest()[:32]
+
+        sender_hash = _sender_hash(sender)
+        _captured_hash  = snapshot_hash
+        _captured_data  = raw_data
+        _captured_mode  = mode_raw
+        _captured_path  = safe_path
+        _captured_fname = filename
+        _captured_sender = sender
+
+        # ── Build hash-bound callback ──────────────────────────────────────────
+        # Late-capture of nonce for autobackup filename (set after add() below)
+        _nonce_holder: list = []
+
+        async def _callback():
+            # Re-read file and verify hash to detect tampering (anti-TOCTOU)
+            try:
+                fresh_data = await asyncio.to_thread(_read_json_file, _captured_path)
+            except Exception as exc:  # noqa: BLE001
+                return (
+                    f"❌ Cannot re-read file `{_captured_fname}` at apply time: {exc}\n"
+                    "Re-issue the command.",
+                    "",
+                )
+
+            fresh_bytes = _json.dumps(
+                fresh_data if isinstance(fresh_data, dict) else {},
+                sort_keys=True, ensure_ascii=True,
+            ).encode("utf-8")
+            fresh_hash = hashlib.sha256(
+                (_captured_fname + ":" + _captured_mode + ":").encode("utf-8") + fresh_bytes
+            ).hexdigest()[:32]
+
+            if fresh_hash != _captured_hash:
+                logger.warning(
+                    "Policy import confirm rejected: file changed since preview "
+                    "(sender=%s file=%s)", _captured_sender, _captured_fname,
+                )
+                return (
+                    f"❌ File `{_captured_fname}` changed after preview — confirm rejected.\n"
+                    "Re-issue `!policy import ...` to get a new preview.",
+                    "",
+                )
+
+            # M10.0: Auto-backup current policy before applying changes
+            _autobackup_basename = ""
+            _autobackup_hash = ""
+            if self._policy_store is not None and self._bridge_data_dir:
+                _exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
+                _nonce_suffix = _nonce_holder[0] if _nonce_holder else "BACKUP"
+                _sender_hash8 = _sender_hash(_captured_sender)[:8]
+                try:
+                    _os.makedirs(_exports_dir, exist_ok=True)
+                    _ab_path, _autobackup_hash = await asyncio.to_thread(
+                        self._policy_store.write_autobackup,
+                        _exports_dir, _sender_hash8, _nonce_suffix,
+                    )
+                    _autobackup_basename = _os.path.basename(_ab_path)
+                    logger.info(
+                        "Policy auto-backup written: %s hash=%s",
+                        _autobackup_basename, _autobackup_hash,
+                    )
+                except Exception as exc:  # noqa: BLE001
+                    logger.warning("Policy auto-backup failed (non-fatal): %s", exc)
+
+            # Apply the import using the captured (previewed) data
+            try:
+                stats = await asyncio.to_thread(
+                    self._policy_store.import_snapshot,
+                    _captured_data, _captured_mode, False, _captured_sender,
+                )
+                self._policy_last_import_at = int(time.time())
+                diff_summary = (
+                    f"node: +{stats['node_added']} ~{stats['node_updated']} "
+                    f"-{stats['node_deleted']}; "
+                    f"agent: +{stats['agent_added']} ~{stats['agent_updated']} "
+                    f"-{stats['agent_deleted']}"
+                )
+                if _autobackup_basename:
+                    diff_summary += f"; autobackup={_autobackup_basename}"
+                # M10.2: record in policy change history
+                _is_destr = (
+                    stats.get("node_deleted", 0) + stats.get("agent_deleted", 0)
+                ) > 0
+                try:
+                    await asyncio.to_thread(
+                        self._policy_store.record_policy_change,
+                        "policy.import", _captured_mode, _captured_fname,
+                        _sender_hash(_captured_sender), diff_summary, _is_destr,
+                        stats.get("node_added", 0), stats.get("node_updated", 0),
+                        stats.get("node_deleted", 0),
+                        stats.get("agent_added", 0), stats.get("agent_updated", 0),
+                        stats.get("agent_deleted", 0),
+                        self._policy_history_limit,
+                    )
+                except Exception as _exc:  # noqa: BLE001
+                    logger.warning("Failed to record policy change history: %s", _exc)
+                reply = policy_import_reply(stats, _captured_mode)
+                if _autobackup_basename:
+                    reply += (
+                        f"\n\n💾 Auto-backup saved: `{_autobackup_basename}` "
+                        f"(hash `{_autobackup_hash}`)"
+                    )
+                return reply, diff_summary
+            except Exception as exc:  # noqa: BLE001
+                logger.exception(
+                    "Policy import apply failed: sender=%s file=%s", _captured_sender, _captured_fname,
+                )
+                return f"❌ Import failed: {exc}", ""
+
+        # ── Store pending confirmation ─────────────────────────────────────────
+        nonce = self._confirm_store.add(
+            sender_hash=sender_hash,
+            verb="policy.import",
+            normalized_args=normalized,
+            action_summary=action_summary,
+            room_id=room_id,
+            callback=_callback,
+        )
+        # M10.0: make nonce available inside _callback for backup filename
+        _nonce_holder.append(nonce)
+
+        await _write_audit(
+            http_client, self._console_url, self._internal_token,
+            event="matrix.control.intent",
+            agent_id="control", node_id=self._node_id,
+            room_id=room_id, event_id=event_id,
+            status="ok",
+            data={
+                "sender_hash": sender_hash,
+                "verb": "policy.import",
+                "normalized": normalized,
+                "nonce": nonce,
+                "expires_in_s": int(self._confirm_store.ttl_s),
+                "snapshot_hash_prefix": snapshot_hash[:8],
+                "diff": {
+                    "node_added": diff.node_added,
+                    "node_updated": diff.node_updated,
+                    "node_deleted": diff.node_deleted,
+                    "agent_added": diff.agent_added,
+                    "agent_updated": diff.agent_updated,
+                    "agent_deleted": diff.agent_deleted,
+                    "sample_keys": diff.sample_keys,
+                },
+            },
+        )
+
+        logger.info(
+            "Confirm policy import intent: sender=%s mode=%s file=%s hash=%s nonce=%s",
+            sender, mode_raw, filename, snapshot_hash[:8], nonce,
+        )
+        return policy_import_intent_reply(
+            diff=diff,
+            action_summary=action_summary,
+            nonce=nonce,
+            ttl_s=int(self._confirm_store.ttl_s),
+        )
+
+    async def _handle_policy_restore_intent(
+        self,
+        http_client: "httpx.AsyncClient",
+        cmd: "ControlCommand",
+        sender: str,
+        room_id: str,
+        event_id: str,
+        *,
+        action_summary: str,
+        normalized: str,
+    ) -> str:
+        """
+        Intent step for !policy restore (M10.1).
+
+        Reads the snapshot file, computes a diff preview, binds a SHA-256 hash
+        to the exact file content + mode, stores a callback in ConfirmStore, and
+        returns a rollback preview reply with the nonce.
+        """
+        assert self._confirm_store is not None
+        assert self._policy_store is not None
+        assert self._bridge_data_dir is not None
+
+        filename = cmd.kwargs.get("path", "").strip()
+        if not filename:
+            return policy_cmd_error("Missing `path=` argument.")
+
+        exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
+        safe_path = validate_export_path(exports_dir, filename)
+        if safe_path is None:
+            return policy_cmd_error(
+                f"Invalid path `{filename}`. Only simple filenames within the exports "
+                "directory are allowed."
+            )
+
+        mode_raw = cmd.kwargs.get("mode", "replace").strip().lower()
+        if mode_raw not in ("merge", "replace"):
+            return policy_cmd_error("mode must be `merge` or `replace`.")
+
+        # ── Read file + compute diff preview ──────────────────────────────────
+        try:
+            raw_data = await asyncio.to_thread(_read_json_file, safe_path)
+        except FileNotFoundError:
+            return policy_cmd_error(f"File not found: `{filename}`")
+        except Exception as exc:  # noqa: BLE001
+            return policy_cmd_error(f"Cannot read file: {exc}")
+
+        if not isinstance(raw_data, dict):
+            return policy_cmd_error("Invalid JSON format (expected object).")
+
+        try:
+            diff = await asyncio.to_thread(
+                self._policy_store.compute_import_diff, raw_data, mode_raw,
+            )
+        except ValueError as ve:
+            return policy_cmd_error(str(ve))
+        except Exception as exc:  # noqa: BLE001
+            return policy_cmd_error(f"Preview failed: {exc}")
+
+        # ── Compute snapshot hash for confirm binding (anti-TOCTOU) ──────────
+        _content_bytes = _json.dumps(
+            raw_data, sort_keys=True, ensure_ascii=True
+        ).encode("utf-8")
+        snapshot_hash = hashlib.sha256(
+            (filename + ":" + mode_raw + ":restore:").encode("utf-8") + _content_bytes
+        ).hexdigest()[:32]
+
+        sender_hash    = _sender_hash(sender)
+        _captured_hash  = snapshot_hash
+        _captured_data  = raw_data
+        _captured_mode  = mode_raw
+        _captured_path  = safe_path
+        _captured_fname = filename
+        _captured_sender = sender
+
+        # Late-capture of nonce for autobackup filename
+        _nonce_holder: list = []
+
+        # ── Build hash-bound callback ─────────────────────────────────────────
+        async def _callback():
+            # Re-read + verify hash (anti-TOCTOU)
+            try:
+                fresh_data = await asyncio.to_thread(_read_json_file, _captured_path)
+            except Exception as exc:  # noqa: BLE001
+                return (
+                    f"❌ Cannot re-read `{_captured_fname}` at apply time: {exc}\n"
+                    "Re-issue the command.",
+                    "",
+                )
+
+            fresh_bytes = _json.dumps(
+                fresh_data if isinstance(fresh_data, dict) else {},
+                sort_keys=True, ensure_ascii=True,
+            ).encode("utf-8")
+            fresh_hash = hashlib.sha256(
+                (_captured_fname + ":" + _captured_mode + ":restore:").encode("utf-8")
+                + fresh_bytes
+            ).hexdigest()[:32]
+
+            if fresh_hash != _captured_hash:
+                logger.warning(
+                    "Policy restore confirm rejected: file changed since preview "
+                    "(sender=%s file=%s)", _captured_sender, _captured_fname,
+                )
+                return (
+                    f"❌ File `{_captured_fname}` changed after preview — confirm rejected.\n"
+                    "Re-issue `!policy restore ...` to get a new preview.",
+                    "",
+                )
+
+            # Auto-backup current state before overwriting
+            _autobackup_basename = ""
+            _autobackup_hash = ""
+            if self._policy_store is not None and self._bridge_data_dir:
+                _exp_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
+                _nonce_suffix = _nonce_holder[0] if _nonce_holder else "RESTORE"
+                _sender_hash8 = _sender_hash(_captured_sender)[:8]
+                try:
+                    _os.makedirs(_exp_dir, exist_ok=True)
+                    _ab_path, _autobackup_hash = await asyncio.to_thread(
+                        self._policy_store.write_autobackup,
+                        _exp_dir, _sender_hash8, _nonce_suffix,
+                    )
+                    _autobackup_basename = _os.path.basename(_ab_path)
+                    logger.info(
+                        "Pre-restore backup written: %s hash=%s",
+                        _autobackup_basename, _autobackup_hash,
+                    )
+                except Exception as exc:  # noqa: BLE001
+                    logger.warning("Pre-restore backup failed (non-fatal): %s", exc)
+
+            # Apply restore
+            try:
+                stats = await asyncio.to_thread(
+                    self._policy_store.import_snapshot,
+                    _captured_data, _captured_mode, False, _captured_sender,
+                )
+                self._policy_last_import_at = int(time.time())
+                diff_summary = (
+                    f"restore/{_captured_mode}: "
+                    f"node: +{stats['node_added']} ~{stats['node_updated']} "
+                    f"-{stats['node_deleted']}; "
+                    f"agent: +{stats['agent_added']} ~{stats['agent_updated']} "
+                    f"-{stats['agent_deleted']}"
+                )
+                if _autobackup_basename:
+                    diff_summary += f"; autobackup={_autobackup_basename}"
+                # M10.2: record in policy change history
+                _is_destr = (
+                    stats.get("node_deleted", 0) + stats.get("agent_deleted", 0)
+                ) > 0
+                try:
+                    await asyncio.to_thread(
+                        self._policy_store.record_policy_change,
+                        "policy.restore", _captured_mode, _captured_fname,
+                        _sender_hash(_captured_sender), diff_summary, _is_destr,
+                        stats.get("node_added", 0), stats.get("node_updated", 0),
+                        stats.get("node_deleted", 0),
+                        stats.get("agent_added", 0), stats.get("agent_updated", 0),
+                        stats.get("agent_deleted", 0),
+                        self._policy_history_limit,
+                    )
+                except Exception as _exc:  # noqa: BLE001
+                    logger.warning("Failed to record restore history: %s", _exc)
+                reply = policy_restore_applied_reply(
+                    stats, _captured_mode, _autobackup_basename
+                )
+                return reply, diff_summary
+            except Exception as exc:  # noqa: BLE001
+                logger.exception(
+                    "Policy restore apply failed: sender=%s file=%s",
+                    _captured_sender, _captured_fname,
+                )
+                return f"❌ Restore failed: {exc}", ""
+
+        # ── Store pending confirmation ────────────────────────────────────────
+        nonce = self._confirm_store.add(
+            sender_hash=sender_hash,
+            verb="policy.restore",
+            normalized_args=normalized,
+            action_summary=action_summary,
+            room_id=room_id,
+            callback=_callback,
+        )
+        _nonce_holder.append(nonce)
+
+        await _write_audit(
+            http_client, self._console_url, self._internal_token,
+            event="matrix.control.policy.restore",
+            agent_id="control", node_id=self._node_id,
+            room_id=room_id, event_id=event_id,
+            status="ok",
+            data={
+                "sender_hash": sender_hash,
+                "verb": "policy.restore",
+                "normalized": normalized,
+                "nonce": nonce,
+                "expires_in_s": int(self._confirm_store.ttl_s),
+                "mode": mode_raw,
+                "snapshot_hash_prefix": snapshot_hash[:8],
+                "diff": {
+                    "node_added":   diff.node_added,
+                    "node_updated": diff.node_updated,
+                    "node_deleted": diff.node_deleted,
+                    "agent_added":   diff.agent_added,
+                    "agent_updated": diff.agent_updated,
+                    "agent_deleted": diff.agent_deleted,
+                    "sample_keys":   diff.sample_keys,
+                },
+            },
+        )
+
+        logger.info(
+            "Confirm policy restore intent: sender=%s mode=%s file=%s hash=%s nonce=%s",
+            sender, mode_raw, filename, snapshot_hash[:8], nonce,
+        )
+        return policy_restore_intent_reply(
+            diff=diff,
+            action_summary=action_summary,
+            nonce=nonce,
+            ttl_s=int(self._confirm_store.ttl_s),
+        )
+
+    async def _handle_dangerous_intent(
+        self,
+        http_client: "httpx.AsyncClient",
+        cmd: "ControlCommand",
+        sender: str,
+        room_id: str,
+        event_id: str,
+    ) -> str:
+        """
+        First leg of the two-step confirm flow (M9.0).
+
+        Does NOT apply the command.  Stores a pending confirmation with a
+        callback that will execute the original handler, and returns a reply
+        containing the nonce that the operator must send via !confirm <nonce>.
+        """
+        assert self._confirm_store is not None
+
+        sender_hash = _sender_hash(sender)
+        normalized = build_normalized_args(cmd)
+        action_summary = (
+            f"!{cmd.verb} {cmd.subcommand or ''} {normalized}".strip()
+        )
+
+        # M9.1: policy import gets a richer preview with diff + hash binding
+        if cmd.verb == VERB_POLICY and (cmd.subcommand or "").lower() == "import":
+            return await self._handle_policy_import_intent(
+                http_client, cmd, sender, room_id, event_id,
+                action_summary=action_summary, normalized=normalized,
+            )
+
+        # M10.1: policy restore — rollback with diff preview + hash binding
+        if cmd.verb == VERB_POLICY and (cmd.subcommand or "").lower() == "restore":
+            if self._policy_store is None or not self._bridge_data_dir:
+                return policy_cmd_error(
+                    "Policy store or data directory not configured."
+                )
+            return await self._handle_policy_restore_intent(
+                http_client, cmd, sender, room_id, event_id,
+                action_summary=action_summary, normalized=normalized,
+            )
+
+        # Build the callback: calls the actual handler when confirmed.
+        # We capture all args by closure so the callback is self-contained.
+        _verb = cmd.verb
+        _subcmd = cmd.subcommand
+        _args  = cmd.args
+        _kw    = dict(cmd.kwargs)
+
+        async def _callback():
+            if _verb == VERB_NODE:
+                # Reconstruct args_text (same as _try_control does)
+                _parts = []
+                if _subcmd:
+                    _parts.append(_subcmd)
+                _parts.extend(_args)
+                _parts.extend(f"{k}={v}" for k, v in _kw.items())
+                reply = await self._handle_node_cmd(
+                    http_client, sender, room_id, event_id, " ".join(_parts),
+                )
+            elif _verb == VERB_ROOM:
+                reply = await self._handle_room_cmd(
+                    http_client, sender, room_id, event_id,
+                    _subcmd, tuple(_args), _kw,
+                )
+            elif _verb == VERB_POLICY:
+                reply = await self._handle_policy_cmd(
+                    http_client, sender, room_id, event_id, _subcmd, _kw,
+                )
+            else:
+                reply = f"❌ Unknown dangerous verb: {_verb}"
+            return reply, action_summary
+
+        nonce = self._confirm_store.add(
+            sender_hash=sender_hash,
+            verb=f"{cmd.verb}.{cmd.subcommand or ''}",
+            normalized_args=normalized,
+            action_summary=action_summary,
+            room_id=room_id,
+            callback=_callback,
+        )
+
+        await _write_audit(
+            http_client, self._console_url, self._internal_token,
+            event="matrix.control.intent",
+            agent_id="control", node_id=self._node_id,
+            room_id=room_id, event_id=event_id,
+            status="ok",
+            data={
+                "sender_hash": sender_hash,
+                "verb": cmd.verb,
+                "subcommand": cmd.subcommand or "",
+                "normalized": normalized,
+                "nonce": nonce,
+                "expires_in_s": int(self._confirm_store.ttl_s),
+            },
+        )
+
+        logger.info(
+            "Confirm intent: sender=%s verb=%s/%s nonce=%s ttl=%.0fs",
+            sender, cmd.verb, cmd.subcommand, nonce, self._confirm_store.ttl_s,
+        )
+        return confirm_intent_reply(action_summary, nonce, int(self._confirm_store.ttl_s))
+
+    async def _handle_confirm_cmd(
+        self,
+        http_client: "httpx.AsyncClient",
+        cmd: "ControlCommand",
+        sender: str,
+        room_id: str,
+        event_id: str,
+    ) -> str:
+        """
+        Second leg of the two-step confirm flow (M9.0).
+
+        Validates the nonce and sender, executes the stored callback, and
+        emits confirmed + applied audit events.
+        """
+        if self._confirm_store is None:
+            return "❌ Confirmation store not active."
+
+        # Nonce may come as subcommand (token right after !confirm)
+        nonce = (cmd.subcommand or "").strip().upper()
+        if not nonce and cmd.args:
+            nonce = cmd.args[0].strip().upper()
+        if not nonce:
+            return "❌ Usage: `!confirm <code>` — provide the confirmation code."
+
+        sender_hash = _sender_hash(sender)
+        entry = self._confirm_store.pop(nonce, sender_hash)
+
+        if entry is None:
+            logger.info(
+                "Confirm rejected: sender=%s nonce=%s (invalid/expired/wrong-sender)",
+                sender, nonce,
+            )
+            return confirm_expired_reply()
+
+        await _write_audit(
+            http_client, self._console_url, self._internal_token,
+            event="matrix.control.confirmed",
+            agent_id="control", node_id=self._node_id,
+            room_id=room_id, event_id=event_id,
+            status="ok",
+            data={
+                "sender_hash": sender_hash,
+                "nonce": nonce,
+                "verb": entry.verb,
+                "action_summary": entry.action_summary,
+            },
+        )
+
+        logger.info(
+            "Confirm accepted: sender=%s nonce=%s verb=%s",
+            sender, nonce, entry.verb,
+        )
+
+        try:
+            reply_text, diff_summary = await entry.callback()
+        except Exception as exc:
+            logger.exception(
+                "Confirm callback failed: sender=%s nonce=%s verb=%s",
+                sender, nonce, entry.verb,
+            )
+            return f"❌ Apply failed: {exc}"
+
+        await _write_audit(
+            http_client, self._console_url, self._internal_token,
+            event="matrix.control.applied",
+            agent_id="control", node_id=self._node_id,
+            room_id=room_id, event_id=event_id,
+            status="ok",
+            data={
+                "sender_hash": sender_hash,
+                "verb": entry.verb,
+                "normalized": entry.normalized_args,
+                "diff_summary": diff_summary,
+            },
+        )
+
+        return confirm_success_reply(reply_text)
+
+    # ── M6.0: Dynamic room-node overrides via !node command ───────────────────
+
+    async def _handle_node_cmd(
+        self,
+        http_client: httpx.AsyncClient,
+        sender: str,
+        ctrl_room_id: str,
+        event_id: str,
+        args_text: str,
+    ) -> str:
+        """Handle `!node <set|unset|get|list>` from an authorized operator."""
+        if self._policy_store is None or not self._policy_store.is_open:
+            return "⚠️ Policy store not available."
+
+        subcmd, room_id, node_id = parse_node_cmd(args_text)
+
+        if subcmd not in (NODE_SUBCMD_SET, NODE_SUBCMD_UNSET, NODE_SUBCMD_GET, NODE_SUBCMD_LIST):
+            return node_cmd_reply_error(
+                f"Unknown subcommand: `{subcmd or '?'}`"
+            )
+
+        # ── list ──────────────────────────────────────────────────────────────
+        if subcmd == NODE_SUBCMD_LIST:
+            try:
+                rows = await asyncio.to_thread(self._policy_store.list_overrides, 10)
+                total = await asyncio.to_thread(self._policy_store.count_overrides)
+            except Exception as exc:
+                logger.warning("PolicyStore list_overrides error: %s", exc)
+                return "⚠️ Could not read policy store."
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.node.list",
+                agent_id="control", node_id=self._node_id,
+                room_id=ctrl_room_id, event_id=event_id,
+                status="ok", data={"sender": sender, "total": total},
+            )
+            return node_cmd_reply_list(rows, total)
+
+        # ── subcommands that require room_id ──────────────────────────────────
+        if not room_id:
+            return node_cmd_reply_error("Missing `room=` argument.")
+        if not node_cmd_validate_room(room_id):
+            return node_cmd_reply_error(
+                f"Invalid room ID format: `{room_id}`\n"
+                "Expected: `!localpart:server`"
+            )
+
+        # ── get ───────────────────────────────────────────────────────────────
+        if subcmd == NODE_SUBCMD_GET:
+            try:
+                override = await asyncio.to_thread(self._policy_store.get_override, room_id)
+            except Exception as exc:
+                logger.warning("PolicyStore get_override error: %s", exc)
+                return "⚠️ Could not read policy store."
+            # env map lookup for context
+            env_node: Optional[str] = None
+            if self._node_policy is not None:
+                env_node = self._node_policy.room_node_map.get(room_id)
+            default = self._node_policy.default_node if self._node_policy else self._node_id
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.node.get",
+                agent_id="control", node_id=self._node_id,
+                room_id=ctrl_room_id, event_id=event_id,
+                status="ok", data={"sender": sender, "queried_room": room_id},
+            )
+            return node_cmd_reply_get(room_id, override, env_node, default)
+
+        # ── unset ─────────────────────────────────────────────────────────────
+        if subcmd == NODE_SUBCMD_UNSET:
+            try:
+                deleted = await asyncio.to_thread(self._policy_store.delete_override, room_id)
+            except Exception as exc:
+                logger.warning("PolicyStore delete_override error: %s", exc)
+                return "⚠️ Could not write to policy store."
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.node.unset",
+                agent_id="control", node_id=self._node_id,
+                room_id=ctrl_room_id, event_id=event_id,
+                status="ok", data={"sender": sender, "target_room": room_id, "was_set": deleted},
+            )
+            return node_cmd_reply_unset_ok(room_id) if deleted else node_cmd_reply_unset_not_found(room_id)
+
+        # ── set ───────────────────────────────────────────────────────────────
+        if subcmd == NODE_SUBCMD_SET:
+            if not node_id:
+                return node_cmd_reply_error("Missing `node=` argument for `set`.")
+            allowed = self._node_policy.allowed_nodes if self._node_policy else frozenset([self._node_id])
+            if node_id not in allowed:
+                allowed_list = ", ".join(f"`{n}`" for n in sorted(allowed))
+                return node_cmd_reply_error(
+                    f"Node `{node_id}` is not in allowed list: {allowed_list}"
+                )
+            try:
+                await asyncio.to_thread(self._policy_store.set_override, room_id, node_id, sender)
+            except Exception as exc:
+                logger.warning("PolicyStore set_override error: %s", exc)
+                return "⚠️ Could not write to policy store."
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.node.set",
+                agent_id="control", node_id=self._node_id,
+                room_id=ctrl_room_id, event_id=event_id,
+                status="ok", data={"sender": sender, "target_room": room_id, "set_node": node_id},
+            )
+            return node_cmd_reply_set(room_id, node_id)
+
+        return node_cmd_reply_error("Unhandled subcommand.")
+
+    # ── M4.1: Bridge status for operators ─────────────────────────────────────
+
+    async def _handle_bridge_status(
+        self,
+        http_client: httpx.AsyncClient,
+        sender: str,
+        room_id: str,
+        event_id: str,
+    ) -> str:
+        """Build and return a bridge health snapshot for `!status` in control room."""
+        try:
+            snapshot: Dict[str, Any] = {
+                "node_id": self._node_id,
+                "worker_count": self._worker_count,
+                "room_count": len(self._room_map.mappings),
+                "mixed_room_count": (
+                    len(self._mixed_room_config.rooms)
+                    if self._mixed_room_config
+                    else 0
+                ),
+                "operators_count": (
+                    len(self._control_config.operator_allowlist)
+                    if self._control_config and self._control_config.operator_allowlist
+                    else 0
+                ),
+            }
+            # Queue info (exposed by MatrixIngressLoop via get_status)
+            status = self.get_status()
+            snapshot["queue_size"] = status.get("queue_size", "?")
+            snapshot["queue_max"] = status.get("queue_max", "?")
+            # Control safety
+            if self._control_limiter is not None:
+                snapshot["control_safety"] = self._control_limiter.as_health_dict()
+            # Persistent dedupe
+            if self._event_store is not None:
+                snapshot["persistent_dedupe"] = self._event_store.as_health_dict()
+
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.bridge_status",
+                agent_id="control", node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="ok", data={"sender": sender},
+            )
+            return bridge_status_reply(snapshot)
+        except Exception as exc:
+            logger.warning("_handle_bridge_status error: %s", exc)
+            return status_not_available_reply()
+
     # ── Control command handler ────────────────────────────────────────────────
 
     async def _try_control(
@@ -639,12 +2512,82 @@ class MatrixIngressLoop:
                     logger.warning("Could not send unauthorized reply: %s", exc)
             return
 
+        # M3.4: Rate limiting + cooldown (after auth, before parse/dispatch)
+        if self._control_limiter is not None:
+            sender_hash_ctrl = _sender_hash(sender)
+
+            allowed_room, retry_room = self._control_limiter.check_room(room_id)
+            if not allowed_room:
+                scope = "room"
+                logger.info("Control rate limited: scope=%s room=%s sender=%s", scope, room_id, sender)
+                await _write_audit(
+                    http_client, self._console_url, self._internal_token,
+                    event="matrix.control.rate_limited",
+                    agent_id="control", node_id=self._node_id,
+                    room_id=room_id, event_id=event_id,
+                    status="error", error_code="rate_limited_room",
+                    data={"sender": sender, "scope": scope, "retry_after_s": retry_room},
+                )
+                if self._on_control_rate_limited:
+                    self._on_control_rate_limited(scope)
+                txn_id = MatrixClient.make_txn_id(room_id, event_id + "_rl")
+                await client.send_text(room_id, rate_limited_reply(scope, retry_room), txn_id)
+                return
+
+            allowed_op, retry_op = self._control_limiter.check_operator(sender_hash_ctrl)
+            if not allowed_op:
+                scope = "operator"
+                logger.info("Control rate limited: scope=%s sender=%s", scope, sender)
+                await _write_audit(
+                    http_client, self._console_url, self._internal_token,
+                    event="matrix.control.rate_limited",
+                    agent_id="control", node_id=self._node_id,
+                    room_id=room_id, event_id=event_id,
+                    status="error", error_code="rate_limited_operator",
+                    data={"sender": sender, "scope": scope, "retry_after_s": retry_op},
+                )
+                if self._on_control_rate_limited:
+                    self._on_control_rate_limited(scope)
+                txn_id = MatrixClient.make_txn_id(room_id, event_id + "_rl")
+                await client.send_text(room_id, rate_limited_reply(scope, retry_op), txn_id)
+                return
+
         # Parse command
         cmd = parse_command(text)
         if cmd is None:
             logger.warning("Control message from %s could not be parsed: %r", sender, text[:60])
             return
 
+        # M3.4: Cooldown check (anti-double-click, per operator+verb+subcommand)
+        if self._control_limiter is not None:
+            sender_hash_ctrl = _sender_hash(sender)
+            allowed_cd, wait_cd = self._control_limiter.check_cooldown(
+                sender_hash_ctrl, cmd.verb, cmd.subcommand or "",
+            )
+            if not allowed_cd:
+                scope = "cooldown"
+                logger.info(
+                    "Control cooldown: sender=%s verb=%s sub=%s wait=%.1fs",
+                    sender, cmd.verb, cmd.subcommand, wait_cd,
+                )
+                await _write_audit(
+                    http_client, self._console_url, self._internal_token,
+                    event="matrix.control.rate_limited",
+                    agent_id="control", node_id=self._node_id,
+                    room_id=room_id, event_id=event_id,
+                    status="error", error_code="cooldown",
+                    data={
+                        "sender": sender, "scope": scope,
+                        "verb": cmd.verb, "subcommand": cmd.subcommand,
+                        "wait_s": wait_cd,
+                    },
+                )
+                if self._on_control_rate_limited:
+                    self._on_control_rate_limited(scope)
+                txn_id = MatrixClient.make_txn_id(room_id, event_id + "_cd")
+                await client.send_text(room_id, rate_limited_reply(scope, wait_cd), txn_id)
+                return
+
         # Metric callback
         if self._on_control_command:
             self._on_control_command(sender, cmd.verb, cmd.subcommand)
@@ -671,10 +2614,91 @@ class MatrixIngressLoop:
             sender, cmd.verb, cmd.subcommand, cmd.args,
         )
 
-        # Build reply
+        # Dispatch command
         txn_id = MatrixClient.make_txn_id(room_id, event_id + "_ctrl")
-        if cmd.verb == VERB_HELP:
+
+        # M9.0: Dangerous commands → two-step confirmation (intent leg)
+        if (
+            self._confirm_store is not None
+            and is_dangerous_cmd(cmd)
+            and cmd.verb != VERB_CONFIRM
+        ):
+            reply_text = await self._handle_dangerous_intent(
+                http_client, cmd, sender, room_id, event_id,
+            )
+        elif cmd.verb == VERB_CONFIRM:
+            # M9.0: !confirm <nonce> (second leg)
+            reply_text = await self._handle_confirm_cmd(
+                http_client, cmd, sender, room_id, event_id,
+            )
+        elif cmd.verb == VERB_HELP:
             reply_text = help_reply()
+        elif cmd.verb == VERB_CONFIRM:
+            # Fallback if confirm_store is None (disabled) — inform the operator
+            reply_text = "❌ Confirmation system is disabled."
+        elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_START:
+            reply_text = await self._handle_runbook_start(
+                http_client, client, cmd, sender, room_id, event_id,
+            )
+        elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_NEXT:
+            # M3.2: advance to next step
+            reply_text = await self._handle_runbook_next(
+                http_client, client, cmd, sender, room_id, event_id,
+            )
+        elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_COMPLETE:
+            # M3.2: mark manual step complete
+            reply_text = await self._handle_runbook_complete(
+                http_client, client, cmd, sender, room_id, event_id,
+            )
+        elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_STATUS:
+            # M3.3: show run status
+            reply_text = await self._handle_runbook_status(
+                http_client, client, cmd, sender, room_id, event_id,
+            )
+        elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_EVIDENCE:
+            # M3.3: generate release evidence
+            reply_text = await self._handle_runbook_evidence(
+                http_client, client, cmd, sender, room_id, event_id,
+            )
+        elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_POST_REVIEW:
+            # M3.3: generate post-release review
+            reply_text = await self._handle_runbook_post_review(
+                http_client, client, cmd, sender, room_id, event_id,
+            )
+        elif cmd.verb == VERB_STATUS:
+            # M4.1: bridge health snapshot for operators
+            reply_text = await self._handle_bridge_status(
+                http_client, sender, room_id, event_id,
+            )
+        elif cmd.verb == VERB_NODES:
+            # M5.1: node policy overview for operators
+            reply_text = await self._handle_nodes(
+                http_client, sender, room_id, event_id,
+            )
+        elif cmd.verb == VERB_NODE:
+            # M6.0: dynamic room-node override commands
+            # Reconstruct args_text from parsed command parts
+            _node_args_parts = []
+            if cmd.subcommand:
+                _node_args_parts.append(cmd.subcommand)
+            _node_args_parts.extend(cmd.args)
+            _node_args_parts.extend(f"{k}={v}" for k, v in cmd.kwargs.items())
+            _node_args_text = " ".join(_node_args_parts)
+            reply_text = await self._handle_node_cmd(
+                http_client, sender, room_id, event_id, _node_args_text,
+            )
+        elif cmd.verb == VERB_ROOM:
+            # M6.1: dynamic mixed room agent overrides
+            reply_text = await self._handle_room_cmd(
+                http_client, sender, room_id, event_id,
+                cmd.subcommand, cmd.args, cmd.kwargs,
+            )
+        elif cmd.verb == VERB_POLICY:
+            # M6.2: policy snapshot export/import
+            reply_text = await self._handle_policy_cmd(
+                http_client, sender, room_id, event_id,
+                cmd.subcommand, cmd.kwargs,
+            )
         elif not cmd.is_known:
             reply_text = unknown_command_reply(cmd)
             await _write_audit(
@@ -686,7 +2710,6 @@ class MatrixIngressLoop:
                 data={"verb": cmd.verb, "sender": sender},
             )
         else:
-            # M3.1+ will implement actual runbook/status commands
             reply_text = not_implemented_reply(cmd)
 
         try:
@@ -694,6 +2717,439 @@ class MatrixIngressLoop:
         except Exception as exc:
             logger.error("Could not send control reply: %s", exc)
 
+    async def _handle_runbook_start(
+        self,
+        http_client: httpx.AsyncClient,
+        client: "MatrixClient",
+        cmd: ControlCommand,
+        sender: str,
+        room_id: str,
+        event_id: str,
+    ) -> str:
+        """
+        M3.1: Execute !runbook start <path> [node=NODA1].
+
+        Calls sofiia-console POST /api/runbooks/internal/runs.
+        Returns reply text (success or failure) for delivery to the control room.
+        Audits matrix.control.runbook.start regardless of outcome.
+        """
+        # Extract positional runbook_path
+        runbook_path = cmd.args[0].strip() if cmd.args else ""
+        node_id = cmd.kwargs.get("node", "NODA1").strip()
+
+        # Validate path before calling the console
+        path_error = _ctrl_runner.validate_runbook_path(runbook_path)
+        if path_error:
+            logger.warning(
+                "!runbook start invalid path: sender=%s path=%r error=%s",
+                sender, runbook_path, path_error,
+            )
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.runbook.start",
+                agent_id="control", node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="error", error_code="invalid_path",
+                data={"sender": sender, "runbook_path": runbook_path, "error": path_error},
+            )
+            return start_usage_reply()
+
+        # Call sofiia-console internal API
+        run_id: Optional[str] = None
+        http_status: Optional[int] = None
+        try:
+            result = await _ctrl_runner.start_runbook_run(
+                http_client=http_client,
+                console_url=self._console_url,
+                control_token=self._control_token,
+                runbook_path=runbook_path,
+                operator_id=sender,
+                node_id=node_id,
+            )
+            run_id = result.get("run_id", "")
+            steps_total = result.get("steps_total", 0)
+            status = result.get("status", "running")
+            http_status = 200
+
+            logger.info(
+                "Runbook started: run_id=%s path=%s node=%s steps=%d by sender=%s",
+                run_id, runbook_path, node_id, steps_total, sender,
+            )
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.runbook.start",
+                agent_id="control", node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="ok",
+                data={
+                    "sender": sender,
+                    "runbook_path": runbook_path,
+                    "node_id": node_id,
+                    "run_id": run_id,
+                    "steps_total": steps_total,
+                    "http_status": http_status,
+                },
+            )
+            return runbook_started_reply(run_id, steps_total, status)
+
+        except _ctrl_runner.RunnerError as exc:
+            reason = str(exc)
+            logger.error(
+                "!runbook start failed: sender=%s path=%r node=%s error=%s",
+                sender, runbook_path, node_id, reason,
+            )
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.runbook.start",
+                agent_id="control", node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="error", error_code="runner_error",
+                data={
+                    "sender": sender,
+                    "runbook_path": runbook_path,
+                    "node_id": node_id,
+                    "error": reason,
+                    "http_status": http_status,
+                },
+            )
+            return runbook_start_error_reply(reason)
+
+    async def _handle_runbook_next(
+        self,
+        http_client: httpx.AsyncClient,
+        client: "MatrixClient",
+        cmd: ControlCommand,
+        sender: str,
+        room_id: str,
+        event_id: str,
+    ) -> str:
+        """
+        M3.2: Execute !runbook next <run_id>.
+
+        Calls sofiia-console POST /api/runbooks/internal/runs/{run_id}/next.
+        Returns reply text for the control room.
+        Audits matrix.control.runbook.next.
+        """
+        run_id = cmd.args[0].strip() if cmd.args else ""
+        if not run_id:
+            return next_usage_reply()
+
+        # M3.4: per-run rate limit for !runbook next
+        if self._control_limiter is not None:
+            allowed_run, retry_run = self._control_limiter.check_run_next(run_id)
+            if not allowed_run:
+                scope = "run"
+                if self._on_control_rate_limited:
+                    self._on_control_rate_limited(scope)
+                return rate_limited_reply(scope, retry_run)
+
+        http_status: Optional[int] = None
+        try:
+            result = await _ctrl_runner.next_runbook_step(
+                http_client=http_client,
+                console_url=self._console_url,
+                control_token=self._control_token,
+                run_id=run_id,
+                operator_id=sender,
+            )
+            http_status = 200
+            step_type = result.get("type", "unknown")
+            step_index = result.get("step_index", 0)
+
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.runbook.next",
+                agent_id="control", node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="ok",
+                data={
+                    "sender": sender,
+                    "run_id": run_id,
+                    "step_index": step_index,
+                    "step_type": step_type,
+                    "http_status": http_status,
+                },
+            )
+
+            if step_type == "manual":
+                return next_manual_reply(
+                    run_id=run_id,
+                    step_index=step_index,
+                    steps_total=result.get("steps_total"),
+                    title=result.get("title", ""),
+                    instructions_md=result.get("instructions_md", ""),
+                )
+            else:
+                # http_check / script
+                result_dict = result.get("result") or {}
+                duration_ms = int(result_dict.get("duration_ms", 0)) if isinstance(result_dict, dict) else 0
+                return next_auto_reply(
+                    run_id=run_id,
+                    step_index=step_index,
+                    action_type=step_type,
+                    step_status=result.get("step_status", "ok"),
+                    duration_ms=duration_ms or None,
+                    completed=bool(result.get("completed", False)),
+                )
+
+        except _ctrl_runner.RunnerError as exc:
+            reason = str(exc)
+            logger.error(
+                "!runbook next failed: sender=%s run_id=%r error=%s",
+                sender, run_id, reason,
+            )
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.runbook.next",
+                agent_id="control", node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="error", error_code="runner_error",
+                data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status},
+            )
+            return next_error_reply(run_id, reason)
+
+    async def _handle_runbook_complete(
+        self,
+        http_client: httpx.AsyncClient,
+        client: "MatrixClient",
+        cmd: ControlCommand,
+        sender: str,
+        room_id: str,
+        event_id: str,
+    ) -> str:
+        """
+        M3.2: Execute !runbook complete <run_id> step=<n> status=ok|warn|fail [notes=...]
+
+        Calls sofiia-console POST /api/runbooks/internal/runs/{run_id}/steps/{n}/complete.
+        Audits matrix.control.runbook.complete.
+        """
+        run_id = cmd.args[0].strip() if cmd.args else ""
+        if not run_id:
+            return complete_usage_reply()
+
+        # step kwarg required
+        step_raw = cmd.kwargs.get("step", "").strip()
+        if not step_raw or not step_raw.isdigit():
+            return complete_usage_reply()
+        step_index = int(step_raw)
+
+        # status kwarg required
+        status = cmd.kwargs.get("status", "").strip().lower()
+        if status not in ("ok", "warn", "fail", "skipped"):
+            return complete_usage_reply()
+
+        # notes: kwarg or remaining positional args (joined with space)
+        notes = cmd.kwargs.get("notes", "").strip()
+        if not notes and len(cmd.args) > 1:
+            notes = " ".join(cmd.args[1:])
+        notes = sanitize_notes(notes)  # M3.4: strip control chars + truncate to MAX_NOTES_LEN
+
+        http_status: Optional[int] = None
+        try:
+            result = await _ctrl_runner.complete_runbook_step(
+                http_client=http_client,
+                console_url=self._console_url,
+                control_token=self._control_token,
+                run_id=run_id,
+                step_index=step_index,
+                status=status,
+                notes=notes,
+                operator_id=sender,
+            )
+            http_status = 200
+            run_completed = bool(result.get("run_completed", False))
+
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.runbook.complete",
+                agent_id="control", node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="ok",
+                data={
+                    "sender": sender,
+                    "run_id": run_id,
+                    "step_index": step_index,
+                    "status": status,
+                    "run_completed": run_completed,
+                    "http_status": http_status,
+                },
+            )
+            return complete_ok_reply(run_id, step_index, status, run_completed)
+
+        except _ctrl_runner.RunnerError as exc:
+            reason = str(exc)
+            logger.error(
+                "!runbook complete failed: sender=%s run_id=%r step=%d error=%s",
+                sender, run_id, step_index, reason,
+            )
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.runbook.complete",
+                agent_id="control", node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="error", error_code="runner_error",
+                data={
+                    "sender": sender,
+                    "run_id": run_id,
+                    "step_index": step_index,
+                    "error": reason,
+                    "http_status": http_status,
+                },
+            )
+            return complete_error_reply(run_id, reason)
+
+    async def _handle_runbook_status(
+        self,
+        http_client: httpx.AsyncClient,
+        client: "MatrixClient",
+        cmd: ControlCommand,
+        sender: str,
+        room_id: str,
+        event_id: str,
+    ) -> str:
+        """M3.3: !runbook status <run_id> — GET run info + format status."""
+        run_id = cmd.args[0].strip() if cmd.args else ""
+        if not run_id:
+            return status_usage_reply()
+
+        http_status: Optional[int] = None
+        try:
+            result = await _ctrl_runner.get_runbook_run(
+                http_client=http_client,
+                console_url=self._console_url,
+                control_token=self._control_token,
+                run_id=run_id,
+            )
+            http_status = 200
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.runbook.status",
+                agent_id="control", node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="ok",
+                data={
+                    "sender": sender, "run_id": run_id,
+                    "run_status": result.get("status"),
+                    "http_status": http_status,
+                },
+            )
+            return status_reply(result)
+
+        except _ctrl_runner.RunnerError as exc:
+            reason = str(exc)
+            logger.error("!runbook status failed: sender=%s run_id=%r error=%s", sender, run_id, reason)
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.runbook.status",
+                agent_id="control", node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="error", error_code="runner_error",
+                data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status},
+            )
+            return status_error_reply(run_id, reason)
+
+    async def _handle_runbook_evidence(
+        self,
+        http_client: httpx.AsyncClient,
+        client: "MatrixClient",
+        cmd: ControlCommand,
+        sender: str,
+        room_id: str,
+        event_id: str,
+    ) -> str:
+        """M3.3: !runbook evidence <run_id> — generate release evidence."""
+        run_id = cmd.args[0].strip() if cmd.args else ""
+        if not run_id:
+            return evidence_usage_reply()
+
+        http_status: Optional[int] = None
+        try:
+            result = await _ctrl_runner.generate_evidence(
+                http_client=http_client,
+                console_url=self._console_url,
+                control_token=self._control_token,
+                run_id=run_id,
+            )
+            http_status = 200
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.runbook.evidence",
+                agent_id="control", node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="ok",
+                data={
+                    "sender": sender, "run_id": run_id,
+                    "evidence_path": result.get("evidence_path"),
+                    "bytes": result.get("bytes"),
+                    "http_status": http_status,
+                },
+            )
+            return evidence_reply(result)
+
+        except _ctrl_runner.RunnerError as exc:
+            reason = str(exc)
+            logger.error("!runbook evidence failed: sender=%s run_id=%r error=%s", sender, run_id, reason)
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.runbook.evidence",
+                agent_id="control", node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="error", error_code="runner_error",
+                data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status},
+            )
+            return evidence_error_reply(run_id, reason)
+
+    async def _handle_runbook_post_review(
+        self,
+        http_client: httpx.AsyncClient,
+        client: "MatrixClient",
+        cmd: ControlCommand,
+        sender: str,
+        room_id: str,
+        event_id: str,
+    ) -> str:
+        """M3.3: !runbook post_review <run_id> — generate post-release review."""
+        run_id = cmd.args[0].strip() if cmd.args else ""
+        if not run_id:
+            return post_review_usage_reply()
+
+        http_status: Optional[int] = None
+        try:
+            result = await _ctrl_runner.generate_post_review(
+                http_client=http_client,
+                console_url=self._console_url,
+                control_token=self._control_token,
+                run_id=run_id,
+            )
+            http_status = 200
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.runbook.post_review",
+                agent_id="control", node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="ok",
+                data={
+                    "sender": sender, "run_id": run_id,
+                    "path": result.get("path"),
+                    "bytes": result.get("bytes"),
+                    "http_status": http_status,
+                },
+            )
+            return post_review_reply(result)
+
+        except _ctrl_runner.RunnerError as exc:
+            reason = str(exc)
+            logger.error("!runbook post_review failed: sender=%s run_id=%r error=%s", sender, run_id, reason)
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.control.runbook.post_review",
+                agent_id="control", node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="error", error_code="runner_error",
+                data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status},
+            )
+            return post_review_error_reply(run_id, reason)
+
     # ── Worker ─────────────────────────────────────────────────────────────────
 
     async def _worker(
@@ -760,12 +3216,76 @@ class MatrixIngressLoop:
             },
         )
 
-        # M2.1: session isolation per (room, agent) for mixed rooms
-        room_key = room_id.replace("!", "").replace(":", "_")
-        if is_mixed:
-            session_id = f"matrix:{room_key}:{agent_id}"
-        else:
-            session_id = f"matrix:{room_key}"
+        # Session Scope v2: canonical key + PII-safe sender hash.
+        # Scope is always room_agent for user-initiated messages.
+        # Control room messages never reach _process_entry (handled by _try_control).
+        scope = SCOPE_ROOM_AGENT
+        session_id = _build_session_key(room_id, agent_id, scope=scope)
+        sender_hash = _sender_hash(sender)
+
+        logger.debug(
+            "Session scope v2: session_key=%s scope=%s sender_hash=%s",
+            session_id, scope, sender_hash,
+        )
+
+        # M5.0: Node-aware routing
+        # Extract node=X kwarg from body (mixed rooms only, to avoid breaking direct rooms)
+        explicit_node: Optional[str] = None
+        effective_text = text
+        if is_mixed and self._node_policy is not None:
+            explicit_node, effective_text = extract_node_kwarg(text)
+
+        # M6.0: look up dynamic policy store override for this room
+        store_override: Optional[str] = None
+        if self._policy_store is not None and self._policy_store.is_open:
+            try:
+                store_override = await asyncio.to_thread(
+                    self._policy_store.get_override, room_id
+                )
+            except Exception as exc:  # noqa: BLE001
+                logger.warning("PolicyStore get_override failed: %s", exc)
+
+        node_res = (
+            self._node_policy.resolve(room_id, explicit_node, store_override=store_override)
+            if self._node_policy is not None
+            else NodeResolution(node_id=self._node_id, source=NODE_SOURCE_DEFAULT)
+        )
+
+        if node_res.rejected_node:
+            logger.info(
+                "Node kwarg rejected: requested=%s allowed=%s room=%s agent=%s",
+                node_res.rejected_node, self._node_policy.allowed_nodes if self._node_policy else {}, room_id, agent_id,
+            )
+            if self._on_node_rejected:
+                self._on_node_rejected(node_res.rejected_node)
+            txn_rej = MatrixClient.make_txn_id(room_id, event_id + "_node_rej")
+            allowed = self._node_policy.allowed_nodes if self._node_policy else frozenset()
+            reply_rej = node_rejected_reply(node_res.rejected_node, allowed)
+            try:
+                await client.send_text(room_id, reply_rej, txn_rej)
+            except Exception as exc:
+                logger.warning("Could not send node rejection reply: %s", exc)
+            await _write_audit(
+                http_client, self._console_url, self._internal_token,
+                event="matrix.route.node_rejected",
+                agent_id=agent_id, node_id=self._node_id,
+                room_id=room_id, event_id=event_id,
+                status="error", error_code="node_rejected",
+                data={"requested_node": node_res.rejected_node, "resolved_node": node_res.node_id},
+            )
+            # Continue with fallback node (do not drop the message)
+
+        if self._on_node_selected:
+            self._on_node_selected(agent_id, node_res.node_id, node_res.source)
+
+        await _write_audit(
+            http_client, self._console_url, self._internal_token,
+            event="matrix.route.node_selected",
+            agent_id=agent_id, node_id=node_res.node_id,
+            room_id=room_id, event_id=event_id,
+            status="ok",
+            data={"node_id": node_res.node_id, "source": node_res.source},
+        )
 
         # M2.2: per-room-agent concurrency cap (only for mixed rooms; single-agent rooms unaffected)
         _lock = self._get_concurrency_lock(room_id, agent_id) if is_mixed and self._mixed_concurrency_cap > 0 else None
@@ -774,6 +3294,9 @@ class MatrixIngressLoop:
         try:
             await self._invoke_and_send(
                 client, http_client, entry, session_id, wait_s, is_mixed, routing_reason,
+                sender_hash=sender_hash, scope=scope,
+                effective_node_id=node_res.node_id, node_source=node_res.source,
+                effective_text=effective_text,
             )
         finally:
             if _lock is not None:
@@ -788,76 +3311,212 @@ class MatrixIngressLoop:
         wait_s: float,
         is_mixed: bool,
         routing_reason: str,
+        sender_hash: str = "",
+        scope: str = SCOPE_ROOM_AGENT,
+        # M5.0: resolved node
+        effective_node_id: Optional[str] = None,
+        node_source: str = NODE_SOURCE_DEFAULT,
+        effective_text: Optional[str] = None,   # text with node=X kwarg stripped
     ) -> None:
         """Inner: invoke Router + send reply (separated for concurrency lock wrapping)."""
         event = entry.event
         event_id = event.get("event_id", "")
-        text = event.get("content", {}).get("body", "").strip()
+        # Use effective_text if provided (node kwarg stripped), otherwise original body
+        text = effective_text if effective_text is not None else event.get("content", {}).get("body", "").strip()
         room_id = entry.room_id
         agent_id = entry.agent_id
+        node_id = effective_node_id if effective_node_id is not None else self._node_id
 
-        # H3: Invoke with latency
+        # H3 + M8.0 + M8.1: Invoke with latency tracking, soft-failover, and sticky routing
         t0 = time.monotonic()
         reply_text: Optional[str] = None
         invoke_ok = False
         invoke_duration_s = 0.0
+        used_node_id = node_id   # may change on failover
 
-        try:
-            reply_text = await _invoke_router(
-                http_client, self._router_url,
-                agent_id=agent_id, node_id=self._node_id,
-                prompt=text, session_id=session_id,
-            )
-            invoke_ok = True
-            invoke_duration_s = time.monotonic() - t0
-            if self._on_invoke_latency:
-                self._on_invoke_latency(agent_id, invoke_duration_s)
+        # M8.1: check sticky cache (skip primary if sticky is set for this room:agent)
+        sticky_key = make_sticky_key(room_id, agent_id)
+        sticky_node: Optional[str] = None
+        if node_source != NODE_SOURCE_EXPLICIT and self._sticky_cache is not None:
+            sticky_node = self._sticky_cache.get(sticky_key)
+
+        async def _do_invoke(target_node: str, target_source: str) -> Optional[str]:
+            """Single invoke attempt; returns reply text or None on failure."""
+            nonlocal invoke_duration_s
+            _t = time.monotonic()
+            try:
+                result = await _invoke_router(
+                    http_client, self._router_url,
+                    agent_id=agent_id, node_id=target_node,
+                    prompt=text, session_id=session_id,
+                    sender_hash=sender_hash, scope=scope,
+                    node_source=target_source,
+                )
+                invoke_duration_s = time.monotonic() - _t
+                if self._node_health_tracker is not None:
+                    self._node_health_tracker.record_ok(target_node, invoke_duration_s)
+                if self._on_invoke_latency:
+                    self._on_invoke_latency(agent_id, invoke_duration_s, target_node)
+                logger.info(
+                    "Invoke ok: agent=%s node=%s event=%s reply_len=%d duration=%dms",
+                    agent_id, target_node, event_id, len(result or ""), int(invoke_duration_s * 1000),
+                )
+                return result
+            except httpx.HTTPStatusError as exc:
+                invoke_duration_s = time.monotonic() - _t
+                _reason = FAILOVER_REASON_HTTP_5XX if exc.response.status_code >= 500 else "http_4xx"
+                logger.error(
+                    "Router HTTP %d agent=%s node=%s event=%s duration=%dms",
+                    exc.response.status_code, agent_id, target_node, event_id,
+                    int(invoke_duration_s * 1000),
+                )
+                if self._node_health_tracker and exc.response.status_code >= 500:
+                    self._node_health_tracker.record_error(target_node, _reason)
+                if self._on_gateway_error:
+                    self._on_gateway_error(f"http_{exc.response.status_code}")
+                await _write_audit(
+                    http_client, self._console_url, self._internal_token,
+                    event="matrix.error", agent_id=agent_id, node_id=target_node,
+                    room_id=room_id, event_id=event_id,
+                    status="error", error_code=f"router_http_{exc.response.status_code}",
+                    duration_ms=int(invoke_duration_s * 1000),
+                )
+                if exc.response.status_code >= 500:
+                    raise  # eligible for failover
+                return None  # 4xx: not a node issue, don't failover
+
+            except (httpx.ConnectError, httpx.TimeoutException) as exc:
+                invoke_duration_s = time.monotonic() - _t
+                _reason = (
+                    FAILOVER_REASON_TIMEOUT
+                    if isinstance(exc, httpx.TimeoutException)
+                    else FAILOVER_REASON_NETWORK
+                )
+                logger.error(
+                    "Router network error agent=%s node=%s event=%s: %s",
+                    agent_id, target_node, event_id, exc,
+                )
+                if self._node_health_tracker:
+                    self._node_health_tracker.record_error(target_node, _reason)
+                if self._on_gateway_error:
+                    self._on_gateway_error("network_error")
+                await _write_audit(
+                    http_client, self._console_url, self._internal_token,
+                    event="matrix.error", agent_id=agent_id, node_id=target_node,
+                    room_id=room_id, event_id=event_id,
+                    status="error", error_code="router_network_error",
+                    duration_ms=int(invoke_duration_s * 1000),
+                )
+                raise  # eligible for failover
+
+            except Exception as exc:
+                invoke_duration_s = time.monotonic() - _t
+                logger.error(
+                    "Unexpected invoke error agent=%s node=%s event=%s: %s",
+                    agent_id, target_node, event_id, exc,
+                )
+                if self._node_health_tracker:
+                    self._node_health_tracker.record_error(target_node, "unexpected")
+                if self._on_gateway_error:
+                    self._on_gateway_error("unexpected")
+                await _write_audit(
+                    http_client, self._console_url, self._internal_token,
+                    event="matrix.error", agent_id=agent_id, node_id=target_node,
+                    room_id=room_id, event_id=event_id,
+                    status="error", error_code="router_unexpected",
+                    duration_ms=int(invoke_duration_s * 1000),
+                )
+                return None  # unexpected errors: no failover (could be code bug)
+
+        if sticky_node is not None:
+            # M8.1: sticky path — route directly to known-good fallback, skip primary
             logger.info(
-                "Invoke ok: agent=%s event=%s reply_len=%d duration=%dms",
-                agent_id, event_id, len(reply_text or ""), int(invoke_duration_s * 1000),
+                "Sticky: routing %s→%s (skipping primary=%s) agent=%s event=%s",
+                sticky_key, sticky_node, node_id, agent_id, event_id,
             )
+            try:
+                reply_text = await _do_invoke(sticky_node, NODE_SOURCE_DEFAULT)
+                invoke_ok = reply_text is not None
+                used_node_id = sticky_node
+            except Exception:  # noqa: BLE001
+                # Sticky node also failed — clear sticky and leave reply_text=None
+                self._sticky_cache.delete(sticky_key)  # type: ignore[union-attr]
+                logger.warning(
+                    "Sticky node %s failed for %s — cleared (agent=%s event=%s)",
+                    sticky_node, sticky_key, agent_id, event_id,
+                )
+                # M8.2: remove from DB as well
+                await self._sticky_persist_delete(sticky_key)
 
-        except httpx.HTTPStatusError as exc:
-            invoke_duration_s = time.monotonic() - t0
-            logger.error(
-                "Router HTTP %d agent=%s event=%s duration=%dms",
-                exc.response.status_code, agent_id, event_id, int(invoke_duration_s * 1000),
-            )
-            if self._on_gateway_error:
-                self._on_gateway_error(f"http_{exc.response.status_code}")
-            await _write_audit(
-                http_client, self._console_url, self._internal_token,
-                event="matrix.error", agent_id=agent_id, node_id=self._node_id,
-                room_id=room_id, event_id=event_id,
-                status="error", error_code=f"router_http_{exc.response.status_code}",
-                duration_ms=int(invoke_duration_s * 1000),
-            )
+        else:
+            # Normal path: try primary; attempt failover on eligible errors
+            try:
+                reply_text = await _do_invoke(node_id, node_source)
+                invoke_ok = reply_text is not None
+                used_node_id = node_id
 
-        except (httpx.ConnectError, httpx.TimeoutException) as exc:
-            invoke_duration_s = time.monotonic() - t0
-            logger.error("Router network error agent=%s event=%s: %s", agent_id, event_id, exc)
-            if self._on_gateway_error:
-                self._on_gateway_error("network_error")
-            await _write_audit(
-                http_client, self._console_url, self._internal_token,
-                event="matrix.error", agent_id=agent_id, node_id=self._node_id,
-                room_id=room_id, event_id=event_id,
-                status="error", error_code="router_network_error",
-                duration_ms=int(invoke_duration_s * 1000),
-            )
+            except (httpx.ConnectError, httpx.TimeoutException, httpx.HTTPStatusError):
+                # Primary failed with a failover-eligible error.
+                # Failover only for non-explicit routing (explicit = user chose node).
+                if node_source == NODE_SOURCE_EXPLICIT:
+                    logger.info(
+                        "Node %s failed for explicit routing — no failover (agent=%s event=%s)",
+                        node_id, agent_id, event_id,
+                    )
+                    # reply_text stays None; error already audited
+                else:
+                    # Attempt failover
+                    fallback_node: Optional[str] = None
+                    if self._node_health_tracker is not None and self._node_policy is not None:
+                        fallback_node = self._node_health_tracker.pick_fallback(
+                            node_id, self._node_policy.allowed_nodes
+                        )
+                    elif self._node_policy is not None:
+                        # No tracker — pick any other allowed node deterministically
+                        others = sorted(
+                            n for n in self._node_policy.allowed_nodes if n != node_id
+                        )
+                        fallback_node = others[0] if others else None
 
-        except Exception as exc:
-            invoke_duration_s = time.monotonic() - t0
-            logger.error("Unexpected invoke error agent=%s event=%s: %s", agent_id, event_id, exc)
-            if self._on_gateway_error:
-                self._on_gateway_error("unexpected")
-            await _write_audit(
-                http_client, self._console_url, self._internal_token,
-                event="matrix.error", agent_id=agent_id, node_id=self._node_id,
-                room_id=room_id, event_id=event_id,
-                status="error", error_code="router_unexpected",
-                duration_ms=int(invoke_duration_s * 1000),
-            )
+                    if fallback_node:
+                        logger.warning(
+                            "Failover: %s → %s agent=%s event=%s",
+                            node_id, fallback_node, agent_id, event_id,
+                        )
+                        try:
+                            reply_text = await _do_invoke(fallback_node, NODE_SOURCE_DEFAULT)
+                            invoke_ok = reply_text is not None
+                            used_node_id = fallback_node
+                            if invoke_ok:
+                                # Fire failover callback and audit
+                                if self._on_failover:
+                                    self._on_failover(node_id, fallback_node, "invoke_error")
+                                await _write_audit(
+                                    http_client, self._console_url, self._internal_token,
+                                    event="matrix.node.failover",
+                                    agent_id=agent_id, node_id=fallback_node,
+                                    room_id=room_id, event_id=event_id,
+                                    status="ok", data={
+                                        "from_node": node_id,
+                                        "to_node": fallback_node,
+                                        "original_source": node_source,
+                                    },
+                                )
+                                # M8.1: set sticky — future messages skip primary
+                                if self._sticky_cache is not None:
+                                    self._sticky_cache.set(sticky_key, fallback_node)
+                                    scope = "mixed" if is_mixed else "direct"
+                                    if self._on_sticky_set:
+                                        self._on_sticky_set(fallback_node, scope)
+                                    logger.info(
+                                        "Sticky set: %s → %s scope=%s ttl=%.0fs",
+                                        sticky_key, fallback_node, scope,
+                                        self._sticky_cache.ttl_s,
+                                    )
+                                    # M8.2: persist sticky to DB
+                                    await self._sticky_persist_set(sticky_key, fallback_node)
+                        except Exception:  # noqa: BLE001
+                            pass  # errors already audited inside _do_invoke
 
         if not invoke_ok or not reply_text:
             if invoke_ok:
@@ -881,7 +3540,7 @@ class MatrixIngressLoop:
                 self._on_message_replied(room_id, agent_id, "ok")
             await _write_audit(
                 http_client, self._console_url, self._internal_token,
-                event="matrix.agent.replied", agent_id=agent_id, node_id=self._node_id,
+                event="matrix.agent.replied", agent_id=agent_id, node_id=used_node_id,
                 room_id=room_id, event_id=event_id, status="ok",
                 duration_ms=int(send_duration_s * 1000),
                 data={
@@ -891,6 +3550,8 @@ class MatrixIngressLoop:
                     "queue_wait_ms": int(wait_s * 1000),
                     "routing_reason": routing_reason,
                     "is_mixed": is_mixed,
+                    "node_source": node_source,
+                    "failover": used_node_id != node_id,  # M8.0: failover flag
                 },
             )
             logger.info(
@@ -907,7 +3568,7 @@ class MatrixIngressLoop:
                 self._on_gateway_error("matrix_send_error")
             await _write_audit(
                 http_client, self._console_url, self._internal_token,
-                event="matrix.error", agent_id=agent_id, node_id=self._node_id,
+                event="matrix.error", agent_id=agent_id, node_id=node_id,
                 room_id=room_id, event_id=event_id,
                 status="error", error_code="matrix_send_failed",
                 duration_ms=int(send_duration_s * 1000),
diff --git a/services/matrix-bridge-dagi/app/main.py b/services/matrix-bridge-dagi/app/main.py
index a297931e..f91713a7 100644
--- a/services/matrix-bridge-dagi/app/main.py
+++ b/services/matrix-bridge-dagi/app/main.py
@@ -33,6 +33,9 @@ except ImportError:  # pragma: no cover
 
 from .config import BridgeConfig, load_config
 from .control import ControlConfig, parse_control_config
+from .control_limiter import ControlRateLimiter
+from .event_store import EventStore
+from .node_policy import parse_node_policy
 from .ingress import MatrixIngressLoop
 from .mixed_routing import MixedRoomConfig, parse_mixed_room_map
 from .rate_limit import InMemoryRateLimiter
@@ -69,7 +72,7 @@ if _PROM_OK:
     _invoke_latency = Histogram(
         "matrix_bridge_invoke_duration_seconds",
         "Latency of DAGI Router infer call",
-        ["agent_id"],
+        ["agent_id", "node_id"],   # M5.1: per-node latency breakdown
         buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 45.0],
     )
     _send_latency = Histogram(
@@ -80,7 +83,8 @@ if _PROM_OK:
     )
     _bridge_up = Gauge(
         "matrix_bridge_up",
-        "1 if bridge started successfully",
+        "1 if bridge started successfully; 0 on config error",
+        ["node_id"],   # M7.1: per-node label for multi-node deployments
     )
     _rate_limiter_active_rooms = Gauge(
         "matrix_bridge_rate_limiter_active_rooms",
@@ -106,10 +110,11 @@ if _PROM_OK:
         ["agent_id"],
         buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 30.0],
     )
-    # M2.2: Mixed room routing metrics
-    _routed_total = Counter(
-        "matrix_bridge_routed_total",
-        "Successful message routing by reason",
+    # M2.2: Mixed room routing — reason breakdown (slash/mention/name/default/direct)
+    # M7.1: Renamed from matrix_bridge_routed_total to avoid collision with M5.0 counter
+    _routing_reasons_total = Counter(
+        "matrix_bridge_routing_reasons_total",
+        "Message routing breakdown by agent and routing reason (slash/mention/name/default/direct)",
         ["agent_id", "reason"],
     )
     _route_rejected_total = Counter(
@@ -127,20 +132,74 @@ if _PROM_OK:
         "Total control commands received from authorized operators",
         ["sender", "verb", "subcommand"],
     )
+    _control_rate_limited_total = Counter(
+        "matrix_bridge_control_rate_limited_total",
+        "Total control commands rejected by rate limiter or cooldown",
+        ["scope"],
+    )
+    _dedupe_persistent_hits_total = Counter(
+        "matrix_bridge_dedupe_persistent_hits_total",
+        "Total events dropped by persistent (SQLite) deduplication",
+        ["room_id"],
+    )
+    _dedupe_persistent_inserts_total = Counter(
+        "matrix_bridge_dedupe_persistent_inserts_total",
+        "Total events marked as processed in persistent dedupe store",
+    )
+    # M5.0: node-aware routing — primary routed counter (unique name, no collision)
+    _routed_total = Counter(
+        "matrix_bridge_routed_total",
+        "Total messages successfully routed, by agent, resolved node, and node source",
+        ["agent_id", "node_id", "source"],
+    )
+    _node_rejected_total = Counter(
+        "matrix_bridge_node_rejected_total",
+        "Total messages with rejected (non-allowlisted) node kwarg",
+        ["node_id"],
+    )
+    # M8.0: soft-failover metrics
+    _failover_total = Counter(
+        "matrix_bridge_failover_total",
+        "Total successful soft-failovers by node transition and reason",
+        ["from_node", "to_node", "reason"],
+    )
+    _node_health_state = Gauge(
+        "matrix_bridge_node_health_state",
+        "Node health state: 1=healthy 0.5=degraded 0=down",
+        ["node_id"],
+    )
+    # M8.1: sticky routing metrics
+    _sticky_set_total = Counter(
+        "matrix_bridge_sticky_node_total",
+        "Total sticky routing entries set after failover, by preferred node and scope",
+        ["node_id", "scope"],
+    )
+    _sticky_active = Gauge(
+        "matrix_bridge_sticky_node_active",
+        "Current count of active sticky routing entries",
+        [],
+    )
 
 # ── Startup state ─────────────────────────────────────────────────────────────
 _START_TIME = time.monotonic()
 _cfg: Optional[BridgeConfig] = None
+# M5.1: in-memory per-node counters (lightweight, for !status reply)
+from collections import defaultdict as _defaultdict
+_node_stats: Dict[str, Dict[str, int]] = _defaultdict(lambda: {"routed": 0, "rejected": 0})
 _config_error: Optional[str] = None
 _matrix_reachable: Optional[bool] = None
 _gateway_reachable: Optional[bool] = None
 _room_map: Optional[RoomMappingConfig] = None
 _mixed_room_config: Optional[MixedRoomConfig] = None
 _control_config: Optional[ControlConfig] = None
+_event_store: Optional[EventStore] = None
 _rate_limiter: Optional[InMemoryRateLimiter] = None
 _ingress_loop: Optional["MatrixIngressLoop"] = None   # for /health queue_size
 _ingress_task: Optional[asyncio.Task] = None
 _ingress_stop: Optional[asyncio.Event] = None
+_sticky_cache: Optional[Any] = None   # M8.1: StickyNodeCache instance
+_confirm_store: Optional[Any] = None  # M9.0: ConfirmStore instance
+_dummy_http_client: Optional[Any] = None  # M11: soak inject endpoint (debug only)
 
 
 async def _probe_url(url: str, timeout: float = 5.0) -> bool:
@@ -230,7 +289,7 @@ async def lifespan(app_: Any):
         else:
             logger.warning("⚠️  DAGI Gateway NOT reachable: %s", _cfg.dagi_gateway_url)
         if _PROM_OK:
-            _bridge_up.set(1)
+            _bridge_up.labels(node_id=_cfg.node_id or "").set(1)  # M7.1: labeled
 
         # Start ingress loop (fire-and-forget asyncio task)
         _has_rooms = (_room_map and _room_map.total_mappings > 0) or (
@@ -263,9 +322,9 @@ async def lifespan(app_: Any):
                         _rate_limiter_active_rooms.set(stats["active_rooms"])
                         _rate_limiter_active_senders.set(stats["active_senders"])
 
-            def _on_invoke_latency(agent_id: str, duration_s: float) -> None:
+            def _on_invoke_latency(agent_id: str, duration_s: float, node_id: str = "") -> None:
                 if _PROM_OK:
-                    _invoke_latency.labels(agent_id=agent_id).observe(duration_s)
+                    _invoke_latency.labels(agent_id=agent_id, node_id=node_id or "unknown").observe(duration_s)
 
             def _on_send_latency(agent_id: str, duration_s: float) -> None:
                 if _PROM_OK:
@@ -287,7 +346,7 @@ async def lifespan(app_: Any):
             # M2.2 callbacks
             def _on_routed(agent_id: str, reason: str) -> None:
                 if _PROM_OK:
-                    _routed_total.labels(agent_id=agent_id, reason=reason).inc()
+                    _routing_reasons_total.labels(agent_id=agent_id, reason=reason).inc()  # M7.1: renamed
 
             def _on_route_rejected(room_id: str, reason: str) -> None:
                 if _PROM_OK:
@@ -300,6 +359,164 @@ async def lifespan(app_: Any):
                         sender=sender, verb=verb, subcommand=subcommand
                     ).inc()
 
+            # M3.4: control safety rate limiter
+            _control_limiter = ControlRateLimiter(
+                room_rpm=_cfg.control_room_rpm,
+                operator_rpm=_cfg.control_operator_rpm,
+                run_next_rpm=_cfg.control_run_next_rpm,
+                cooldown_s=_cfg.control_cooldown_s,
+            ) if _control_config and _control_config.is_enabled else None
+
+            def _on_control_rate_limited(scope: str) -> None:
+                if _PROM_OK:
+                    _control_rate_limited_total.labels(scope=scope).inc()
+
+            # M2.3: Persistent event deduplication
+            _prune_task: Optional[asyncio.Task] = None
+            if _cfg.persistent_dedupe:
+                import os
+                db_path = os.path.join(_cfg.bridge_data_dir, "matrix_bridge.db")
+                _event_store = EventStore(
+                    db_path=db_path,
+                    ttl_h=_cfg.processed_events_ttl_h,
+                    prune_batch=_cfg.processed_events_prune_batch,
+                )
+                store_ok = await _event_store.open()
+                if store_ok:
+                    logger.info(
+                        "✅ Persistent dedupe: %s (ttl_h=%d)",
+                        db_path, _cfg.processed_events_ttl_h,
+                    )
+                    # Best-effort prune on startup
+                    pruned = await _event_store.prune()
+                    if pruned:
+                        logger.info("Startup prune removed %d stale events", pruned)
+                    # Periodic prune task
+                    if _cfg.processed_events_prune_interval_s > 0:
+                        async def _prune_loop() -> None:
+                            while True:
+                                await asyncio.sleep(_cfg.processed_events_prune_interval_s)
+                                if _event_store:
+                                    await _event_store.prune()
+                        _prune_task = asyncio.create_task(_prune_loop(), name="event_store_prune")
+                else:
+                    logger.warning("⚠️  EventStore init failed — persistent dedupe disabled (degraded)")
+                    _event_store = None
+            else:
+                logger.info("Persistent dedupe disabled (PERSISTENT_DEDUPE=0)")
+
+            def _on_dedupe_hit(room_id: str, agent_id: str) -> None:
+                if _PROM_OK:
+                    _dedupe_persistent_hits_total.labels(room_id=room_id).inc()
+
+            def _on_dedupe_insert() -> None:
+                if _PROM_OK:
+                    _dedupe_persistent_inserts_total.inc()
+
+            # M5.0: node-aware routing policy
+            _node_policy = parse_node_policy(
+                raw_allowed=_cfg.bridge_allowed_nodes,
+                default_node=_cfg.bridge_default_node,
+                raw_room_map=_cfg.bridge_room_node_map,
+            )
+            logger.info(
+                "✅ Node policy: default=%s allowed=%s room_overrides=%d",
+                _node_policy.default_node,
+                sorted(_node_policy.allowed_nodes),
+                len(_node_policy.room_node_map),
+            )
+
+            # M6.0: Persistent policy store for dynamic room-node overrides
+            _policy_store: Optional[Any] = None
+            try:
+                from .policy_store import PolicyStore as _PolicyStore
+                import os
+                _ps_path = os.path.join(_cfg.bridge_data_dir, "policy_overrides.db")
+                _policy_store = _PolicyStore(db_path=_ps_path)
+                _policy_store.open()
+                logger.info(
+                    "✅ Policy store: %s (%d overrides)",
+                    _ps_path, _policy_store.count_overrides(),
+                )
+            except Exception as _ps_exc:
+                logger.warning("Policy store init failed (non-fatal): %s", _ps_exc)
+                _policy_store = None
+
+            def _on_node_selected(agent_id: str, node_id: str, source: str) -> None:
+                if _PROM_OK:
+                    _routed_total.labels(agent_id=agent_id, node_id=node_id, source=source).inc()
+                _node_stats[node_id]["routed"] += 1
+
+            def _on_node_rejected(rejected_node: str) -> None:
+                if _PROM_OK:
+                    _node_rejected_total.labels(node_id=rejected_node).inc()
+                _node_stats[rejected_node]["rejected"] += 1
+
+            # M8.0: Node health tracker + soft-failover
+            from .node_health import NodeHealthTracker as _NodeHealthTracker, parse_node_health_config as _parse_nhc
+            _health_cfg = _parse_nhc(
+                fail_consecutive=_cfg.node_fail_consecutive,
+                lat_ewma_s=_cfg.node_lat_ewma_s,
+                ewma_alpha=_cfg.node_ewma_alpha,
+            )
+            _node_health_tracker = _NodeHealthTracker(_health_cfg)
+            logger.info(
+                "✅ Node health tracker: fail_consecutive=%d lat_ewma_s=%.1f ewma_alpha=%.2f",
+                _cfg.node_fail_consecutive, _cfg.node_lat_ewma_s, _cfg.node_ewma_alpha,
+            )
+
+            def _on_failover(from_node: str, to_node: str, reason: str) -> None:
+                if _PROM_OK:
+                    _failover_total.labels(
+                        from_node=from_node, to_node=to_node, reason=reason
+                    ).inc()
+                if _PROM_OK:
+                    _update_health_gauges()
+                logger.info("⚡ Failover: %s → %s reason=%s", from_node, to_node, reason)
+
+            def _update_health_gauges() -> None:
+                if not _PROM_OK or _node_health_tracker is None or _node_policy is None:
+                    return
+                _STATE_MAP = {"healthy": 1.0, "degraded": 0.5, "down": 0.0}
+                for nid in _node_policy.allowed_nodes:
+                    state = _node_health_tracker.state(nid)
+                    _node_health_state.labels(node_id=nid).set(_STATE_MAP.get(state, 1.0))
+
+            # M8.1: Sticky failover cache
+            from .sticky_cache import StickyNodeCache as _StickyNodeCache
+            global _sticky_cache
+            if _cfg.failover_sticky_ttl_s > 0:
+                _sticky_cache = _StickyNodeCache(ttl_s=_cfg.failover_sticky_ttl_s)
+                logger.info("✅ Sticky failover cache: ttl=%.0fs", _cfg.failover_sticky_ttl_s)
+            else:
+                _sticky_cache = None
+                logger.info("ℹ️  Sticky failover disabled (FAILOVER_STICKY_TTL_S=0)")
+
+            # M9.0: Confirm store
+            from .confirm_store import ConfirmStore as _ConfirmStore
+            global _confirm_store
+            if _cfg.confirm_ttl_s > 0:
+                _confirm_store = _ConfirmStore(ttl_s=_cfg.confirm_ttl_s)
+                logger.info("✅ Confirm store: ttl=%.0fs", _cfg.confirm_ttl_s)
+            else:
+                _confirm_store = None
+                logger.info("ℹ️  Confirm store disabled (CONFIRM_TTL_S=0)")
+
+            # M11: debug inject client (only created when inject is enabled)
+            global _dummy_http_client
+            if _cfg.debug_inject_enabled and _HTTPX_OK:
+                _dummy_http_client = _httpx.AsyncClient(timeout=30.0)
+                logger.warning(
+                    "⚠️  DEBUG_INJECT_ENABLED=true — synthetic event injection active. "
+                    "NEVER use in production!"
+                )
+
+            def _on_sticky_set(node_id: str, scope: str) -> None:
+                if _PROM_OK:
+                    _sticky_set_total.labels(node_id=node_id, scope=scope).inc()
+                    if _sticky_cache is not None:
+                        _sticky_active.labels().set(_sticky_cache.active_count())
+
             ingress = MatrixIngressLoop(
                 matrix_homeserver_url=_cfg.matrix_homeserver_url,
                 matrix_access_token=_cfg.matrix_access_token,
@@ -330,7 +547,38 @@ async def lifespan(app_: Any):
                 on_route_rejected=_on_route_rejected,
                 control_config=_control_config,
                 control_unauthorized_behavior=_cfg.control_unauthorized_behavior,
+                sofiia_control_token=_cfg.sofiia_control_token,
+                control_limiter=_control_limiter,
                 on_control_command=_on_control_command,
+                on_control_rate_limited=_on_control_rate_limited,
+                event_store=_event_store,
+                on_dedupe_persistent_hit=_on_dedupe_hit,
+                on_dedupe_persistent_insert=_on_dedupe_insert,
+                # M4.0: agent discovery
+                discovery_rpm=_cfg.discovery_rpm,
+                # M5.0: node-aware routing
+                node_policy=_node_policy,
+                on_node_selected=_on_node_selected,
+                on_node_rejected=_on_node_rejected,
+                # M5.1: node stats getter for !status
+                node_stats_getter=lambda: {k: dict(v) for k, v in _node_stats.items()},
+                # M6.0: dynamic room-node policy store
+                policy_store=_policy_store,
+                # M6.2: data directory for policy exports/imports
+                bridge_data_dir=_cfg.bridge_data_dir,
+                # M8.0: node health tracker + failover callback
+                node_health_tracker=_node_health_tracker,
+                on_failover=_on_failover,
+                # M8.1: sticky failover cache
+                sticky_cache=_sticky_cache,
+                on_sticky_set=_on_sticky_set,
+                # M8.2: HA persistence config
+                ha_health_snapshot_interval_s=_cfg.ha_health_snapshot_interval_s,
+                ha_health_max_age_s=_cfg.ha_health_max_age_s,
+                # M9.0: Two-step confirmation store
+                confirm_store=_confirm_store,
+                policy_export_retention_days=_cfg.policy_export_retention_days,
+                policy_history_limit=_cfg.policy_history_limit,
             )
             logger.info(
                 "✅ Backpressure queue: max=%d workers=%d drain_timeout=%.1fs",
@@ -349,7 +597,8 @@ async def lifespan(app_: Any):
         _config_error = str(exc)
         logger.error("❌ Config error: %s", _config_error)
         if _PROM_OK:
-            _bridge_up.set(0)
+            _cfg_node = _cfg.node_id if _cfg else ""
+            _bridge_up.labels(node_id=_cfg_node or "").set(0)  # M7.1: labeled
     yield
     # Shutdown: cancel ingress loop
     if _ingress_stop:
@@ -360,6 +609,23 @@ async def lifespan(app_: Any):
             await asyncio.wait_for(_ingress_task, timeout=5.0)
         except (asyncio.CancelledError, asyncio.TimeoutError):
             pass
+    # Shutdown: cancel prune task + close EventStore
+    if "_prune_task" in dir() and _prune_task and not _prune_task.done():  # type: ignore[name-defined]
+        _prune_task.cancel()  # type: ignore[name-defined]
+    if _event_store is not None:
+        await _event_store.close()
+    # M6.0: close policy store
+    if "_policy_store" in dir() and _policy_store is not None:  # type: ignore[name-defined]
+        try:
+            _policy_store.close()  # type: ignore[name-defined]
+        except Exception:  # noqa: BLE001
+            pass
+    # M11: close debug http client if open
+    if _dummy_http_client is not None:
+        try:
+            await _dummy_http_client.aclose()
+        except Exception:  # noqa: BLE001
+            pass
     logger.info("matrix-bridge-dagi shutting down")
 
 # ── App ───────────────────────────────────────────────────────────────────────
@@ -435,6 +701,89 @@ async def health() -> Dict[str, Any]:
             "operators_count": len(_control_config.operator_allowlist) if _control_config else 0,
             "unauthorized_behavior": _cfg.control_unauthorized_behavior,
         },
+        "control_safety": {
+            "enabled": _cfg.control_room_rpm > 0 or _cfg.control_operator_rpm > 0,
+            "room_rpm": _cfg.control_room_rpm,
+            "operator_rpm": _cfg.control_operator_rpm,
+            "run_next_rpm": _cfg.control_run_next_rpm,
+            "cooldown_s": _cfg.control_cooldown_s,
+        },
+        "persistent_dedupe": _event_store.as_health_dict() if _event_store else {
+            "enabled": False,
+            "db_path": None,
+            "ttl_h": _cfg.processed_events_ttl_h,
+            "ok": False,
+            "last_prune_at": None,
+            "pruned_rows_last": 0,
+        },
+        # M6.0: policy store health
+        "policy_store": _health_policy_store_dict(),
+        # M8.1: sticky failover cache health
+        "sticky_cache": _health_sticky_dict(),
+        # M8.2: HA state persistence info
+        "ha_state": _health_ha_dict(),
+        # M9.0: confirm store
+        "confirm_store": _health_confirm_dict(),
+    }
+
+
+def _health_confirm_dict() -> Dict[str, Any]:
+    """Return confirm store info for /health endpoint (M9.0)."""
+    if _confirm_store is None:
+        return {"enabled": False}
+    return {
+        "enabled": True,
+        "pending": _confirm_store.pending_count(),
+        "ttl_s": _confirm_store.ttl_s,
+    }
+
+
+def _health_ha_dict() -> Dict[str, Any]:
+    """Return HA persistence info for /health endpoint (M8.2)."""
+    if _ingress_loop is None:
+        return {"sticky_loaded": 0, "health_loaded": False, "snapshot_interval_s": 0}
+    try:
+        s = _ingress_loop.get_status()
+        return {
+            "sticky_loaded": s.get("ha_sticky_loaded", 0),
+            "health_loaded": s.get("ha_health_loaded", False),
+            "snapshot_interval_s": s.get("ha_health_snapshot_interval_s", 0),
+        }
+    except Exception:  # noqa: BLE001
+        return {"sticky_loaded": 0, "health_loaded": False, "snapshot_interval_s": 0}
+
+
+def _health_sticky_dict() -> Dict[str, Any]:
+    """Return sticky failover cache health for /health endpoint (M8.1)."""
+    if _sticky_cache is None:
+        return {"enabled": False, "active_keys": 0, "ttl_s": 0}
+    return {
+        "enabled": True,
+        "active_keys": _sticky_cache.active_count(),
+        "ttl_s": _sticky_cache.ttl_s,
+    }
+
+
+def _health_policy_store_dict() -> Dict[str, Any]:
+    """Return policy store health info for /health endpoint."""
+    try:
+        if _ingress_loop is not None:
+            s = _ingress_loop.get_status()
+            return {
+                "ok": s.get("policy_store_ok", False),
+                "path": s.get("policy_store_path"),
+                "overrides_count": s.get("policy_overrides_count", 0),
+                "agent_overrides_count": s.get("policy_agent_overrides_count", 0),      # M6.1
+                "last_export_at": s.get("policy_last_export_at"),                        # M6.2
+                "last_import_at": s.get("policy_last_import_at"),                        # M6.2
+                "db_mtime": s.get("policy_db_mtime"),                                    # M6.2
+            }
+    except Exception:  # noqa: BLE001
+        pass
+    return {
+        "ok": False, "path": None,
+        "overrides_count": 0, "agent_overrides_count": 0,
+        "last_export_at": None, "last_import_at": None, "db_mtime": None,
     }
 
 
@@ -464,6 +813,101 @@ async def bridge_mappings() -> Dict[str, Any]:
     }
 
 
+# ── Debug / Soak (M11) ────────────────────────────────────────────────────────
+@app.post("/v1/debug/inject_event")
+async def debug_inject_event(body: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Synthetic event injection for soak/load testing.
+
+    Enabled ONLY when DEBUG_INJECT_ENABLED=true (never in production).
+
+    Body: { "room_id": "!room:server", "event": { Matrix event dict } }
+    The event is enqueued directly into the ingress loop, bypassing Matrix poll.
+
+    Returns: { "ok": bool, "enqueued": bool, "room_id": str, "event_id": str }
+    """
+    if _cfg is None or not _cfg.debug_inject_enabled:
+        return Response(  # type: ignore[return-value]
+            '{"ok":false,"error":"debug inject disabled"}',
+            status_code=403,
+            media_type="application/json",
+        )
+    if _ingress_loop is None:
+        return {"ok": False, "error": "ingress loop not running"}
+
+    room_id = body.get("room_id", "")
+    event   = body.get("event", {})
+    if not room_id or not event:
+        return {"ok": False, "error": "missing room_id or event"}
+
+    # Ensure event has minimum required fields for ingress processing
+    if not event.get("event_id"):
+        import time as _time
+        event["event_id"] = f"!inject-{int(_time.monotonic() * 1e6)}"
+    if not event.get("type"):
+        event["type"] = "m.room.message"
+    if not event.get("content"):
+        event["content"] = {"msgtype": "m.text", "body": event.get("body", "soak-ping")}
+
+    # Build a minimal sync_resp that looks like a real Matrix /sync response
+    # so _enqueue_from_sync can pick it up via extract_room_messages.
+    # We bypass Matrix polling by directly calling _try_enqueue on the right mapping.
+    enqueued = False
+    try:
+        # Find the matching room mapping (direct rooms only for soak)
+        mapping = None
+        if _ingress_loop._room_map is not None:
+            for m in _ingress_loop._room_map.mappings:
+                if m.room_id == room_id:
+                    mapping = m
+                    break
+
+        if mapping is None:
+            return {"ok": False, "error": f"no mapping for room_id={room_id!r}"}
+
+        # Build a minimal stub Matrix client — replies are discarded for soak events
+        from .matrix_client import MatrixClient
+
+        class _SoakMatrixClient(MatrixClient):  # type: ignore[misc]
+            """No-op Matrix client for synthetic soak events."""
+            def __init__(self) -> None:  # noqa: D107
+                pass  # skip real __init__
+
+            async def mark_seen(self, room_id: str, event_id: str) -> None:  # type: ignore[override]
+                pass
+
+            async def send_text(self, room_id: str, text: str,  # type: ignore[override]
+                                txn: Optional[str] = None) -> None:
+                pass
+
+        _stub_client = _SoakMatrixClient()
+
+        if _dummy_http_client is None:
+            return {"ok": False, "error": "debug http client not initialised"}
+
+        await _ingress_loop._try_enqueue(
+            _stub_client,              # type: ignore[arg-type]
+            _ingress_loop._queue,
+            _dummy_http_client,
+            event,
+            mapping,
+        )
+        enqueued = True
+    except Exception as exc:  # noqa: BLE001
+        return {"ok": False, "error": str(exc), "enqueued": False}
+
+    return {
+        "ok": True,
+        "enqueued": enqueued,
+        "room_id": room_id,
+        "event_id": event.get("event_id"),
+    }
+
+
+async def _noop_send(room_id: str, text: str, txn: Optional[str] = None) -> None:
+    """Discard replies from injected soak events."""
+
+
 # ── Metrics ───────────────────────────────────────────────────────────────────
 @app.get("/metrics")
 async def metrics():
diff --git a/services/matrix-bridge-dagi/app/metrics_contract.py b/services/matrix-bridge-dagi/app/metrics_contract.py
new file mode 100644
index 00000000..ce346def
--- /dev/null
+++ b/services/matrix-bridge-dagi/app/metrics_contract.py
@@ -0,0 +1,224 @@
+"""
+Metrics Contract — Matrix Bridge DAGI
+Phase M7.1
+
+Single source of truth for all Prometheus metric names and their label sets.
+Used by:
+  - main.py (registers metrics against this contract)
+  - tests/test_matrix_bridge_m71_metrics_contract.py (static validation)
+  - ops/prometheus/alerts/matrix-bridge-dagi.rules.yml (PromQL expressions)
+  - ops/grafana/dashboards/matrix-bridge-dagi.json (panel queries)
+
+Format:
+  METRICS_CONTRACT: Dict[metric_name, MetricSpec]
+
+MetricSpec fields:
+  kind   : "counter" | "histogram" | "gauge"
+  labels : list of label names (empty list = no labels)
+  help   : one-line description
+  phase  : originating milestone
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Dict, List
+
+
+@dataclass(frozen=True)
+class MetricSpec:
+    kind: str                    # "counter" | "histogram" | "gauge"
+    labels: List[str]            # label names; empty = no labels
+    help: str
+    phase: str = "M1"            # originating milestone for traceability
+
+
+# ── Contract ──────────────────────────────────────────────────────────────────
+
+METRICS_CONTRACT: Dict[str, MetricSpec] = {
+
+    # ── Core message traffic ──────────────────────────────────────────────────
+    "matrix_bridge_messages_received_total": MetricSpec(
+        kind="counter",
+        labels=["room_id", "agent_id"],
+        help="Total Matrix messages received",
+        phase="M1",
+    ),
+    "matrix_bridge_messages_replied_total": MetricSpec(
+        kind="counter",
+        labels=["room_id", "agent_id", "status"],
+        help="Total agent replies sent to Matrix (status=ok|error)",
+        phase="M1",
+    ),
+    "matrix_bridge_rate_limited_total": MetricSpec(
+        kind="counter",
+        labels=["room_id", "agent_id", "limit_type"],
+        help="Messages dropped by rate limiter",
+        phase="H1",
+    ),
+    "matrix_bridge_gateway_errors_total": MetricSpec(
+        kind="counter",
+        labels=["error_type"],
+        help="Bridge errors by stage: sync_error, network_error, http_<status>, matrix_send_error, unexpected",
+        phase="M1",
+    ),
+
+    # ── Latency histograms ────────────────────────────────────────────────────
+    "matrix_bridge_invoke_duration_seconds": MetricSpec(
+        kind="histogram",
+        labels=["agent_id", "node_id"],
+        help="Latency of DAGI Router infer call, per agent and node",
+        phase="H3",
+    ),
+    "matrix_bridge_send_duration_seconds": MetricSpec(
+        kind="histogram",
+        labels=["agent_id"],
+        help="Latency of Matrix send_text call",
+        phase="H3",
+    ),
+    "matrix_bridge_queue_wait_seconds": MetricSpec(
+        kind="histogram",
+        labels=["agent_id"],
+        help="Time between enqueue and worker start processing",
+        phase="H3",
+    ),
+
+    # ── Queue ─────────────────────────────────────────────────────────────────
+    "matrix_bridge_queue_size": MetricSpec(
+        kind="gauge",
+        labels=[],
+        help="Current number of pending items in the work queue",
+        phase="H2",
+    ),
+    "matrix_bridge_queue_dropped_total": MetricSpec(
+        kind="counter",
+        labels=["room_id", "agent_id"],
+        help="Messages dropped because queue was full",
+        phase="H2",
+    ),
+
+    # ── Rate limiter gauges ───────────────────────────────────────────────────
+    "matrix_bridge_rate_limiter_active_rooms": MetricSpec(
+        kind="gauge",
+        labels=[],
+        help="Rooms with activity in the current rate-limit window",
+        phase="H1",
+    ),
+    "matrix_bridge_rate_limiter_active_senders": MetricSpec(
+        kind="gauge",
+        labels=[],
+        help="Senders with activity in the current rate-limit window",
+        phase="H1",
+    ),
+
+    # ── Routing ───────────────────────────────────────────────────────────────
+    "matrix_bridge_routing_reasons_total": MetricSpec(
+        kind="counter",
+        labels=["agent_id", "reason"],
+        help="Message routing breakdown by agent and routing reason (slash/mention/name/default/direct)",
+        phase="M2.2",
+    ),
+    "matrix_bridge_route_rejected_total": MetricSpec(
+        kind="counter",
+        labels=["room_id", "reason"],
+        help="Messages rejected during routing (unknown agent, bad slash, no mapping, etc.)",
+        phase="M2.2",
+    ),
+    "matrix_bridge_active_room_agent_locks": MetricSpec(
+        kind="gauge",
+        labels=[],
+        help="Number of room-agent pairs currently holding a concurrency lock",
+        phase="M2.2",
+    ),
+
+    # ── Control channel ───────────────────────────────────────────────────────
+    "matrix_bridge_control_commands_total": MetricSpec(
+        kind="counter",
+        labels=["sender", "verb", "subcommand"],
+        help="Total control commands received from authorized operators",
+        phase="M3.0",
+    ),
+    "matrix_bridge_control_rate_limited_total": MetricSpec(
+        kind="counter",
+        labels=["scope"],
+        help="Total control commands rejected by rate limiter or cooldown",
+        phase="M3.4",
+    ),
+
+    # ── Persistent deduplication ─────────────────────────────────────────────
+    "matrix_bridge_dedupe_persistent_hits_total": MetricSpec(
+        kind="counter",
+        labels=["room_id"],
+        help="Total events dropped by persistent (SQLite) deduplication",
+        phase="M2.3",
+    ),
+    "matrix_bridge_dedupe_persistent_inserts_total": MetricSpec(
+        kind="counter",
+        labels=[],
+        help="Total events marked as processed in persistent dedupe store",
+        phase="M2.3",
+    ),
+
+    # ── Node-aware routing (M5.0) ─────────────────────────────────────────────
+    "matrix_bridge_routed_total": MetricSpec(
+        kind="counter",
+        labels=["agent_id", "node_id", "source"],
+        help="Total messages successfully routed, by agent, resolved node, and node source",
+        phase="M5.0",
+    ),
+    "matrix_bridge_node_rejected_total": MetricSpec(
+        kind="counter",
+        labels=["node_id"],
+        help="Total messages with rejected (non-allowlisted) node kwarg",
+        phase="M5.0",
+    ),
+
+    # ── Bridge health (M7.1) ──────────────────────────────────────────────────
+    "matrix_bridge_up": MetricSpec(
+        kind="gauge",
+        labels=["node_id"],
+        help="1 if bridge started successfully; 0 on config error",
+        phase="M7.1",
+    ),
+
+    # ── Soft-failover (M8.0) ─────────────────────────────────────────────────
+    "matrix_bridge_failover_total": MetricSpec(
+        kind="counter",
+        labels=["from_node", "to_node", "reason"],
+        help="Total successful soft-failovers by node transition and reason",
+        phase="M8.0",
+    ),
+    "matrix_bridge_node_health_state": MetricSpec(
+        kind="gauge",
+        labels=["node_id"],
+        help="Node health state gauge: 1=healthy 0.5=degraded 0=down",
+        phase="M8.0",
+    ),
+
+    # ── Sticky routing anti-flap (M8.1) ──────────────────────────────────────
+    "matrix_bridge_sticky_node_total": MetricSpec(
+        kind="counter",
+        labels=["node_id", "scope"],
+        help="Total sticky routing entries set after failover, by preferred node and scope",
+        phase="M8.1",
+    ),
+    "matrix_bridge_sticky_node_active": MetricSpec(
+        kind="gauge",
+        labels=[],
+        help="Current count of active sticky routing entries",
+        phase="M8.1",
+    ),
+}
+
+# ── Alert metric references ────────────────────────────────────────────────────
+# These are the metric base-names referenced in alert rules.
+# All must exist in METRICS_CONTRACT.
+ALERT_METRIC_REFS = frozenset({
+    "matrix_bridge_up",
+    "matrix_bridge_gateway_errors_total",
+    "matrix_bridge_messages_replied_total",
+    "matrix_bridge_queue_dropped_total",
+    "matrix_bridge_rate_limited_total",
+    "matrix_bridge_control_rate_limited_total",
+    "matrix_bridge_dedupe_persistent_hits_total",
+    "matrix_bridge_invoke_duration_seconds",
+})
diff --git a/services/matrix-bridge-dagi/app/mixed_routing.py b/services/matrix-bridge-dagi/app/mixed_routing.py
index 040a6a1c..937fa038 100644
--- a/services/matrix-bridge-dagi/app/mixed_routing.py
+++ b/services/matrix-bridge-dagi/app/mixed_routing.py
@@ -309,3 +309,25 @@ def reply_prefix(agent_id: str, is_mixed: bool) -> str:
         return ""
     # Capitalise first letter of agent name: "sofiia" → "Sofiia"
     return f"{agent_id.capitalize()}: "
+
+
+def build_override_config(
+    base_config: MixedRoomConfig,
+    room_id: str,
+    agents: List[str],
+    default_agent: str,
+) -> MixedRoomConfig:
+    """
+    M6.1: Build a temporary MixedRoomConfig that uses a dynamic store override
+    for room_id while keeping all other rooms from base_config unchanged.
+
+    Used in _enqueue_from_mixed_room to inject PolicyStore agent overrides
+    without mutating the shared base configuration.
+    """
+    rooms = dict(base_config.rooms)
+    rooms[room_id] = MixedRoom(
+        room_id=room_id,
+        agents=agents,
+        default_agent=default_agent,
+    )
+    return MixedRoomConfig(rooms=rooms)
diff --git a/services/matrix-bridge-dagi/app/node_health.py b/services/matrix-bridge-dagi/app/node_health.py
new file mode 100644
index 00000000..7dd66f67
--- /dev/null
+++ b/services/matrix-bridge-dagi/app/node_health.py
@@ -0,0 +1,262 @@
+"""
+NodeHealthTracker — M8.0: per-node health state tracking for soft-failover.
+
+Tracks invoke outcomes per node and maintains:
+  - EWMA latency estimate
+  - consecutive failure counter
+  - last ok / last error timestamps
+  - derived health state: "healthy" | "degraded" | "down"
+
+State transitions
+-----------------
+  Any state → "down"      : consecutive_failures >= fail_consecutive
+  Any state → "degraded"  : ewma_latency_s >= lat_ewma_threshold
+                             (and not yet "down")
+  "down"/"degraded" → "healthy"  : record_ok() resets consecutive_failures to 0
+                                    and ewma is updated towards the actual latency
+
+Thread safety
+-------------
+  All mutations are protected by a threading.Lock so this can be called from
+  asyncio callbacks (e.g. in `_invoke_and_send` on the event loop thread).
+  Use `record_ok` / `record_error` from within coroutines; they are synchronous
+  (no blocking I/O) so they are safe to call directly without to_thread.
+"""
+from __future__ import annotations
+
+import logging
+import threading
+import time
+from dataclasses import dataclass, field
+from typing import Dict, FrozenSet, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+# ── State constants ────────────────────────────────────────────────────────────
+
+NODE_STATE_HEALTHY  = "healthy"
+NODE_STATE_DEGRADED = "degraded"
+NODE_STATE_DOWN     = "down"
+
+# Failover-triggering error classes
+FAILOVER_REASON_TIMEOUT  = "timeout"
+FAILOVER_REASON_HTTP_5XX = "http_5xx"
+FAILOVER_REASON_NETWORK  = "network"
+
+
+# ── Config ────────────────────────────────────────────────────────────────────
+
+@dataclass(frozen=True)
+class NodeHealthConfig:
+    """
+    Thresholds controlling when a node is considered degraded or down.
+
+    fail_consecutive : int   number of consecutive failures → "down"
+    lat_ewma_s       : float EWMA latency estimate (seconds) threshold → "degraded"
+    ewma_alpha       : float EWMA smoothing factor (0..1); higher = more reactive
+    """
+    fail_consecutive: int   = 3
+    lat_ewma_s:       float = 12.0
+    ewma_alpha:       float = 0.3
+
+    def __post_init__(self) -> None:
+        if not (0 < self.ewma_alpha <= 1):
+            raise ValueError(f"ewma_alpha must be in (0, 1], got {self.ewma_alpha}")
+        if self.fail_consecutive < 1:
+            raise ValueError(f"fail_consecutive must be ≥ 1, got {self.fail_consecutive}")
+        if self.lat_ewma_s <= 0:
+            raise ValueError(f"lat_ewma_s must be > 0, got {self.lat_ewma_s}")
+
+
+# ── Per-node state ────────────────────────────────────────────────────────────
+
+@dataclass
+class _NodeState:
+    invoke_ok_total:      int   = 0
+    invoke_err_total:     int   = 0
+    consecutive_failures: int   = 0
+    last_ok_ts:           Optional[float] = None
+    last_err_ts:          Optional[float] = None
+    ewma_latency_s:       Optional[float] = None   # None until first ok record
+
+
+# ── Tracker ───────────────────────────────────────────────────────────────────
+
+class NodeHealthTracker:
+    """
+    Thread-safe per-node health tracker.
+
+    Usage:
+        tracker = NodeHealthTracker(NodeHealthConfig())
+
+        # On successful invoke
+        tracker.record_ok("NODA1", latency_s=1.4)
+
+        # On failed invoke
+        tracker.record_error("NODA1", reason=FAILOVER_REASON_TIMEOUT)
+
+        # Read health state
+        state = tracker.state("NODA1")       # "healthy" | "degraded" | "down"
+        fallback = tracker.pick_fallback("NODA1", allowed_nodes=frozenset({"NODA1","NODA2"}))
+    """
+
+    def __init__(self, config: Optional[NodeHealthConfig] = None) -> None:
+        self._cfg = config or NodeHealthConfig()
+        self._nodes: Dict[str, _NodeState] = {}
+        self._lock = threading.RLock()   # RLock: re-entrant (needed for all_info → as_info_dict)
+
+    # ── Public mutation API ────────────────────────────────────────────────────
+
+    def record_ok(self, node_id: str, latency_s: float) -> None:
+        """Record a successful invoke for node_id with given latency."""
+        with self._lock:
+            ns = self._get_or_create(node_id)
+            ns.invoke_ok_total += 1
+            ns.consecutive_failures = 0
+            ns.last_ok_ts = time.monotonic()
+            if ns.ewma_latency_s is None:
+                ns.ewma_latency_s = latency_s
+            else:
+                alpha = self._cfg.ewma_alpha
+                ns.ewma_latency_s = alpha * latency_s + (1 - alpha) * ns.ewma_latency_s
+
+    def record_error(self, node_id: str, reason: str = "unknown") -> None:
+        """Record a failed invoke for node_id."""
+        with self._lock:
+            ns = self._get_or_create(node_id)
+            ns.invoke_err_total += 1
+            ns.consecutive_failures += 1
+            ns.last_err_ts = time.monotonic()
+            logger.debug(
+                "NodeHealth: node=%s consecutive_failures=%d reason=%s",
+                node_id, ns.consecutive_failures, reason,
+            )
+
+    # ── Public read API ───────────────────────────────────────────────────────
+
+    def state(self, node_id: str) -> str:
+        """Return current health state for node_id."""
+        with self._lock:
+            return self._state_unlocked(node_id)
+
+    def pick_fallback(
+        self,
+        primary: str,
+        allowed_nodes: FrozenSet[str],
+    ) -> Optional[str]:
+        """
+        Return the best alternative node for failover.
+
+        Priority: healthy > degraded > (never down)
+        Returns None if no acceptable fallback exists.
+        """
+        with self._lock:
+            candidates = sorted(n for n in allowed_nodes if n != primary)
+            # Prefer healthy first
+            for n in candidates:
+                if self._state_unlocked(n) == NODE_STATE_HEALTHY:
+                    return n
+            # Accept degraded if no healthy available
+            for n in candidates:
+                if self._state_unlocked(n) == NODE_STATE_DEGRADED:
+                    return n
+            # Do not failover to "down" nodes
+            return None
+
+    def as_info_dict(self, node_id: str) -> dict:
+        """Return a JSON-safe status dict for one node."""
+        with self._lock:
+            ns = self._nodes.get(node_id)
+            if ns is None:
+                return {
+                    "node_id": node_id,
+                    "state": NODE_STATE_HEALTHY,
+                    "invoke_ok": 0,
+                    "invoke_err": 0,
+                    "consecutive_failures": 0,
+                    "ewma_latency_s": None,
+                    "last_ok_ts": None,
+                    "last_err_ts": None,
+                }
+            return {
+                "node_id": node_id,
+                "state": self._state_unlocked(node_id),
+                "invoke_ok": ns.invoke_ok_total,
+                "invoke_err": ns.invoke_err_total,
+                "consecutive_failures": ns.consecutive_failures,
+                "ewma_latency_s": round(ns.ewma_latency_s, 3) if ns.ewma_latency_s else None,
+                "last_ok_ts": ns.last_ok_ts,
+                "last_err_ts": ns.last_err_ts,
+            }
+
+    def all_info(self, allowed_nodes: Optional[FrozenSet[str]] = None) -> Dict[str, dict]:
+        """
+        Return status dicts for all tracked (or specified) nodes.
+        If allowed_nodes provided, also include entries for unseen nodes (state=healthy).
+        """
+        with self._lock:
+            keys = set(self._nodes.keys())
+            if allowed_nodes:
+                keys |= set(allowed_nodes)
+            return {n: self.as_info_dict(n) for n in sorted(keys)}
+
+    def reset(self, node_id: str) -> None:
+        """Reset health state for a node (e.g. after manual recovery)."""
+        with self._lock:
+            self._nodes.pop(node_id, None)
+
+    def restore_node(
+        self,
+        node_id: str,
+        ewma_latency_s: Optional[float],
+        consecutive_failures: int,
+    ) -> None:
+        """
+        Restore persisted node state after a restart (M8.2).
+
+        Only restores ewma_latency_s and consecutive_failures; counters
+        (invoke_ok_total, invoke_err_total) start from 0 since they are
+        runtime metrics for the current session.
+        """
+        with self._lock:
+            ns = self._get_or_create(node_id)
+            ns.ewma_latency_s = ewma_latency_s
+            ns.consecutive_failures = max(0, consecutive_failures)
+
+    # ── Internal ──────────────────────────────────────────────────────────────
+
+    def _get_or_create(self, node_id: str) -> _NodeState:
+        if node_id not in self._nodes:
+            self._nodes[node_id] = _NodeState()
+        return self._nodes[node_id]
+
+    def _state_unlocked(self, node_id: str) -> str:
+        ns = self._nodes.get(node_id)
+        if ns is None:
+            return NODE_STATE_HEALTHY   # unseen nodes are assumed healthy
+
+        if ns.consecutive_failures >= self._cfg.fail_consecutive:
+            return NODE_STATE_DOWN
+
+        if (
+            ns.ewma_latency_s is not None
+            and ns.ewma_latency_s >= self._cfg.lat_ewma_s
+        ):
+            return NODE_STATE_DEGRADED
+
+        return NODE_STATE_HEALTHY
+
+
+# ── Parser (env vars → NodeHealthConfig) ──────────────────────────────────────
+
+def parse_node_health_config(
+    fail_consecutive: int = 3,
+    lat_ewma_s: float = 12.0,
+    ewma_alpha: float = 0.3,
+) -> NodeHealthConfig:
+    """Construct NodeHealthConfig from parsed env values."""
+    return NodeHealthConfig(
+        fail_consecutive=fail_consecutive,
+        lat_ewma_s=lat_ewma_s,
+        ewma_alpha=ewma_alpha,
+    )
diff --git a/services/matrix-bridge-dagi/app/node_policy.py b/services/matrix-bridge-dagi/app/node_policy.py
new file mode 100644
index 00000000..4c115a2f
--- /dev/null
+++ b/services/matrix-bridge-dagi/app/node_policy.py
@@ -0,0 +1,179 @@
+"""
+node_policy — Node-aware routing for matrix-bridge-dagi.
+
+Resolves which NODA (NODA1, NODA2, …) a message should be tagged with based on:
+  1. Explicit `node=X` kwarg in the message body (mixed rooms only)
+  2. Dynamic store override (PolicyStore, set by operators via !node set)  ← M6.0
+  3. Static per-room mapping from BRIDGE_ROOM_NODE_MAP env
+  4. BRIDGE_DEFAULT_NODE (fallback)
+
+The resolved node_id is embedded in the Router metadata so downstream
+services (Router / Memory / Agent) can apply per-node policies.
+
+This module does NOT change the HTTP endpoint called — the Router URL
+stays the same.
+"""
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass, field
+from typing import Dict, FrozenSet, Optional, Tuple
+
+# Regex to find 'node=X' anywhere in message text (case-insensitive)
+_NODE_KWARG_RE = re.compile(r"\bnode=(\w+)\b", re.IGNORECASE)
+
+# Node resolution sources (priority order)
+NODE_SOURCE_EXPLICIT = "explicit"
+NODE_SOURCE_STORE    = "store"      # M6.0: dynamic PolicyStore override
+NODE_SOURCE_ROOM_MAP = "room_map"
+NODE_SOURCE_DEFAULT  = "default"
+
+
+@dataclass(frozen=True)
+class NodeResolution:
+    """Result of resolving the target node for a message."""
+    node_id: str
+    source: str
+    rejected_node: Optional[str] = None  # set when explicit node was not allowlisted
+
+
+@dataclass
+class NodePolicy:
+    """
+    Node resolution policy.
+
+    Attributes:
+        allowed_nodes: Set of valid node names (uppercase).
+        default_node:  Fallback node when no explicit or room-map match.
+        room_node_map: Optional per-room override (room_id → node_id).
+    """
+    allowed_nodes: FrozenSet[str]
+    default_node: str
+    room_node_map: Dict[str, str] = field(default_factory=dict)
+
+    def resolve(
+        self,
+        room_id: str,
+        explicit_node: Optional[str] = None,
+        store_override: Optional[str] = None,
+    ) -> NodeResolution:
+        """
+        Resolve target node for a message.
+
+        Priority (highest → lowest):
+          1. explicit_node kwarg (user-supplied, mixed rooms only)
+          2. store_override  — dynamic PolicyStore entry (M6.0)
+          3. room_node_map   — static BRIDGE_ROOM_NODE_MAP env entry
+          4. default_node
+        """
+        if explicit_node is not None:
+            upper = explicit_node.upper()
+            if upper in self.allowed_nodes:
+                return NodeResolution(node_id=upper, source=NODE_SOURCE_EXPLICIT)
+            # Rejected — report bad value and fall through to best available
+            fallback = self._fallback(room_id, store_override)
+            return NodeResolution(
+                node_id=fallback.node_id,
+                source=fallback.source,
+                rejected_node=upper,
+            )
+
+        return self._fallback(room_id, store_override)
+
+    def _fallback(
+        self,
+        room_id: str,
+        store_override: Optional[str] = None,
+    ) -> NodeResolution:
+        """Resolve node without an explicit kwarg (store → env map → default)."""
+        if store_override is not None:
+            upper = store_override.upper()
+            if upper in self.allowed_nodes:
+                return NodeResolution(node_id=upper, source=NODE_SOURCE_STORE)
+
+        if room_id in self.room_node_map:
+            mapped = self.room_node_map[room_id].upper()
+            if mapped in self.allowed_nodes:
+                return NodeResolution(node_id=mapped, source=NODE_SOURCE_ROOM_MAP)
+
+        return NodeResolution(node_id=self.default_node, source=NODE_SOURCE_DEFAULT)
+
+    def as_info_dict(self) -> dict:
+        """Return a safe dict for health/ops snapshots (no secrets)."""
+        return {
+            "default_node": self.default_node,
+            "allowed_nodes": sorted(self.allowed_nodes),
+            "room_overrides": len(self.room_node_map),
+        }
+
+
+def parse_node_policy(
+    raw_allowed: str,
+    default_node: str,
+    raw_room_map: str,
+) -> NodePolicy:
+    """
+    Parse node policy from env-style config strings.
+
+    raw_allowed:  "NODA1,NODA2"
+    default_node: "NODA1"
+    raw_room_map: "!roomA:server=NODA2;!roomB:server=NODA1"
+    """
+    default = default_node.strip().upper() or "NODA1"
+
+    allowed: FrozenSet[str] = frozenset(
+        n.strip().upper() for n in raw_allowed.split(",") if n.strip()
+    )
+    if not allowed:
+        allowed = frozenset([default])
+    elif default not in allowed:
+        # default must always be reachable
+        allowed = allowed | frozenset([default])
+
+    room_map: Dict[str, str] = {}
+    for entry in raw_room_map.split(";"):
+        entry = entry.strip()
+        if not entry or "=" not in entry:
+            continue
+        room_id_raw, node_raw = entry.split("=", 1)
+        room_id = room_id_raw.strip()
+        node = node_raw.strip().upper()
+        if room_id and node:
+            room_map[room_id] = node
+
+    return NodePolicy(
+        allowed_nodes=allowed,
+        default_node=default,
+        room_node_map=room_map,
+    )
+
+
+def extract_node_kwarg(text: str) -> Tuple[Optional[str], str]:
+    """
+    Extract 'node=X' kwarg from message text.
+
+    Returns (node_id_or_None, cleaned_text_without_kwarg).
+    Preserves the rest of the message — no other transformations.
+
+    Example:
+        "/sofiia node=NODA2 Hello!"
+        → ("NODA2", "/sofiia Hello!")
+    """
+    m = _NODE_KWARG_RE.search(text)
+    if m:
+        node = m.group(1).upper()
+        cleaned = _NODE_KWARG_RE.sub("", text, count=1)
+        # Collapse runs of whitespace introduced by the removal
+        cleaned = " ".join(cleaned.split())
+        return node, cleaned
+    return None, text
+
+
+def node_rejected_reply(requested: str, allowed: FrozenSet[str]) -> str:
+    """Reply when user requests a node not in the allowlist."""
+    allowed_list = ", ".join(f"`{n}`" for n in sorted(allowed))
+    return (
+        f"⚠️ Unknown node: `{requested}`\n"
+        f"Allowed: {allowed_list}\n"
+        f"_Example: `/sofiia node=NODA1 Hello!`_"
+    )
diff --git a/services/matrix-bridge-dagi/app/policy_store.py b/services/matrix-bridge-dagi/app/policy_store.py
new file mode 100644
index 00000000..49c685d0
--- /dev/null
+++ b/services/matrix-bridge-dagi/app/policy_store.py
@@ -0,0 +1,1007 @@
+"""
+policy_store — M6.0: Persistent room-node override store.
+
+SQLite-backed store that allows operators to dynamically set a preferred
+node (NODA1, NODA2, …) for any Matrix room without redeploying the bridge.
+
+Resolution layer (in NodePolicy.resolve):
+  1. explicit node=X kwarg (highest priority)
+  2. dynamic store override  ← this module
+  3. static BRIDGE_ROOM_NODE_MAP env
+  4. BRIDGE_DEFAULT_NODE (lowest priority)
+
+All DB operations are synchronous/blocking.  Call via asyncio.to_thread
+in async contexts to avoid blocking the event loop.
+
+Security:
+  - operator identity is stored as SHA-256[:16] (no PII verbatim)
+  - room_id values validated against basic Matrix ID format by callers
+  - SQLite WAL mode, PRAGMA synchronous=NORMAL for durability+speed
+"""
+from __future__ import annotations
+
+import datetime
+import glob as _glob
+import hashlib
+import json as _json
+import logging
+import os as _os
+import sqlite3
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+POLICY_SNAPSHOT_VERSION = 1
+POLICY_IMPORT_MODE_MERGE   = "merge"
+POLICY_IMPORT_MODE_REPLACE = "replace"
+
+logger = logging.getLogger(__name__)
+
+_DDL = """
+CREATE TABLE IF NOT EXISTS room_node_overrides (
+    room_id          TEXT PRIMARY KEY,
+    node_id          TEXT NOT NULL,
+    updated_at       INTEGER NOT NULL,
+    updated_by_hash  TEXT NOT NULL
+);
+"""
+
+_IDX_TS = """
+CREATE INDEX IF NOT EXISTS idx_rno_updated_at
+    ON room_node_overrides (updated_at DESC);
+"""
+
+# M6.1: Dynamic mixed room agent overrides
+_DDL_AGENT = """
+CREATE TABLE IF NOT EXISTS room_agent_overrides (
+    room_id          TEXT PRIMARY KEY,
+    agents_csv       TEXT NOT NULL,
+    default_agent    TEXT,
+    updated_at       INTEGER NOT NULL,
+    updated_by_hash  TEXT NOT NULL
+);
+"""
+
+_IDX_AGENT_TS = """
+CREATE INDEX IF NOT EXISTS idx_rao_updated_at
+    ON room_agent_overrides (updated_at DESC);
+"""
+
+# M8.2: HA persistence tables
+_DDL_STICKY = """
+CREATE TABLE IF NOT EXISTS sticky_node_cache (
+    key         TEXT PRIMARY KEY,
+    node_id     TEXT NOT NULL,
+    expires_at  INTEGER NOT NULL,
+    updated_at  INTEGER NOT NULL
+);
+"""
+
+_DDL_NODE_HEALTH = """
+CREATE TABLE IF NOT EXISTS node_health_state (
+    node_id              TEXT PRIMARY KEY,
+    ewma_latency_s       REAL,
+    consecutive_failures INTEGER NOT NULL DEFAULT 0,
+    updated_at           INTEGER NOT NULL
+);
+"""
+
+# M10.2: Policy change history table
+_DDL_POLICY_CHANGES = """
+CREATE TABLE IF NOT EXISTS policy_changes (
+    id             INTEGER PRIMARY KEY AUTOINCREMENT,
+    applied_at     INTEGER NOT NULL,
+    verb           TEXT    NOT NULL DEFAULT '',
+    mode           TEXT    NOT NULL DEFAULT '',
+    source_file    TEXT    NOT NULL DEFAULT '',
+    sender_hash    TEXT    NOT NULL DEFAULT '',
+    diff_summary   TEXT    NOT NULL DEFAULT '',
+    is_destructive INTEGER NOT NULL DEFAULT 0,
+    node_added     INTEGER NOT NULL DEFAULT 0,
+    node_updated   INTEGER NOT NULL DEFAULT 0,
+    node_deleted   INTEGER NOT NULL DEFAULT 0,
+    agent_added    INTEGER NOT NULL DEFAULT 0,
+    agent_updated  INTEGER NOT NULL DEFAULT 0,
+    agent_deleted  INTEGER NOT NULL DEFAULT 0
+);
+"""
+
+_IDX_POLICY_CHANGES_TS = """
+CREATE INDEX IF NOT EXISTS idx_pc_applied_at
+    ON policy_changes (applied_at DESC);
+"""
+
+_POLICY_HISTORY_DEFAULT_LIMIT = 100
+
+# Maximum number of entries returned by list_* (safety cap)
+_LIST_HARD_LIMIT = 100
+
+
+# M9.1: Import diff result dataclass
+_SAMPLE_KEYS_MAX = 5
+
+
+@dataclass
+class ImportDiff:
+    """
+    Result of compute_import_diff — what would change if a snapshot were imported.
+
+    Used to build a preview reply and confirm binding hash (M9.1).
+    """
+    node_added: int = 0
+    node_updated: int = 0
+    node_deleted: int = 0
+    agent_added: int = 0
+    agent_updated: int = 0
+    agent_deleted: int = 0
+    sample_keys: List[str] = field(default_factory=list)  # up to _SAMPLE_KEYS_MAX
+    is_replace: bool = False
+
+    def total_changes(self) -> int:
+        return (
+            self.node_added + self.node_updated + self.node_deleted
+            + self.agent_added + self.agent_updated + self.agent_deleted
+        )
+
+    def is_destructive(self) -> bool:
+        """True if any existing data would be deleted."""
+        return self.node_deleted > 0 or self.agent_deleted > 0
+
+
+# M10.2: Policy change history entry
+@dataclass
+class PolicyChange:
+    """A single recorded policy apply event (import or restore)."""
+    id: int
+    applied_at: int         # unix timestamp
+    verb: str               # e.g. "policy.import", "policy.restore"
+    mode: str               # "merge" or "replace"
+    source_file: str        # snapshot filename (basename only)
+    sender_hash: str        # truncated hash of operator sender_id
+    diff_summary: str       # human-readable change summary string
+    is_destructive: bool    # True if any deletions occurred
+    node_added: int
+    node_updated: int
+    node_deleted: int
+    agent_added: int
+    agent_updated: int
+    agent_deleted: int
+
+    def when_str(self) -> str:
+        """Human-readable UTC timestamp."""
+        return datetime.datetime.fromtimestamp(
+            self.applied_at, datetime.timezone.utc
+        ).strftime("%Y-%m-%d %H:%M")
+
+    def changes_short(self) -> str:
+        """Compact change summary, e.g. '+2n -1n +1a'."""
+        parts = []
+        if self.node_added:    parts.append(f"+{self.node_added}n")
+        if self.node_updated:  parts.append(f"~{self.node_updated}n")
+        if self.node_deleted:  parts.append(f"-{self.node_deleted}n")
+        if self.agent_added:   parts.append(f"+{self.agent_added}a")
+        if self.agent_updated: parts.append(f"~{self.agent_updated}a")
+        if self.agent_deleted: parts.append(f"-{self.agent_deleted}a")
+        return " ".join(parts) or "no changes"
+
+
+# M10.0: Auto-backup + prune result
+_AUTOBACKUP_PREFIX = "policy-autobackup-"
+_EXPORT_GLOB = "policy-*.json"
+_PRUNE_SAMPLE_MAX = 5
+
+
+@dataclass
+class PruneResult:
+    """Result of prune_exports — what was (or would be) pruned (M10.0)."""
+    files_to_delete: List[str]  # basenames of matching expired files
+    total_bytes: int            # approximate bytes freed (or to be freed)
+    oldest_mtime: Optional[float] = None  # oldest mtime among files to delete
+
+    @property
+    def count(self) -> int:
+        return len(self.files_to_delete)
+
+    def sample_filenames(self, n: int = _PRUNE_SAMPLE_MAX) -> List[str]:
+        return sorted(self.files_to_delete)[:n]
+
+
+def _hash_sender(sender: str) -> str:
+    """Partial SHA-256 of sender Matrix ID (non-reversible, no PII stored raw)."""
+    return hashlib.sha256(sender.encode("utf-8")).hexdigest()[:16]
+
+
+class PolicyStore:
+    """
+    Lightweight synchronous SQLite wrapper for room→node overrides.
+
+    Usage pattern (async callers):
+        override = await asyncio.to_thread(store.get_override, room_id)
+        await asyncio.to_thread(store.set_override, room_id, "NODA2", sender)
+    """
+
+    def __init__(self, db_path: str) -> None:
+        self._db_path = db_path
+        self._conn: Optional[sqlite3.Connection] = None
+
+    # ── Lifecycle ──────────────────────────────────────────────────────────────
+
+    def open(self) -> None:
+        """Open (or create) the SQLite DB and apply schema."""
+        Path(self._db_path).parent.mkdir(parents=True, exist_ok=True)
+        self._conn = sqlite3.connect(
+            self._db_path,
+            check_same_thread=False,
+            isolation_level=None,   # autocommit
+        )
+        self._conn.execute("PRAGMA journal_mode=WAL")
+        self._conn.execute("PRAGMA synchronous=NORMAL")
+        self._conn.execute(_DDL)
+        self._conn.execute(_IDX_TS)
+        self._conn.execute(_DDL_AGENT)
+        self._conn.execute(_IDX_AGENT_TS)
+        # M8.2: HA persistence tables
+        self._conn.execute(_DDL_STICKY)
+        self._conn.execute(_DDL_NODE_HEALTH)
+        # M10.2: Policy change history
+        self._conn.execute(_DDL_POLICY_CHANGES)
+        self._conn.execute(_IDX_POLICY_CHANGES_TS)
+        logger.info("PolicyStore opened: %s", self._db_path)
+
+    def close(self) -> None:
+        """Close the SQLite connection."""
+        if self._conn:
+            try:
+                self._conn.close()
+            except Exception:  # noqa: BLE001
+                pass
+            finally:
+                self._conn = None
+
+    # ── CRUD ───────────────────────────────────────────────────────────────────
+
+    def get_override(self, room_id: str) -> Optional[str]:
+        """Return the stored node_id for room_id, or None if not set."""
+        self._require_open()
+        row = self._conn.execute(  # type: ignore[union-attr]
+            "SELECT node_id FROM room_node_overrides WHERE room_id = ?",
+            (room_id,),
+        ).fetchone()
+        return row[0] if row else None
+
+    def set_override(self, room_id: str, node_id: str, updated_by: str) -> None:
+        """Upsert a room→node override."""
+        self._require_open()
+        self._conn.execute(  # type: ignore[union-attr]
+            """
+            INSERT INTO room_node_overrides (room_id, node_id, updated_at, updated_by_hash)
+            VALUES (?, ?, ?, ?)
+            ON CONFLICT(room_id) DO UPDATE SET
+                node_id         = excluded.node_id,
+                updated_at      = excluded.updated_at,
+                updated_by_hash = excluded.updated_by_hash
+            """,
+            (room_id, node_id, int(time.time()), _hash_sender(updated_by)),
+        )
+
+    def delete_override(self, room_id: str) -> bool:
+        """Remove override for room_id. Returns True if a row was deleted."""
+        self._require_open()
+        cursor = self._conn.execute(  # type: ignore[union-attr]
+            "DELETE FROM room_node_overrides WHERE room_id = ?",
+            (room_id,),
+        )
+        return cursor.rowcount > 0
+
+    def list_overrides(self, limit: int = 10) -> List[Tuple[str, str, int]]:
+        """
+        Return [(room_id, node_id, updated_at), …] ordered by updated_at DESC.
+        Hard-capped at _LIST_HARD_LIMIT regardless of caller's limit.
+        """
+        self._require_open()
+        cap = min(max(1, limit), _LIST_HARD_LIMIT)
+        rows = self._conn.execute(  # type: ignore[union-attr]
+            """
+            SELECT room_id, node_id, updated_at
+            FROM room_node_overrides
+            ORDER BY updated_at DESC
+            LIMIT ?
+            """,
+            (cap,),
+        ).fetchall()
+        return [(r[0], r[1], r[2]) for r in rows]
+
+    def count_overrides(self) -> int:
+        """Return total number of override rows in the DB."""
+        self._require_open()
+        row = self._conn.execute(
+            "SELECT COUNT(*) FROM room_node_overrides"
+        ).fetchone()
+        return int(row[0]) if row else 0
+
+    # ── Properties ─────────────────────────────────────────────────────────────
+
+    @property
+    def db_path(self) -> str:
+        return self._db_path
+
+    @property
+    def is_open(self) -> bool:
+        return self._conn is not None
+
+    # ── M6.1: Room agent overrides ─────────────────────────────────────────────
+
+    def get_agent_override(
+        self, room_id: str
+    ) -> Optional[Tuple[List[str], Optional[str]]]:
+        """
+        Return (agents_list, default_agent_or_None) for room_id,
+        or None if no override exists.
+        """
+        self._require_open()
+        row = self._conn.execute(  # type: ignore[union-attr]
+            "SELECT agents_csv, default_agent FROM room_agent_overrides WHERE room_id = ?",
+            (room_id,),
+        ).fetchone()
+        if row is None:
+            return None
+        agents = [a.strip() for a in row[0].split(",") if a.strip()]
+        return agents, (row[1] or None)
+
+    def set_agent_override(
+        self,
+        room_id: str,
+        agents: List[str],
+        default_agent: Optional[str],
+        updated_by: str,
+    ) -> None:
+        """Upsert a room agent override (sorted, deduplicated agents_csv)."""
+        self._require_open()
+        agents_csv = ",".join(sorted(set(agents)))
+        self._conn.execute(  # type: ignore[union-attr]
+            """
+            INSERT INTO room_agent_overrides
+                (room_id, agents_csv, default_agent, updated_at, updated_by_hash)
+            VALUES (?, ?, ?, ?, ?)
+            ON CONFLICT(room_id) DO UPDATE SET
+                agents_csv      = excluded.agents_csv,
+                default_agent   = excluded.default_agent,
+                updated_at      = excluded.updated_at,
+                updated_by_hash = excluded.updated_by_hash
+            """,
+            (room_id, agents_csv, default_agent, int(time.time()), _hash_sender(updated_by)),
+        )
+
+    def delete_agent_override(self, room_id: str) -> bool:
+        """Remove agent override for room_id. Returns True if deleted."""
+        self._require_open()
+        cursor = self._conn.execute(  # type: ignore[union-attr]
+            "DELETE FROM room_agent_overrides WHERE room_id = ?",
+            (room_id,),
+        )
+        return cursor.rowcount > 0
+
+    def add_agent_to_room(
+        self, room_id: str, agent: str, updated_by: str
+    ) -> Tuple[List[str], Optional[str]]:
+        """
+        Add agent to room override, creating it if it doesn't exist.
+        Returns the new (agents, default_agent) state.
+        """
+        self._require_open()
+        existing = self.get_agent_override(room_id)
+        if existing:
+            agents, default = existing
+            if agent not in agents:
+                agents = sorted(set(agents) | {agent})
+            self.set_agent_override(room_id, agents, default, updated_by)
+            return agents, default
+        else:
+            self.set_agent_override(room_id, [agent], agent, updated_by)
+            return [agent], agent
+
+    def remove_agent_from_room(
+        self, room_id: str, agent: str, updated_by: str
+    ) -> Tuple[bool, Optional[str]]:
+        """
+        Remove agent from room override.
+        Returns (removed: bool, error_message_or_None).
+        If the last agent is removed, the entire override is deleted.
+        """
+        self._require_open()
+        existing = self.get_agent_override(room_id)
+        if not existing:
+            return False, "No agent override set for this room"
+        agents, default = existing
+        if agent not in agents:
+            return False, f"Agent `{agent}` not in override list"
+        agents = [a for a in agents if a != agent]
+        if not agents:
+            self.delete_agent_override(room_id)
+            return True, None
+        new_default = default if default != agent else agents[0]
+        self.set_agent_override(room_id, agents, new_default, updated_by)
+        return True, None
+
+    def list_agent_overrides(
+        self, limit: int = 10
+    ) -> List[Tuple[str, List[str], Optional[str], int]]:
+        """
+        Return [(room_id, agents_list, default_agent, updated_at), …]
+        ordered by updated_at DESC.
+        """
+        self._require_open()
+        cap = min(max(1, limit), _LIST_HARD_LIMIT)
+        rows = self._conn.execute(  # type: ignore[union-attr]
+            """
+            SELECT room_id, agents_csv, default_agent, updated_at
+            FROM room_agent_overrides
+            ORDER BY updated_at DESC
+            LIMIT ?
+            """,
+            (cap,),
+        ).fetchall()
+        return [
+            (r[0], [a.strip() for a in r[1].split(",") if a.strip()], r[2] or None, r[3])
+            for r in rows
+        ]
+
+    def count_agent_overrides(self) -> int:
+        """Return total number of agent override rows."""
+        self._require_open()
+        row = self._conn.execute(
+            "SELECT COUNT(*) FROM room_agent_overrides"
+        ).fetchone()
+        return int(row[0]) if row else 0
+
+    # ── M8.2: HA persistence — sticky node cache ──────────────────────────────
+
+    def upsert_sticky(self, key: str, node_id: str, expires_at_unix: int) -> None:
+        """Persist a sticky routing entry.  Idempotent (upsert by key)."""
+        assert self._conn, "Store not open"
+        now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
+        self._conn.execute(
+            """INSERT INTO sticky_node_cache (key, node_id, expires_at, updated_at)
+               VALUES (?, ?, ?, ?)
+               ON CONFLICT(key) DO UPDATE SET
+                 node_id=excluded.node_id,
+                 expires_at=excluded.expires_at,
+                 updated_at=excluded.updated_at""",
+            (key, node_id, expires_at_unix, now),
+        )
+
+    def delete_sticky(self, key: str) -> bool:
+        """Remove a sticky entry.  Returns True if it existed."""
+        assert self._conn, "Store not open"
+        cur = self._conn.execute(
+            "DELETE FROM sticky_node_cache WHERE key=?", (key,)
+        )
+        return cur.rowcount > 0
+
+    def load_sticky_entries(self) -> List[Tuple[str, str, int]]:
+        """
+        Return all non-expired sticky entries as (key, node_id, expires_at_unix).
+        Callers filter by monotonic time; here we compare against unix now.
+        """
+        assert self._conn, "Store not open"
+        now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
+        rows = self._conn.execute(
+            "SELECT key, node_id, expires_at FROM sticky_node_cache WHERE expires_at > ?",
+            (now,),
+        ).fetchall()
+        return [(r[0], r[1], int(r[2])) for r in rows]
+
+    def prune_sticky_expired(self) -> int:
+        """Remove all expired sticky entries. Returns count removed."""
+        assert self._conn, "Store not open"
+        now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
+        cur = self._conn.execute(
+            "DELETE FROM sticky_node_cache WHERE expires_at <= ?", (now,)
+        )
+        return cur.rowcount
+
+    # ── M8.2: HA persistence — node health state ──────────────────────────────
+
+    def upsert_node_health(
+        self,
+        node_id: str,
+        ewma_latency_s: Optional[float],
+        consecutive_failures: int,
+    ) -> None:
+        """Persist node health snapshot.  Idempotent (upsert by node_id)."""
+        assert self._conn, "Store not open"
+        now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
+        self._conn.execute(
+            """INSERT INTO node_health_state
+                   (node_id, ewma_latency_s, consecutive_failures, updated_at)
+               VALUES (?, ?, ?, ?)
+               ON CONFLICT(node_id) DO UPDATE SET
+                 ewma_latency_s=excluded.ewma_latency_s,
+                 consecutive_failures=excluded.consecutive_failures,
+                 updated_at=excluded.updated_at""",
+            (node_id, ewma_latency_s, consecutive_failures, now),
+        )
+
+    def load_node_health(self, max_age_s: int = 600) -> Optional[Dict[str, Any]]:
+        """
+        Load node health snapshot if all rows are fresh enough (updated_at >= now - max_age_s).
+        Returns None if no rows or snapshot is stale.
+        Returns dict: {node_id: {ewma_latency_s, consecutive_failures, updated_at}}
+        """
+        assert self._conn, "Store not open"
+        now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
+        cutoff = now - max_age_s
+        rows = self._conn.execute(
+            """SELECT node_id, ewma_latency_s, consecutive_failures, updated_at
+               FROM node_health_state""",
+        ).fetchall()
+        if not rows:
+            return None
+        result: Dict[str, Any] = {}
+        for node_id, ewma, consec, updated_at in rows:
+            if int(updated_at) < cutoff:
+                logger.debug(
+                    "HA: node health snapshot for %s is stale (age=%ds > max=%ds) — ignoring",
+                    node_id, now - int(updated_at), max_age_s,
+                )
+                return None  # Any stale node → discard whole snapshot
+            result[node_id] = {
+                "ewma_latency_s": ewma,
+                "consecutive_failures": int(consec),
+                "updated_at": int(updated_at),
+            }
+        return result if result else None
+
+    # ── M6.2: Snapshot export / import ────────────────────────────────────────
+
+    # ── M10.2: Policy change history ──────────────────────────────────────────
+
+    def record_policy_change(
+        self,
+        verb: str,
+        mode: str,
+        source_file: str,
+        sender_hash: str,
+        diff_summary: str,
+        is_destructive: bool,
+        node_added: int,
+        node_updated: int,
+        node_deleted: int,
+        agent_added: int,
+        agent_updated: int,
+        agent_deleted: int,
+        history_limit: int = _POLICY_HISTORY_DEFAULT_LIMIT,
+    ) -> int:
+        """
+        Insert a policy apply event into the history table and prune old rows.
+
+        history_limit=0 means keep all rows (no pruning).
+        Returns the id of the inserted row.
+        """
+        self._require_open()
+        cur = self._conn.execute(  # type: ignore[union-attr]
+            """INSERT INTO policy_changes
+               (applied_at, verb, mode, source_file, sender_hash,
+                diff_summary, is_destructive,
+                node_added, node_updated, node_deleted,
+                agent_added, agent_updated, agent_deleted)
+               VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
+            (
+                int(time.time()), verb, mode, source_file, sender_hash,
+                diff_summary, int(is_destructive),
+                node_added, node_updated, node_deleted,
+                agent_added, agent_updated, agent_deleted,
+            ),
+        )
+        row_id: int = cur.lastrowid  # type: ignore[assignment]
+
+        # Prune oldest rows beyond limit
+        if history_limit > 0:
+            self._conn.execute(  # type: ignore[union-attr]
+                """DELETE FROM policy_changes
+                   WHERE id NOT IN (
+                       SELECT id FROM policy_changes ORDER BY id DESC LIMIT ?
+                   )""",
+                (history_limit,),
+            )
+
+        logger.debug(
+            "Recorded policy change id=%d verb=%s mode=%s file=%s destr=%s",
+            row_id, verb, mode, source_file, is_destructive,
+        )
+        return row_id
+
+    def list_policy_changes(self, limit: int = 10) -> List[PolicyChange]:
+        """
+        Return the most-recent `limit` policy change records, newest first.
+
+        Hard cap: min(limit, _LIST_HARD_LIMIT).
+        """
+        self._require_open()
+        safe_limit = min(max(1, limit), _LIST_HARD_LIMIT)
+        rows = self._conn.execute(  # type: ignore[union-attr]
+            """SELECT id, applied_at, verb, mode, source_file, sender_hash,
+                      diff_summary, is_destructive,
+                      node_added, node_updated, node_deleted,
+                      agent_added, agent_updated, agent_deleted
+               FROM policy_changes
+               ORDER BY id DESC LIMIT ?""",
+            (safe_limit,),
+        ).fetchall()
+        return [
+            PolicyChange(
+                id=r[0], applied_at=r[1], verb=r[2], mode=r[3],
+                source_file=r[4], sender_hash=r[5], diff_summary=r[6],
+                is_destructive=bool(r[7]),
+                node_added=r[8], node_updated=r[9], node_deleted=r[10],
+                agent_added=r[11], agent_updated=r[12], agent_deleted=r[13],
+            )
+            for r in rows
+        ]
+
+    def get_policy_changes_count(self) -> int:
+        """Return the total number of recorded policy changes."""
+        self._require_open()
+        row = self._conn.execute(  # type: ignore[union-attr]
+            "SELECT COUNT(*) FROM policy_changes"
+        ).fetchone()
+        return row[0] if row else 0
+
+    def get_policy_change_by_id(self, change_id: int) -> Optional["PolicyChange"]:
+        """Return a single PolicyChange by its DB auto-increment id, or None."""
+        self._require_open()
+        row = self._conn.execute(  # type: ignore[union-attr]
+            """SELECT id, applied_at, verb, mode, source_file, sender_hash,
+                      diff_summary, is_destructive,
+                      node_added, node_updated, node_deleted,
+                      agent_added, agent_updated, agent_deleted
+               FROM policy_changes WHERE id = ?""",
+            (change_id,),
+        ).fetchone()
+        if row is None:
+            return None
+        return PolicyChange(
+            id=row[0], applied_at=row[1], verb=row[2], mode=row[3],
+            source_file=row[4], sender_hash=row[5], diff_summary=row[6],
+            is_destructive=bool(row[7]),
+            node_added=row[8], node_updated=row[9], node_deleted=row[10],
+            agent_added=row[11], agent_updated=row[12], agent_deleted=row[13],
+        )
+
+    # ── M10.0: Auto-backup + retention prune ──────────────────────────────────
+
+    def write_autobackup(
+        self,
+        exports_dir: str,
+        sender_hash8: str,
+        nonce: str,
+    ) -> tuple[str, str]:
+        """
+        Export all current policy to a timestamped autobackup file.
+
+        Filename: policy-autobackup-<UTC>-<senderhash8>-<nonce>.json
+
+        Returns (file_path, content_hash_prefix[:8]).
+        Non-atomic write is acceptable: file is complete before we return.
+        """
+        self._require_open()
+        ts = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+        filename = f"{_AUTOBACKUP_PREFIX}{ts}-{sender_hash8[:8]}-{nonce}.json"
+        file_path = _os.path.join(exports_dir, filename)
+
+        snapshot = self.export_all()
+        content = _json.dumps(snapshot, sort_keys=True, ensure_ascii=True)
+        with open(file_path, "w", encoding="utf-8") as fh:
+            fh.write(content)
+
+        content_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()[:8]
+        logger.debug("Auto-backup written: %s hash=%s", filename, content_hash)
+        return file_path, content_hash
+
+    def prune_exports(
+        self,
+        exports_dir: str,
+        retention_days: int,
+        dry_run: bool = True,
+    ) -> PruneResult:
+        """
+        Remove policy export files older than retention_days.
+
+        Only files matching 'policy-*.json' in exports_dir are considered —
+        never recursing into subdirectories.
+
+        dry_run=True: compute stats without deleting.
+        dry_run=False: actually delete matching files.
+
+        Returns PruneResult with filenames, total_bytes, oldest_mtime.
+        """
+        if retention_days <= 0:
+            return PruneResult(files_to_delete=[], total_bytes=0)
+
+        cutoff = time.time() - retention_days * 86400
+        pattern = _os.path.join(exports_dir, _EXPORT_GLOB)
+        to_delete: List[str] = []
+        total_bytes = 0
+        oldest_mtime: Optional[float] = None
+
+        for fpath in sorted(_glob.glob(pattern)):
+            # Safety: only process files directly in exports_dir (no subdirs)
+            if _os.path.dirname(fpath) != _os.path.abspath(exports_dir):
+                continue
+            try:
+                stat = _os.stat(fpath)
+            except OSError:
+                continue
+            if stat.st_mtime < cutoff:
+                basename = _os.path.basename(fpath)
+                to_delete.append(basename)
+                total_bytes += stat.st_size
+                if oldest_mtime is None or stat.st_mtime < oldest_mtime:
+                    oldest_mtime = stat.st_mtime
+
+        if not dry_run:
+            for basename in to_delete:
+                fpath = _os.path.join(exports_dir, basename)
+                try:
+                    _os.remove(fpath)
+                    logger.info("Pruned policy export: %s", basename)
+                except OSError as exc:
+                    logger.warning("Could not prune %s: %s", basename, exc)
+
+        return PruneResult(
+            files_to_delete=to_delete,
+            total_bytes=total_bytes,
+            oldest_mtime=oldest_mtime,
+        )
+
+    def export_all(self) -> Dict[str, Any]:
+        """
+        Export all overrides as a JSON-serializable snapshot dict.
+
+        Format (version 1):
+        {
+          "version": 1,
+          "created_at": "<ISO8601>Z",
+          "room_node_overrides":  [{room_id, node_id, updated_at, updated_by}, ...],
+          "room_agent_overrides": [{room_id, agents, default_agent, updated_at, updated_by}, ...]
+        }
+        """
+        self._require_open()
+        node_rows = self._conn.execute(  # type: ignore[union-attr]
+            "SELECT room_id, node_id, updated_at, updated_by_hash FROM room_node_overrides ORDER BY room_id"
+        ).fetchall()
+        agent_rows = self._conn.execute(  # type: ignore[union-attr]
+            """SELECT room_id, agents_csv, default_agent, updated_at, updated_by_hash
+               FROM room_agent_overrides ORDER BY room_id"""
+        ).fetchall()
+        return {
+            "version": POLICY_SNAPSHOT_VERSION,
+            "created_at": datetime.datetime.now(datetime.timezone.utc).isoformat().replace("+00:00", "Z"),
+            "room_node_overrides": [
+                {"room_id": r[0], "node_id": r[1], "updated_at": r[2], "updated_by": r[3]}
+                for r in node_rows
+            ],
+            "room_agent_overrides": [
+                {
+                    "room_id": r[0],
+                    "agents": [a.strip() for a in r[1].split(",") if a.strip()],
+                    "default_agent": r[2] or None,
+                    "updated_at": r[3],
+                    "updated_by": r[4],
+                }
+                for r in agent_rows
+            ],
+        }
+
+    def compute_import_diff(
+        self,
+        data: Dict[str, Any],
+        mode: str = POLICY_IMPORT_MODE_MERGE,
+    ) -> ImportDiff:
+        """
+        Compute what would change if data were imported (dry-run, M9.1).
+
+        Returns an ImportDiff with counts and up to _SAMPLE_KEYS_MAX changed rooms.
+        Non-destructive — never modifies the database.
+        """
+        if data.get("version") != POLICY_SNAPSHOT_VERSION:
+            raise ValueError(f"Unsupported snapshot version: {data.get('version')!r}")
+
+        self._require_open()
+
+        existing_nodes: Dict[str, str] = {
+            r[0]: r[1]
+            for r in self._conn.execute(  # type: ignore[union-attr]
+                "SELECT room_id, node_id FROM room_node_overrides"
+            ).fetchall()
+        }
+        existing_agents: Dict[str, str] = {
+            r[0]: r[1]
+            for r in self._conn.execute(  # type: ignore[union-attr]
+                "SELECT room_id, agents_csv FROM room_agent_overrides"
+            ).fetchall()
+        }
+
+        file_nodes: Dict[str, str] = {
+            e["room_id"]: e["node_id"]
+            for e in (data.get("room_node_overrides") or [])
+            if "room_id" in e and "node_id" in e
+        }
+        file_agents: Dict[str, Any] = {
+            e["room_id"]: e
+            for e in (data.get("room_agent_overrides") or [])
+            if "room_id" in e and "agents" in e
+        }
+
+        node_added   = sum(1 for r in file_nodes if r not in existing_nodes)
+        node_updated = sum(1 for r in file_nodes if r in existing_nodes)
+        agent_added  = sum(1 for r in file_agents if r not in existing_agents)
+        agent_updated = sum(1 for r in file_agents if r in existing_agents)
+        node_deleted  = 0
+        agent_deleted = 0
+        if mode == POLICY_IMPORT_MODE_REPLACE:
+            node_deleted  = sum(1 for r in existing_nodes if r not in file_nodes)
+            agent_deleted = sum(1 for r in existing_agents if r not in file_agents)
+
+        # Collect up to _SAMPLE_KEYS_MAX affected rooms (deterministic: sorted)
+        affected: List[str] = []
+        seen: set[str] = set()
+        for rid in list(file_nodes) + list(file_agents):
+            if rid not in seen:
+                affected.append(rid)
+                seen.add(rid)
+        if mode == POLICY_IMPORT_MODE_REPLACE:
+            for rid in list(existing_nodes) + list(existing_agents):
+                if rid not in seen and (rid not in file_nodes or rid not in file_agents):
+                    affected.append(rid)
+                    seen.add(rid)
+        sample_keys = sorted(affected)[:_SAMPLE_KEYS_MAX]
+
+        return ImportDiff(
+            node_added=node_added,
+            node_updated=node_updated,
+            node_deleted=node_deleted,
+            agent_added=agent_added,
+            agent_updated=agent_updated,
+            agent_deleted=agent_deleted,
+            sample_keys=sample_keys,
+            is_replace=(mode == POLICY_IMPORT_MODE_REPLACE),
+        )
+
+    def import_snapshot(
+        self,
+        data: Dict[str, Any],
+        mode: str = POLICY_IMPORT_MODE_MERGE,
+        dry_run: bool = True,
+        imported_by: str = "import",
+    ) -> Dict[str, int]:
+        """
+        Import a policy snapshot.
+
+        mode=merge:   upsert entries from file; never delete existing entries not in file.
+        mode=replace: upsert entries from file AND delete entries in DB not present in file.
+
+        dry_run=True: compute stats without modifying DB.
+
+        Returns:
+            {
+              "node_added": N, "node_updated": N, "node_deleted": N,
+              "agent_added": N, "agent_updated": N, "agent_deleted": N,
+            }
+        """
+        if data.get("version") != POLICY_SNAPSHOT_VERSION:
+            raise ValueError(f"Unsupported snapshot version: {data.get('version')!r}")
+
+        self._require_open()
+
+        # ── Current DB state ──────────────────────────────────────────────────
+        existing_nodes: Dict[str, str] = {
+            r[0]: r[1]
+            for r in self._conn.execute(  # type: ignore[union-attr]
+                "SELECT room_id, node_id FROM room_node_overrides"
+            ).fetchall()
+        }
+        existing_agents: Dict[str, str] = {
+            r[0]: r[1]
+            for r in self._conn.execute(  # type: ignore[union-attr]
+                "SELECT room_id, agents_csv FROM room_agent_overrides"
+            ).fetchall()
+        }
+
+        # ── Compute deltas ────────────────────────────────────────────────────
+        file_nodes = {
+            e["room_id"]: e["node_id"]
+            for e in (data.get("room_node_overrides") or [])
+            if "room_id" in e and "node_id" in e
+        }
+        file_agents = {
+            e["room_id"]: e
+            for e in (data.get("room_agent_overrides") or [])
+            if "room_id" in e and "agents" in e
+        }
+
+        node_added   = sum(1 for r in file_nodes if r not in existing_nodes)
+        node_updated = sum(1 for r in file_nodes if r in existing_nodes)
+        agent_added  = sum(1 for r in file_agents if r not in existing_agents)
+        agent_updated = sum(1 for r in file_agents if r in existing_agents)
+
+        node_deleted  = 0
+        agent_deleted = 0
+        if mode == POLICY_IMPORT_MODE_REPLACE:
+            node_deleted  = sum(1 for r in existing_nodes if r not in file_nodes)
+            agent_deleted = sum(1 for r in existing_agents if r not in file_agents)
+
+        stats = {
+            "node_added": node_added,
+            "node_updated": node_updated,
+            "node_deleted": node_deleted,
+            "agent_added": agent_added,
+            "agent_updated": agent_updated,
+            "agent_deleted": agent_deleted,
+        }
+
+        if dry_run:
+            return stats
+
+        # ── Apply changes ─────────────────────────────────────────────────────
+        now = int(time.time())
+        by_hash = _hash_sender(imported_by)
+
+        for entry in (data.get("room_node_overrides") or []):
+            rid = entry.get("room_id")
+            nid = entry.get("node_id")
+            if rid and nid:
+                self._conn.execute(  # type: ignore[union-attr]
+                    """
+                    INSERT INTO room_node_overrides (room_id, node_id, updated_at, updated_by_hash)
+                    VALUES (?, ?, ?, ?)
+                    ON CONFLICT(room_id) DO UPDATE SET
+                        node_id = excluded.node_id,
+                        updated_at = excluded.updated_at,
+                        updated_by_hash = excluded.updated_by_hash
+                    """,
+                    (rid, nid, now, by_hash),
+                )
+
+        for entry in (data.get("room_agent_overrides") or []):
+            rid = entry.get("room_id")
+            agents = entry.get("agents") or []
+            def_agent = entry.get("default_agent") or (agents[0] if agents else None)
+            if rid and agents:
+                agents_csv = ",".join(sorted(set(agents)))
+                self._conn.execute(  # type: ignore[union-attr]
+                    """
+                    INSERT INTO room_agent_overrides
+                        (room_id, agents_csv, default_agent, updated_at, updated_by_hash)
+                    VALUES (?, ?, ?, ?, ?)
+                    ON CONFLICT(room_id) DO UPDATE SET
+                        agents_csv = excluded.agents_csv,
+                        default_agent = excluded.default_agent,
+                        updated_at = excluded.updated_at,
+                        updated_by_hash = excluded.updated_by_hash
+                    """,
+                    (rid, agents_csv, def_agent, now, by_hash),
+                )
+
+        if mode == POLICY_IMPORT_MODE_REPLACE:
+            file_node_rooms = set(file_nodes.keys())
+            file_agent_rooms = set(file_agents.keys())
+            for room_id in existing_nodes:
+                if room_id not in file_node_rooms:
+                    self._conn.execute(  # type: ignore[union-attr]
+                        "DELETE FROM room_node_overrides WHERE room_id = ?", (room_id,)
+                    )
+            for room_id in existing_agents:
+                if room_id not in file_agent_rooms:
+                    self._conn.execute(  # type: ignore[union-attr]
+                        "DELETE FROM room_agent_overrides WHERE room_id = ?", (room_id,)
+                    )
+
+        return stats
+
+    # ── Internal ───────────────────────────────────────────────────────────────
+
+    def _require_open(self) -> None:
+        if self._conn is None:
+            raise RuntimeError("PolicyStore is not open — call open() first")
diff --git a/services/matrix-bridge-dagi/app/sticky_cache.py b/services/matrix-bridge-dagi/app/sticky_cache.py
new file mode 100644
index 00000000..36eedbce
--- /dev/null
+++ b/services/matrix-bridge-dagi/app/sticky_cache.py
@@ -0,0 +1,149 @@
+"""
+StickyNodeCache — M8.1: anti-flap sticky routing after soft-failover.
+
+After a successful failover (primary → fallback), the bridge remembers the
+fallback node per room:agent pair for `ttl_s` seconds.  Subsequent messages
+for the same pair skip the primary entirely and go directly to the known-good
+fallback, preventing oscillation ("flapping") while the primary recovers.
+
+Key design
+----------
+  key  = "{room_id}:{agent_id}"
+  ttl  = FAILOVER_STICKY_TTL_S (default 300 s)
+
+Priority in routing (when source != explicit):
+  1. sticky cache  (temporary)
+  2. store override (desired long-term policy)
+  3. env room_node_map
+  4. env default
+
+Sticky expires naturally; recovery is automatic — no operator action needed.
+If the sticky node also fails, the entry is removed and normal failover logic
+takes over again.
+
+Thread safety
+-------------
+  Uses threading.RLock — safe to call from asyncio callbacks without to_thread.
+"""
+from __future__ import annotations
+
+import logging
+import threading
+import time
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+_DEFAULT_TTL_S = 300.0
+
+
+@dataclass
+class _StickyEntry:
+    node_id: str
+    expires_at: float   # time.monotonic() deadline
+
+
+class StickyNodeCache:
+    """
+    In-memory sticky node preference cache.
+
+    Usage:
+        cache = StickyNodeCache(ttl_s=300)
+
+        # After successful failover:
+        cache.set("!room:srv:sofiia", "NODA2")
+
+        # Before routing the next message:
+        node = cache.get("!room:srv:sofiia")  # → "NODA2" or None if expired/missing
+
+        # If sticky node also fails:
+        cache.delete("!room:srv:sofiia")
+    """
+
+    def __init__(self, ttl_s: float = _DEFAULT_TTL_S) -> None:
+        if ttl_s <= 0:
+            raise ValueError(f"ttl_s must be > 0, got {ttl_s}")
+        self._ttl_s = ttl_s
+        self._cache: Dict[str, _StickyEntry] = {}
+        self._lock = threading.RLock()
+
+    # ── Public API ────────────────────────────────────────────────────────────
+
+    def set(self, key: str, node_id: str, ttl_s: Optional[float] = None) -> None:
+        """Set sticky preference; overwrites existing entry."""
+        ttl = ttl_s if ttl_s is not None else self._ttl_s
+        with self._lock:
+            self._cache[key] = _StickyEntry(
+                node_id=node_id,
+                expires_at=time.monotonic() + ttl,
+            )
+            logger.debug("StickyCache.set: key=%s node=%s ttl=%.0fs", key, node_id, ttl)
+
+    def get(self, key: str) -> Optional[str]:
+        """
+        Return sticky node_id if entry exists and not expired; else None.
+        Expired entries are lazily removed on access.
+        """
+        with self._lock:
+            entry = self._cache.get(key)
+            if entry is None:
+                return None
+            if time.monotonic() >= entry.expires_at:
+                del self._cache[key]
+                logger.debug("StickyCache.expired: key=%s node=%s", key, entry.node_id)
+                return None
+            return entry.node_id
+
+    def delete(self, key: str) -> bool:
+        """Remove an entry. Returns True if it existed."""
+        with self._lock:
+            existed = key in self._cache
+            self._cache.pop(key, None)
+            if existed:
+                logger.debug("StickyCache.delete: key=%s", key)
+            return existed
+
+    def active_count(self) -> int:
+        """Count of non-expired entries (best-effort; no eviction)."""
+        now = time.monotonic()
+        with self._lock:
+            return sum(1 for e in self._cache.values() if e.expires_at > now)
+
+    def active_entries(self) -> List[Tuple[str, str, float]]:
+        """
+        Return (key, node_id, ttl_remaining_s) for all non-expired entries.
+        Useful for ops visibility in !status/!nodes.
+        """
+        now = time.monotonic()
+        with self._lock:
+            result = []
+            for k, e in self._cache.items():
+                remaining = e.expires_at - now
+                if remaining > 0:
+                    result.append((k, e.node_id, remaining))
+            return sorted(result, key=lambda x: x[0])
+
+    def cleanup(self) -> int:
+        """
+        Remove all expired entries.
+        Call periodically (e.g. in a background task) to reclaim memory.
+        Returns count of removed entries.
+        """
+        now = time.monotonic()
+        with self._lock:
+            expired_keys = [k for k, e in self._cache.items() if e.expires_at <= now]
+            for k in expired_keys:
+                del self._cache[k]
+            if expired_keys:
+                logger.debug("StickyCache.cleanup: removed %d expired entries", len(expired_keys))
+            return len(expired_keys)
+
+    @property
+    def ttl_s(self) -> float:
+        return self._ttl_s
+
+
+def make_sticky_key(room_id: str, agent_id: str) -> str:
+    """Canonical sticky cache key for a room+agent pair."""
+    return f"{room_id}:{agent_id}"
diff --git a/services/matrix-bridge-dagi/requirements.txt b/services/matrix-bridge-dagi/requirements.txt
index 208a74dc..a4f539e6 100644
--- a/services/matrix-bridge-dagi/requirements.txt
+++ b/services/matrix-bridge-dagi/requirements.txt
@@ -4,3 +4,4 @@ httpx>=0.25.0
 python-dotenv>=1.0.0
 prometheus-client>=0.20.0
 pyyaml>=6.0
+aiosqlite>=0.19.0