microdao-daarion/services/matrix-bridge-dagi/app/ingress.py

"""
Matrix Ingress + Egress Loop — Phase M1.4 + H1 + H2 + H3 + M2.1 + M2.2 + M3.0 + M3.1 + M3.3 + SessionScopeV2

Architecture (H2):
  Reader task  → asyncio.Queue(maxsize) → N Worker tasks
  ─────────────────────────────────────────────────────────
  Reader:
    sync_poll() → extract_room_messages()
    → rate_limit check (H1)
    → mark_seen / dedupe
    → queue.put_nowait() or DROP (audit matrix.queue_full + metric)

  Workers (N concurrent):
    queue.get() → measure wait latency (H3)
    → audit matrix.message.received
    → invoke Router  (timed, H3)
    → send_text()    (timed, H3)
    → audit matrix.agent.replied | matrix.error

Shutdown:
  1. stop_event set → reader exits loop
  2. queue.join() with drain_timeout → workers finish in-flight
  3. worker tasks cancelled

Queue entry: _QueueEntry(event, room_id, agent_id, enqueue_time, routing_reason, is_mixed)
"""

import asyncio
import hashlib
import json as _json
import logging
import os as _os
import time
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional

import httpx

from .control import (
    ControlConfig, ControlCommand,
    check_authorization, parse_command, is_control_message,
    not_implemented_reply, unknown_command_reply, unauthorized_reply, help_reply,
    start_usage_reply, runbook_started_reply, runbook_start_error_reply,
    next_usage_reply, next_manual_reply, next_auto_reply, next_error_reply,
    complete_usage_reply, complete_ok_reply, complete_error_reply,
    status_usage_reply, status_reply, status_error_reply,
    evidence_usage_reply, evidence_reply, evidence_error_reply,
    post_review_usage_reply, post_review_reply, post_review_error_reply,
    rate_limited_reply, sanitize_notes, MAX_NOTES_LEN,
    status_not_available_reply, nodes_reply,
    VERB_HELP, VERB_RUNBOOK, VERB_STATUS, VERB_NODES, VERB_NODE, VERB_CONFIRM,
    is_dangerous_cmd, build_normalized_args,
    confirm_intent_reply, confirm_success_reply, confirm_expired_reply,
    NODE_SUBCMD_SET, NODE_SUBCMD_UNSET, NODE_SUBCMD_GET, NODE_SUBCMD_LIST,
    parse_node_cmd, node_cmd_validate_room,
    node_cmd_reply_set, node_cmd_reply_unset_ok, node_cmd_reply_unset_not_found,
    node_cmd_reply_get, node_cmd_reply_list, node_cmd_reply_error,
    VERB_ROOM, ROOM_SUBCMD_AGENTS,
    ROOM_ACTION_SET, ROOM_ACTION_ADD, ROOM_ACTION_REMOVE,
    ROOM_ACTION_GET, ROOM_ACTION_LIST, ROOM_ACTION_UNSET,
    parse_room_agents_cmd,
    room_agents_reply_set, room_agents_reply_add, room_agents_reply_remove,
    room_agents_reply_unset_ok, room_agents_reply_unset_not_found,
    room_agents_reply_get, room_agents_reply_list, room_agents_reply_error,
    VERB_POLICY, POLICY_EXPORTS_SUBDIR, validate_export_path,
    policy_import_intent_reply, format_import_diff as _format_import_diff,
    policy_export_reply, policy_import_dry_run_reply, policy_import_reply,
    policy_cmd_error,
    policy_prune_preview_reply, policy_prune_applied_reply,
    policy_restore_intent_reply, policy_restore_applied_reply,
    policy_history_reply, policy_change_detail_reply,
    SUBCOMMAND_START, SUBCOMMAND_NEXT, SUBCOMMAND_COMPLETE,
    SUBCOMMAND_STATUS, SUBCOMMAND_EVIDENCE, SUBCOMMAND_POST_REVIEW,
)
from .control_limiter import ControlRateLimiter
from .discovery import agents_reply, bridge_status_reply, is_discovery_message
from .event_store import EventStore
from .node_policy import (
    NodePolicy, NodeResolution,
    NODE_SOURCE_DEFAULT, NODE_SOURCE_EXPLICIT, NODE_SOURCE_ROOM_MAP,
    extract_node_kwarg, node_rejected_reply,
)
from .node_health import (
    NodeHealthTracker, NodeHealthConfig,
    NODE_STATE_HEALTHY, NODE_STATE_DEGRADED, NODE_STATE_DOWN,
    FAILOVER_REASON_TIMEOUT, FAILOVER_REASON_HTTP_5XX, FAILOVER_REASON_NETWORK,
)
from .sticky_cache import StickyNodeCache, make_sticky_key
from .confirm_store import ConfirmStore
from . import control_runner as _ctrl_runner
from .matrix_client import MatrixClient
from .mixed_routing import (
    MixedRoomConfig, MixedRoom, route_message, reply_prefix, build_override_config,
    REASON_REJECTED_UNKNOWN_AGENT, REASON_REJECTED_SLASH_TOO_LONG, REASON_REJECTED_NO_MAPPING,
)
from .rate_limit import InMemoryRateLimiter
from .room_mapping import RoomMappingConfig, RoomMapping

logger = logging.getLogger(__name__)

# ── Constants ──────────────────────────────────────────────────────────────────

_MAX_RETRY_BACKOFF = 60.0
_INIT_RETRY_BACKOFF = 2.0
_ROUTER_TIMEOUT_S = 45.0
_AUDIT_TIMEOUT_S = 5.0
_REPLY_TEXT_MAX = 4000
_WORKER_GET_TIMEOUT_S = 1.0   # how long a worker waits on empty queue before re-checking


# ── Queue entry ────────────────────────────────────────────────────────────────

@dataclass
class _QueueEntry:
    event: Dict[str, Any]
    room_id: str
    agent_id: str
    enqueue_time: float           # time.monotonic() at enqueue
    routing_reason: str = "direct"
    is_mixed: bool = False        # True for mixed-room entries (reply tagging, session isolation)


# ── Session Scope v2 ───────────────────────────────────────────────────────────
# Invariants:
#   1. Control room messages never reach the Router (no session_key needed there).
#   2. Matrix and Telegram never share a key namespace (prefix "matrix:").
#   3. Mixed rooms: each (room_id, agent_id) pair has its own key — no cross-agent leakage.
#   4. Logs/metrics receive sender_hash (sha256[:16]), never raw Matrix user_id.

SCOPE_ROOM_AGENT = "room_agent"    # default: shared room context per agent
SCOPE_OPS_RUNBOOK = "ops_runbook"  # future: control/ops room invocations
SCOPE_DM_USER = "dm_agent_user"    # future: per-user DM isolation


def _sender_hash(sender: str) -> str:
    """PII-safe 16-hex hash of a Matrix user_id (e.g. @alice:server → 'a3f9...')."""
    return hashlib.sha256(sender.encode()).hexdigest()[:16]


def _build_session_key(room_id: str, agent_id: str, scope: str = SCOPE_ROOM_AGENT) -> str:
    """
    Canonical session key v2: matrix:{scope}:{room_key}:{agent_id}

    Examples:
      matrix:room_agent:roomXserver_yourdomain:sofiia
      matrix:ops_runbook:opsroomXserver:sofiia
    """
    room_key = room_id.replace("!", "").replace(":", "_")
    return f"matrix:{scope}:{room_key}:{agent_id}"


# ── Router invoke ──────────────────────────────────────────────────────────────

async def _invoke_router(
    http_client: httpx.AsyncClient,
    router_url: str,
    agent_id: str,
    node_id: str,
    prompt: str,
    session_id: str,
    sender_hash: str = "",
    scope: str = SCOPE_ROOM_AGENT,
    node_source: str = NODE_SOURCE_DEFAULT,
) -> str:
    """POST /v1/agents/{agent_id}/infer → response text. Raises httpx.HTTPError on failure."""
    url = f"{router_url.rstrip('/')}/v1/agents/{agent_id}/infer"
    payload = {
        "prompt": prompt,
        "session_id": session_id,
        "user_id": "matrix_bridge",
        "metadata": {
            "transport": "matrix",
            "node_id": node_id,
            "node_source": node_source,  # M5.0: how node was resolved
            "session_key": session_id,  # explicit for Router/Memory to index on
            "sender_hash": sender_hash,  # PII-safe sender fingerprint
            "scope": scope,
        },
    }
    resp = await http_client.post(url, json=payload, timeout=_ROUTER_TIMEOUT_S)
    resp.raise_for_status()
    data = resp.json()
    text = (
        data.get("response")
        or data.get("text")
        or data.get("content")
        or data.get("message")
        or ""
    )
    return (text if isinstance(text, str) else str(text)).strip()


# ── M6.2: File helpers (run in thread) ────────────────────────────────────────

def _write_json_file(path: str, data: Any) -> None:
    """Synchronously write data as JSON to path (UTF-8, 2-space indent)."""
    with open(path, "w", encoding="utf-8") as fh:
        _json.dump(data, fh, ensure_ascii=False, indent=2)


def _read_json_file(path: str) -> Any:
    """Synchronously read and parse a JSON file."""
    with open(path, encoding="utf-8") as fh:
        return _json.load(fh)


# ── Audit write ────────────────────────────────────────────────────────────────

async def _write_audit(
    http_client: httpx.AsyncClient,
    console_url: str,
    internal_token: str,
    event: str,
    agent_id: str,
    node_id: str,
    room_id: str,
    event_id: str,
    status: str = "ok",
    error_code: Optional[str] = None,
    duration_ms: Optional[int] = None,
    data: Optional[Dict[str, Any]] = None,
) -> None:
    """Fire-and-forget. Never raises."""
    if not console_url or not internal_token:
        return
    try:
        await http_client.post(
            f"{console_url.rstrip('/')}/api/audit/internal",
            json={
                "event": event,
                "operator_id": "matrix_bridge",
                "node_id": node_id,
                "agent_id": agent_id,
                "chat_id": room_id,
                "status": status,
                "error_code": error_code,
                "duration_ms": duration_ms,
                "data": {"matrix_event_id": event_id, "matrix_room_id": room_id, **(data or {})},
            },
            headers={"X-Internal-Service-Token": internal_token},
            timeout=_AUDIT_TIMEOUT_S,
        )
    except Exception as exc:
        logger.warning("Audit write failed (non-blocking): %s", exc)


# ── Ingress loop (reader + workers) ───────────────────────────────────────────

class MatrixIngressLoop:
    """
    Drives the full Matrix → Router → Matrix pipeline with backpressure.

    Reader task:   sync → extract → rate_check → dedupe → queue.put_nowait
    Worker tasks:  queue.get → invoke → send → audit

    Metric callbacks (all optional, called synchronously):
      on_message_received(room_id, agent_id)
      on_message_replied(room_id, agent_id, status)
      on_gateway_error(error_type)
      on_rate_limited(room_id, agent_id, limit_type)
      on_queue_dropped(room_id, agent_id)
      on_queue_size(current_size: int)
      on_invoke_latency(agent_id, duration_seconds)
      on_send_latency(agent_id, duration_seconds)
      on_queue_wait(agent_id, wait_seconds)
      on_routed(agent_id, reason)              M2.2: successful routing
      on_route_rejected(room_id, reason)       M2.2: routing rejection
    """

    def __init__(
        self,
        matrix_homeserver_url: str,
        matrix_access_token: str,
        matrix_user_id: str,
        router_url: str,
        node_id: str,
        room_map: RoomMappingConfig,
        sofiia_console_url: str = "",
        sofiia_internal_token: str = "",
        rate_limiter: Optional[InMemoryRateLimiter] = None,
        queue_max_events: int = 100,
        worker_concurrency: int = 2,
        queue_drain_timeout_s: float = 5.0,
        mixed_room_config: Optional[MixedRoomConfig] = None,
        # M2.2: guard rails
        unknown_agent_behavior: str = "ignore",   # "ignore" | "reply_error"
        max_slash_len: int = 32,
        mixed_concurrency_cap: int = 1,           # 0 = unlimited
        # M3.0: control channel
        control_config: Optional[ControlConfig] = None,
        control_unauthorized_behavior: str = "ignore",   # "ignore" | "reply_error"
        # M3.1: runbook runner integration
        sofiia_control_token: str = "",
        # M3.4: control channel safety
        control_limiter: Optional["ControlRateLimiter"] = None,
        # M2.3: persistent event deduplication
        event_store: Optional["EventStore"] = None,
        # M4.0: agent discovery
        discovery_rpm: int = 20,
        # M5.0: node-aware routing
        node_policy: Optional["NodePolicy"] = None,
        # Callbacks
        on_message_received: Optional[Callable[[str, str], None]] = None,
        on_message_replied: Optional[Callable[[str, str, str], None]] = None,
        on_gateway_error: Optional[Callable[[str], None]] = None,
        on_rate_limited: Optional[Callable[[str, str, str], None]] = None,
        on_queue_dropped: Optional[Callable[[str, str], None]] = None,
        on_queue_size: Optional[Callable[[int], None]] = None,
        on_invoke_latency: Optional[Callable[..., None]] = None,   # (agent_id, duration_s, node_id="")
        on_send_latency: Optional[Callable[[str, float], None]] = None,
        on_queue_wait: Optional[Callable[[str, float], None]] = None,
        on_routed: Optional[Callable[[str, str], None]] = None,
        on_route_rejected: Optional[Callable[[str, str], None]] = None,
        on_control_command: Optional[Callable[[str, str, str], None]] = None,
        on_control_rate_limited: Optional[Callable[[str], None]] = None,
        on_dedupe_persistent_hit: Optional[Callable[[str, str], None]] = None,
        on_dedupe_persistent_insert: Optional[Callable[[], None]] = None,
        # M5.0: node routing callbacks
        on_node_selected: Optional[Callable[[str, str, str], None]] = None,  # (agent_id, node_id, source)
        on_node_rejected: Optional[Callable[[str], None]] = None,            # (rejected_node)
        # M5.1: per-node stats for !status reply
        node_stats_getter: Optional[Callable[[], Dict[str, Any]]] = None,
        # M6.0: persistent policy store for dynamic room-node overrides
        policy_store: Optional[Any] = None,   # app.policy_store.PolicyStore
        # M6.2: data directory for policy exports/imports
        bridge_data_dir: Optional[str] = None,
        # M8.0: node health tracker for soft-failover
        node_health_tracker: Optional[NodeHealthTracker] = None,
        on_failover: Optional[Callable[[str, str, str], None]] = None,  # (from_node, to_node, reason)
        # M8.1: sticky failover cache (anti-flap)
        sticky_cache: Optional[StickyNodeCache] = None,
        on_sticky_set: Optional[Callable[[str, str], None]] = None,  # (node_id, scope)
        # M8.2: HA state persistence config
        ha_health_snapshot_interval_s: int = 60,
        ha_health_max_age_s: int = 600,
        # M9.0: Two-step confirmation store for dangerous commands
        confirm_store: Optional[ConfirmStore] = None,
        # M10.0: Auto-backup retention policy (days; 0 = keep forever)
        policy_export_retention_days: int = 30,
        # M10.2: max rows to keep in policy_changes history (0 = unlimited)
        policy_history_limit: int = 100,
    ) -> None:
        self._hs_url = matrix_homeserver_url
        self._token = matrix_access_token
        self._user_id = matrix_user_id
        self._router_url = router_url
        self._node_id = node_id
        self._room_map = room_map
        self._console_url = sofiia_console_url
        self._internal_token = sofiia_internal_token
        self._rate_limiter = rate_limiter
        self._queue_max = queue_max_events
        self._worker_count = worker_concurrency
        self._drain_timeout_s = queue_drain_timeout_s
        # Callbacks
        self._on_message_received = on_message_received
        self._on_message_replied = on_message_replied
        self._on_gateway_error = on_gateway_error
        self._on_rate_limited = on_rate_limited
        self._on_queue_dropped = on_queue_dropped
        self._on_queue_size = on_queue_size
        self._on_invoke_latency = on_invoke_latency
        self._on_send_latency = on_send_latency
        self._on_queue_wait = on_queue_wait
        self._mixed_room_config = mixed_room_config
        self._control_config = control_config
        self._control_unauthorized_behavior = control_unauthorized_behavior
        self._control_token = sofiia_control_token
        self._unknown_agent_behavior = unknown_agent_behavior
        self._max_slash_len = max_slash_len
        self._mixed_concurrency_cap = mixed_concurrency_cap
        self._on_routed = on_routed
        self._on_route_rejected = on_route_rejected
        self._on_control_command = on_control_command
        self._on_control_rate_limited = on_control_rate_limited
        # M3.4: control channel safety
        self._control_limiter = control_limiter
        # M2.3: persistent event deduplication
        self._event_store: Optional[EventStore] = event_store
        self._on_dedupe_persistent_hit = on_dedupe_persistent_hit
        self._on_dedupe_persistent_insert = on_dedupe_persistent_insert
        # M5.0: node routing callbacks
        self._on_node_selected = on_node_selected
        self._on_node_rejected = on_node_rejected
        # M5.1: per-node stats getter for !status reply
        self._node_stats_getter = node_stats_getter
        # M6.0: persistent policy store
        self._policy_store = policy_store
        # M6.2: policy exports directory
        self._bridge_data_dir: Optional[str] = bridge_data_dir
        self._policy_last_export_at: Optional[int] = None
        self._policy_last_import_at: Optional[int] = None
        # M10.0: auto-backup retention
        self._policy_export_retention_days: int = policy_export_retention_days
        # M10.2: history table row limit
        self._policy_history_limit: int = policy_history_limit
        # M8.0: node health + soft-failover
        self._node_health_tracker: Optional[NodeHealthTracker] = node_health_tracker
        self._on_failover = on_failover
        # M8.1: sticky failover cache
        self._sticky_cache: Optional[StickyNodeCache] = sticky_cache
        self._on_sticky_set = on_sticky_set
        # M9.0: two-step confirmation store
        self._confirm_store: Optional[ConfirmStore] = confirm_store
        # M8.2: HA state persistence
        self._ha_health_snapshot_interval_s: int = ha_health_snapshot_interval_s
        self._ha_health_max_age_s: int = ha_health_max_age_s
        self._ha_sticky_loaded: int = 0      # count of sticky entries loaded on startup
        self._ha_health_loaded: bool = False  # whether health state was loaded on startup
        # M4.0: agent discovery — simple per-room sliding window (reuses InMemoryRateLimiter logic)
        self._discovery_rpm = discovery_rpm
        # M5.0: node-aware routing policy
        self._node_policy: Optional[NodePolicy] = node_policy
        from collections import defaultdict, deque
        self._discovery_windows: dict = defaultdict(deque)
        # Lazily populated semaphores keyed by "{room_id}:{agent_id}"
        self._concurrency_locks: Dict[str, asyncio.Semaphore] = {}
        self._next_batch: Optional[str] = None
        self._queue: Optional[asyncio.Queue] = None  # exposed for /health

    @property
    def next_batch(self) -> Optional[str]:
        return self._next_batch

    @property
    def queue_size(self) -> int:
        return self._queue.qsize() if self._queue else 0

    @property
    def worker_count(self) -> int:
        return self._worker_count

    def get_status(self) -> Dict[str, Any]:
        """Return a simple bridge status dict for health/ops queries."""
        status: Dict[str, Any] = {
            "queue_size": self._queue.qsize() if self._queue else 0,
            "queue_max": self._queue_max,
            "worker_count": self._worker_count,
        }
        if self._node_policy is not None:
            status["node_policy"] = self._node_policy.as_info_dict()
        # M5.1: per-node routed/rejected counters
        if self._node_stats_getter is not None:
            status["nodes"] = self._node_stats_getter()
        # M6.0: policy store info
        if self._policy_store is not None:
            try:
                status["policy_store_ok"] = self._policy_store.is_open
                status["policy_store_path"] = self._policy_store.db_path
                status["policy_overrides_count"] = self._policy_store.count_overrides()
                status["policy_agent_overrides_count"] = self._policy_store.count_agent_overrides()  # M6.1
            except Exception as exc:  # noqa: BLE001
                status["policy_store_ok"] = False
                status["policy_store_error"] = str(exc)
        # M6.2: policy snapshot timestamps
        if self._policy_last_export_at is not None:
            status["policy_last_export_at"] = self._policy_last_export_at
        if self._policy_last_import_at is not None:
            status["policy_last_import_at"] = self._policy_last_import_at
        # M6.2: policy DB mtime (best-effort)
        if self._policy_store is not None:
            try:
                db_path = self._policy_store.db_path
                if db_path and _os.path.exists(db_path):
                    status["policy_db_mtime"] = int(_os.path.getmtime(db_path))
            except Exception:  # noqa: BLE001
                pass
        # M10.2: policy change history count
        if self._policy_store is not None and self._policy_store.is_open:
            try:
                status["policy_changes_count"] = self._policy_store.get_policy_changes_count()
            except Exception:  # noqa: BLE001
                pass
        # M8.0: node health tracker state
        if self._node_health_tracker is not None:
            allowed = (
                self._node_policy.allowed_nodes
                if self._node_policy is not None
                else None
            )
            status["node_health"] = self._node_health_tracker.all_info(allowed)
        # M8.1: sticky failover cache info
        if self._sticky_cache is not None:
            status["sticky_active_keys"] = self._sticky_cache.active_count()
            status["sticky_ttl_s"] = self._sticky_cache.ttl_s
        # M9.0: pending confirmations
        if self._confirm_store is not None:
            status["confirm_pending"] = self._confirm_store.pending_count()
            status["confirm_ttl_s"] = self._confirm_store.ttl_s
        # M8.2: HA persistence info
        status["ha_sticky_loaded"] = self._ha_sticky_loaded
        status["ha_health_loaded"] = self._ha_health_loaded
        status["ha_health_snapshot_interval_s"] = self._ha_health_snapshot_interval_s
        return status

    @property
    def active_lock_count(self) -> int:
        """Number of room-agent pairs currently holding a concurrency lock."""
        return sum(1 for lock in self._concurrency_locks.values() if lock.locked())

    def _get_concurrency_lock(self, room_id: str, agent_id: str) -> asyncio.Semaphore:
        """Lazily create and return the semaphore for a (room, agent) pair."""
        key = f"{room_id}:{agent_id}"
        if key not in self._concurrency_locks:
            cap = self._mixed_concurrency_cap if self._mixed_concurrency_cap > 0 else 2 ** 31
            self._concurrency_locks[key] = asyncio.Semaphore(cap)
        return self._concurrency_locks[key]

    # ── Public run ─────────────────────────────────────────────────────────────

    async def run(self, stop_event: asyncio.Event) -> None:
        mixed_rooms_count = self._mixed_room_config.total_rooms if self._mixed_room_config else 0
        logger.info(
            "Matrix ingress loop started | hs=%s node=%s mappings=%d mixed_rooms=%d "
            "queue_max=%d workers=%d",
            self._hs_url, self._node_id, self._room_map.total_mappings, mixed_rooms_count,
            self._queue_max, self._worker_count,
        )

        if self._room_map.total_mappings == 0 and mixed_rooms_count == 0:
            logger.warning("No room mappings — ingress loop is idle")

        queue: asyncio.Queue[Optional[_QueueEntry]] = asyncio.Queue(
            maxsize=self._queue_max
        )
        self._queue = queue

        async with MatrixClient(self._hs_url, self._token, self._user_id) as client:
            for mapping in self._room_map.mappings:
                if mapping.agent_id in self._room_map.allowed_agents:
                    try:
                        await client.join_room(mapping.room_id)
                    except Exception as exc:
                        logger.warning("Could not join room %s: %s", mapping.room_id, exc)
            if self._mixed_room_config:
                for room_id in self._mixed_room_config.rooms:
                    try:
                        await client.join_room(room_id)
                    except Exception as exc:
                        logger.warning("Could not join mixed room %s: %s", room_id, exc)
            if self._control_config and self._control_config.is_enabled:
                for room_id in self._control_config.control_rooms:
                    try:
                        await client.join_room(room_id)
                    except Exception as exc:
                        logger.warning("Could not join control room %s: %s", room_id, exc)
                logger.info(
                    "Control channel: %d rooms, %d operators",
                    len(self._control_config.control_rooms),
                    len(self._control_config.operator_allowlist),
                )

            async with httpx.AsyncClient() as http_client:
                # M8.2: Load persisted HA state before processing any messages
                await self._load_ha_state()

                # Start workers
                worker_tasks = [
                    asyncio.create_task(
                        self._worker(queue, client, http_client),
                        name=f"matrix_worker_{i}",
                    )
                    for i in range(self._worker_count)
                ]

                # M8.2: Start periodic node health snapshot task
                _health_snapshot_task = None
                if (
                    self._ha_health_snapshot_interval_s > 0
                    and self._policy_store is not None
                    and self._node_health_tracker is not None
                ):
                    _health_snapshot_task = asyncio.create_task(
                        self._node_health_snapshot_loop(),
                        name="ha_health_snapshot",
                    )

                # Run reader until stop_event
                await self._reader(client, queue, http_client, stop_event)

                # Drain: wait for all enqueued items to be processed
                logger.info(
                    "Reader stopped. Draining queue (%d items, timeout=%.1fs)...",
                    queue.qsize(), self._drain_timeout_s,
                )
                try:
                    await asyncio.wait_for(queue.join(), timeout=self._drain_timeout_s)
                    logger.info("Queue drained successfully")
                except asyncio.TimeoutError:
                    remaining = queue.qsize()
                    logger.warning(
                        "Drain timeout (%.1fs): %d items not processed",
                        self._drain_timeout_s, remaining,
                    )

                # Cancel workers
                for task in worker_tasks:
                    task.cancel()
                # M8.2: Cancel health snapshot task if running
                if _health_snapshot_task is not None and not _health_snapshot_task.done():
                    _health_snapshot_task.cancel()
                results = await asyncio.gather(*worker_tasks, return_exceptions=True)
                cancelled = sum(1 for r in results if isinstance(r, asyncio.CancelledError))
                logger.info("Workers stopped (%d cancelled)", cancelled)

        self._queue = None
        logger.info("Matrix ingress loop stopped")

    # ── Reader ─────────────────────────────────────────────────────────────────

    async def _reader(
        self,
        client: MatrixClient,
        queue: "asyncio.Queue[Optional[_QueueEntry]]",
        http_client: httpx.AsyncClient,
        stop_event: asyncio.Event,
    ) -> None:
        backoff = _INIT_RETRY_BACKOFF
        while not stop_event.is_set():
            try:
                sync_resp = await client.sync_poll(since=self._next_batch)
                self._next_batch = sync_resp.get("next_batch")
                backoff = _INIT_RETRY_BACKOFF
                await self._enqueue_from_sync(client, queue, http_client, sync_resp)
            except asyncio.CancelledError:
                break
            except Exception as exc:
                logger.error("Reader error (retry in %.1fs): %s", backoff, exc)
                if self._on_gateway_error:
                    self._on_gateway_error("sync_error")
                try:
                    await asyncio.wait_for(stop_event.wait(), timeout=backoff)
                except asyncio.TimeoutError:
                    pass
                backoff = min(backoff * 2, _MAX_RETRY_BACKOFF)

    async def _enqueue_from_sync(
        self,
        client: MatrixClient,
        queue: "asyncio.Queue[Optional[_QueueEntry]]",
        http_client: httpx.AsyncClient,
        sync_resp: Dict[str, Any],
    ) -> None:
        # M3.0: Control rooms — handled first, not forwarded to agents
        if self._control_config and self._control_config.is_enabled:
            for room_id in self._control_config.control_rooms:
                messages = client.extract_room_messages(sync_resp, room_id)
                for event in messages:
                    await self._try_control(client, http_client, event, room_id)

        # Regular rooms: 1 room → 1 agent (M1 / M2.0)
        for mapping in self._room_map.mappings:
            if mapping.agent_id not in self._room_map.allowed_agents:
                continue
            messages = client.extract_room_messages(sync_resp, mapping.room_id)
            for event in messages:
                text = event.get("content", {}).get("body", "").strip()
                # M4.0: agent discovery before routing
                if is_discovery_message(text):
                    await self._handle_discovery(client, http_client, event, mapping.room_id)
                    continue
                await self._try_enqueue(client, queue, http_client, event, mapping)

        # Mixed rooms: 1 room → N agents, routing per message (M2.1)
        if self._mixed_room_config:
            for room_id in self._mixed_room_config.rooms:
                messages = client.extract_room_messages(sync_resp, room_id)
                for event in messages:
                    text = event.get("content", {}).get("body", "").strip()
                    # M4.0: agent discovery before routing
                    if is_discovery_message(text):
                        await self._handle_discovery(client, http_client, event, room_id)
                        continue
                    await self._try_enqueue_mixed(client, queue, http_client, event, room_id)

    async def _try_enqueue(
        self,
        client: MatrixClient,
        queue: "asyncio.Queue[Optional[_QueueEntry]]",
        http_client: httpx.AsyncClient,
        event: Dict[str, Any],
        mapping: RoomMapping,
    ) -> None:
        event_id = event.get("event_id", "")
        sender = event.get("sender", "")
        text = event.get("content", {}).get("body", "").strip()
        room_id = mapping.room_id
        agent_id = mapping.agent_id

        if not text:
            return

        # H1: Rate limit (before mark_seen — don't charge quota on drop)
        if self._rate_limiter is not None:
            allowed, limit_type = self._rate_limiter.check(room_id=room_id, sender=sender)
            if not allowed:
                logger.warning(
                    "Rate limited: room=%s sender=%s limit_type=%s event=%s",
                    room_id, sender, limit_type, event_id,
                )
                if self._on_rate_limited:
                    self._on_rate_limited(room_id, agent_id, limit_type or "unknown")
                await _write_audit(
                    http_client, self._console_url, self._internal_token,
                    event="matrix.rate_limited",
                    agent_id=agent_id, node_id=self._node_id,
                    room_id=room_id, event_id=event_id,
                    status="error", error_code=f"rate_limit_{limit_type}",
                    data={"sender": sender, "limit_type": limit_type},
                )
                return

        # Dedupe — mark before enqueue (prevents double-enqueue on retry)
        client.mark_seen(event_id)

        # M2.3: Persistent dedupe (cross-restart protection)
        if self._event_store is not None:
            try:
                already = await self._event_store.is_processed(room_id, event_id)
            except Exception as exc:
                logger.warning("EventStore.is_processed error (degraded): %s", exc)
                already = False

            if already:
                logger.debug("Persistent dedupe hit: event=%s room=%s", event_id, room_id)
                if self._on_dedupe_persistent_hit:
                    self._on_dedupe_persistent_hit(room_id, agent_id)
                await _write_audit(
                    http_client, self._console_url, self._internal_token,
                    event="matrix.dedupe.persistent_hit",
                    agent_id=agent_id, node_id=self._node_id,
                    room_id=room_id, event_id=event_id,
                    status="ok",
                    data={"sender": sender},
                )
                return

        # H2: Enqueue or drop
        entry = _QueueEntry(
            event=event,
            room_id=room_id,
            agent_id=agent_id,
            enqueue_time=time.monotonic(),
        )
        enqueued = False
        try:
            queue.put_nowait(entry)
            enqueued = True
            qsize = queue.qsize()
            logger.debug("Enqueued event=%s qsize=%d", event_id, qsize)
            if self._on_queue_size:
                self._on_queue_size(qsize)
        except asyncio.QueueFull:
            logger.warning(
                "Queue full (max=%d): dropping event=%s room=%s agent=%s",
                self._queue_max, event_id, room_id, agent_id,
            )
            if self._on_queue_dropped:
                self._on_queue_dropped(room_id, agent_id)
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.queue_full",
                agent_id=agent_id, node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="error", error_code="queue_full",
                data={"queue_max": self._queue_max, "sender": sender},
            )

        # M2.3: Mark as processed ONLY after successful enqueue
        if enqueued and self._event_store is not None:
            sender_hash = _sender_hash(sender)
            try:
                await self._event_store.mark_processed(room_id, event_id, sender_hash)
                if self._on_dedupe_persistent_insert:
                    self._on_dedupe_persistent_insert()
            except Exception as exc:
                logger.warning("EventStore.mark_processed error (degraded): %s", exc)

    async def _try_enqueue_mixed(
        self,
        client: MatrixClient,
        queue: "asyncio.Queue[Optional[_QueueEntry]]",
        http_client: httpx.AsyncClient,
        event: Dict[str, Any],
        room_id: str,
    ) -> None:
        """Enqueue a message from a mixed room, routing to the appropriate agent."""
        assert self._mixed_room_config is not None
        event_id = event.get("event_id", "")
        sender = event.get("sender", "")
        text = event.get("content", {}).get("body", "").strip()

        if not text:
            return

        # M6.1: look up dynamic agent override for this room
        _routing_config = self._mixed_room_config
        if self._policy_store is not None and self._policy_store.is_open:
            try:
                _agent_ov = await asyncio.to_thread(
                    self._policy_store.get_agent_override, room_id
                )
                if _agent_ov is not None:
                    _ov_agents, _ov_default = _agent_ov
                    _effective_default = _ov_default or (_ov_agents[0] if _ov_agents else None)
                    if _ov_agents and _effective_default:
                        _routing_config = build_override_config(
                            self._mixed_room_config, room_id,
                            _ov_agents, _effective_default,
                        )
            except Exception as _exc:  # noqa: BLE001
                logger.warning("PolicyStore get_agent_override failed: %s", _exc)

        # Route message to determine target agent
        agent_id, routing_reason, effective_text = route_message(
            text, room_id, _routing_config, self._room_map.allowed_agents,
            max_slash_len=self._max_slash_len,
        )

        if agent_id is None:
            # M2.2: routing rejected — audit + metric + optional error reply
            logger.warning(
                "Mixed room %s: routing rejected reason=%s event=%s",
                room_id, routing_reason, event_id,
            )
            if self._on_route_rejected:
                self._on_route_rejected(room_id, routing_reason)
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.route.rejected",
                agent_id="unknown", node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="error", error_code=routing_reason,
                data={"routing_reason": routing_reason, "sender": sender, "text_len": len(text)},
            )
            # M2.2: optional user-facing error reply in room
            if self._unknown_agent_behavior == "reply_error" and routing_reason == REASON_REJECTED_UNKNOWN_AGENT:
                available = self._mixed_room_config.agents_for_room(room_id)
                # Extract agent name from text (first slash token, if any)
                slash_token = text.strip().split()[0].lstrip("/") if text.strip().startswith("/") else ""
                label = f"`/{slash_token}`" if slash_token else "this command"
                error_msg = (
                    f"⚠️ Unknown agent {label}. "
                    f"Available in this room: {', '.join(available)}"
                )
                txn_id = MatrixClient.make_txn_id(room_id, event_id + "_reject")
                try:
                    await client.send_text(room_id, error_msg, txn_id)
                except Exception as exc:
                    logger.warning("Could not send route-error reply: %s", exc)
            return

        # M2.2: successful route — fire metric callback
        if self._on_routed:
            self._on_routed(agent_id, routing_reason)

        # H1: Rate limit (uses final agent_id for metric tagging)
        if self._rate_limiter is not None:
            allowed, limit_type = self._rate_limiter.check(room_id=room_id, sender=sender)
            if not allowed:
                logger.warning(
                    "Rate limited (mixed): room=%s sender=%s agent=%s limit_type=%s",
                    room_id, sender, agent_id, limit_type,
                )
                if self._on_rate_limited:
                    self._on_rate_limited(room_id, agent_id, limit_type or "unknown")
                await _write_audit(
                    http_client, self._console_url, self._internal_token,
                    event="matrix.rate_limited",
                    agent_id=agent_id, node_id=self._node_id,
                    room_id=room_id, event_id=event_id,
                    status="error", error_code=f"rate_limit_{limit_type}",
                    data={"sender": sender, "limit_type": limit_type, "routing_reason": routing_reason},
                )
                return

        client.mark_seen(event_id)

        # M2.3: Persistent dedupe (cross-restart protection, mixed rooms)
        if self._event_store is not None:
            try:
                already = await self._event_store.is_processed(room_id, event_id)
            except Exception as exc:
                logger.warning("EventStore.is_processed error mixed (degraded): %s", exc)
                already = False

            if already:
                logger.debug("Persistent dedupe hit (mixed): event=%s room=%s agent=%s", event_id, room_id, agent_id)
                if self._on_dedupe_persistent_hit:
                    self._on_dedupe_persistent_hit(room_id, agent_id)
                await _write_audit(
                    http_client, self._console_url, self._internal_token,
                    event="matrix.dedupe.persistent_hit",
                    agent_id=agent_id, node_id=self._node_id,
                    room_id=room_id, event_id=event_id,
                    status="ok",
                    data={"sender": sender, "routing_reason": routing_reason},
                )
                return

        # Store effective_text (stripped of routing token) in a patched event copy
        effective_event = dict(event)
        effective_event["content"] = dict(event.get("content", {}))
        effective_event["content"]["body"] = effective_text

        entry = _QueueEntry(
            event=effective_event,
            room_id=room_id,
            agent_id=agent_id,
            enqueue_time=time.monotonic(),
            routing_reason=routing_reason,
            is_mixed=True,
        )
        enqueued_mixed = False
        try:
            queue.put_nowait(entry)
            enqueued_mixed = True
            qsize = queue.qsize()
            logger.debug(
                "Enqueued (mixed): event=%s agent=%s reason=%s qsize=%d",
                event_id, agent_id, routing_reason, qsize,
            )
            if self._on_queue_size:
                self._on_queue_size(qsize)
        except asyncio.QueueFull:
            logger.warning(
                "Queue full (max=%d): dropping mixed event=%s room=%s agent=%s",
                self._queue_max, event_id, room_id, agent_id,
            )
            if self._on_queue_dropped:
                self._on_queue_dropped(room_id, agent_id)
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.queue_full",
                agent_id=agent_id, node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="error", error_code="queue_full",
                data={"queue_max": self._queue_max, "sender": sender},
            )

        # M2.3: Mark as processed ONLY after successful enqueue
        if enqueued_mixed and self._event_store is not None:
            sender_hash = _sender_hash(sender)
            try:
                await self._event_store.mark_processed(room_id, event_id, sender_hash)
                if self._on_dedupe_persistent_insert:
                    self._on_dedupe_persistent_insert()
            except Exception as exc:
                logger.warning("EventStore.mark_processed error mixed (degraded): %s", exc)

    # ── M4.0: Agent discovery ──────────────────────────────────────────────────

    def _check_discovery_rate(self, room_id: str) -> bool:
        """Sliding-window per-room rate check for discovery replies. Returns True if allowed."""
        if self._discovery_rpm <= 0:
            return True
        now = time.monotonic()
        window = self._discovery_windows[room_id]
        cutoff = now - 60.0
        while window and window[0] < cutoff:
            window.popleft()
        if len(window) >= self._discovery_rpm:
            return False
        window.append(now)
        return True

    async def _handle_discovery(
        self,
        client: MatrixClient,
        http_client: httpx.AsyncClient,
        event: Dict[str, Any],
        room_id: str,
    ) -> None:
        """
        Reply to !agents in any user room (no auth required).
        Rate-limited per room. Marks event as seen + persisted (no router enqueue).
        """
        event_id = event.get("event_id", "")
        sender = event.get("sender", "")

        # Rate limit for discovery replies
        if not self._check_discovery_rate(room_id):
            logger.debug("Discovery rate limited: room=%s", room_id)
            client.mark_seen(event_id)
            return

        client.mark_seen(event_id)

        # M6.1: use store-based agent config if available for accurate discovery
        _disc_config = self._mixed_room_config
        if self._policy_store is not None and self._policy_store.is_open:
            try:
                _disc_ov = await asyncio.to_thread(
                    self._policy_store.get_agent_override, room_id
                )
                if _disc_ov is not None:
                    _d_agents, _d_default = _disc_ov
                    _d_eff_default = _d_default or (_d_agents[0] if _d_agents else None)
                    if _d_agents and _d_eff_default and self._mixed_room_config is not None:
                        _disc_config = build_override_config(
                            self._mixed_room_config, room_id, _d_agents, _d_eff_default,
                        )
            except Exception:  # noqa: BLE001
                pass

        reply = agents_reply(room_id, self._room_map, _disc_config)
        txn_id = MatrixClient.make_txn_id(room_id, event_id + "_discovery")
        try:
            await client.send_text(room_id, reply, txn_id)
        except Exception as exc:
            logger.warning("Could not send discovery reply: %s", exc)

        # Persist dedupe so restart doesn't re-deliver this discovery
        if self._event_store is not None:
            sender_hash = _sender_hash(sender)
            await self._event_store.mark_processed(room_id, event_id, sender_hash)

    # ── M6.1: Dynamic mixed room agent overrides via !room agents command ─────

    async def _handle_room_cmd(
        self,
        http_client: httpx.AsyncClient,
        sender: str,
        ctrl_room_id: str,
        event_id: str,
        cmd_subcommand: str,
        cmd_args: tuple,
        cmd_kwargs: Dict[str, str],
    ) -> str:
        """Handle `!room agents <set|add|remove|get|list|unset>` from authorized operator."""
        if self._policy_store is None or not self._policy_store.is_open:
            return "⚠️ Policy store not available."

        if cmd_subcommand != ROOM_SUBCMD_AGENTS:
            return room_agents_reply_error(
                f"Unknown subcommand: `{cmd_subcommand or '?'}`. Use `!room agents <action>`."
            )

        action, room_id, agents_list, single_agent, default_agent = parse_room_agents_cmd(
            cmd_subcommand, cmd_args, cmd_kwargs,
        )

        if action not in (ROOM_ACTION_SET, ROOM_ACTION_ADD, ROOM_ACTION_REMOVE,
                          ROOM_ACTION_GET, ROOM_ACTION_LIST, ROOM_ACTION_UNSET):
            return room_agents_reply_error(f"Unknown action: `{action or '?'}`")

        # Validate allowed agents from global policy
        allowed_all = self._room_map.allowed_agents  # global allowed agents set

        # ── list ──────────────────────────────────────────────────────────────
        if action == ROOM_ACTION_LIST:
            try:
                rows = await asyncio.to_thread(self._policy_store.list_agent_overrides, 10)
                total = await asyncio.to_thread(self._policy_store.count_agent_overrides)
            except Exception as exc:
                logger.warning("PolicyStore list_agent_overrides error: %s", exc)
                return "⚠️ Could not read policy store."
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.room.agents.list",
                agent_id="control", node_id=self._node_id,
                room_id=ctrl_room_id, event_id=event_id,
                status="ok", data={"sender": sender, "total": total},
            )
            return room_agents_reply_list(rows, total)

        # ── subcommands that require room_id ──────────────────────────────────
        if not room_id:
            return room_agents_reply_error("Missing `room=` argument.")
        if not node_cmd_validate_room(room_id):
            return room_agents_reply_error(
                f"Invalid room ID format: `{room_id}`\nExpected: `!localpart:server`"
            )

        # ── get ───────────────────────────────────────────────────────────────
        if action == ROOM_ACTION_GET:
            try:
                ov = await asyncio.to_thread(self._policy_store.get_agent_override, room_id)
            except Exception as exc:
                logger.warning("PolicyStore get_agent_override error: %s", exc)
                return "⚠️ Could not read policy store."
            ov_agents, ov_default = (ov if ov else (None, None))
            env_room = (
                self._mixed_room_config.rooms.get(room_id) if self._mixed_room_config else None
            )
            env_agents = list(env_room.agents) if env_room else None
            env_default = env_room.default_agent if env_room else None
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.room.agents.get",
                agent_id="control", node_id=self._node_id,
                room_id=ctrl_room_id, event_id=event_id,
                status="ok", data={"sender": sender, "queried_room": room_id},
            )
            return room_agents_reply_get(room_id, ov_agents, ov_default, env_agents, env_default)

        # ── unset ─────────────────────────────────────────────────────────────
        if action == ROOM_ACTION_UNSET:
            try:
                deleted = await asyncio.to_thread(self._policy_store.delete_agent_override, room_id)
            except Exception as exc:
                logger.warning("PolicyStore delete_agent_override error: %s", exc)
                return "⚠️ Could not write to policy store."
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.room.agents.unset",
                agent_id="control", node_id=self._node_id,
                room_id=ctrl_room_id, event_id=event_id,
                status="ok", data={"sender": sender, "target_room": room_id, "was_set": deleted},
            )
            return room_agents_reply_unset_ok(room_id) if deleted else room_agents_reply_unset_not_found(room_id)

        # ── add ───────────────────────────────────────────────────────────────
        if action == ROOM_ACTION_ADD:
            if not single_agent:
                return room_agents_reply_error("Missing `agent=` argument for `add`.")
            if single_agent not in allowed_all:
                allowed_str = ", ".join(f"`{a}`" for a in sorted(allowed_all))
                return room_agents_reply_error(f"Agent `{single_agent}` not in allowed agents: {allowed_str}")
            try:
                new_agents, new_default = await asyncio.to_thread(
                    self._policy_store.add_agent_to_room, room_id, single_agent, sender
                )
            except Exception as exc:
                logger.warning("PolicyStore add_agent_to_room error: %s", exc)
                return "⚠️ Could not write to policy store."
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.room.agents.add",
                agent_id="control", node_id=self._node_id,
                room_id=ctrl_room_id, event_id=event_id,
                status="ok", data={"sender": sender, "target_room": room_id, "agent": single_agent},
            )
            return room_agents_reply_add(room_id, single_agent, new_agents, new_default)

        # ── remove ────────────────────────────────────────────────────────────
        if action == ROOM_ACTION_REMOVE:
            if not single_agent:
                return room_agents_reply_error("Missing `agent=` argument for `remove`.")
            try:
                removed, err = await asyncio.to_thread(
                    self._policy_store.remove_agent_from_room, room_id, single_agent, sender
                )
            except Exception as exc:
                logger.warning("PolicyStore remove_agent_from_room error: %s", exc)
                return "⚠️ Could not write to policy store."
            if not removed:
                return room_agents_reply_error(err or "Could not remove agent.")
            # Get updated state
            try:
                ov = await asyncio.to_thread(self._policy_store.get_agent_override, room_id)
            except Exception:  # noqa: BLE001
                ov = None
            remaining = ov[0] if ov else []
            new_default_r = ov[1] if ov else None
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.room.agents.remove",
                agent_id="control", node_id=self._node_id,
                room_id=ctrl_room_id, event_id=event_id,
                status="ok", data={"sender": sender, "target_room": room_id, "agent": single_agent},
            )
            return room_agents_reply_remove(room_id, single_agent, remaining, new_default_r)

        # ── set ───────────────────────────────────────────────────────────────
        if action == ROOM_ACTION_SET:
            if not agents_list:
                return room_agents_reply_error("Missing `agents=` argument for `set`.")
            invalid = [a for a in agents_list if a not in allowed_all]
            if invalid:
                allowed_str = ", ".join(f"`{a}`" for a in sorted(allowed_all))
                return room_agents_reply_error(
                    f"Unknown agents: {', '.join(f'`{a}`' for a in invalid)}\nAllowed: {allowed_str}"
                )
            effective_default = default_agent if default_agent else agents_list[0]
            if effective_default not in agents_list:
                return room_agents_reply_error(
                    f"Default agent `{effective_default}` not in provided agents list."
                )
            try:
                await asyncio.to_thread(
                    self._policy_store.set_agent_override,
                    room_id, agents_list, effective_default, sender,
                )
            except Exception as exc:
                logger.warning("PolicyStore set_agent_override error: %s", exc)
                return "⚠️ Could not write to policy store."
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.room.agents.set",
                agent_id="control", node_id=self._node_id,
                room_id=ctrl_room_id, event_id=event_id,
                status="ok", data={
                    "sender": sender, "target_room": room_id,
                    "agents": agents_list, "default": effective_default,
                },
            )
            return room_agents_reply_set(room_id, agents_list, effective_default)

        return room_agents_reply_error("Unhandled action.")

    # ── M6.2: Policy snapshot export / import ─────────────────────────────────

    async def _handle_policy_cmd(
        self,
        http_client: httpx.AsyncClient,
        sender: str,
        ctrl_room_id: str,
        event_id: str,
        subcommand: Optional[str],
        cmd_kwargs: Dict[str, str],
    ) -> str:
        """
        Handle `!policy export`, `!policy import`, and `!policy prune_exports`.
        Requires policy_store and bridge_data_dir to be configured.
        """
        if self._policy_store is None or not self._policy_store.is_open:
            return policy_cmd_error("Policy store is not available.")

        if not self._bridge_data_dir:
            return policy_cmd_error("BRIDGE_DATA_DIR is not configured.")

        exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)

        # ── export ────────────────────────────────────────────────────────────
        if subcommand == "export":
            try:
                snapshot = await asyncio.to_thread(self._policy_store.export_all)
                node_count  = len(snapshot.get("room_node_overrides",  []))
                agent_count = len(snapshot.get("room_agent_overrides", []))

                import datetime as _dt
                ts = _dt.datetime.now(_dt.timezone.utc).strftime("%Y%m%d-%H%M%S")
                filename = f"policy-{ts}.json"

                await asyncio.to_thread(_os.makedirs, exports_dir, exist_ok=True)
                export_path = _os.path.join(exports_dir, filename)
                await asyncio.to_thread(
                    _write_json_file, export_path, snapshot
                )

                self._policy_last_export_at = int(time.time())
                await _write_audit(
                    http_client, self._console_url, self._internal_token,
                    event="matrix.control.policy.export",
                    agent_id="control", node_id=self._node_id,
                    room_id=ctrl_room_id, event_id=event_id,
                    status="ok", data={
                        "sender": sender, "file": filename,
                        "node_overrides": node_count, "agent_overrides": agent_count,
                    },
                )
                return policy_export_reply(export_path, node_count, agent_count)

            except Exception as exc:  # noqa: BLE001
                logger.exception("_handle_policy_cmd export error: %s", exc)
                return policy_cmd_error(f"Export failed: {exc}")

        # ── import ────────────────────────────────────────────────────────────
        if subcommand == "import":
            filename = cmd_kwargs.get("path", "").strip()
            if not filename:
                return policy_cmd_error("Missing `path=` argument.")

            safe_path = validate_export_path(exports_dir, filename)
            if safe_path is None:
                return policy_cmd_error(
                    f"Invalid path `{filename}`. Only simple filenames within the exports "
                    f"directory are allowed."
                )

            mode_raw = cmd_kwargs.get("mode", "merge").strip().lower()
            if mode_raw not in ("merge", "replace"):
                return policy_cmd_error("mode must be `merge` or `replace`.")

            dry_raw = cmd_kwargs.get("dry_run", "1").strip()
            dry_run = dry_raw not in ("0", "false", "no")

            try:
                raw_text = await asyncio.to_thread(_read_json_file, safe_path)
            except FileNotFoundError:
                return policy_cmd_error(f"File not found: `{filename}`")
            except Exception as exc:  # noqa: BLE001
                return policy_cmd_error(f"Cannot read file: {exc}")

            try:
                data = raw_text if isinstance(raw_text, dict) else {}
                stats = await asyncio.to_thread(
                    self._policy_store.import_snapshot,
                    data, mode_raw, dry_run, sender,
                )
            except ValueError as ve:
                return policy_cmd_error(str(ve))
            except Exception as exc:  # noqa: BLE001
                logger.exception("_handle_policy_cmd import error: %s", exc)
                return policy_cmd_error(f"Import failed: {exc}")

            if not dry_run:
                self._policy_last_import_at = int(time.time())
                # M10.2: record in policy change history
                _is_destr = (
                    stats.get("node_deleted", 0) + stats.get("agent_deleted", 0)
                ) > 0
                _ds = (
                    f"node: +{stats['node_added']} ~{stats['node_updated']} "
                    f"-{stats['node_deleted']}; "
                    f"agent: +{stats['agent_added']} ~{stats['agent_updated']} "
                    f"-{stats['agent_deleted']}"
                )
                try:
                    await asyncio.to_thread(
                        self._policy_store.record_policy_change,
                        "policy.import", mode_raw, filename,
                        _sender_hash(sender), _ds, _is_destr,
                        stats.get("node_added", 0), stats.get("node_updated", 0),
                        stats.get("node_deleted", 0),
                        stats.get("agent_added", 0), stats.get("agent_updated", 0),
                        stats.get("agent_deleted", 0),
                        self._policy_history_limit,
                    )
                except Exception as _exc:  # noqa: BLE001
                    logger.warning("Failed to record import history (non-fatal): %s", _exc)

            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.policy.import",
                agent_id="control", node_id=self._node_id,
                room_id=ctrl_room_id, event_id=event_id,
                status="ok", data={
                    "sender": sender, "file": filename,
                    "mode": mode_raw, "dry_run": dry_run, "stats": stats,
                },
            )
            return (
                policy_import_dry_run_reply(stats, mode_raw)
                if dry_run
                else policy_import_reply(stats, mode_raw)
            )

        # ── restore (M10.1) ───────────────────────────────────────────────────
        if subcommand == "restore":
            filename = cmd_kwargs.get("path", "").strip()
            if not filename:
                return policy_cmd_error("Missing `path=` argument.")

            safe_path = validate_export_path(exports_dir, filename)
            if safe_path is None:
                return policy_cmd_error(
                    f"Invalid path `{filename}`. Only simple filenames within the exports "
                    "directory are allowed."
                )

            mode_raw = cmd_kwargs.get("mode", "replace").strip().lower()
            if mode_raw not in ("merge", "replace"):
                return policy_cmd_error("mode must be `merge` or `replace`.")

            try:
                raw_data = await asyncio.to_thread(_read_json_file, safe_path)
            except FileNotFoundError:
                return policy_cmd_error(f"File not found: `{filename}`")
            except Exception as exc:  # noqa: BLE001
                return policy_cmd_error(f"Cannot read file: {exc}")

            data = raw_data if isinstance(raw_data, dict) else {}
            try:
                stats = await asyncio.to_thread(
                    self._policy_store.import_snapshot,
                    data, mode_raw, False, sender,
                )
                self._policy_last_import_at = int(time.time())
            except ValueError as ve:
                return policy_cmd_error(str(ve))
            except Exception as exc:  # noqa: BLE001
                logger.exception("_handle_policy_cmd restore error: %s", exc)
                return policy_cmd_error(f"Restore failed: {exc}")

            # M10.2: record in policy change history
            _is_destr = (
                stats.get("node_deleted", 0) + stats.get("agent_deleted", 0)
            ) > 0
            _rds = (
                f"restore/{mode_raw}: "
                f"node: +{stats['node_added']} ~{stats['node_updated']} "
                f"-{stats['node_deleted']}; "
                f"agent: +{stats['agent_added']} ~{stats['agent_updated']} "
                f"-{stats['agent_deleted']}"
            )
            try:
                await asyncio.to_thread(
                    self._policy_store.record_policy_change,
                    "policy.restore", mode_raw, filename,
                    _sender_hash(sender), _rds, _is_destr,
                    stats.get("node_added", 0), stats.get("node_updated", 0),
                    stats.get("node_deleted", 0),
                    stats.get("agent_added", 0), stats.get("agent_updated", 0),
                    stats.get("agent_deleted", 0),
                    self._policy_history_limit,
                )
            except Exception as _exc:  # noqa: BLE001
                logger.warning("Failed to record restore history (non-fatal): %s", _exc)

            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.policy.restore",
                agent_id="control", node_id=self._node_id,
                room_id=ctrl_room_id, event_id=event_id,
                status="ok",
                data={
                    "sender": sender, "file": filename,
                    "mode": mode_raw, "stats": stats,
                },
            )
            return policy_restore_applied_reply(stats, mode_raw)

        # ── prune_exports ─────────────────────────────────────────────────────
        if subcommand == "prune_exports":
            dry_raw = cmd_kwargs.get("dry_run", "1").strip()
            dry_run = dry_raw not in ("0", "false", "no")
            try:
                retention_days = int(
                    cmd_kwargs.get(
                        "retention_days", str(self._policy_export_retention_days)
                    )
                )
            except (ValueError, TypeError):
                return policy_cmd_error("`retention_days` must be a positive integer.")

            try:
                await asyncio.to_thread(_os.makedirs, exports_dir, exist_ok=True)
                result = await asyncio.to_thread(
                    self._policy_store.prune_exports,
                    exports_dir, retention_days, dry_run,
                )
            except Exception as exc:  # noqa: BLE001
                logger.exception("_handle_policy_cmd prune error: %s", exc)
                return policy_cmd_error(f"Prune failed: {exc}")

            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.policy.prune_exports",
                agent_id="control", node_id=self._node_id,
                room_id=ctrl_room_id, event_id=event_id,
                status="ok",
                data={
                    "sender": sender,
                    "dry_run": dry_run,
                    "retention_days": retention_days,
                    "files_to_delete": result.count,
                    "bytes_to_free": result.total_bytes,
                },
            )
            return (
                policy_prune_preview_reply(result, retention_days)
                if dry_run
                else policy_prune_applied_reply(result, retention_days)
            )

        # ── history (M10.2) ───────────────────────────────────────────────────
        if subcommand == "history":
            try:
                limit_raw = int(cmd_kwargs.get("limit", "10"))
            except (ValueError, TypeError):
                return policy_cmd_error("`limit` must be a positive integer.")
            safe_limit = max(1, min(limit_raw, 20))
            try:
                changes = await asyncio.to_thread(
                    self._policy_store.list_policy_changes, safe_limit,
                )
            except Exception as exc:  # noqa: BLE001
                logger.exception("_handle_policy_cmd history error: %s", exc)
                return policy_cmd_error(f"History fetch failed: {exc}")
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.policy.history",
                agent_id="control", node_id=self._node_id,
                room_id=ctrl_room_id, event_id=event_id,
                status="ok", data={"sender": sender, "limit": safe_limit, "count": len(changes)},
            )
            return policy_history_reply(changes)

        # ── change (M10.3) ────────────────────────────────────────────────────
        if subcommand == "change":
            try:
                change_id = int(cmd_kwargs.get("id", "0"))
            except (ValueError, TypeError):
                return policy_cmd_error("`id` must be a positive integer (DB change id).")
            if change_id <= 0:
                return policy_cmd_error("Missing or invalid `id=` argument. "
                                        "Use `!policy history` to get change ids.")
            try:
                change = await asyncio.to_thread(
                    self._policy_store.get_policy_change_by_id, change_id,
                )
            except Exception as exc:  # noqa: BLE001
                logger.exception("_handle_policy_cmd change detail error: %s", exc)
                return policy_cmd_error(f"DB error: {exc}")
            if change is None:
                return policy_cmd_error(
                    f"Change id={change_id} not found. "
                    "Use `!policy history` to see available ids."
                )
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.policy.change_detail",
                agent_id="control", node_id=self._node_id,
                room_id=ctrl_room_id, event_id=event_id,
                status="ok", data={"sender": sender, "change_id": change_id},
            )
            return policy_change_detail_reply(change)

        return policy_cmd_error(f"Unknown subcommand: `{subcommand!r}`.")

    # ── M5.1: Nodes overview for operators ────────────────────────────────────

    async def _handle_nodes(
        self,
        http_client: httpx.AsyncClient,
        sender: str,
        room_id: str,
        event_id: str,
    ) -> str:
        """Return node policy overview for `!nodes` in control room."""
        try:
            policy_info = (
                self._node_policy.as_info_dict()
                if self._node_policy is not None
                else {}
            )
            node_stats = (
                self._node_stats_getter()
                if self._node_stats_getter is not None
                else {}
            )
            # M8.0: merge health state into node_stats
            if self._node_health_tracker is not None:
                allowed = (
                    self._node_policy.allowed_nodes
                    if self._node_policy is not None
                    else None
                )
                health_all = self._node_health_tracker.all_info(allowed)
                for nid, info in health_all.items():
                    if nid not in node_stats:
                        node_stats[nid] = {}
                    node_stats[nid]["health"] = info.get("state", NODE_STATE_HEALTHY)
                    node_stats[nid]["ewma_latency_s"] = info.get("ewma_latency_s")
                    node_stats[nid]["consecutive_failures"] = info.get("consecutive_failures", 0)
            # M8.1: include sticky cache info
            sticky_info = None
            if self._sticky_cache is not None:
                sticky_entries = self._sticky_cache.active_entries()
                sticky_info = {
                    "active_keys": len(sticky_entries),
                    "ttl_s": self._sticky_cache.ttl_s,
                    "entries": [
                        {"key": k, "node": n, "remaining_s": round(r, 0)}
                        for k, n, r in sticky_entries[:5]   # show at most 5
                    ],
                }
                if len(sticky_entries) > 5:
                    sticky_info["truncated"] = len(sticky_entries) - 5
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.nodes",
                agent_id="control", node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="ok", data={"sender": sender},
            )
            return nodes_reply(policy_info, node_stats, sticky_info=sticky_info)
        except Exception as exc:
            logger.warning("_handle_nodes error: %s", exc)
            return "⚠️ Node info not available."

    # ── M8.2: HA state persistence helpers ────────────────────────────────────

    async def _load_ha_state(self) -> None:
        """
        Load persisted HA state (sticky entries + node health) from PolicyStore on startup.
        Non-fatal: any error is logged and bridge continues in in-memory-only mode.
        """
        if self._policy_store is None or not self._policy_store.is_open:
            return

        # Load sticky entries
        if self._sticky_cache is not None:
            try:
                entries = await asyncio.to_thread(self._policy_store.load_sticky_entries)
                now_unix = int(time.time())
                loaded = 0
                for key, node_id, expires_at_unix in entries:
                    remaining_s = expires_at_unix - now_unix
                    if remaining_s > 0:
                        self._sticky_cache.set(key, node_id, ttl_s=float(remaining_s))
                        loaded += 1
                self._ha_sticky_loaded = loaded
                logger.info("HA: loaded %d sticky entries from DB", loaded)
            except Exception as exc:  # noqa: BLE001
                logger.warning("HA: failed to load sticky entries (non-fatal): %s", exc)

        # Load node health snapshot
        if self._node_health_tracker is not None and self._ha_health_max_age_s > 0:
            try:
                snapshot = await asyncio.to_thread(
                    self._policy_store.load_node_health, self._ha_health_max_age_s
                )
                if snapshot:
                    for node_id, info in snapshot.items():
                        self._node_health_tracker.restore_node(
                            node_id,
                            ewma_latency_s=info.get("ewma_latency_s"),
                            consecutive_failures=int(info.get("consecutive_failures", 0)),
                        )
                    self._ha_health_loaded = True
                    logger.info(
                        "HA: loaded node health for %d nodes from DB", len(snapshot)
                    )
                else:
                    logger.info("HA: no fresh node health snapshot found in DB")
            except Exception as exc:  # noqa: BLE001
                logger.warning("HA: failed to load node health (non-fatal): %s", exc)

    async def _sticky_persist_set(self, key: str, node_id: str) -> None:
        """Persist a sticky entry to DB after setting it in-memory (M8.2)."""
        if self._policy_store is None or not self._policy_store.is_open:
            return
        if self._sticky_cache is None:
            return
        expires_at_unix = int(time.time()) + int(self._sticky_cache.ttl_s)
        try:
            await asyncio.to_thread(
                self._policy_store.upsert_sticky, key, node_id, expires_at_unix
            )
        except Exception as exc:  # noqa: BLE001
            logger.warning("HA: failed to persist sticky key=%s (non-fatal): %s", key, exc)

    async def _sticky_persist_delete(self, key: str) -> None:
        """Remove a sticky entry from DB after deleting it in-memory (M8.2)."""
        if self._policy_store is None or not self._policy_store.is_open:
            return
        try:
            await asyncio.to_thread(self._policy_store.delete_sticky, key)
        except Exception as exc:  # noqa: BLE001
            logger.warning("HA: failed to delete sticky key=%s (non-fatal): %s", key, exc)

    async def _node_health_snapshot_loop(self) -> None:
        """
        Background task: periodically write node health state to DB (M8.2).
        Runs until policy_store becomes unavailable or interval is 0.
        """
        if self._ha_health_snapshot_interval_s <= 0:
            return
        logger.debug(
            "HA: health snapshot loop started (interval=%ds)", self._ha_health_snapshot_interval_s
        )
        while True:
            await asyncio.sleep(self._ha_health_snapshot_interval_s)
            if self._policy_store is None or not self._policy_store.is_open:
                break
            if self._node_health_tracker is None or self._node_policy is None:
                break
            try:
                for node_id in sorted(self._node_policy.allowed_nodes):
                    info = self._node_health_tracker.as_info_dict(node_id)
                    await asyncio.to_thread(
                        self._policy_store.upsert_node_health,
                        node_id,
                        info.get("ewma_latency_s"),
                        int(info.get("consecutive_failures", 0)),
                    )
                logger.debug(
                    "HA: health snapshot written for %d nodes",
                    len(self._node_policy.allowed_nodes),
                )
            except Exception as exc:  # noqa: BLE001
                logger.warning("HA: health snapshot write failed (non-fatal): %s", exc)

    # ── M9.0: Two-step confirmation for dangerous control commands ─────────────

    async def _handle_policy_import_intent(
        self,
        http_client: "httpx.AsyncClient",
        cmd: "ControlCommand",
        sender: str,
        room_id: str,
        event_id: str,
        action_summary: str,
        normalized: str,
    ) -> str:
        """
        M9.1: Intent step for !policy import with diff preview and hash binding.

        Reads the file, computes a diff preview, stores a hash-bound callback,
        and returns a formatted preview reply containing the nonce.
        The confirm callback verifies the file hasn't changed before applying.
        """
        assert self._confirm_store is not None

        # ── Validate args ──────────────────────────────────────────────────────
        if self._policy_store is None or not self._policy_store.is_open:
            return policy_cmd_error("Policy store not available.")
        if not self._bridge_data_dir:
            return policy_cmd_error("BRIDGE_DATA_DIR not configured.")

        filename = cmd.kwargs.get("path", "").strip()
        if not filename:
            return policy_cmd_error("Missing `path=` argument.")

        exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
        safe_path = validate_export_path(exports_dir, filename)
        if safe_path is None:
            return policy_cmd_error(
                f"Invalid path `{filename}`. Only simple filenames within the exports "
                "directory are allowed."
            )

        mode_raw = cmd.kwargs.get("mode", "merge").strip().lower()
        if mode_raw not in ("merge", "replace"):
            return policy_cmd_error("mode must be `merge` or `replace`.")

        # ── Read file + compute diff preview ───────────────────────────────────
        try:
            raw_data = await asyncio.to_thread(_read_json_file, safe_path)
        except FileNotFoundError:
            return policy_cmd_error(f"File not found: `{filename}`")
        except Exception as exc:  # noqa: BLE001
            return policy_cmd_error(f"Cannot read file: {exc}")

        if not isinstance(raw_data, dict):
            return policy_cmd_error("Invalid JSON format (expected object).")

        try:
            diff = await asyncio.to_thread(
                self._policy_store.compute_import_diff, raw_data, mode_raw,
            )
        except ValueError as ve:
            return policy_cmd_error(str(ve))
        except Exception as exc:  # noqa: BLE001
            return policy_cmd_error(f"Preview failed: {exc}")

        # ── Compute snapshot hash for confirm binding ─────────────────────────
        _content_bytes = _json.dumps(
            raw_data, sort_keys=True, ensure_ascii=True
        ).encode("utf-8")
        snapshot_hash = hashlib.sha256(
            (filename + ":" + mode_raw + ":").encode("utf-8") + _content_bytes
        ).hexdigest()[:32]

        sender_hash = _sender_hash(sender)
        _captured_hash  = snapshot_hash
        _captured_data  = raw_data
        _captured_mode  = mode_raw
        _captured_path  = safe_path
        _captured_fname = filename
        _captured_sender = sender

        # ── Build hash-bound callback ──────────────────────────────────────────
        # Late-capture of nonce for autobackup filename (set after add() below)
        _nonce_holder: list = []

        async def _callback():
            # Re-read file and verify hash to detect tampering (anti-TOCTOU)
            try:
                fresh_data = await asyncio.to_thread(_read_json_file, _captured_path)
            except Exception as exc:  # noqa: BLE001
                return (
                    f"❌ Cannot re-read file `{_captured_fname}` at apply time: {exc}\n"
                    "Re-issue the command.",
                    "",
                )

            fresh_bytes = _json.dumps(
                fresh_data if isinstance(fresh_data, dict) else {},
                sort_keys=True, ensure_ascii=True,
            ).encode("utf-8")
            fresh_hash = hashlib.sha256(
                (_captured_fname + ":" + _captured_mode + ":").encode("utf-8") + fresh_bytes
            ).hexdigest()[:32]

            if fresh_hash != _captured_hash:
                logger.warning(
                    "Policy import confirm rejected: file changed since preview "
                    "(sender=%s file=%s)", _captured_sender, _captured_fname,
                )
                return (
                    f"❌ File `{_captured_fname}` changed after preview — confirm rejected.\n"
                    "Re-issue `!policy import ...` to get a new preview.",
                    "",
                )

            # M10.0: Auto-backup current policy before applying changes
            _autobackup_basename = ""
            _autobackup_hash = ""
            if self._policy_store is not None and self._bridge_data_dir:
                _exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
                _nonce_suffix = _nonce_holder[0] if _nonce_holder else "BACKUP"
                _sender_hash8 = _sender_hash(_captured_sender)[:8]
                try:
                    _os.makedirs(_exports_dir, exist_ok=True)
                    _ab_path, _autobackup_hash = await asyncio.to_thread(
                        self._policy_store.write_autobackup,
                        _exports_dir, _sender_hash8, _nonce_suffix,
                    )
                    _autobackup_basename = _os.path.basename(_ab_path)
                    logger.info(
                        "Policy auto-backup written: %s hash=%s",
                        _autobackup_basename, _autobackup_hash,
                    )
                except Exception as exc:  # noqa: BLE001
                    logger.warning("Policy auto-backup failed (non-fatal): %s", exc)

            # Apply the import using the captured (previewed) data
            try:
                stats = await asyncio.to_thread(
                    self._policy_store.import_snapshot,
                    _captured_data, _captured_mode, False, _captured_sender,
                )
                self._policy_last_import_at = int(time.time())
                diff_summary = (
                    f"node: +{stats['node_added']} ~{stats['node_updated']} "
                    f"-{stats['node_deleted']}; "
                    f"agent: +{stats['agent_added']} ~{stats['agent_updated']} "
                    f"-{stats['agent_deleted']}"
                )
                if _autobackup_basename:
                    diff_summary += f"; autobackup={_autobackup_basename}"
                # M10.2: record in policy change history
                _is_destr = (
                    stats.get("node_deleted", 0) + stats.get("agent_deleted", 0)
                ) > 0
                try:
                    await asyncio.to_thread(
                        self._policy_store.record_policy_change,
                        "policy.import", _captured_mode, _captured_fname,
                        _sender_hash(_captured_sender), diff_summary, _is_destr,
                        stats.get("node_added", 0), stats.get("node_updated", 0),
                        stats.get("node_deleted", 0),
                        stats.get("agent_added", 0), stats.get("agent_updated", 0),
                        stats.get("agent_deleted", 0),
                        self._policy_history_limit,
                    )
                except Exception as _exc:  # noqa: BLE001
                    logger.warning("Failed to record policy change history: %s", _exc)
                reply = policy_import_reply(stats, _captured_mode)
                if _autobackup_basename:
                    reply += (
                        f"\n\n💾 Auto-backup saved: `{_autobackup_basename}` "
                        f"(hash `{_autobackup_hash}`)"
                    )
                return reply, diff_summary
            except Exception as exc:  # noqa: BLE001
                logger.exception(
                    "Policy import apply failed: sender=%s file=%s", _captured_sender, _captured_fname,
                )
                return f"❌ Import failed: {exc}", ""

        # ── Store pending confirmation ─────────────────────────────────────────
        nonce = self._confirm_store.add(
            sender_hash=sender_hash,
            verb="policy.import",
            normalized_args=normalized,
            action_summary=action_summary,
            room_id=room_id,
            callback=_callback,
        )
        # M10.0: make nonce available inside _callback for backup filename
        _nonce_holder.append(nonce)

        await _write_audit(
            http_client, self._console_url, self._internal_token,
            event="matrix.control.intent",
            agent_id="control", node_id=self._node_id,
            room_id=room_id, event_id=event_id,
            status="ok",
            data={
                "sender_hash": sender_hash,
                "verb": "policy.import",
                "normalized": normalized,
                "nonce": nonce,
                "expires_in_s": int(self._confirm_store.ttl_s),
                "snapshot_hash_prefix": snapshot_hash[:8],
                "diff": {
                    "node_added": diff.node_added,
                    "node_updated": diff.node_updated,
                    "node_deleted": diff.node_deleted,
                    "agent_added": diff.agent_added,
                    "agent_updated": diff.agent_updated,
                    "agent_deleted": diff.agent_deleted,
                    "sample_keys": diff.sample_keys,
                },
            },
        )

        logger.info(
            "Confirm policy import intent: sender=%s mode=%s file=%s hash=%s nonce=%s",
            sender, mode_raw, filename, snapshot_hash[:8], nonce,
        )
        return policy_import_intent_reply(
            diff=diff,
            action_summary=action_summary,
            nonce=nonce,
            ttl_s=int(self._confirm_store.ttl_s),
        )

    async def _handle_policy_restore_intent(
        self,
        http_client: "httpx.AsyncClient",
        cmd: "ControlCommand",
        sender: str,
        room_id: str,
        event_id: str,
        *,
        action_summary: str,
        normalized: str,
    ) -> str:
        """
        Intent step for !policy restore (M10.1).

        Reads the snapshot file, computes a diff preview, binds a SHA-256 hash
        to the exact file content + mode, stores a callback in ConfirmStore, and
        returns a rollback preview reply with the nonce.
        """
        assert self._confirm_store is not None
        assert self._policy_store is not None
        assert self._bridge_data_dir is not None

        filename = cmd.kwargs.get("path", "").strip()
        if not filename:
            return policy_cmd_error("Missing `path=` argument.")

        exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
        safe_path = validate_export_path(exports_dir, filename)
        if safe_path is None:
            return policy_cmd_error(
                f"Invalid path `{filename}`. Only simple filenames within the exports "
                "directory are allowed."
            )

        mode_raw = cmd.kwargs.get("mode", "replace").strip().lower()
        if mode_raw not in ("merge", "replace"):
            return policy_cmd_error("mode must be `merge` or `replace`.")

        # ── Read file + compute diff preview ──────────────────────────────────
        try:
            raw_data = await asyncio.to_thread(_read_json_file, safe_path)
        except FileNotFoundError:
            return policy_cmd_error(f"File not found: `{filename}`")
        except Exception as exc:  # noqa: BLE001
            return policy_cmd_error(f"Cannot read file: {exc}")

        if not isinstance(raw_data, dict):
            return policy_cmd_error("Invalid JSON format (expected object).")

        try:
            diff = await asyncio.to_thread(
                self._policy_store.compute_import_diff, raw_data, mode_raw,
            )
        except ValueError as ve:
            return policy_cmd_error(str(ve))
        except Exception as exc:  # noqa: BLE001
            return policy_cmd_error(f"Preview failed: {exc}")

        # ── Compute snapshot hash for confirm binding (anti-TOCTOU) ──────────
        _content_bytes = _json.dumps(
            raw_data, sort_keys=True, ensure_ascii=True
        ).encode("utf-8")
        snapshot_hash = hashlib.sha256(
            (filename + ":" + mode_raw + ":restore:").encode("utf-8") + _content_bytes
        ).hexdigest()[:32]

        sender_hash    = _sender_hash(sender)
        _captured_hash  = snapshot_hash
        _captured_data  = raw_data
        _captured_mode  = mode_raw
        _captured_path  = safe_path
        _captured_fname = filename
        _captured_sender = sender

        # Late-capture of nonce for autobackup filename
        _nonce_holder: list = []

        # ── Build hash-bound callback ─────────────────────────────────────────
        async def _callback():
            # Re-read + verify hash (anti-TOCTOU)
            try:
                fresh_data = await asyncio.to_thread(_read_json_file, _captured_path)
            except Exception as exc:  # noqa: BLE001
                return (
                    f"❌ Cannot re-read `{_captured_fname}` at apply time: {exc}\n"
                    "Re-issue the command.",
                    "",
                )

            fresh_bytes = _json.dumps(
                fresh_data if isinstance(fresh_data, dict) else {},
                sort_keys=True, ensure_ascii=True,
            ).encode("utf-8")
            fresh_hash = hashlib.sha256(
                (_captured_fname + ":" + _captured_mode + ":restore:").encode("utf-8")
                + fresh_bytes
            ).hexdigest()[:32]

            if fresh_hash != _captured_hash:
                logger.warning(
                    "Policy restore confirm rejected: file changed since preview "
                    "(sender=%s file=%s)", _captured_sender, _captured_fname,
                )
                return (
                    f"❌ File `{_captured_fname}` changed after preview — confirm rejected.\n"
                    "Re-issue `!policy restore ...` to get a new preview.",
                    "",
                )

            # Auto-backup current state before overwriting
            _autobackup_basename = ""
            _autobackup_hash = ""
            if self._policy_store is not None and self._bridge_data_dir:
                _exp_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
                _nonce_suffix = _nonce_holder[0] if _nonce_holder else "RESTORE"
                _sender_hash8 = _sender_hash(_captured_sender)[:8]
                try:
                    _os.makedirs(_exp_dir, exist_ok=True)
                    _ab_path, _autobackup_hash = await asyncio.to_thread(
                        self._policy_store.write_autobackup,
                        _exp_dir, _sender_hash8, _nonce_suffix,
                    )
                    _autobackup_basename = _os.path.basename(_ab_path)
                    logger.info(
                        "Pre-restore backup written: %s hash=%s",
                        _autobackup_basename, _autobackup_hash,
                    )
                except Exception as exc:  # noqa: BLE001
                    logger.warning("Pre-restore backup failed (non-fatal): %s", exc)

            # Apply restore
            try:
                stats = await asyncio.to_thread(
                    self._policy_store.import_snapshot,
                    _captured_data, _captured_mode, False, _captured_sender,
                )
                self._policy_last_import_at = int(time.time())
                diff_summary = (
                    f"restore/{_captured_mode}: "
                    f"node: +{stats['node_added']} ~{stats['node_updated']} "
                    f"-{stats['node_deleted']}; "
                    f"agent: +{stats['agent_added']} ~{stats['agent_updated']} "
                    f"-{stats['agent_deleted']}"
                )
                if _autobackup_basename:
                    diff_summary += f"; autobackup={_autobackup_basename}"
                # M10.2: record in policy change history
                _is_destr = (
                    stats.get("node_deleted", 0) + stats.get("agent_deleted", 0)
                ) > 0
                try:
                    await asyncio.to_thread(
                        self._policy_store.record_policy_change,
                        "policy.restore", _captured_mode, _captured_fname,
                        _sender_hash(_captured_sender), diff_summary, _is_destr,
                        stats.get("node_added", 0), stats.get("node_updated", 0),
                        stats.get("node_deleted", 0),
                        stats.get("agent_added", 0), stats.get("agent_updated", 0),
                        stats.get("agent_deleted", 0),
                        self._policy_history_limit,
                    )
                except Exception as _exc:  # noqa: BLE001
                    logger.warning("Failed to record restore history: %s", _exc)
                reply = policy_restore_applied_reply(
                    stats, _captured_mode, _autobackup_basename
                )
                return reply, diff_summary
            except Exception as exc:  # noqa: BLE001
                logger.exception(
                    "Policy restore apply failed: sender=%s file=%s",
                    _captured_sender, _captured_fname,
                )
                return f"❌ Restore failed: {exc}", ""

        # ── Store pending confirmation ────────────────────────────────────────
        nonce = self._confirm_store.add(
            sender_hash=sender_hash,
            verb="policy.restore",
            normalized_args=normalized,
            action_summary=action_summary,
            room_id=room_id,
            callback=_callback,
        )
        _nonce_holder.append(nonce)

        await _write_audit(
            http_client, self._console_url, self._internal_token,
            event="matrix.control.policy.restore",
            agent_id="control", node_id=self._node_id,
            room_id=room_id, event_id=event_id,
            status="ok",
            data={
                "sender_hash": sender_hash,
                "verb": "policy.restore",
                "normalized": normalized,
                "nonce": nonce,
                "expires_in_s": int(self._confirm_store.ttl_s),
                "mode": mode_raw,
                "snapshot_hash_prefix": snapshot_hash[:8],
                "diff": {
                    "node_added":   diff.node_added,
                    "node_updated": diff.node_updated,
                    "node_deleted": diff.node_deleted,
                    "agent_added":   diff.agent_added,
                    "agent_updated": diff.agent_updated,
                    "agent_deleted": diff.agent_deleted,
                    "sample_keys":   diff.sample_keys,
                },
            },
        )

        logger.info(
            "Confirm policy restore intent: sender=%s mode=%s file=%s hash=%s nonce=%s",
            sender, mode_raw, filename, snapshot_hash[:8], nonce,
        )
        return policy_restore_intent_reply(
            diff=diff,
            action_summary=action_summary,
            nonce=nonce,
            ttl_s=int(self._confirm_store.ttl_s),
        )

    async def _handle_dangerous_intent(
        self,
        http_client: "httpx.AsyncClient",
        cmd: "ControlCommand",
        sender: str,
        room_id: str,
        event_id: str,
    ) -> str:
        """
        First leg of the two-step confirm flow (M9.0).

        Does NOT apply the command.  Stores a pending confirmation with a
        callback that will execute the original handler, and returns a reply
        containing the nonce that the operator must send via !confirm <nonce>.
        """
        assert self._confirm_store is not None

        sender_hash = _sender_hash(sender)
        normalized = build_normalized_args(cmd)
        action_summary = (
            f"!{cmd.verb} {cmd.subcommand or ''} {normalized}".strip()
        )

        # M9.1: policy import gets a richer preview with diff + hash binding
        if cmd.verb == VERB_POLICY and (cmd.subcommand or "").lower() == "import":
            return await self._handle_policy_import_intent(
                http_client, cmd, sender, room_id, event_id,
                action_summary=action_summary, normalized=normalized,
            )

        # M10.1: policy restore — rollback with diff preview + hash binding
        if cmd.verb == VERB_POLICY and (cmd.subcommand or "").lower() == "restore":
            if self._policy_store is None or not self._bridge_data_dir:
                return policy_cmd_error(
                    "Policy store or data directory not configured."
                )
            return await self._handle_policy_restore_intent(
                http_client, cmd, sender, room_id, event_id,
                action_summary=action_summary, normalized=normalized,
            )

        # Build the callback: calls the actual handler when confirmed.
        # We capture all args by closure so the callback is self-contained.
        _verb = cmd.verb
        _subcmd = cmd.subcommand
        _args  = cmd.args
        _kw    = dict(cmd.kwargs)

        async def _callback():
            if _verb == VERB_NODE:
                # Reconstruct args_text (same as _try_control does)
                _parts = []
                if _subcmd:
                    _parts.append(_subcmd)
                _parts.extend(_args)
                _parts.extend(f"{k}={v}" for k, v in _kw.items())
                reply = await self._handle_node_cmd(
                    http_client, sender, room_id, event_id, " ".join(_parts),
                )
            elif _verb == VERB_ROOM:
                reply = await self._handle_room_cmd(
                    http_client, sender, room_id, event_id,
                    _subcmd, tuple(_args), _kw,
                )
            elif _verb == VERB_POLICY:
                reply = await self._handle_policy_cmd(
                    http_client, sender, room_id, event_id, _subcmd, _kw,
                )
            else:
                reply = f"❌ Unknown dangerous verb: {_verb}"
            return reply, action_summary

        nonce = self._confirm_store.add(
            sender_hash=sender_hash,
            verb=f"{cmd.verb}.{cmd.subcommand or ''}",
            normalized_args=normalized,
            action_summary=action_summary,
            room_id=room_id,
            callback=_callback,
        )

        await _write_audit(
            http_client, self._console_url, self._internal_token,
            event="matrix.control.intent",
            agent_id="control", node_id=self._node_id,
            room_id=room_id, event_id=event_id,
            status="ok",
            data={
                "sender_hash": sender_hash,
                "verb": cmd.verb,
                "subcommand": cmd.subcommand or "",
                "normalized": normalized,
                "nonce": nonce,
                "expires_in_s": int(self._confirm_store.ttl_s),
            },
        )

        logger.info(
            "Confirm intent: sender=%s verb=%s/%s nonce=%s ttl=%.0fs",
            sender, cmd.verb, cmd.subcommand, nonce, self._confirm_store.ttl_s,
        )
        return confirm_intent_reply(action_summary, nonce, int(self._confirm_store.ttl_s))

    async def _handle_confirm_cmd(
        self,
        http_client: "httpx.AsyncClient",
        cmd: "ControlCommand",
        sender: str,
        room_id: str,
        event_id: str,
    ) -> str:
        """
        Second leg of the two-step confirm flow (M9.0).

        Validates the nonce and sender, executes the stored callback, and
        emits confirmed + applied audit events.
        """
        if self._confirm_store is None:
            return "❌ Confirmation store not active."

        # Nonce may come as subcommand (token right after !confirm)
        nonce = (cmd.subcommand or "").strip().upper()
        if not nonce and cmd.args:
            nonce = cmd.args[0].strip().upper()
        if not nonce:
            return "❌ Usage: `!confirm <code>` — provide the confirmation code."

        sender_hash = _sender_hash(sender)
        entry = self._confirm_store.pop(nonce, sender_hash)

        if entry is None:
            logger.info(
                "Confirm rejected: sender=%s nonce=%s (invalid/expired/wrong-sender)",
                sender, nonce,
            )
            return confirm_expired_reply()

        await _write_audit(
            http_client, self._console_url, self._internal_token,
            event="matrix.control.confirmed",
            agent_id="control", node_id=self._node_id,
            room_id=room_id, event_id=event_id,
            status="ok",
            data={
                "sender_hash": sender_hash,
                "nonce": nonce,
                "verb": entry.verb,
                "action_summary": entry.action_summary,
            },
        )

        logger.info(
            "Confirm accepted: sender=%s nonce=%s verb=%s",
            sender, nonce, entry.verb,
        )

        try:
            reply_text, diff_summary = await entry.callback()
        except Exception as exc:
            logger.exception(
                "Confirm callback failed: sender=%s nonce=%s verb=%s",
                sender, nonce, entry.verb,
            )
            return f"❌ Apply failed: {exc}"

        await _write_audit(
            http_client, self._console_url, self._internal_token,
            event="matrix.control.applied",
            agent_id="control", node_id=self._node_id,
            room_id=room_id, event_id=event_id,
            status="ok",
            data={
                "sender_hash": sender_hash,
                "verb": entry.verb,
                "normalized": entry.normalized_args,
                "diff_summary": diff_summary,
            },
        )

        return confirm_success_reply(reply_text)

    # ── M6.0: Dynamic room-node overrides via !node command ───────────────────

    async def _handle_node_cmd(
        self,
        http_client: httpx.AsyncClient,
        sender: str,
        ctrl_room_id: str,
        event_id: str,
        args_text: str,
    ) -> str:
        """Handle `!node <set|unset|get|list>` from an authorized operator."""
        if self._policy_store is None or not self._policy_store.is_open:
            return "⚠️ Policy store not available."

        subcmd, room_id, node_id = parse_node_cmd(args_text)

        if subcmd not in (NODE_SUBCMD_SET, NODE_SUBCMD_UNSET, NODE_SUBCMD_GET, NODE_SUBCMD_LIST):
            return node_cmd_reply_error(
                f"Unknown subcommand: `{subcmd or '?'}`"
            )

        # ── list ──────────────────────────────────────────────────────────────
        if subcmd == NODE_SUBCMD_LIST:
            try:
                rows = await asyncio.to_thread(self._policy_store.list_overrides, 10)
                total = await asyncio.to_thread(self._policy_store.count_overrides)
            except Exception as exc:
                logger.warning("PolicyStore list_overrides error: %s", exc)
                return "⚠️ Could not read policy store."
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.node.list",
                agent_id="control", node_id=self._node_id,
                room_id=ctrl_room_id, event_id=event_id,
                status="ok", data={"sender": sender, "total": total},
            )
            return node_cmd_reply_list(rows, total)

        # ── subcommands that require room_id ──────────────────────────────────
        if not room_id:
            return node_cmd_reply_error("Missing `room=` argument.")
        if not node_cmd_validate_room(room_id):
            return node_cmd_reply_error(
                f"Invalid room ID format: `{room_id}`\n"
                "Expected: `!localpart:server`"
            )

        # ── get ───────────────────────────────────────────────────────────────
        if subcmd == NODE_SUBCMD_GET:
            try:
                override = await asyncio.to_thread(self._policy_store.get_override, room_id)
            except Exception as exc:
                logger.warning("PolicyStore get_override error: %s", exc)
                return "⚠️ Could not read policy store."
            # env map lookup for context
            env_node: Optional[str] = None
            if self._node_policy is not None:
                env_node = self._node_policy.room_node_map.get(room_id)
            default = self._node_policy.default_node if self._node_policy else self._node_id
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.node.get",
                agent_id="control", node_id=self._node_id,
                room_id=ctrl_room_id, event_id=event_id,
                status="ok", data={"sender": sender, "queried_room": room_id},
            )
            return node_cmd_reply_get(room_id, override, env_node, default)

        # ── unset ─────────────────────────────────────────────────────────────
        if subcmd == NODE_SUBCMD_UNSET:
            try:
                deleted = await asyncio.to_thread(self._policy_store.delete_override, room_id)
            except Exception as exc:
                logger.warning("PolicyStore delete_override error: %s", exc)
                return "⚠️ Could not write to policy store."
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.node.unset",
                agent_id="control", node_id=self._node_id,
                room_id=ctrl_room_id, event_id=event_id,
                status="ok", data={"sender": sender, "target_room": room_id, "was_set": deleted},
            )
            return node_cmd_reply_unset_ok(room_id) if deleted else node_cmd_reply_unset_not_found(room_id)

        # ── set ───────────────────────────────────────────────────────────────
        if subcmd == NODE_SUBCMD_SET:
            if not node_id:
                return node_cmd_reply_error("Missing `node=` argument for `set`.")
            allowed = self._node_policy.allowed_nodes if self._node_policy else frozenset([self._node_id])
            if node_id not in allowed:
                allowed_list = ", ".join(f"`{n}`" for n in sorted(allowed))
                return node_cmd_reply_error(
                    f"Node `{node_id}` is not in allowed list: {allowed_list}"
                )
            try:
                await asyncio.to_thread(self._policy_store.set_override, room_id, node_id, sender)
            except Exception as exc:
                logger.warning("PolicyStore set_override error: %s", exc)
                return "⚠️ Could not write to policy store."
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.node.set",
                agent_id="control", node_id=self._node_id,
                room_id=ctrl_room_id, event_id=event_id,
                status="ok", data={"sender": sender, "target_room": room_id, "set_node": node_id},
            )
            return node_cmd_reply_set(room_id, node_id)

        return node_cmd_reply_error("Unhandled subcommand.")

    # ── M4.1: Bridge status for operators ─────────────────────────────────────

    async def _handle_bridge_status(
        self,
        http_client: httpx.AsyncClient,
        sender: str,
        room_id: str,
        event_id: str,
    ) -> str:
        """Build and return a bridge health snapshot for `!status` in control room."""
        try:
            snapshot: Dict[str, Any] = {
                "node_id": self._node_id,
                "worker_count": self._worker_count,
                "room_count": len(self._room_map.mappings),
                "mixed_room_count": (
                    len(self._mixed_room_config.rooms)
                    if self._mixed_room_config
                    else 0
                ),
                "operators_count": (
                    len(self._control_config.operator_allowlist)
                    if self._control_config and self._control_config.operator_allowlist
                    else 0
                ),
            }
            # Queue info (exposed by MatrixIngressLoop via get_status)
            status = self.get_status()
            snapshot["queue_size"] = status.get("queue_size", "?")
            snapshot["queue_max"] = status.get("queue_max", "?")
            # Control safety
            if self._control_limiter is not None:
                snapshot["control_safety"] = self._control_limiter.as_health_dict()
            # Persistent dedupe
            if self._event_store is not None:
                snapshot["persistent_dedupe"] = self._event_store.as_health_dict()

            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.bridge_status",
                agent_id="control", node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="ok", data={"sender": sender},
            )
            return bridge_status_reply(snapshot)
        except Exception as exc:
            logger.warning("_handle_bridge_status error: %s", exc)
            return status_not_available_reply()

    # ── Control command handler ────────────────────────────────────────────────

    async def _try_control(
        self,
        client: MatrixClient,
        http_client: httpx.AsyncClient,
        event: Dict[str, Any],
        room_id: str,
    ) -> None:
        """
        Process a message from a control room.

        Non-command messages (not starting with '!') are silently ignored.
        All command attempts are audited regardless of authorization.
        """
        assert self._control_config is not None
        event_id = event.get("event_id", "")
        sender = event.get("sender", "")
        text = event.get("content", {}).get("body", "").strip()

        if not text or not is_control_message(text):
            return   # not a command, ignore

        client.mark_seen(event_id)

        # Authorization check
        authorized, rejection_reason = check_authorization(sender, room_id, self._control_config)

        if not authorized:
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.unauthorized",
                agent_id="control", node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="error", error_code=rejection_reason,
                data={"sender": sender, "command_preview": text[:80]},
            )
            logger.warning(
                "Unauthorized control command: sender=%s room=%s reason=%s cmd=%r",
                sender, room_id, rejection_reason, text[:60],
            )
            if self._control_unauthorized_behavior == "reply_error":
                try:
                    txn_id = MatrixClient.make_txn_id(room_id, event_id + "_unauth")
                    await client.send_text(room_id, unauthorized_reply(rejection_reason), txn_id)
                except Exception as exc:
                    logger.warning("Could not send unauthorized reply: %s", exc)
            return

        # M3.4: Rate limiting + cooldown (after auth, before parse/dispatch)
        if self._control_limiter is not None:
            sender_hash_ctrl = _sender_hash(sender)

            allowed_room, retry_room = self._control_limiter.check_room(room_id)
            if not allowed_room:
                scope = "room"
                logger.info("Control rate limited: scope=%s room=%s sender=%s", scope, room_id, sender)
                await _write_audit(
                    http_client, self._console_url, self._internal_token,
                    event="matrix.control.rate_limited",
                    agent_id="control", node_id=self._node_id,
                    room_id=room_id, event_id=event_id,
                    status="error", error_code="rate_limited_room",
                    data={"sender": sender, "scope": scope, "retry_after_s": retry_room},
                )
                if self._on_control_rate_limited:
                    self._on_control_rate_limited(scope)
                txn_id = MatrixClient.make_txn_id(room_id, event_id + "_rl")
                await client.send_text(room_id, rate_limited_reply(scope, retry_room), txn_id)
                return

            allowed_op, retry_op = self._control_limiter.check_operator(sender_hash_ctrl)
            if not allowed_op:
                scope = "operator"
                logger.info("Control rate limited: scope=%s sender=%s", scope, sender)
                await _write_audit(
                    http_client, self._console_url, self._internal_token,
                    event="matrix.control.rate_limited",
                    agent_id="control", node_id=self._node_id,
                    room_id=room_id, event_id=event_id,
                    status="error", error_code="rate_limited_operator",
                    data={"sender": sender, "scope": scope, "retry_after_s": retry_op},
                )
                if self._on_control_rate_limited:
                    self._on_control_rate_limited(scope)
                txn_id = MatrixClient.make_txn_id(room_id, event_id + "_rl")
                await client.send_text(room_id, rate_limited_reply(scope, retry_op), txn_id)
                return

        # Parse command
        cmd = parse_command(text)
        if cmd is None:
            logger.warning("Control message from %s could not be parsed: %r", sender, text[:60])
            return

        # M3.4: Cooldown check (anti-double-click, per operator+verb+subcommand)
        if self._control_limiter is not None:
            sender_hash_ctrl = _sender_hash(sender)
            allowed_cd, wait_cd = self._control_limiter.check_cooldown(
                sender_hash_ctrl, cmd.verb, cmd.subcommand or "",
            )
            if not allowed_cd:
                scope = "cooldown"
                logger.info(
                    "Control cooldown: sender=%s verb=%s sub=%s wait=%.1fs",
                    sender, cmd.verb, cmd.subcommand, wait_cd,
                )
                await _write_audit(
                    http_client, self._console_url, self._internal_token,
                    event="matrix.control.rate_limited",
                    agent_id="control", node_id=self._node_id,
                    room_id=room_id, event_id=event_id,
                    status="error", error_code="cooldown",
                    data={
                        "sender": sender, "scope": scope,
                        "verb": cmd.verb, "subcommand": cmd.subcommand,
                        "wait_s": wait_cd,
                    },
                )
                if self._on_control_rate_limited:
                    self._on_control_rate_limited(scope)
                txn_id = MatrixClient.make_txn_id(room_id, event_id + "_cd")
                await client.send_text(room_id, rate_limited_reply(scope, wait_cd), txn_id)
                return

        # Metric callback
        if self._on_control_command:
            self._on_control_command(sender, cmd.verb, cmd.subcommand)

        # Audit every authorized command
        await _write_audit(
            http_client, self._console_url, self._internal_token,
            event="matrix.control.command",
            agent_id="control", node_id=self._node_id,
            room_id=room_id, event_id=event_id,
            status="ok",
            data={
                "sender": sender,
                "verb": cmd.verb,
                "subcommand": cmd.subcommand,
                "args": list(cmd.args),
                "kwargs": dict(cmd.kwargs),
                "is_known": cmd.is_known,
            },
        )

        logger.info(
            "Control command: sender=%s verb=%s sub=%s args=%s",
            sender, cmd.verb, cmd.subcommand, cmd.args,
        )

        # Dispatch command
        txn_id = MatrixClient.make_txn_id(room_id, event_id + "_ctrl")

        # M9.0: Dangerous commands → two-step confirmation (intent leg)
        if (
            self._confirm_store is not None
            and is_dangerous_cmd(cmd)
            and cmd.verb != VERB_CONFIRM
        ):
            reply_text = await self._handle_dangerous_intent(
                http_client, cmd, sender, room_id, event_id,
            )
        elif cmd.verb == VERB_CONFIRM:
            # M9.0: !confirm <nonce> (second leg)
            reply_text = await self._handle_confirm_cmd(
                http_client, cmd, sender, room_id, event_id,
            )
        elif cmd.verb == VERB_HELP:
            reply_text = help_reply()
        elif cmd.verb == VERB_CONFIRM:
            # Fallback if confirm_store is None (disabled) — inform the operator
            reply_text = "❌ Confirmation system is disabled."
        elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_START:
            reply_text = await self._handle_runbook_start(
                http_client, client, cmd, sender, room_id, event_id,
            )
        elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_NEXT:
            # M3.2: advance to next step
            reply_text = await self._handle_runbook_next(
                http_client, client, cmd, sender, room_id, event_id,
            )
        elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_COMPLETE:
            # M3.2: mark manual step complete
            reply_text = await self._handle_runbook_complete(
                http_client, client, cmd, sender, room_id, event_id,
            )
        elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_STATUS:
            # M3.3: show run status
            reply_text = await self._handle_runbook_status(
                http_client, client, cmd, sender, room_id, event_id,
            )
        elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_EVIDENCE:
            # M3.3: generate release evidence
            reply_text = await self._handle_runbook_evidence(
                http_client, client, cmd, sender, room_id, event_id,
            )
        elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_POST_REVIEW:
            # M3.3: generate post-release review
            reply_text = await self._handle_runbook_post_review(
                http_client, client, cmd, sender, room_id, event_id,
            )
        elif cmd.verb == VERB_STATUS:
            # M4.1: bridge health snapshot for operators
            reply_text = await self._handle_bridge_status(
                http_client, sender, room_id, event_id,
            )
        elif cmd.verb == VERB_NODES:
            # M5.1: node policy overview for operators
            reply_text = await self._handle_nodes(
                http_client, sender, room_id, event_id,
            )
        elif cmd.verb == VERB_NODE:
            # M6.0: dynamic room-node override commands
            # Reconstruct args_text from parsed command parts
            _node_args_parts = []
            if cmd.subcommand:
                _node_args_parts.append(cmd.subcommand)
            _node_args_parts.extend(cmd.args)
            _node_args_parts.extend(f"{k}={v}" for k, v in cmd.kwargs.items())
            _node_args_text = " ".join(_node_args_parts)
            reply_text = await self._handle_node_cmd(
                http_client, sender, room_id, event_id, _node_args_text,
            )
        elif cmd.verb == VERB_ROOM:
            # M6.1: dynamic mixed room agent overrides
            reply_text = await self._handle_room_cmd(
                http_client, sender, room_id, event_id,
                cmd.subcommand, cmd.args, cmd.kwargs,
            )
        elif cmd.verb == VERB_POLICY:
            # M6.2: policy snapshot export/import
            reply_text = await self._handle_policy_cmd(
                http_client, sender, room_id, event_id,
                cmd.subcommand, cmd.kwargs,
            )
        elif not cmd.is_known:
            reply_text = unknown_command_reply(cmd)
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.unknown_cmd",
                agent_id="control", node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="error", error_code="unknown_verb",
                data={"verb": cmd.verb, "sender": sender},
            )
        else:
            reply_text = not_implemented_reply(cmd)

        try:
            await client.send_text(room_id, reply_text, txn_id)
        except Exception as exc:
            logger.error("Could not send control reply: %s", exc)

    async def _handle_runbook_start(
        self,
        http_client: httpx.AsyncClient,
        client: "MatrixClient",
        cmd: ControlCommand,
        sender: str,
        room_id: str,
        event_id: str,
    ) -> str:
        """
        M3.1: Execute !runbook start <path> [node=NODA1].

        Calls sofiia-console POST /api/runbooks/internal/runs.
        Returns reply text (success or failure) for delivery to the control room.
        Audits matrix.control.runbook.start regardless of outcome.
        """
        # Extract positional runbook_path
        runbook_path = cmd.args[0].strip() if cmd.args else ""
        node_id = cmd.kwargs.get("node", "NODA1").strip()

        # Validate path before calling the console
        path_error = _ctrl_runner.validate_runbook_path(runbook_path)
        if path_error:
            logger.warning(
                "!runbook start invalid path: sender=%s path=%r error=%s",
                sender, runbook_path, path_error,
            )
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.runbook.start",
                agent_id="control", node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="error", error_code="invalid_path",
                data={"sender": sender, "runbook_path": runbook_path, "error": path_error},
            )
            return start_usage_reply()

        # Call sofiia-console internal API
        run_id: Optional[str] = None
        http_status: Optional[int] = None
        try:
            result = await _ctrl_runner.start_runbook_run(
                http_client=http_client,
                console_url=self._console_url,
                control_token=self._control_token,
                runbook_path=runbook_path,
                operator_id=sender,
                node_id=node_id,
            )
            run_id = result.get("run_id", "")
            steps_total = result.get("steps_total", 0)
            status = result.get("status", "running")
            http_status = 200

            logger.info(
                "Runbook started: run_id=%s path=%s node=%s steps=%d by sender=%s",
                run_id, runbook_path, node_id, steps_total, sender,
            )
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.runbook.start",
                agent_id="control", node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="ok",
                data={
                    "sender": sender,
                    "runbook_path": runbook_path,
                    "node_id": node_id,
                    "run_id": run_id,
                    "steps_total": steps_total,
                    "http_status": http_status,
                },
            )
            return runbook_started_reply(run_id, steps_total, status)

        except _ctrl_runner.RunnerError as exc:
            reason = str(exc)
            logger.error(
                "!runbook start failed: sender=%s path=%r node=%s error=%s",
                sender, runbook_path, node_id, reason,
            )
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.runbook.start",
                agent_id="control", node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="error", error_code="runner_error",
                data={
                    "sender": sender,
                    "runbook_path": runbook_path,
                    "node_id": node_id,
                    "error": reason,
                    "http_status": http_status,
                },
            )
            return runbook_start_error_reply(reason)

    async def _handle_runbook_next(
        self,
        http_client: httpx.AsyncClient,
        client: "MatrixClient",
        cmd: ControlCommand,
        sender: str,
        room_id: str,
        event_id: str,
    ) -> str:
        """
        M3.2: Execute !runbook next <run_id>.

        Calls sofiia-console POST /api/runbooks/internal/runs/{run_id}/next.
        Returns reply text for the control room.
        Audits matrix.control.runbook.next.
        """
        run_id = cmd.args[0].strip() if cmd.args else ""
        if not run_id:
            return next_usage_reply()

        # M3.4: per-run rate limit for !runbook next
        if self._control_limiter is not None:
            allowed_run, retry_run = self._control_limiter.check_run_next(run_id)
            if not allowed_run:
                scope = "run"
                if self._on_control_rate_limited:
                    self._on_control_rate_limited(scope)
                return rate_limited_reply(scope, retry_run)

        http_status: Optional[int] = None
        try:
            result = await _ctrl_runner.next_runbook_step(
                http_client=http_client,
                console_url=self._console_url,
                control_token=self._control_token,
                run_id=run_id,
                operator_id=sender,
            )
            http_status = 200
            step_type = result.get("type", "unknown")
            step_index = result.get("step_index", 0)

            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.runbook.next",
                agent_id="control", node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="ok",
                data={
                    "sender": sender,
                    "run_id": run_id,
                    "step_index": step_index,
                    "step_type": step_type,
                    "http_status": http_status,
                },
            )

            if step_type == "manual":
                return next_manual_reply(
                    run_id=run_id,
                    step_index=step_index,
                    steps_total=result.get("steps_total"),
                    title=result.get("title", ""),
                    instructions_md=result.get("instructions_md", ""),
                )
            else:
                # http_check / script
                result_dict = result.get("result") or {}
                duration_ms = int(result_dict.get("duration_ms", 0)) if isinstance(result_dict, dict) else 0
                return next_auto_reply(
                    run_id=run_id,
                    step_index=step_index,
                    action_type=step_type,
                    step_status=result.get("step_status", "ok"),
                    duration_ms=duration_ms or None,
                    completed=bool(result.get("completed", False)),
                )

        except _ctrl_runner.RunnerError as exc:
            reason = str(exc)
            logger.error(
                "!runbook next failed: sender=%s run_id=%r error=%s",
                sender, run_id, reason,
            )
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.runbook.next",
                agent_id="control", node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="error", error_code="runner_error",
                data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status},
            )
            return next_error_reply(run_id, reason)

    async def _handle_runbook_complete(
        self,
        http_client: httpx.AsyncClient,
        client: "MatrixClient",
        cmd: ControlCommand,
        sender: str,
        room_id: str,
        event_id: str,
    ) -> str:
        """
        M3.2: Execute !runbook complete <run_id> step=<n> status=ok|warn|fail [notes=...]

        Calls sofiia-console POST /api/runbooks/internal/runs/{run_id}/steps/{n}/complete.
        Audits matrix.control.runbook.complete.
        """
        run_id = cmd.args[0].strip() if cmd.args else ""
        if not run_id:
            return complete_usage_reply()

        # step kwarg required
        step_raw = cmd.kwargs.get("step", "").strip()
        if not step_raw or not step_raw.isdigit():
            return complete_usage_reply()
        step_index = int(step_raw)

        # status kwarg required
        status = cmd.kwargs.get("status", "").strip().lower()
        if status not in ("ok", "warn", "fail", "skipped"):
            return complete_usage_reply()

        # notes: kwarg or remaining positional args (joined with space)
        notes = cmd.kwargs.get("notes", "").strip()
        if not notes and len(cmd.args) > 1:
            notes = " ".join(cmd.args[1:])
        notes = sanitize_notes(notes)  # M3.4: strip control chars + truncate to MAX_NOTES_LEN

        http_status: Optional[int] = None
        try:
            result = await _ctrl_runner.complete_runbook_step(
                http_client=http_client,
                console_url=self._console_url,
                control_token=self._control_token,
                run_id=run_id,
                step_index=step_index,
                status=status,
                notes=notes,
                operator_id=sender,
            )
            http_status = 200
            run_completed = bool(result.get("run_completed", False))

            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.runbook.complete",
                agent_id="control", node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="ok",
                data={
                    "sender": sender,
                    "run_id": run_id,
                    "step_index": step_index,
                    "status": status,
                    "run_completed": run_completed,
                    "http_status": http_status,
                },
            )
            return complete_ok_reply(run_id, step_index, status, run_completed)

        except _ctrl_runner.RunnerError as exc:
            reason = str(exc)
            logger.error(
                "!runbook complete failed: sender=%s run_id=%r step=%d error=%s",
                sender, run_id, step_index, reason,
            )
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.runbook.complete",
                agent_id="control", node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="error", error_code="runner_error",
                data={
                    "sender": sender,
                    "run_id": run_id,
                    "step_index": step_index,
                    "error": reason,
                    "http_status": http_status,
                },
            )
            return complete_error_reply(run_id, reason)

    async def _handle_runbook_status(
        self,
        http_client: httpx.AsyncClient,
        client: "MatrixClient",
        cmd: ControlCommand,
        sender: str,
        room_id: str,
        event_id: str,
    ) -> str:
        """M3.3: !runbook status <run_id> — GET run info + format status."""
        run_id = cmd.args[0].strip() if cmd.args else ""
        if not run_id:
            return status_usage_reply()

        http_status: Optional[int] = None
        try:
            result = await _ctrl_runner.get_runbook_run(
                http_client=http_client,
                console_url=self._console_url,
                control_token=self._control_token,
                run_id=run_id,
            )
            http_status = 200
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.runbook.status",
                agent_id="control", node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="ok",
                data={
                    "sender": sender, "run_id": run_id,
                    "run_status": result.get("status"),
                    "http_status": http_status,
                },
            )
            return status_reply(result)

        except _ctrl_runner.RunnerError as exc:
            reason = str(exc)
            logger.error("!runbook status failed: sender=%s run_id=%r error=%s", sender, run_id, reason)
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.runbook.status",
                agent_id="control", node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="error", error_code="runner_error",
                data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status},
            )
            return status_error_reply(run_id, reason)

    async def _handle_runbook_evidence(
        self,
        http_client: httpx.AsyncClient,
        client: "MatrixClient",
        cmd: ControlCommand,
        sender: str,
        room_id: str,
        event_id: str,
    ) -> str:
        """M3.3: !runbook evidence <run_id> — generate release evidence."""
        run_id = cmd.args[0].strip() if cmd.args else ""
        if not run_id:
            return evidence_usage_reply()

        http_status: Optional[int] = None
        try:
            result = await _ctrl_runner.generate_evidence(
                http_client=http_client,
                console_url=self._console_url,
                control_token=self._control_token,
                run_id=run_id,
            )
            http_status = 200
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.runbook.evidence",
                agent_id="control", node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="ok",
                data={
                    "sender": sender, "run_id": run_id,
                    "evidence_path": result.get("evidence_path"),
                    "bytes": result.get("bytes"),
                    "http_status": http_status,
                },
            )
            return evidence_reply(result)

        except _ctrl_runner.RunnerError as exc:
            reason = str(exc)
            logger.error("!runbook evidence failed: sender=%s run_id=%r error=%s", sender, run_id, reason)
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.runbook.evidence",
                agent_id="control", node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="error", error_code="runner_error",
                data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status},
            )
            return evidence_error_reply(run_id, reason)

    async def _handle_runbook_post_review(
        self,
        http_client: httpx.AsyncClient,
        client: "MatrixClient",
        cmd: ControlCommand,
        sender: str,
        room_id: str,
        event_id: str,
    ) -> str:
        """M3.3: !runbook post_review <run_id> — generate post-release review."""
        run_id = cmd.args[0].strip() if cmd.args else ""
        if not run_id:
            return post_review_usage_reply()

        http_status: Optional[int] = None
        try:
            result = await _ctrl_runner.generate_post_review(
                http_client=http_client,
                console_url=self._console_url,
                control_token=self._control_token,
                run_id=run_id,
            )
            http_status = 200
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.runbook.post_review",
                agent_id="control", node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="ok",
                data={
                    "sender": sender, "run_id": run_id,
                    "path": result.get("path"),
                    "bytes": result.get("bytes"),
                    "http_status": http_status,
                },
            )
            return post_review_reply(result)

        except _ctrl_runner.RunnerError as exc:
            reason = str(exc)
            logger.error("!runbook post_review failed: sender=%s run_id=%r error=%s", sender, run_id, reason)
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.control.runbook.post_review",
                agent_id="control", node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="error", error_code="runner_error",
                data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status},
            )
            return post_review_error_reply(run_id, reason)

    # ── Worker ─────────────────────────────────────────────────────────────────

    async def _worker(
        self,
        queue: "asyncio.Queue[Optional[_QueueEntry]]",
        client: MatrixClient,
        http_client: httpx.AsyncClient,
    ) -> None:
        """Consume queue entries until cancelled."""
        while True:
            entry = await queue.get()   # blocks until item available; raises CancelledError on cancel
            try:
                await self._process_entry(client, http_client, entry)
            except Exception as exc:
                logger.error("Worker unhandled error: %s", exc)
            finally:
                queue.task_done()
                if self._on_queue_size:
                    self._on_queue_size(queue.qsize())

    # ── Process (invoke + send + audit) ───────────────────────────────────────

    async def _process_entry(
        self,
        client: MatrixClient,
        http_client: httpx.AsyncClient,
        entry: _QueueEntry,
    ) -> None:
        event = entry.event
        event_id = event.get("event_id", "")
        sender = event.get("sender", "")
        text = event.get("content", {}).get("body", "").strip()
        room_id = entry.room_id
        agent_id = entry.agent_id

        # H3: Queue wait latency
        wait_s = time.monotonic() - entry.enqueue_time
        if self._on_queue_wait:
            self._on_queue_wait(agent_id, wait_s)

        routing_reason = entry.routing_reason
        is_mixed = entry.is_mixed

        logger.info(
            "Processing: room=%s agent=%s event=%s len=%d wait=%.3fs mixed=%s reason=%s",
            room_id, agent_id, event_id, len(text), wait_s, is_mixed, routing_reason,
        )

        if self._on_message_received:
            self._on_message_received(room_id, agent_id)

        await _write_audit(
            http_client, self._console_url, self._internal_token,
            event="matrix.message.received",
            agent_id=agent_id, node_id=self._node_id,
            room_id=room_id, event_id=event_id,
            status="ok",
            data={
                "sender": sender,
                "text_len": len(text),
                "queue_wait_ms": int(wait_s * 1000),
                "routing_reason": routing_reason,
                "is_mixed": is_mixed,
            },
        )

        # Session Scope v2: canonical key + PII-safe sender hash.
        # Scope is always room_agent for user-initiated messages.
        # Control room messages never reach _process_entry (handled by _try_control).
        scope = SCOPE_ROOM_AGENT
        session_id = _build_session_key(room_id, agent_id, scope=scope)
        sender_hash = _sender_hash(sender)

        logger.debug(
            "Session scope v2: session_key=%s scope=%s sender_hash=%s",
            session_id, scope, sender_hash,
        )

        # M5.0: Node-aware routing
        # Extract node=X kwarg from body (mixed rooms only, to avoid breaking direct rooms)
        explicit_node: Optional[str] = None
        effective_text = text
        if is_mixed and self._node_policy is not None:
            explicit_node, effective_text = extract_node_kwarg(text)

        # M6.0: look up dynamic policy store override for this room
        store_override: Optional[str] = None
        if self._policy_store is not None and self._policy_store.is_open:
            try:
                store_override = await asyncio.to_thread(
                    self._policy_store.get_override, room_id
                )
            except Exception as exc:  # noqa: BLE001
                logger.warning("PolicyStore get_override failed: %s", exc)

        node_res = (
            self._node_policy.resolve(room_id, explicit_node, store_override=store_override)
            if self._node_policy is not None
            else NodeResolution(node_id=self._node_id, source=NODE_SOURCE_DEFAULT)
        )

        if node_res.rejected_node:
            logger.info(
                "Node kwarg rejected: requested=%s allowed=%s room=%s agent=%s",
                node_res.rejected_node, self._node_policy.allowed_nodes if self._node_policy else {}, room_id, agent_id,
            )
            if self._on_node_rejected:
                self._on_node_rejected(node_res.rejected_node)
            txn_rej = MatrixClient.make_txn_id(room_id, event_id + "_node_rej")
            allowed = self._node_policy.allowed_nodes if self._node_policy else frozenset()
            reply_rej = node_rejected_reply(node_res.rejected_node, allowed)
            try:
                await client.send_text(room_id, reply_rej, txn_rej)
            except Exception as exc:
                logger.warning("Could not send node rejection reply: %s", exc)
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.route.node_rejected",
                agent_id=agent_id, node_id=self._node_id,
                room_id=room_id, event_id=event_id,
                status="error", error_code="node_rejected",
                data={"requested_node": node_res.rejected_node, "resolved_node": node_res.node_id},
            )
            # Continue with fallback node (do not drop the message)

        if self._on_node_selected:
            self._on_node_selected(agent_id, node_res.node_id, node_res.source)

        await _write_audit(
            http_client, self._console_url, self._internal_token,
            event="matrix.route.node_selected",
            agent_id=agent_id, node_id=node_res.node_id,
            room_id=room_id, event_id=event_id,
            status="ok",
            data={"node_id": node_res.node_id, "source": node_res.source},
        )

        # M2.2: per-room-agent concurrency cap (only for mixed rooms; single-agent rooms unaffected)
        _lock = self._get_concurrency_lock(room_id, agent_id) if is_mixed and self._mixed_concurrency_cap > 0 else None
        if _lock is not None:
            await _lock.acquire()
        try:
            await self._invoke_and_send(
                client, http_client, entry, session_id, wait_s, is_mixed, routing_reason,
                sender_hash=sender_hash, scope=scope,
                effective_node_id=node_res.node_id, node_source=node_res.source,
                effective_text=effective_text,
            )
        finally:
            if _lock is not None:
                _lock.release()

    async def _invoke_and_send(
        self,
        client: MatrixClient,
        http_client: httpx.AsyncClient,
        entry: _QueueEntry,
        session_id: str,
        wait_s: float,
        is_mixed: bool,
        routing_reason: str,
        sender_hash: str = "",
        scope: str = SCOPE_ROOM_AGENT,
        # M5.0: resolved node
        effective_node_id: Optional[str] = None,
        node_source: str = NODE_SOURCE_DEFAULT,
        effective_text: Optional[str] = None,   # text with node=X kwarg stripped
    ) -> None:
        """Inner: invoke Router + send reply (separated for concurrency lock wrapping)."""
        event = entry.event
        event_id = event.get("event_id", "")
        # Use effective_text if provided (node kwarg stripped), otherwise original body
        text = effective_text if effective_text is not None else event.get("content", {}).get("body", "").strip()
        room_id = entry.room_id
        agent_id = entry.agent_id
        node_id = effective_node_id if effective_node_id is not None else self._node_id

        # H3 + M8.0 + M8.1: Invoke with latency tracking, soft-failover, and sticky routing
        t0 = time.monotonic()
        reply_text: Optional[str] = None
        invoke_ok = False
        invoke_duration_s = 0.0
        used_node_id = node_id   # may change on failover

        # M8.1: check sticky cache (skip primary if sticky is set for this room:agent)
        sticky_key = make_sticky_key(room_id, agent_id)
        sticky_node: Optional[str] = None
        if node_source != NODE_SOURCE_EXPLICIT and self._sticky_cache is not None:
            sticky_node = self._sticky_cache.get(sticky_key)

        async def _do_invoke(target_node: str, target_source: str) -> Optional[str]:
            """Single invoke attempt; returns reply text or None on failure."""
            nonlocal invoke_duration_s
            _t = time.monotonic()
            try:
                result = await _invoke_router(
                    http_client, self._router_url,
                    agent_id=agent_id, node_id=target_node,
                    prompt=text, session_id=session_id,
                    sender_hash=sender_hash, scope=scope,
                    node_source=target_source,
                )
                invoke_duration_s = time.monotonic() - _t
                if self._node_health_tracker is not None:
                    self._node_health_tracker.record_ok(target_node, invoke_duration_s)
                if self._on_invoke_latency:
                    self._on_invoke_latency(agent_id, invoke_duration_s, target_node)
                logger.info(
                    "Invoke ok: agent=%s node=%s event=%s reply_len=%d duration=%dms",
                    agent_id, target_node, event_id, len(result or ""), int(invoke_duration_s * 1000),
                )
                return result
            except httpx.HTTPStatusError as exc:
                invoke_duration_s = time.monotonic() - _t
                _reason = FAILOVER_REASON_HTTP_5XX if exc.response.status_code >= 500 else "http_4xx"
                logger.error(
                    "Router HTTP %d agent=%s node=%s event=%s duration=%dms",
                    exc.response.status_code, agent_id, target_node, event_id,
                    int(invoke_duration_s * 1000),
                )
                if self._node_health_tracker and exc.response.status_code >= 500:
                    self._node_health_tracker.record_error(target_node, _reason)
                if self._on_gateway_error:
                    self._on_gateway_error(f"http_{exc.response.status_code}")
                await _write_audit(
                    http_client, self._console_url, self._internal_token,
                    event="matrix.error", agent_id=agent_id, node_id=target_node,
                    room_id=room_id, event_id=event_id,
                    status="error", error_code=f"router_http_{exc.response.status_code}",
                    duration_ms=int(invoke_duration_s * 1000),
                )
                if exc.response.status_code >= 500:
                    raise  # eligible for failover
                return None  # 4xx: not a node issue, don't failover

            except (httpx.ConnectError, httpx.TimeoutException) as exc:
                invoke_duration_s = time.monotonic() - _t
                _reason = (
                    FAILOVER_REASON_TIMEOUT
                    if isinstance(exc, httpx.TimeoutException)
                    else FAILOVER_REASON_NETWORK
                )
                logger.error(
                    "Router network error agent=%s node=%s event=%s: %s",
                    agent_id, target_node, event_id, exc,
                )
                if self._node_health_tracker:
                    self._node_health_tracker.record_error(target_node, _reason)
                if self._on_gateway_error:
                    self._on_gateway_error("network_error")
                await _write_audit(
                    http_client, self._console_url, self._internal_token,
                    event="matrix.error", agent_id=agent_id, node_id=target_node,
                    room_id=room_id, event_id=event_id,
                    status="error", error_code="router_network_error",
                    duration_ms=int(invoke_duration_s * 1000),
                )
                raise  # eligible for failover

            except Exception as exc:
                invoke_duration_s = time.monotonic() - _t
                logger.error(
                    "Unexpected invoke error agent=%s node=%s event=%s: %s",
                    agent_id, target_node, event_id, exc,
                )
                if self._node_health_tracker:
                    self._node_health_tracker.record_error(target_node, "unexpected")
                if self._on_gateway_error:
                    self._on_gateway_error("unexpected")
                await _write_audit(
                    http_client, self._console_url, self._internal_token,
                    event="matrix.error", agent_id=agent_id, node_id=target_node,
                    room_id=room_id, event_id=event_id,
                    status="error", error_code="router_unexpected",
                    duration_ms=int(invoke_duration_s * 1000),
                )
                return None  # unexpected errors: no failover (could be code bug)

        if sticky_node is not None:
            # M8.1: sticky path — route directly to known-good fallback, skip primary
            logger.info(
                "Sticky: routing %s→%s (skipping primary=%s) agent=%s event=%s",
                sticky_key, sticky_node, node_id, agent_id, event_id,
            )
            try:
                reply_text = await _do_invoke(sticky_node, NODE_SOURCE_DEFAULT)
                invoke_ok = reply_text is not None
                used_node_id = sticky_node
            except Exception:  # noqa: BLE001
                # Sticky node also failed — clear sticky and leave reply_text=None
                self._sticky_cache.delete(sticky_key)  # type: ignore[union-attr]
                logger.warning(
                    "Sticky node %s failed for %s — cleared (agent=%s event=%s)",
                    sticky_node, sticky_key, agent_id, event_id,
                )
                # M8.2: remove from DB as well
                await self._sticky_persist_delete(sticky_key)

        else:
            # Normal path: try primary; attempt failover on eligible errors
            try:
                reply_text = await _do_invoke(node_id, node_source)
                invoke_ok = reply_text is not None
                used_node_id = node_id

            except (httpx.ConnectError, httpx.TimeoutException, httpx.HTTPStatusError):
                # Primary failed with a failover-eligible error.
                # Failover only for non-explicit routing (explicit = user chose node).
                if node_source == NODE_SOURCE_EXPLICIT:
                    logger.info(
                        "Node %s failed for explicit routing — no failover (agent=%s event=%s)",
                        node_id, agent_id, event_id,
                    )
                    # reply_text stays None; error already audited
                else:
                    # Attempt failover
                    fallback_node: Optional[str] = None
                    if self._node_health_tracker is not None and self._node_policy is not None:
                        fallback_node = self._node_health_tracker.pick_fallback(
                            node_id, self._node_policy.allowed_nodes
                        )
                    elif self._node_policy is not None:
                        # No tracker — pick any other allowed node deterministically
                        others = sorted(
                            n for n in self._node_policy.allowed_nodes if n != node_id
                        )
                        fallback_node = others[0] if others else None

                    if fallback_node:
                        logger.warning(
                            "Failover: %s → %s agent=%s event=%s",
                            node_id, fallback_node, agent_id, event_id,
                        )
                        try:
                            reply_text = await _do_invoke(fallback_node, NODE_SOURCE_DEFAULT)
                            invoke_ok = reply_text is not None
                            used_node_id = fallback_node
                            if invoke_ok:
                                # Fire failover callback and audit
                                if self._on_failover:
                                    self._on_failover(node_id, fallback_node, "invoke_error")
                                await _write_audit(
                                    http_client, self._console_url, self._internal_token,
                                    event="matrix.node.failover",
                                    agent_id=agent_id, node_id=fallback_node,
                                    room_id=room_id, event_id=event_id,
                                    status="ok", data={
                                        "from_node": node_id,
                                        "to_node": fallback_node,
                                        "original_source": node_source,
                                    },
                                )
                                # M8.1: set sticky — future messages skip primary
                                if self._sticky_cache is not None:
                                    self._sticky_cache.set(sticky_key, fallback_node)
                                    scope = "mixed" if is_mixed else "direct"
                                    if self._on_sticky_set:
                                        self._on_sticky_set(fallback_node, scope)
                                    logger.info(
                                        "Sticky set: %s → %s scope=%s ttl=%.0fs",
                                        sticky_key, fallback_node, scope,
                                        self._sticky_cache.ttl_s,
                                    )
                                    # M8.2: persist sticky to DB
                                    await self._sticky_persist_set(sticky_key, fallback_node)
                        except Exception:  # noqa: BLE001
                            pass  # errors already audited inside _do_invoke

        if not invoke_ok or not reply_text:
            if invoke_ok:
                logger.warning("Empty reply from router agent=%s event=%s", agent_id, event_id)
            return

        # H3: Send with latency
        # M2.1: prefix reply with agent identity in mixed rooms ("Sofiia: ...")
        prefix = reply_prefix(agent_id, is_mixed)
        raw_reply = reply_text[:_REPLY_TEXT_MAX - len(prefix)]
        send_text = prefix + raw_reply
        txn_id = MatrixClient.make_txn_id(room_id, event_id)
        send_t0 = time.monotonic()

        try:
            await client.send_text(room_id, send_text, txn_id)
            send_duration_s = time.monotonic() - send_t0
            if self._on_send_latency:
                self._on_send_latency(agent_id, send_duration_s)
            if self._on_message_replied:
                self._on_message_replied(room_id, agent_id, "ok")
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.agent.replied", agent_id=agent_id, node_id=used_node_id,
                room_id=room_id, event_id=event_id, status="ok",
                duration_ms=int(send_duration_s * 1000),
                data={
                    "reply_len": len(send_text),
                    "truncated": len(reply_text) > _REPLY_TEXT_MAX,
                    "router_duration_ms": int(invoke_duration_s * 1000),
                    "queue_wait_ms": int(wait_s * 1000),
                    "routing_reason": routing_reason,
                    "is_mixed": is_mixed,
                    "node_source": node_source,
                    "failover": used_node_id != node_id,  # M8.0: failover flag
                },
            )
            logger.info(
                "Reply sent: agent=%s event=%s reply_len=%d send_ms=%d",
                agent_id, event_id, len(send_text), int(send_duration_s * 1000),
            )

        except Exception as exc:
            send_duration_s = time.monotonic() - send_t0
            logger.error("Send failed agent=%s event=%s: %s", agent_id, event_id, exc)
            if self._on_message_replied:
                self._on_message_replied(room_id, agent_id, "error")
            if self._on_gateway_error:
                self._on_gateway_error("matrix_send_error")
            await _write_audit(
                http_client, self._console_url, self._internal_token,
                event="matrix.error", agent_id=agent_id, node_id=node_id,
                room_id=room_id, event_id=event_id,
                status="error", error_code="matrix_send_failed",
                duration_ms=int(send_duration_s * 1000),
            )