""" Matrix Ingress + Egress Loop — Phase M1.4 + H1 + H2 + H3 + M2.1 + M2.2 + M3.0 + M3.1 + M3.3 + SessionScopeV2 Architecture (H2): Reader task → asyncio.Queue(maxsize) → N Worker tasks ───────────────────────────────────────────────────────── Reader: sync_poll() → extract_room_messages() → rate_limit check (H1) → mark_seen / dedupe → queue.put_nowait() or DROP (audit matrix.queue_full + metric) Workers (N concurrent): queue.get() → measure wait latency (H3) → audit matrix.message.received → invoke Router (timed, H3) → send_text() (timed, H3) → audit matrix.agent.replied | matrix.error Shutdown: 1. stop_event set → reader exits loop 2. queue.join() with drain_timeout → workers finish in-flight 3. worker tasks cancelled Queue entry: _QueueEntry(event, room_id, agent_id, enqueue_time, routing_reason, is_mixed) """ import asyncio import hashlib import json as _json import logging import os as _os import time from dataclasses import dataclass, field from typing import Any, Callable, Dict, List, Optional import httpx from .control import ( ControlConfig, ControlCommand, check_authorization, parse_command, is_control_message, not_implemented_reply, unknown_command_reply, unauthorized_reply, help_reply, start_usage_reply, runbook_started_reply, runbook_start_error_reply, next_usage_reply, next_manual_reply, next_auto_reply, next_error_reply, complete_usage_reply, complete_ok_reply, complete_error_reply, status_usage_reply, status_reply, status_error_reply, evidence_usage_reply, evidence_reply, evidence_error_reply, post_review_usage_reply, post_review_reply, post_review_error_reply, rate_limited_reply, sanitize_notes, MAX_NOTES_LEN, status_not_available_reply, nodes_reply, VERB_HELP, VERB_RUNBOOK, VERB_STATUS, VERB_NODES, VERB_NODE, VERB_CONFIRM, is_dangerous_cmd, build_normalized_args, confirm_intent_reply, confirm_success_reply, confirm_expired_reply, NODE_SUBCMD_SET, NODE_SUBCMD_UNSET, NODE_SUBCMD_GET, NODE_SUBCMD_LIST, parse_node_cmd, node_cmd_validate_room, node_cmd_reply_set, node_cmd_reply_unset_ok, node_cmd_reply_unset_not_found, node_cmd_reply_get, node_cmd_reply_list, node_cmd_reply_error, VERB_ROOM, ROOM_SUBCMD_AGENTS, ROOM_ACTION_SET, ROOM_ACTION_ADD, ROOM_ACTION_REMOVE, ROOM_ACTION_GET, ROOM_ACTION_LIST, ROOM_ACTION_UNSET, parse_room_agents_cmd, room_agents_reply_set, room_agents_reply_add, room_agents_reply_remove, room_agents_reply_unset_ok, room_agents_reply_unset_not_found, room_agents_reply_get, room_agents_reply_list, room_agents_reply_error, VERB_POLICY, POLICY_EXPORTS_SUBDIR, validate_export_path, policy_import_intent_reply, format_import_diff as _format_import_diff, policy_export_reply, policy_import_dry_run_reply, policy_import_reply, policy_cmd_error, policy_prune_preview_reply, policy_prune_applied_reply, policy_restore_intent_reply, policy_restore_applied_reply, policy_history_reply, policy_change_detail_reply, SUBCOMMAND_START, SUBCOMMAND_NEXT, SUBCOMMAND_COMPLETE, SUBCOMMAND_STATUS, SUBCOMMAND_EVIDENCE, SUBCOMMAND_POST_REVIEW, ) from .control_limiter import ControlRateLimiter from .discovery import agents_reply, bridge_status_reply, is_discovery_message from .event_store import EventStore from .node_policy import ( NodePolicy, NodeResolution, NODE_SOURCE_DEFAULT, NODE_SOURCE_EXPLICIT, NODE_SOURCE_ROOM_MAP, extract_node_kwarg, node_rejected_reply, ) from .node_health import ( NodeHealthTracker, NodeHealthConfig, NODE_STATE_HEALTHY, NODE_STATE_DEGRADED, NODE_STATE_DOWN, FAILOVER_REASON_TIMEOUT, FAILOVER_REASON_HTTP_5XX, FAILOVER_REASON_NETWORK, ) from .sticky_cache import StickyNodeCache, make_sticky_key from .confirm_store import ConfirmStore from . import control_runner as _ctrl_runner from .matrix_client import MatrixClient from .mixed_routing import ( MixedRoomConfig, MixedRoom, route_message, reply_prefix, build_override_config, REASON_REJECTED_UNKNOWN_AGENT, REASON_REJECTED_SLASH_TOO_LONG, REASON_REJECTED_NO_MAPPING, ) from .rate_limit import InMemoryRateLimiter from .room_mapping import RoomMappingConfig, RoomMapping logger = logging.getLogger(__name__) # ── Constants ────────────────────────────────────────────────────────────────── _MAX_RETRY_BACKOFF = 60.0 _INIT_RETRY_BACKOFF = 2.0 _ROUTER_TIMEOUT_S = 45.0 _AUDIT_TIMEOUT_S = 5.0 _REPLY_TEXT_MAX = 4000 _WORKER_GET_TIMEOUT_S = 1.0 # how long a worker waits on empty queue before re-checking # ── Queue entry ──────────────────────────────────────────────────────────────── @dataclass class _QueueEntry: event: Dict[str, Any] room_id: str agent_id: str enqueue_time: float # time.monotonic() at enqueue routing_reason: str = "direct" is_mixed: bool = False # True for mixed-room entries (reply tagging, session isolation) # ── Session Scope v2 ─────────────────────────────────────────────────────────── # Invariants: # 1. Control room messages never reach the Router (no session_key needed there). # 2. Matrix and Telegram never share a key namespace (prefix "matrix:"). # 3. Mixed rooms: each (room_id, agent_id) pair has its own key — no cross-agent leakage. # 4. Logs/metrics receive sender_hash (sha256[:16]), never raw Matrix user_id. SCOPE_ROOM_AGENT = "room_agent" # default: shared room context per agent SCOPE_OPS_RUNBOOK = "ops_runbook" # future: control/ops room invocations SCOPE_DM_USER = "dm_agent_user" # future: per-user DM isolation def _sender_hash(sender: str) -> str: """PII-safe 16-hex hash of a Matrix user_id (e.g. @alice:server → 'a3f9...').""" return hashlib.sha256(sender.encode()).hexdigest()[:16] def _build_session_key(room_id: str, agent_id: str, scope: str = SCOPE_ROOM_AGENT) -> str: """ Canonical session key v2: matrix:{scope}:{room_key}:{agent_id} Examples: matrix:room_agent:roomXserver_yourdomain:sofiia matrix:ops_runbook:opsroomXserver:sofiia """ room_key = room_id.replace("!", "").replace(":", "_") return f"matrix:{scope}:{room_key}:{agent_id}" # ── Router invoke ────────────────────────────────────────────────────────────── async def _invoke_router( http_client: httpx.AsyncClient, router_url: str, agent_id: str, node_id: str, prompt: str, session_id: str, sender_hash: str = "", scope: str = SCOPE_ROOM_AGENT, node_source: str = NODE_SOURCE_DEFAULT, ) -> str: """POST /v1/agents/{agent_id}/infer → response text. Raises httpx.HTTPError on failure.""" url = f"{router_url.rstrip('/')}/v1/agents/{agent_id}/infer" payload = { "prompt": prompt, "session_id": session_id, "user_id": "matrix_bridge", "metadata": { "transport": "matrix", "node_id": node_id, "node_source": node_source, # M5.0: how node was resolved "session_key": session_id, # explicit for Router/Memory to index on "sender_hash": sender_hash, # PII-safe sender fingerprint "scope": scope, }, } resp = await http_client.post(url, json=payload, timeout=_ROUTER_TIMEOUT_S) resp.raise_for_status() data = resp.json() text = ( data.get("response") or data.get("text") or data.get("content") or data.get("message") or "" ) return (text if isinstance(text, str) else str(text)).strip() # ── M6.2: File helpers (run in thread) ──────────────────────────────────────── def _write_json_file(path: str, data: Any) -> None: """Synchronously write data as JSON to path (UTF-8, 2-space indent).""" with open(path, "w", encoding="utf-8") as fh: _json.dump(data, fh, ensure_ascii=False, indent=2) def _read_json_file(path: str) -> Any: """Synchronously read and parse a JSON file.""" with open(path, encoding="utf-8") as fh: return _json.load(fh) # ── Audit write ──────────────────────────────────────────────────────────────── async def _write_audit( http_client: httpx.AsyncClient, console_url: str, internal_token: str, event: str, agent_id: str, node_id: str, room_id: str, event_id: str, status: str = "ok", error_code: Optional[str] = None, duration_ms: Optional[int] = None, data: Optional[Dict[str, Any]] = None, ) -> None: """Fire-and-forget. Never raises.""" if not console_url or not internal_token: return try: await http_client.post( f"{console_url.rstrip('/')}/api/audit/internal", json={ "event": event, "operator_id": "matrix_bridge", "node_id": node_id, "agent_id": agent_id, "chat_id": room_id, "status": status, "error_code": error_code, "duration_ms": duration_ms, "data": {"matrix_event_id": event_id, "matrix_room_id": room_id, **(data or {})}, }, headers={"X-Internal-Service-Token": internal_token}, timeout=_AUDIT_TIMEOUT_S, ) except Exception as exc: logger.warning("Audit write failed (non-blocking): %s", exc) # ── Ingress loop (reader + workers) ─────────────────────────────────────────── class MatrixIngressLoop: """ Drives the full Matrix → Router → Matrix pipeline with backpressure. Reader task: sync → extract → rate_check → dedupe → queue.put_nowait Worker tasks: queue.get → invoke → send → audit Metric callbacks (all optional, called synchronously): on_message_received(room_id, agent_id) on_message_replied(room_id, agent_id, status) on_gateway_error(error_type) on_rate_limited(room_id, agent_id, limit_type) on_queue_dropped(room_id, agent_id) on_queue_size(current_size: int) on_invoke_latency(agent_id, duration_seconds) on_send_latency(agent_id, duration_seconds) on_queue_wait(agent_id, wait_seconds) on_routed(agent_id, reason) M2.2: successful routing on_route_rejected(room_id, reason) M2.2: routing rejection """ def __init__( self, matrix_homeserver_url: str, matrix_access_token: str, matrix_user_id: str, router_url: str, node_id: str, room_map: RoomMappingConfig, sofiia_console_url: str = "", sofiia_internal_token: str = "", rate_limiter: Optional[InMemoryRateLimiter] = None, queue_max_events: int = 100, worker_concurrency: int = 2, queue_drain_timeout_s: float = 5.0, mixed_room_config: Optional[MixedRoomConfig] = None, # M2.2: guard rails unknown_agent_behavior: str = "ignore", # "ignore" | "reply_error" max_slash_len: int = 32, mixed_concurrency_cap: int = 1, # 0 = unlimited # M3.0: control channel control_config: Optional[ControlConfig] = None, control_unauthorized_behavior: str = "ignore", # "ignore" | "reply_error" # M3.1: runbook runner integration sofiia_control_token: str = "", # M3.4: control channel safety control_limiter: Optional["ControlRateLimiter"] = None, # M2.3: persistent event deduplication event_store: Optional["EventStore"] = None, # M4.0: agent discovery discovery_rpm: int = 20, # M5.0: node-aware routing node_policy: Optional["NodePolicy"] = None, # Callbacks on_message_received: Optional[Callable[[str, str], None]] = None, on_message_replied: Optional[Callable[[str, str, str], None]] = None, on_gateway_error: Optional[Callable[[str], None]] = None, on_rate_limited: Optional[Callable[[str, str, str], None]] = None, on_queue_dropped: Optional[Callable[[str, str], None]] = None, on_queue_size: Optional[Callable[[int], None]] = None, on_invoke_latency: Optional[Callable[..., None]] = None, # (agent_id, duration_s, node_id="") on_send_latency: Optional[Callable[[str, float], None]] = None, on_queue_wait: Optional[Callable[[str, float], None]] = None, on_routed: Optional[Callable[[str, str], None]] = None, on_route_rejected: Optional[Callable[[str, str], None]] = None, on_control_command: Optional[Callable[[str, str, str], None]] = None, on_control_rate_limited: Optional[Callable[[str], None]] = None, on_dedupe_persistent_hit: Optional[Callable[[str, str], None]] = None, on_dedupe_persistent_insert: Optional[Callable[[], None]] = None, # M5.0: node routing callbacks on_node_selected: Optional[Callable[[str, str, str], None]] = None, # (agent_id, node_id, source) on_node_rejected: Optional[Callable[[str], None]] = None, # (rejected_node) # M5.1: per-node stats for !status reply node_stats_getter: Optional[Callable[[], Dict[str, Any]]] = None, # M6.0: persistent policy store for dynamic room-node overrides policy_store: Optional[Any] = None, # app.policy_store.PolicyStore # M6.2: data directory for policy exports/imports bridge_data_dir: Optional[str] = None, # M8.0: node health tracker for soft-failover node_health_tracker: Optional[NodeHealthTracker] = None, on_failover: Optional[Callable[[str, str, str], None]] = None, # (from_node, to_node, reason) # M8.1: sticky failover cache (anti-flap) sticky_cache: Optional[StickyNodeCache] = None, on_sticky_set: Optional[Callable[[str, str], None]] = None, # (node_id, scope) # M8.2: HA state persistence config ha_health_snapshot_interval_s: int = 60, ha_health_max_age_s: int = 600, # M9.0: Two-step confirmation store for dangerous commands confirm_store: Optional[ConfirmStore] = None, # M10.0: Auto-backup retention policy (days; 0 = keep forever) policy_export_retention_days: int = 30, # M10.2: max rows to keep in policy_changes history (0 = unlimited) policy_history_limit: int = 100, ) -> None: self._hs_url = matrix_homeserver_url self._token = matrix_access_token self._user_id = matrix_user_id self._router_url = router_url self._node_id = node_id self._room_map = room_map self._console_url = sofiia_console_url self._internal_token = sofiia_internal_token self._rate_limiter = rate_limiter self._queue_max = queue_max_events self._worker_count = worker_concurrency self._drain_timeout_s = queue_drain_timeout_s # Callbacks self._on_message_received = on_message_received self._on_message_replied = on_message_replied self._on_gateway_error = on_gateway_error self._on_rate_limited = on_rate_limited self._on_queue_dropped = on_queue_dropped self._on_queue_size = on_queue_size self._on_invoke_latency = on_invoke_latency self._on_send_latency = on_send_latency self._on_queue_wait = on_queue_wait self._mixed_room_config = mixed_room_config self._control_config = control_config self._control_unauthorized_behavior = control_unauthorized_behavior self._control_token = sofiia_control_token self._unknown_agent_behavior = unknown_agent_behavior self._max_slash_len = max_slash_len self._mixed_concurrency_cap = mixed_concurrency_cap self._on_routed = on_routed self._on_route_rejected = on_route_rejected self._on_control_command = on_control_command self._on_control_rate_limited = on_control_rate_limited # M3.4: control channel safety self._control_limiter = control_limiter # M2.3: persistent event deduplication self._event_store: Optional[EventStore] = event_store self._on_dedupe_persistent_hit = on_dedupe_persistent_hit self._on_dedupe_persistent_insert = on_dedupe_persistent_insert # M5.0: node routing callbacks self._on_node_selected = on_node_selected self._on_node_rejected = on_node_rejected # M5.1: per-node stats getter for !status reply self._node_stats_getter = node_stats_getter # M6.0: persistent policy store self._policy_store = policy_store # M6.2: policy exports directory self._bridge_data_dir: Optional[str] = bridge_data_dir self._policy_last_export_at: Optional[int] = None self._policy_last_import_at: Optional[int] = None # M10.0: auto-backup retention self._policy_export_retention_days: int = policy_export_retention_days # M10.2: history table row limit self._policy_history_limit: int = policy_history_limit # M8.0: node health + soft-failover self._node_health_tracker: Optional[NodeHealthTracker] = node_health_tracker self._on_failover = on_failover # M8.1: sticky failover cache self._sticky_cache: Optional[StickyNodeCache] = sticky_cache self._on_sticky_set = on_sticky_set # M9.0: two-step confirmation store self._confirm_store: Optional[ConfirmStore] = confirm_store # M8.2: HA state persistence self._ha_health_snapshot_interval_s: int = ha_health_snapshot_interval_s self._ha_health_max_age_s: int = ha_health_max_age_s self._ha_sticky_loaded: int = 0 # count of sticky entries loaded on startup self._ha_health_loaded: bool = False # whether health state was loaded on startup # M4.0: agent discovery — simple per-room sliding window (reuses InMemoryRateLimiter logic) self._discovery_rpm = discovery_rpm # M5.0: node-aware routing policy self._node_policy: Optional[NodePolicy] = node_policy from collections import defaultdict, deque self._discovery_windows: dict = defaultdict(deque) # Lazily populated semaphores keyed by "{room_id}:{agent_id}" self._concurrency_locks: Dict[str, asyncio.Semaphore] = {} self._next_batch: Optional[str] = None self._queue: Optional[asyncio.Queue] = None # exposed for /health @property def next_batch(self) -> Optional[str]: return self._next_batch @property def queue_size(self) -> int: return self._queue.qsize() if self._queue else 0 @property def worker_count(self) -> int: return self._worker_count def get_status(self) -> Dict[str, Any]: """Return a simple bridge status dict for health/ops queries.""" status: Dict[str, Any] = { "queue_size": self._queue.qsize() if self._queue else 0, "queue_max": self._queue_max, "worker_count": self._worker_count, } if self._node_policy is not None: status["node_policy"] = self._node_policy.as_info_dict() # M5.1: per-node routed/rejected counters if self._node_stats_getter is not None: status["nodes"] = self._node_stats_getter() # M6.0: policy store info if self._policy_store is not None: try: status["policy_store_ok"] = self._policy_store.is_open status["policy_store_path"] = self._policy_store.db_path status["policy_overrides_count"] = self._policy_store.count_overrides() status["policy_agent_overrides_count"] = self._policy_store.count_agent_overrides() # M6.1 except Exception as exc: # noqa: BLE001 status["policy_store_ok"] = False status["policy_store_error"] = str(exc) # M6.2: policy snapshot timestamps if self._policy_last_export_at is not None: status["policy_last_export_at"] = self._policy_last_export_at if self._policy_last_import_at is not None: status["policy_last_import_at"] = self._policy_last_import_at # M6.2: policy DB mtime (best-effort) if self._policy_store is not None: try: db_path = self._policy_store.db_path if db_path and _os.path.exists(db_path): status["policy_db_mtime"] = int(_os.path.getmtime(db_path)) except Exception: # noqa: BLE001 pass # M10.2: policy change history count if self._policy_store is not None and self._policy_store.is_open: try: status["policy_changes_count"] = self._policy_store.get_policy_changes_count() except Exception: # noqa: BLE001 pass # M8.0: node health tracker state if self._node_health_tracker is not None: allowed = ( self._node_policy.allowed_nodes if self._node_policy is not None else None ) status["node_health"] = self._node_health_tracker.all_info(allowed) # M8.1: sticky failover cache info if self._sticky_cache is not None: status["sticky_active_keys"] = self._sticky_cache.active_count() status["sticky_ttl_s"] = self._sticky_cache.ttl_s # M9.0: pending confirmations if self._confirm_store is not None: status["confirm_pending"] = self._confirm_store.pending_count() status["confirm_ttl_s"] = self._confirm_store.ttl_s # M8.2: HA persistence info status["ha_sticky_loaded"] = self._ha_sticky_loaded status["ha_health_loaded"] = self._ha_health_loaded status["ha_health_snapshot_interval_s"] = self._ha_health_snapshot_interval_s return status @property def active_lock_count(self) -> int: """Number of room-agent pairs currently holding a concurrency lock.""" return sum(1 for lock in self._concurrency_locks.values() if lock.locked()) def _get_concurrency_lock(self, room_id: str, agent_id: str) -> asyncio.Semaphore: """Lazily create and return the semaphore for a (room, agent) pair.""" key = f"{room_id}:{agent_id}" if key not in self._concurrency_locks: cap = self._mixed_concurrency_cap if self._mixed_concurrency_cap > 0 else 2 ** 31 self._concurrency_locks[key] = asyncio.Semaphore(cap) return self._concurrency_locks[key] # ── Public run ───────────────────────────────────────────────────────────── async def run(self, stop_event: asyncio.Event) -> None: mixed_rooms_count = self._mixed_room_config.total_rooms if self._mixed_room_config else 0 logger.info( "Matrix ingress loop started | hs=%s node=%s mappings=%d mixed_rooms=%d " "queue_max=%d workers=%d", self._hs_url, self._node_id, self._room_map.total_mappings, mixed_rooms_count, self._queue_max, self._worker_count, ) if self._room_map.total_mappings == 0 and mixed_rooms_count == 0: logger.warning("No room mappings — ingress loop is idle") queue: asyncio.Queue[Optional[_QueueEntry]] = asyncio.Queue( maxsize=self._queue_max ) self._queue = queue async with MatrixClient(self._hs_url, self._token, self._user_id) as client: for mapping in self._room_map.mappings: if mapping.agent_id in self._room_map.allowed_agents: try: await client.join_room(mapping.room_id) except Exception as exc: logger.warning("Could not join room %s: %s", mapping.room_id, exc) if self._mixed_room_config: for room_id in self._mixed_room_config.rooms: try: await client.join_room(room_id) except Exception as exc: logger.warning("Could not join mixed room %s: %s", room_id, exc) if self._control_config and self._control_config.is_enabled: for room_id in self._control_config.control_rooms: try: await client.join_room(room_id) except Exception as exc: logger.warning("Could not join control room %s: %s", room_id, exc) logger.info( "Control channel: %d rooms, %d operators", len(self._control_config.control_rooms), len(self._control_config.operator_allowlist), ) async with httpx.AsyncClient() as http_client: # M8.2: Load persisted HA state before processing any messages await self._load_ha_state() # Start workers worker_tasks = [ asyncio.create_task( self._worker(queue, client, http_client), name=f"matrix_worker_{i}", ) for i in range(self._worker_count) ] # M8.2: Start periodic node health snapshot task _health_snapshot_task = None if ( self._ha_health_snapshot_interval_s > 0 and self._policy_store is not None and self._node_health_tracker is not None ): _health_snapshot_task = asyncio.create_task( self._node_health_snapshot_loop(), name="ha_health_snapshot", ) # Run reader until stop_event await self._reader(client, queue, http_client, stop_event) # Drain: wait for all enqueued items to be processed logger.info( "Reader stopped. Draining queue (%d items, timeout=%.1fs)...", queue.qsize(), self._drain_timeout_s, ) try: await asyncio.wait_for(queue.join(), timeout=self._drain_timeout_s) logger.info("Queue drained successfully") except asyncio.TimeoutError: remaining = queue.qsize() logger.warning( "Drain timeout (%.1fs): %d items not processed", self._drain_timeout_s, remaining, ) # Cancel workers for task in worker_tasks: task.cancel() # M8.2: Cancel health snapshot task if running if _health_snapshot_task is not None and not _health_snapshot_task.done(): _health_snapshot_task.cancel() results = await asyncio.gather(*worker_tasks, return_exceptions=True) cancelled = sum(1 for r in results if isinstance(r, asyncio.CancelledError)) logger.info("Workers stopped (%d cancelled)", cancelled) self._queue = None logger.info("Matrix ingress loop stopped") # ── Reader ───────────────────────────────────────────────────────────────── async def _reader( self, client: MatrixClient, queue: "asyncio.Queue[Optional[_QueueEntry]]", http_client: httpx.AsyncClient, stop_event: asyncio.Event, ) -> None: backoff = _INIT_RETRY_BACKOFF while not stop_event.is_set(): try: sync_resp = await client.sync_poll(since=self._next_batch) self._next_batch = sync_resp.get("next_batch") backoff = _INIT_RETRY_BACKOFF await self._enqueue_from_sync(client, queue, http_client, sync_resp) except asyncio.CancelledError: break except Exception as exc: logger.error("Reader error (retry in %.1fs): %s", backoff, exc) if self._on_gateway_error: self._on_gateway_error("sync_error") try: await asyncio.wait_for(stop_event.wait(), timeout=backoff) except asyncio.TimeoutError: pass backoff = min(backoff * 2, _MAX_RETRY_BACKOFF) async def _enqueue_from_sync( self, client: MatrixClient, queue: "asyncio.Queue[Optional[_QueueEntry]]", http_client: httpx.AsyncClient, sync_resp: Dict[str, Any], ) -> None: # M3.0: Control rooms — handled first, not forwarded to agents if self._control_config and self._control_config.is_enabled: for room_id in self._control_config.control_rooms: messages = client.extract_room_messages(sync_resp, room_id) for event in messages: await self._try_control(client, http_client, event, room_id) # Regular rooms: 1 room → 1 agent (M1 / M2.0) for mapping in self._room_map.mappings: if mapping.agent_id not in self._room_map.allowed_agents: continue messages = client.extract_room_messages(sync_resp, mapping.room_id) for event in messages: text = event.get("content", {}).get("body", "").strip() # M4.0: agent discovery before routing if is_discovery_message(text): await self._handle_discovery(client, http_client, event, mapping.room_id) continue await self._try_enqueue(client, queue, http_client, event, mapping) # Mixed rooms: 1 room → N agents, routing per message (M2.1) if self._mixed_room_config: for room_id in self._mixed_room_config.rooms: messages = client.extract_room_messages(sync_resp, room_id) for event in messages: text = event.get("content", {}).get("body", "").strip() # M4.0: agent discovery before routing if is_discovery_message(text): await self._handle_discovery(client, http_client, event, room_id) continue await self._try_enqueue_mixed(client, queue, http_client, event, room_id) async def _try_enqueue( self, client: MatrixClient, queue: "asyncio.Queue[Optional[_QueueEntry]]", http_client: httpx.AsyncClient, event: Dict[str, Any], mapping: RoomMapping, ) -> None: event_id = event.get("event_id", "") sender = event.get("sender", "") text = event.get("content", {}).get("body", "").strip() room_id = mapping.room_id agent_id = mapping.agent_id if not text: return # H1: Rate limit (before mark_seen — don't charge quota on drop) if self._rate_limiter is not None: allowed, limit_type = self._rate_limiter.check(room_id=room_id, sender=sender) if not allowed: logger.warning( "Rate limited: room=%s sender=%s limit_type=%s event=%s", room_id, sender, limit_type, event_id, ) if self._on_rate_limited: self._on_rate_limited(room_id, agent_id, limit_type or "unknown") await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.rate_limited", agent_id=agent_id, node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code=f"rate_limit_{limit_type}", data={"sender": sender, "limit_type": limit_type}, ) return # Dedupe — mark before enqueue (prevents double-enqueue on retry) client.mark_seen(event_id) # M2.3: Persistent dedupe (cross-restart protection) if self._event_store is not None: try: already = await self._event_store.is_processed(room_id, event_id) except Exception as exc: logger.warning("EventStore.is_processed error (degraded): %s", exc) already = False if already: logger.debug("Persistent dedupe hit: event=%s room=%s", event_id, room_id) if self._on_dedupe_persistent_hit: self._on_dedupe_persistent_hit(room_id, agent_id) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.dedupe.persistent_hit", agent_id=agent_id, node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", data={"sender": sender}, ) return # H2: Enqueue or drop entry = _QueueEntry( event=event, room_id=room_id, agent_id=agent_id, enqueue_time=time.monotonic(), ) enqueued = False try: queue.put_nowait(entry) enqueued = True qsize = queue.qsize() logger.debug("Enqueued event=%s qsize=%d", event_id, qsize) if self._on_queue_size: self._on_queue_size(qsize) except asyncio.QueueFull: logger.warning( "Queue full (max=%d): dropping event=%s room=%s agent=%s", self._queue_max, event_id, room_id, agent_id, ) if self._on_queue_dropped: self._on_queue_dropped(room_id, agent_id) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.queue_full", agent_id=agent_id, node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code="queue_full", data={"queue_max": self._queue_max, "sender": sender}, ) # M2.3: Mark as processed ONLY after successful enqueue if enqueued and self._event_store is not None: sender_hash = _sender_hash(sender) try: await self._event_store.mark_processed(room_id, event_id, sender_hash) if self._on_dedupe_persistent_insert: self._on_dedupe_persistent_insert() except Exception as exc: logger.warning("EventStore.mark_processed error (degraded): %s", exc) async def _try_enqueue_mixed( self, client: MatrixClient, queue: "asyncio.Queue[Optional[_QueueEntry]]", http_client: httpx.AsyncClient, event: Dict[str, Any], room_id: str, ) -> None: """Enqueue a message from a mixed room, routing to the appropriate agent.""" assert self._mixed_room_config is not None event_id = event.get("event_id", "") sender = event.get("sender", "") text = event.get("content", {}).get("body", "").strip() if not text: return # M6.1: look up dynamic agent override for this room _routing_config = self._mixed_room_config if self._policy_store is not None and self._policy_store.is_open: try: _agent_ov = await asyncio.to_thread( self._policy_store.get_agent_override, room_id ) if _agent_ov is not None: _ov_agents, _ov_default = _agent_ov _effective_default = _ov_default or (_ov_agents[0] if _ov_agents else None) if _ov_agents and _effective_default: _routing_config = build_override_config( self._mixed_room_config, room_id, _ov_agents, _effective_default, ) except Exception as _exc: # noqa: BLE001 logger.warning("PolicyStore get_agent_override failed: %s", _exc) # Route message to determine target agent agent_id, routing_reason, effective_text = route_message( text, room_id, _routing_config, self._room_map.allowed_agents, max_slash_len=self._max_slash_len, ) if agent_id is None: # M2.2: routing rejected — audit + metric + optional error reply logger.warning( "Mixed room %s: routing rejected reason=%s event=%s", room_id, routing_reason, event_id, ) if self._on_route_rejected: self._on_route_rejected(room_id, routing_reason) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.route.rejected", agent_id="unknown", node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code=routing_reason, data={"routing_reason": routing_reason, "sender": sender, "text_len": len(text)}, ) # M2.2: optional user-facing error reply in room if self._unknown_agent_behavior == "reply_error" and routing_reason == REASON_REJECTED_UNKNOWN_AGENT: available = self._mixed_room_config.agents_for_room(room_id) # Extract agent name from text (first slash token, if any) slash_token = text.strip().split()[0].lstrip("/") if text.strip().startswith("/") else "" label = f"`/{slash_token}`" if slash_token else "this command" error_msg = ( f"⚠️ Unknown agent {label}. " f"Available in this room: {', '.join(available)}" ) txn_id = MatrixClient.make_txn_id(room_id, event_id + "_reject") try: await client.send_text(room_id, error_msg, txn_id) except Exception as exc: logger.warning("Could not send route-error reply: %s", exc) return # M2.2: successful route — fire metric callback if self._on_routed: self._on_routed(agent_id, routing_reason) # H1: Rate limit (uses final agent_id for metric tagging) if self._rate_limiter is not None: allowed, limit_type = self._rate_limiter.check(room_id=room_id, sender=sender) if not allowed: logger.warning( "Rate limited (mixed): room=%s sender=%s agent=%s limit_type=%s", room_id, sender, agent_id, limit_type, ) if self._on_rate_limited: self._on_rate_limited(room_id, agent_id, limit_type or "unknown") await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.rate_limited", agent_id=agent_id, node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code=f"rate_limit_{limit_type}", data={"sender": sender, "limit_type": limit_type, "routing_reason": routing_reason}, ) return client.mark_seen(event_id) # M2.3: Persistent dedupe (cross-restart protection, mixed rooms) if self._event_store is not None: try: already = await self._event_store.is_processed(room_id, event_id) except Exception as exc: logger.warning("EventStore.is_processed error mixed (degraded): %s", exc) already = False if already: logger.debug("Persistent dedupe hit (mixed): event=%s room=%s agent=%s", event_id, room_id, agent_id) if self._on_dedupe_persistent_hit: self._on_dedupe_persistent_hit(room_id, agent_id) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.dedupe.persistent_hit", agent_id=agent_id, node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", data={"sender": sender, "routing_reason": routing_reason}, ) return # Store effective_text (stripped of routing token) in a patched event copy effective_event = dict(event) effective_event["content"] = dict(event.get("content", {})) effective_event["content"]["body"] = effective_text entry = _QueueEntry( event=effective_event, room_id=room_id, agent_id=agent_id, enqueue_time=time.monotonic(), routing_reason=routing_reason, is_mixed=True, ) enqueued_mixed = False try: queue.put_nowait(entry) enqueued_mixed = True qsize = queue.qsize() logger.debug( "Enqueued (mixed): event=%s agent=%s reason=%s qsize=%d", event_id, agent_id, routing_reason, qsize, ) if self._on_queue_size: self._on_queue_size(qsize) except asyncio.QueueFull: logger.warning( "Queue full (max=%d): dropping mixed event=%s room=%s agent=%s", self._queue_max, event_id, room_id, agent_id, ) if self._on_queue_dropped: self._on_queue_dropped(room_id, agent_id) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.queue_full", agent_id=agent_id, node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code="queue_full", data={"queue_max": self._queue_max, "sender": sender}, ) # M2.3: Mark as processed ONLY after successful enqueue if enqueued_mixed and self._event_store is not None: sender_hash = _sender_hash(sender) try: await self._event_store.mark_processed(room_id, event_id, sender_hash) if self._on_dedupe_persistent_insert: self._on_dedupe_persistent_insert() except Exception as exc: logger.warning("EventStore.mark_processed error mixed (degraded): %s", exc) # ── M4.0: Agent discovery ────────────────────────────────────────────────── def _check_discovery_rate(self, room_id: str) -> bool: """Sliding-window per-room rate check for discovery replies. Returns True if allowed.""" if self._discovery_rpm <= 0: return True now = time.monotonic() window = self._discovery_windows[room_id] cutoff = now - 60.0 while window and window[0] < cutoff: window.popleft() if len(window) >= self._discovery_rpm: return False window.append(now) return True async def _handle_discovery( self, client: MatrixClient, http_client: httpx.AsyncClient, event: Dict[str, Any], room_id: str, ) -> None: """ Reply to !agents in any user room (no auth required). Rate-limited per room. Marks event as seen + persisted (no router enqueue). """ event_id = event.get("event_id", "") sender = event.get("sender", "") # Rate limit for discovery replies if not self._check_discovery_rate(room_id): logger.debug("Discovery rate limited: room=%s", room_id) client.mark_seen(event_id) return client.mark_seen(event_id) # M6.1: use store-based agent config if available for accurate discovery _disc_config = self._mixed_room_config if self._policy_store is not None and self._policy_store.is_open: try: _disc_ov = await asyncio.to_thread( self._policy_store.get_agent_override, room_id ) if _disc_ov is not None: _d_agents, _d_default = _disc_ov _d_eff_default = _d_default or (_d_agents[0] if _d_agents else None) if _d_agents and _d_eff_default and self._mixed_room_config is not None: _disc_config = build_override_config( self._mixed_room_config, room_id, _d_agents, _d_eff_default, ) except Exception: # noqa: BLE001 pass reply = agents_reply(room_id, self._room_map, _disc_config) txn_id = MatrixClient.make_txn_id(room_id, event_id + "_discovery") try: await client.send_text(room_id, reply, txn_id) except Exception as exc: logger.warning("Could not send discovery reply: %s", exc) # Persist dedupe so restart doesn't re-deliver this discovery if self._event_store is not None: sender_hash = _sender_hash(sender) await self._event_store.mark_processed(room_id, event_id, sender_hash) # ── M6.1: Dynamic mixed room agent overrides via !room agents command ───── async def _handle_room_cmd( self, http_client: httpx.AsyncClient, sender: str, ctrl_room_id: str, event_id: str, cmd_subcommand: str, cmd_args: tuple, cmd_kwargs: Dict[str, str], ) -> str: """Handle `!room agents ` from authorized operator.""" if self._policy_store is None or not self._policy_store.is_open: return "⚠️ Policy store not available." if cmd_subcommand != ROOM_SUBCMD_AGENTS: return room_agents_reply_error( f"Unknown subcommand: `{cmd_subcommand or '?'}`. Use `!room agents `." ) action, room_id, agents_list, single_agent, default_agent = parse_room_agents_cmd( cmd_subcommand, cmd_args, cmd_kwargs, ) if action not in (ROOM_ACTION_SET, ROOM_ACTION_ADD, ROOM_ACTION_REMOVE, ROOM_ACTION_GET, ROOM_ACTION_LIST, ROOM_ACTION_UNSET): return room_agents_reply_error(f"Unknown action: `{action or '?'}`") # Validate allowed agents from global policy allowed_all = self._room_map.allowed_agents # global allowed agents set # ── list ────────────────────────────────────────────────────────────── if action == ROOM_ACTION_LIST: try: rows = await asyncio.to_thread(self._policy_store.list_agent_overrides, 10) total = await asyncio.to_thread(self._policy_store.count_agent_overrides) except Exception as exc: logger.warning("PolicyStore list_agent_overrides error: %s", exc) return "⚠️ Could not read policy store." await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.room.agents.list", agent_id="control", node_id=self._node_id, room_id=ctrl_room_id, event_id=event_id, status="ok", data={"sender": sender, "total": total}, ) return room_agents_reply_list(rows, total) # ── subcommands that require room_id ────────────────────────────────── if not room_id: return room_agents_reply_error("Missing `room=` argument.") if not node_cmd_validate_room(room_id): return room_agents_reply_error( f"Invalid room ID format: `{room_id}`\nExpected: `!localpart:server`" ) # ── get ─────────────────────────────────────────────────────────────── if action == ROOM_ACTION_GET: try: ov = await asyncio.to_thread(self._policy_store.get_agent_override, room_id) except Exception as exc: logger.warning("PolicyStore get_agent_override error: %s", exc) return "⚠️ Could not read policy store." ov_agents, ov_default = (ov if ov else (None, None)) env_room = ( self._mixed_room_config.rooms.get(room_id) if self._mixed_room_config else None ) env_agents = list(env_room.agents) if env_room else None env_default = env_room.default_agent if env_room else None await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.room.agents.get", agent_id="control", node_id=self._node_id, room_id=ctrl_room_id, event_id=event_id, status="ok", data={"sender": sender, "queried_room": room_id}, ) return room_agents_reply_get(room_id, ov_agents, ov_default, env_agents, env_default) # ── unset ───────────────────────────────────────────────────────────── if action == ROOM_ACTION_UNSET: try: deleted = await asyncio.to_thread(self._policy_store.delete_agent_override, room_id) except Exception as exc: logger.warning("PolicyStore delete_agent_override error: %s", exc) return "⚠️ Could not write to policy store." await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.room.agents.unset", agent_id="control", node_id=self._node_id, room_id=ctrl_room_id, event_id=event_id, status="ok", data={"sender": sender, "target_room": room_id, "was_set": deleted}, ) return room_agents_reply_unset_ok(room_id) if deleted else room_agents_reply_unset_not_found(room_id) # ── add ─────────────────────────────────────────────────────────────── if action == ROOM_ACTION_ADD: if not single_agent: return room_agents_reply_error("Missing `agent=` argument for `add`.") if single_agent not in allowed_all: allowed_str = ", ".join(f"`{a}`" for a in sorted(allowed_all)) return room_agents_reply_error(f"Agent `{single_agent}` not in allowed agents: {allowed_str}") try: new_agents, new_default = await asyncio.to_thread( self._policy_store.add_agent_to_room, room_id, single_agent, sender ) except Exception as exc: logger.warning("PolicyStore add_agent_to_room error: %s", exc) return "⚠️ Could not write to policy store." await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.room.agents.add", agent_id="control", node_id=self._node_id, room_id=ctrl_room_id, event_id=event_id, status="ok", data={"sender": sender, "target_room": room_id, "agent": single_agent}, ) return room_agents_reply_add(room_id, single_agent, new_agents, new_default) # ── remove ──────────────────────────────────────────────────────────── if action == ROOM_ACTION_REMOVE: if not single_agent: return room_agents_reply_error("Missing `agent=` argument for `remove`.") try: removed, err = await asyncio.to_thread( self._policy_store.remove_agent_from_room, room_id, single_agent, sender ) except Exception as exc: logger.warning("PolicyStore remove_agent_from_room error: %s", exc) return "⚠️ Could not write to policy store." if not removed: return room_agents_reply_error(err or "Could not remove agent.") # Get updated state try: ov = await asyncio.to_thread(self._policy_store.get_agent_override, room_id) except Exception: # noqa: BLE001 ov = None remaining = ov[0] if ov else [] new_default_r = ov[1] if ov else None await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.room.agents.remove", agent_id="control", node_id=self._node_id, room_id=ctrl_room_id, event_id=event_id, status="ok", data={"sender": sender, "target_room": room_id, "agent": single_agent}, ) return room_agents_reply_remove(room_id, single_agent, remaining, new_default_r) # ── set ─────────────────────────────────────────────────────────────── if action == ROOM_ACTION_SET: if not agents_list: return room_agents_reply_error("Missing `agents=` argument for `set`.") invalid = [a for a in agents_list if a not in allowed_all] if invalid: allowed_str = ", ".join(f"`{a}`" for a in sorted(allowed_all)) return room_agents_reply_error( f"Unknown agents: {', '.join(f'`{a}`' for a in invalid)}\nAllowed: {allowed_str}" ) effective_default = default_agent if default_agent else agents_list[0] if effective_default not in agents_list: return room_agents_reply_error( f"Default agent `{effective_default}` not in provided agents list." ) try: await asyncio.to_thread( self._policy_store.set_agent_override, room_id, agents_list, effective_default, sender, ) except Exception as exc: logger.warning("PolicyStore set_agent_override error: %s", exc) return "⚠️ Could not write to policy store." await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.room.agents.set", agent_id="control", node_id=self._node_id, room_id=ctrl_room_id, event_id=event_id, status="ok", data={ "sender": sender, "target_room": room_id, "agents": agents_list, "default": effective_default, }, ) return room_agents_reply_set(room_id, agents_list, effective_default) return room_agents_reply_error("Unhandled action.") # ── M6.2: Policy snapshot export / import ───────────────────────────────── async def _handle_policy_cmd( self, http_client: httpx.AsyncClient, sender: str, ctrl_room_id: str, event_id: str, subcommand: Optional[str], cmd_kwargs: Dict[str, str], ) -> str: """ Handle `!policy export`, `!policy import`, and `!policy prune_exports`. Requires policy_store and bridge_data_dir to be configured. """ if self._policy_store is None or not self._policy_store.is_open: return policy_cmd_error("Policy store is not available.") if not self._bridge_data_dir: return policy_cmd_error("BRIDGE_DATA_DIR is not configured.") exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR) # ── export ──────────────────────────────────────────────────────────── if subcommand == "export": try: snapshot = await asyncio.to_thread(self._policy_store.export_all) node_count = len(snapshot.get("room_node_overrides", [])) agent_count = len(snapshot.get("room_agent_overrides", [])) import datetime as _dt ts = _dt.datetime.now(_dt.timezone.utc).strftime("%Y%m%d-%H%M%S") filename = f"policy-{ts}.json" await asyncio.to_thread(_os.makedirs, exports_dir, exist_ok=True) export_path = _os.path.join(exports_dir, filename) await asyncio.to_thread( _write_json_file, export_path, snapshot ) self._policy_last_export_at = int(time.time()) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.policy.export", agent_id="control", node_id=self._node_id, room_id=ctrl_room_id, event_id=event_id, status="ok", data={ "sender": sender, "file": filename, "node_overrides": node_count, "agent_overrides": agent_count, }, ) return policy_export_reply(export_path, node_count, agent_count) except Exception as exc: # noqa: BLE001 logger.exception("_handle_policy_cmd export error: %s", exc) return policy_cmd_error(f"Export failed: {exc}") # ── import ──────────────────────────────────────────────────────────── if subcommand == "import": filename = cmd_kwargs.get("path", "").strip() if not filename: return policy_cmd_error("Missing `path=` argument.") safe_path = validate_export_path(exports_dir, filename) if safe_path is None: return policy_cmd_error( f"Invalid path `{filename}`. Only simple filenames within the exports " f"directory are allowed." ) mode_raw = cmd_kwargs.get("mode", "merge").strip().lower() if mode_raw not in ("merge", "replace"): return policy_cmd_error("mode must be `merge` or `replace`.") dry_raw = cmd_kwargs.get("dry_run", "1").strip() dry_run = dry_raw not in ("0", "false", "no") try: raw_text = await asyncio.to_thread(_read_json_file, safe_path) except FileNotFoundError: return policy_cmd_error(f"File not found: `{filename}`") except Exception as exc: # noqa: BLE001 return policy_cmd_error(f"Cannot read file: {exc}") try: data = raw_text if isinstance(raw_text, dict) else {} stats = await asyncio.to_thread( self._policy_store.import_snapshot, data, mode_raw, dry_run, sender, ) except ValueError as ve: return policy_cmd_error(str(ve)) except Exception as exc: # noqa: BLE001 logger.exception("_handle_policy_cmd import error: %s", exc) return policy_cmd_error(f"Import failed: {exc}") if not dry_run: self._policy_last_import_at = int(time.time()) # M10.2: record in policy change history _is_destr = ( stats.get("node_deleted", 0) + stats.get("agent_deleted", 0) ) > 0 _ds = ( f"node: +{stats['node_added']} ~{stats['node_updated']} " f"-{stats['node_deleted']}; " f"agent: +{stats['agent_added']} ~{stats['agent_updated']} " f"-{stats['agent_deleted']}" ) try: await asyncio.to_thread( self._policy_store.record_policy_change, "policy.import", mode_raw, filename, _sender_hash(sender), _ds, _is_destr, stats.get("node_added", 0), stats.get("node_updated", 0), stats.get("node_deleted", 0), stats.get("agent_added", 0), stats.get("agent_updated", 0), stats.get("agent_deleted", 0), self._policy_history_limit, ) except Exception as _exc: # noqa: BLE001 logger.warning("Failed to record import history (non-fatal): %s", _exc) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.policy.import", agent_id="control", node_id=self._node_id, room_id=ctrl_room_id, event_id=event_id, status="ok", data={ "sender": sender, "file": filename, "mode": mode_raw, "dry_run": dry_run, "stats": stats, }, ) return ( policy_import_dry_run_reply(stats, mode_raw) if dry_run else policy_import_reply(stats, mode_raw) ) # ── restore (M10.1) ─────────────────────────────────────────────────── if subcommand == "restore": filename = cmd_kwargs.get("path", "").strip() if not filename: return policy_cmd_error("Missing `path=` argument.") safe_path = validate_export_path(exports_dir, filename) if safe_path is None: return policy_cmd_error( f"Invalid path `{filename}`. Only simple filenames within the exports " "directory are allowed." ) mode_raw = cmd_kwargs.get("mode", "replace").strip().lower() if mode_raw not in ("merge", "replace"): return policy_cmd_error("mode must be `merge` or `replace`.") try: raw_data = await asyncio.to_thread(_read_json_file, safe_path) except FileNotFoundError: return policy_cmd_error(f"File not found: `{filename}`") except Exception as exc: # noqa: BLE001 return policy_cmd_error(f"Cannot read file: {exc}") data = raw_data if isinstance(raw_data, dict) else {} try: stats = await asyncio.to_thread( self._policy_store.import_snapshot, data, mode_raw, False, sender, ) self._policy_last_import_at = int(time.time()) except ValueError as ve: return policy_cmd_error(str(ve)) except Exception as exc: # noqa: BLE001 logger.exception("_handle_policy_cmd restore error: %s", exc) return policy_cmd_error(f"Restore failed: {exc}") # M10.2: record in policy change history _is_destr = ( stats.get("node_deleted", 0) + stats.get("agent_deleted", 0) ) > 0 _rds = ( f"restore/{mode_raw}: " f"node: +{stats['node_added']} ~{stats['node_updated']} " f"-{stats['node_deleted']}; " f"agent: +{stats['agent_added']} ~{stats['agent_updated']} " f"-{stats['agent_deleted']}" ) try: await asyncio.to_thread( self._policy_store.record_policy_change, "policy.restore", mode_raw, filename, _sender_hash(sender), _rds, _is_destr, stats.get("node_added", 0), stats.get("node_updated", 0), stats.get("node_deleted", 0), stats.get("agent_added", 0), stats.get("agent_updated", 0), stats.get("agent_deleted", 0), self._policy_history_limit, ) except Exception as _exc: # noqa: BLE001 logger.warning("Failed to record restore history (non-fatal): %s", _exc) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.policy.restore", agent_id="control", node_id=self._node_id, room_id=ctrl_room_id, event_id=event_id, status="ok", data={ "sender": sender, "file": filename, "mode": mode_raw, "stats": stats, }, ) return policy_restore_applied_reply(stats, mode_raw) # ── prune_exports ───────────────────────────────────────────────────── if subcommand == "prune_exports": dry_raw = cmd_kwargs.get("dry_run", "1").strip() dry_run = dry_raw not in ("0", "false", "no") try: retention_days = int( cmd_kwargs.get( "retention_days", str(self._policy_export_retention_days) ) ) except (ValueError, TypeError): return policy_cmd_error("`retention_days` must be a positive integer.") try: await asyncio.to_thread(_os.makedirs, exports_dir, exist_ok=True) result = await asyncio.to_thread( self._policy_store.prune_exports, exports_dir, retention_days, dry_run, ) except Exception as exc: # noqa: BLE001 logger.exception("_handle_policy_cmd prune error: %s", exc) return policy_cmd_error(f"Prune failed: {exc}") await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.policy.prune_exports", agent_id="control", node_id=self._node_id, room_id=ctrl_room_id, event_id=event_id, status="ok", data={ "sender": sender, "dry_run": dry_run, "retention_days": retention_days, "files_to_delete": result.count, "bytes_to_free": result.total_bytes, }, ) return ( policy_prune_preview_reply(result, retention_days) if dry_run else policy_prune_applied_reply(result, retention_days) ) # ── history (M10.2) ─────────────────────────────────────────────────── if subcommand == "history": try: limit_raw = int(cmd_kwargs.get("limit", "10")) except (ValueError, TypeError): return policy_cmd_error("`limit` must be a positive integer.") safe_limit = max(1, min(limit_raw, 20)) try: changes = await asyncio.to_thread( self._policy_store.list_policy_changes, safe_limit, ) except Exception as exc: # noqa: BLE001 logger.exception("_handle_policy_cmd history error: %s", exc) return policy_cmd_error(f"History fetch failed: {exc}") await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.policy.history", agent_id="control", node_id=self._node_id, room_id=ctrl_room_id, event_id=event_id, status="ok", data={"sender": sender, "limit": safe_limit, "count": len(changes)}, ) return policy_history_reply(changes) # ── change (M10.3) ──────────────────────────────────────────────────── if subcommand == "change": try: change_id = int(cmd_kwargs.get("id", "0")) except (ValueError, TypeError): return policy_cmd_error("`id` must be a positive integer (DB change id).") if change_id <= 0: return policy_cmd_error("Missing or invalid `id=` argument. " "Use `!policy history` to get change ids.") try: change = await asyncio.to_thread( self._policy_store.get_policy_change_by_id, change_id, ) except Exception as exc: # noqa: BLE001 logger.exception("_handle_policy_cmd change detail error: %s", exc) return policy_cmd_error(f"DB error: {exc}") if change is None: return policy_cmd_error( f"Change id={change_id} not found. " "Use `!policy history` to see available ids." ) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.policy.change_detail", agent_id="control", node_id=self._node_id, room_id=ctrl_room_id, event_id=event_id, status="ok", data={"sender": sender, "change_id": change_id}, ) return policy_change_detail_reply(change) return policy_cmd_error(f"Unknown subcommand: `{subcommand!r}`.") # ── M5.1: Nodes overview for operators ──────────────────────────────────── async def _handle_nodes( self, http_client: httpx.AsyncClient, sender: str, room_id: str, event_id: str, ) -> str: """Return node policy overview for `!nodes` in control room.""" try: policy_info = ( self._node_policy.as_info_dict() if self._node_policy is not None else {} ) node_stats = ( self._node_stats_getter() if self._node_stats_getter is not None else {} ) # M8.0: merge health state into node_stats if self._node_health_tracker is not None: allowed = ( self._node_policy.allowed_nodes if self._node_policy is not None else None ) health_all = self._node_health_tracker.all_info(allowed) for nid, info in health_all.items(): if nid not in node_stats: node_stats[nid] = {} node_stats[nid]["health"] = info.get("state", NODE_STATE_HEALTHY) node_stats[nid]["ewma_latency_s"] = info.get("ewma_latency_s") node_stats[nid]["consecutive_failures"] = info.get("consecutive_failures", 0) # M8.1: include sticky cache info sticky_info = None if self._sticky_cache is not None: sticky_entries = self._sticky_cache.active_entries() sticky_info = { "active_keys": len(sticky_entries), "ttl_s": self._sticky_cache.ttl_s, "entries": [ {"key": k, "node": n, "remaining_s": round(r, 0)} for k, n, r in sticky_entries[:5] # show at most 5 ], } if len(sticky_entries) > 5: sticky_info["truncated"] = len(sticky_entries) - 5 await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.nodes", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", data={"sender": sender}, ) return nodes_reply(policy_info, node_stats, sticky_info=sticky_info) except Exception as exc: logger.warning("_handle_nodes error: %s", exc) return "⚠️ Node info not available." # ── M8.2: HA state persistence helpers ──────────────────────────────────── async def _load_ha_state(self) -> None: """ Load persisted HA state (sticky entries + node health) from PolicyStore on startup. Non-fatal: any error is logged and bridge continues in in-memory-only mode. """ if self._policy_store is None or not self._policy_store.is_open: return # Load sticky entries if self._sticky_cache is not None: try: entries = await asyncio.to_thread(self._policy_store.load_sticky_entries) now_unix = int(time.time()) loaded = 0 for key, node_id, expires_at_unix in entries: remaining_s = expires_at_unix - now_unix if remaining_s > 0: self._sticky_cache.set(key, node_id, ttl_s=float(remaining_s)) loaded += 1 self._ha_sticky_loaded = loaded logger.info("HA: loaded %d sticky entries from DB", loaded) except Exception as exc: # noqa: BLE001 logger.warning("HA: failed to load sticky entries (non-fatal): %s", exc) # Load node health snapshot if self._node_health_tracker is not None and self._ha_health_max_age_s > 0: try: snapshot = await asyncio.to_thread( self._policy_store.load_node_health, self._ha_health_max_age_s ) if snapshot: for node_id, info in snapshot.items(): self._node_health_tracker.restore_node( node_id, ewma_latency_s=info.get("ewma_latency_s"), consecutive_failures=int(info.get("consecutive_failures", 0)), ) self._ha_health_loaded = True logger.info( "HA: loaded node health for %d nodes from DB", len(snapshot) ) else: logger.info("HA: no fresh node health snapshot found in DB") except Exception as exc: # noqa: BLE001 logger.warning("HA: failed to load node health (non-fatal): %s", exc) async def _sticky_persist_set(self, key: str, node_id: str) -> None: """Persist a sticky entry to DB after setting it in-memory (M8.2).""" if self._policy_store is None or not self._policy_store.is_open: return if self._sticky_cache is None: return expires_at_unix = int(time.time()) + int(self._sticky_cache.ttl_s) try: await asyncio.to_thread( self._policy_store.upsert_sticky, key, node_id, expires_at_unix ) except Exception as exc: # noqa: BLE001 logger.warning("HA: failed to persist sticky key=%s (non-fatal): %s", key, exc) async def _sticky_persist_delete(self, key: str) -> None: """Remove a sticky entry from DB after deleting it in-memory (M8.2).""" if self._policy_store is None or not self._policy_store.is_open: return try: await asyncio.to_thread(self._policy_store.delete_sticky, key) except Exception as exc: # noqa: BLE001 logger.warning("HA: failed to delete sticky key=%s (non-fatal): %s", key, exc) async def _node_health_snapshot_loop(self) -> None: """ Background task: periodically write node health state to DB (M8.2). Runs until policy_store becomes unavailable or interval is 0. """ if self._ha_health_snapshot_interval_s <= 0: return logger.debug( "HA: health snapshot loop started (interval=%ds)", self._ha_health_snapshot_interval_s ) while True: await asyncio.sleep(self._ha_health_snapshot_interval_s) if self._policy_store is None or not self._policy_store.is_open: break if self._node_health_tracker is None or self._node_policy is None: break try: for node_id in sorted(self._node_policy.allowed_nodes): info = self._node_health_tracker.as_info_dict(node_id) await asyncio.to_thread( self._policy_store.upsert_node_health, node_id, info.get("ewma_latency_s"), int(info.get("consecutive_failures", 0)), ) logger.debug( "HA: health snapshot written for %d nodes", len(self._node_policy.allowed_nodes), ) except Exception as exc: # noqa: BLE001 logger.warning("HA: health snapshot write failed (non-fatal): %s", exc) # ── M9.0: Two-step confirmation for dangerous control commands ───────────── async def _handle_policy_import_intent( self, http_client: "httpx.AsyncClient", cmd: "ControlCommand", sender: str, room_id: str, event_id: str, action_summary: str, normalized: str, ) -> str: """ M9.1: Intent step for !policy import with diff preview and hash binding. Reads the file, computes a diff preview, stores a hash-bound callback, and returns a formatted preview reply containing the nonce. The confirm callback verifies the file hasn't changed before applying. """ assert self._confirm_store is not None # ── Validate args ────────────────────────────────────────────────────── if self._policy_store is None or not self._policy_store.is_open: return policy_cmd_error("Policy store not available.") if not self._bridge_data_dir: return policy_cmd_error("BRIDGE_DATA_DIR not configured.") filename = cmd.kwargs.get("path", "").strip() if not filename: return policy_cmd_error("Missing `path=` argument.") exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR) safe_path = validate_export_path(exports_dir, filename) if safe_path is None: return policy_cmd_error( f"Invalid path `{filename}`. Only simple filenames within the exports " "directory are allowed." ) mode_raw = cmd.kwargs.get("mode", "merge").strip().lower() if mode_raw not in ("merge", "replace"): return policy_cmd_error("mode must be `merge` or `replace`.") # ── Read file + compute diff preview ─────────────────────────────────── try: raw_data = await asyncio.to_thread(_read_json_file, safe_path) except FileNotFoundError: return policy_cmd_error(f"File not found: `{filename}`") except Exception as exc: # noqa: BLE001 return policy_cmd_error(f"Cannot read file: {exc}") if not isinstance(raw_data, dict): return policy_cmd_error("Invalid JSON format (expected object).") try: diff = await asyncio.to_thread( self._policy_store.compute_import_diff, raw_data, mode_raw, ) except ValueError as ve: return policy_cmd_error(str(ve)) except Exception as exc: # noqa: BLE001 return policy_cmd_error(f"Preview failed: {exc}") # ── Compute snapshot hash for confirm binding ───────────────────────── _content_bytes = _json.dumps( raw_data, sort_keys=True, ensure_ascii=True ).encode("utf-8") snapshot_hash = hashlib.sha256( (filename + ":" + mode_raw + ":").encode("utf-8") + _content_bytes ).hexdigest()[:32] sender_hash = _sender_hash(sender) _captured_hash = snapshot_hash _captured_data = raw_data _captured_mode = mode_raw _captured_path = safe_path _captured_fname = filename _captured_sender = sender # ── Build hash-bound callback ────────────────────────────────────────── # Late-capture of nonce for autobackup filename (set after add() below) _nonce_holder: list = [] async def _callback(): # Re-read file and verify hash to detect tampering (anti-TOCTOU) try: fresh_data = await asyncio.to_thread(_read_json_file, _captured_path) except Exception as exc: # noqa: BLE001 return ( f"❌ Cannot re-read file `{_captured_fname}` at apply time: {exc}\n" "Re-issue the command.", "", ) fresh_bytes = _json.dumps( fresh_data if isinstance(fresh_data, dict) else {}, sort_keys=True, ensure_ascii=True, ).encode("utf-8") fresh_hash = hashlib.sha256( (_captured_fname + ":" + _captured_mode + ":").encode("utf-8") + fresh_bytes ).hexdigest()[:32] if fresh_hash != _captured_hash: logger.warning( "Policy import confirm rejected: file changed since preview " "(sender=%s file=%s)", _captured_sender, _captured_fname, ) return ( f"❌ File `{_captured_fname}` changed after preview — confirm rejected.\n" "Re-issue `!policy import ...` to get a new preview.", "", ) # M10.0: Auto-backup current policy before applying changes _autobackup_basename = "" _autobackup_hash = "" if self._policy_store is not None and self._bridge_data_dir: _exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR) _nonce_suffix = _nonce_holder[0] if _nonce_holder else "BACKUP" _sender_hash8 = _sender_hash(_captured_sender)[:8] try: _os.makedirs(_exports_dir, exist_ok=True) _ab_path, _autobackup_hash = await asyncio.to_thread( self._policy_store.write_autobackup, _exports_dir, _sender_hash8, _nonce_suffix, ) _autobackup_basename = _os.path.basename(_ab_path) logger.info( "Policy auto-backup written: %s hash=%s", _autobackup_basename, _autobackup_hash, ) except Exception as exc: # noqa: BLE001 logger.warning("Policy auto-backup failed (non-fatal): %s", exc) # Apply the import using the captured (previewed) data try: stats = await asyncio.to_thread( self._policy_store.import_snapshot, _captured_data, _captured_mode, False, _captured_sender, ) self._policy_last_import_at = int(time.time()) diff_summary = ( f"node: +{stats['node_added']} ~{stats['node_updated']} " f"-{stats['node_deleted']}; " f"agent: +{stats['agent_added']} ~{stats['agent_updated']} " f"-{stats['agent_deleted']}" ) if _autobackup_basename: diff_summary += f"; autobackup={_autobackup_basename}" # M10.2: record in policy change history _is_destr = ( stats.get("node_deleted", 0) + stats.get("agent_deleted", 0) ) > 0 try: await asyncio.to_thread( self._policy_store.record_policy_change, "policy.import", _captured_mode, _captured_fname, _sender_hash(_captured_sender), diff_summary, _is_destr, stats.get("node_added", 0), stats.get("node_updated", 0), stats.get("node_deleted", 0), stats.get("agent_added", 0), stats.get("agent_updated", 0), stats.get("agent_deleted", 0), self._policy_history_limit, ) except Exception as _exc: # noqa: BLE001 logger.warning("Failed to record policy change history: %s", _exc) reply = policy_import_reply(stats, _captured_mode) if _autobackup_basename: reply += ( f"\n\n💾 Auto-backup saved: `{_autobackup_basename}` " f"(hash `{_autobackup_hash}`)" ) return reply, diff_summary except Exception as exc: # noqa: BLE001 logger.exception( "Policy import apply failed: sender=%s file=%s", _captured_sender, _captured_fname, ) return f"❌ Import failed: {exc}", "" # ── Store pending confirmation ───────────────────────────────────────── nonce = self._confirm_store.add( sender_hash=sender_hash, verb="policy.import", normalized_args=normalized, action_summary=action_summary, room_id=room_id, callback=_callback, ) # M10.0: make nonce available inside _callback for backup filename _nonce_holder.append(nonce) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.intent", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", data={ "sender_hash": sender_hash, "verb": "policy.import", "normalized": normalized, "nonce": nonce, "expires_in_s": int(self._confirm_store.ttl_s), "snapshot_hash_prefix": snapshot_hash[:8], "diff": { "node_added": diff.node_added, "node_updated": diff.node_updated, "node_deleted": diff.node_deleted, "agent_added": diff.agent_added, "agent_updated": diff.agent_updated, "agent_deleted": diff.agent_deleted, "sample_keys": diff.sample_keys, }, }, ) logger.info( "Confirm policy import intent: sender=%s mode=%s file=%s hash=%s nonce=%s", sender, mode_raw, filename, snapshot_hash[:8], nonce, ) return policy_import_intent_reply( diff=diff, action_summary=action_summary, nonce=nonce, ttl_s=int(self._confirm_store.ttl_s), ) async def _handle_policy_restore_intent( self, http_client: "httpx.AsyncClient", cmd: "ControlCommand", sender: str, room_id: str, event_id: str, *, action_summary: str, normalized: str, ) -> str: """ Intent step for !policy restore (M10.1). Reads the snapshot file, computes a diff preview, binds a SHA-256 hash to the exact file content + mode, stores a callback in ConfirmStore, and returns a rollback preview reply with the nonce. """ assert self._confirm_store is not None assert self._policy_store is not None assert self._bridge_data_dir is not None filename = cmd.kwargs.get("path", "").strip() if not filename: return policy_cmd_error("Missing `path=` argument.") exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR) safe_path = validate_export_path(exports_dir, filename) if safe_path is None: return policy_cmd_error( f"Invalid path `{filename}`. Only simple filenames within the exports " "directory are allowed." ) mode_raw = cmd.kwargs.get("mode", "replace").strip().lower() if mode_raw not in ("merge", "replace"): return policy_cmd_error("mode must be `merge` or `replace`.") # ── Read file + compute diff preview ────────────────────────────────── try: raw_data = await asyncio.to_thread(_read_json_file, safe_path) except FileNotFoundError: return policy_cmd_error(f"File not found: `{filename}`") except Exception as exc: # noqa: BLE001 return policy_cmd_error(f"Cannot read file: {exc}") if not isinstance(raw_data, dict): return policy_cmd_error("Invalid JSON format (expected object).") try: diff = await asyncio.to_thread( self._policy_store.compute_import_diff, raw_data, mode_raw, ) except ValueError as ve: return policy_cmd_error(str(ve)) except Exception as exc: # noqa: BLE001 return policy_cmd_error(f"Preview failed: {exc}") # ── Compute snapshot hash for confirm binding (anti-TOCTOU) ────────── _content_bytes = _json.dumps( raw_data, sort_keys=True, ensure_ascii=True ).encode("utf-8") snapshot_hash = hashlib.sha256( (filename + ":" + mode_raw + ":restore:").encode("utf-8") + _content_bytes ).hexdigest()[:32] sender_hash = _sender_hash(sender) _captured_hash = snapshot_hash _captured_data = raw_data _captured_mode = mode_raw _captured_path = safe_path _captured_fname = filename _captured_sender = sender # Late-capture of nonce for autobackup filename _nonce_holder: list = [] # ── Build hash-bound callback ───────────────────────────────────────── async def _callback(): # Re-read + verify hash (anti-TOCTOU) try: fresh_data = await asyncio.to_thread(_read_json_file, _captured_path) except Exception as exc: # noqa: BLE001 return ( f"❌ Cannot re-read `{_captured_fname}` at apply time: {exc}\n" "Re-issue the command.", "", ) fresh_bytes = _json.dumps( fresh_data if isinstance(fresh_data, dict) else {}, sort_keys=True, ensure_ascii=True, ).encode("utf-8") fresh_hash = hashlib.sha256( (_captured_fname + ":" + _captured_mode + ":restore:").encode("utf-8") + fresh_bytes ).hexdigest()[:32] if fresh_hash != _captured_hash: logger.warning( "Policy restore confirm rejected: file changed since preview " "(sender=%s file=%s)", _captured_sender, _captured_fname, ) return ( f"❌ File `{_captured_fname}` changed after preview — confirm rejected.\n" "Re-issue `!policy restore ...` to get a new preview.", "", ) # Auto-backup current state before overwriting _autobackup_basename = "" _autobackup_hash = "" if self._policy_store is not None and self._bridge_data_dir: _exp_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR) _nonce_suffix = _nonce_holder[0] if _nonce_holder else "RESTORE" _sender_hash8 = _sender_hash(_captured_sender)[:8] try: _os.makedirs(_exp_dir, exist_ok=True) _ab_path, _autobackup_hash = await asyncio.to_thread( self._policy_store.write_autobackup, _exp_dir, _sender_hash8, _nonce_suffix, ) _autobackup_basename = _os.path.basename(_ab_path) logger.info( "Pre-restore backup written: %s hash=%s", _autobackup_basename, _autobackup_hash, ) except Exception as exc: # noqa: BLE001 logger.warning("Pre-restore backup failed (non-fatal): %s", exc) # Apply restore try: stats = await asyncio.to_thread( self._policy_store.import_snapshot, _captured_data, _captured_mode, False, _captured_sender, ) self._policy_last_import_at = int(time.time()) diff_summary = ( f"restore/{_captured_mode}: " f"node: +{stats['node_added']} ~{stats['node_updated']} " f"-{stats['node_deleted']}; " f"agent: +{stats['agent_added']} ~{stats['agent_updated']} " f"-{stats['agent_deleted']}" ) if _autobackup_basename: diff_summary += f"; autobackup={_autobackup_basename}" # M10.2: record in policy change history _is_destr = ( stats.get("node_deleted", 0) + stats.get("agent_deleted", 0) ) > 0 try: await asyncio.to_thread( self._policy_store.record_policy_change, "policy.restore", _captured_mode, _captured_fname, _sender_hash(_captured_sender), diff_summary, _is_destr, stats.get("node_added", 0), stats.get("node_updated", 0), stats.get("node_deleted", 0), stats.get("agent_added", 0), stats.get("agent_updated", 0), stats.get("agent_deleted", 0), self._policy_history_limit, ) except Exception as _exc: # noqa: BLE001 logger.warning("Failed to record restore history: %s", _exc) reply = policy_restore_applied_reply( stats, _captured_mode, _autobackup_basename ) return reply, diff_summary except Exception as exc: # noqa: BLE001 logger.exception( "Policy restore apply failed: sender=%s file=%s", _captured_sender, _captured_fname, ) return f"❌ Restore failed: {exc}", "" # ── Store pending confirmation ──────────────────────────────────────── nonce = self._confirm_store.add( sender_hash=sender_hash, verb="policy.restore", normalized_args=normalized, action_summary=action_summary, room_id=room_id, callback=_callback, ) _nonce_holder.append(nonce) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.policy.restore", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", data={ "sender_hash": sender_hash, "verb": "policy.restore", "normalized": normalized, "nonce": nonce, "expires_in_s": int(self._confirm_store.ttl_s), "mode": mode_raw, "snapshot_hash_prefix": snapshot_hash[:8], "diff": { "node_added": diff.node_added, "node_updated": diff.node_updated, "node_deleted": diff.node_deleted, "agent_added": diff.agent_added, "agent_updated": diff.agent_updated, "agent_deleted": diff.agent_deleted, "sample_keys": diff.sample_keys, }, }, ) logger.info( "Confirm policy restore intent: sender=%s mode=%s file=%s hash=%s nonce=%s", sender, mode_raw, filename, snapshot_hash[:8], nonce, ) return policy_restore_intent_reply( diff=diff, action_summary=action_summary, nonce=nonce, ttl_s=int(self._confirm_store.ttl_s), ) async def _handle_dangerous_intent( self, http_client: "httpx.AsyncClient", cmd: "ControlCommand", sender: str, room_id: str, event_id: str, ) -> str: """ First leg of the two-step confirm flow (M9.0). Does NOT apply the command. Stores a pending confirmation with a callback that will execute the original handler, and returns a reply containing the nonce that the operator must send via !confirm . """ assert self._confirm_store is not None sender_hash = _sender_hash(sender) normalized = build_normalized_args(cmd) action_summary = ( f"!{cmd.verb} {cmd.subcommand or ''} {normalized}".strip() ) # M9.1: policy import gets a richer preview with diff + hash binding if cmd.verb == VERB_POLICY and (cmd.subcommand or "").lower() == "import": return await self._handle_policy_import_intent( http_client, cmd, sender, room_id, event_id, action_summary=action_summary, normalized=normalized, ) # M10.1: policy restore — rollback with diff preview + hash binding if cmd.verb == VERB_POLICY and (cmd.subcommand or "").lower() == "restore": if self._policy_store is None or not self._bridge_data_dir: return policy_cmd_error( "Policy store or data directory not configured." ) return await self._handle_policy_restore_intent( http_client, cmd, sender, room_id, event_id, action_summary=action_summary, normalized=normalized, ) # Build the callback: calls the actual handler when confirmed. # We capture all args by closure so the callback is self-contained. _verb = cmd.verb _subcmd = cmd.subcommand _args = cmd.args _kw = dict(cmd.kwargs) async def _callback(): if _verb == VERB_NODE: # Reconstruct args_text (same as _try_control does) _parts = [] if _subcmd: _parts.append(_subcmd) _parts.extend(_args) _parts.extend(f"{k}={v}" for k, v in _kw.items()) reply = await self._handle_node_cmd( http_client, sender, room_id, event_id, " ".join(_parts), ) elif _verb == VERB_ROOM: reply = await self._handle_room_cmd( http_client, sender, room_id, event_id, _subcmd, tuple(_args), _kw, ) elif _verb == VERB_POLICY: reply = await self._handle_policy_cmd( http_client, sender, room_id, event_id, _subcmd, _kw, ) else: reply = f"❌ Unknown dangerous verb: {_verb}" return reply, action_summary nonce = self._confirm_store.add( sender_hash=sender_hash, verb=f"{cmd.verb}.{cmd.subcommand or ''}", normalized_args=normalized, action_summary=action_summary, room_id=room_id, callback=_callback, ) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.intent", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", data={ "sender_hash": sender_hash, "verb": cmd.verb, "subcommand": cmd.subcommand or "", "normalized": normalized, "nonce": nonce, "expires_in_s": int(self._confirm_store.ttl_s), }, ) logger.info( "Confirm intent: sender=%s verb=%s/%s nonce=%s ttl=%.0fs", sender, cmd.verb, cmd.subcommand, nonce, self._confirm_store.ttl_s, ) return confirm_intent_reply(action_summary, nonce, int(self._confirm_store.ttl_s)) async def _handle_confirm_cmd( self, http_client: "httpx.AsyncClient", cmd: "ControlCommand", sender: str, room_id: str, event_id: str, ) -> str: """ Second leg of the two-step confirm flow (M9.0). Validates the nonce and sender, executes the stored callback, and emits confirmed + applied audit events. """ if self._confirm_store is None: return "❌ Confirmation store not active." # Nonce may come as subcommand (token right after !confirm) nonce = (cmd.subcommand or "").strip().upper() if not nonce and cmd.args: nonce = cmd.args[0].strip().upper() if not nonce: return "❌ Usage: `!confirm ` — provide the confirmation code." sender_hash = _sender_hash(sender) entry = self._confirm_store.pop(nonce, sender_hash) if entry is None: logger.info( "Confirm rejected: sender=%s nonce=%s (invalid/expired/wrong-sender)", sender, nonce, ) return confirm_expired_reply() await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.confirmed", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", data={ "sender_hash": sender_hash, "nonce": nonce, "verb": entry.verb, "action_summary": entry.action_summary, }, ) logger.info( "Confirm accepted: sender=%s nonce=%s verb=%s", sender, nonce, entry.verb, ) try: reply_text, diff_summary = await entry.callback() except Exception as exc: logger.exception( "Confirm callback failed: sender=%s nonce=%s verb=%s", sender, nonce, entry.verb, ) return f"❌ Apply failed: {exc}" await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.applied", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", data={ "sender_hash": sender_hash, "verb": entry.verb, "normalized": entry.normalized_args, "diff_summary": diff_summary, }, ) return confirm_success_reply(reply_text) # ── M6.0: Dynamic room-node overrides via !node command ─────────────────── async def _handle_node_cmd( self, http_client: httpx.AsyncClient, sender: str, ctrl_room_id: str, event_id: str, args_text: str, ) -> str: """Handle `!node ` from an authorized operator.""" if self._policy_store is None or not self._policy_store.is_open: return "⚠️ Policy store not available." subcmd, room_id, node_id = parse_node_cmd(args_text) if subcmd not in (NODE_SUBCMD_SET, NODE_SUBCMD_UNSET, NODE_SUBCMD_GET, NODE_SUBCMD_LIST): return node_cmd_reply_error( f"Unknown subcommand: `{subcmd or '?'}`" ) # ── list ────────────────────────────────────────────────────────────── if subcmd == NODE_SUBCMD_LIST: try: rows = await asyncio.to_thread(self._policy_store.list_overrides, 10) total = await asyncio.to_thread(self._policy_store.count_overrides) except Exception as exc: logger.warning("PolicyStore list_overrides error: %s", exc) return "⚠️ Could not read policy store." await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.node.list", agent_id="control", node_id=self._node_id, room_id=ctrl_room_id, event_id=event_id, status="ok", data={"sender": sender, "total": total}, ) return node_cmd_reply_list(rows, total) # ── subcommands that require room_id ────────────────────────────────── if not room_id: return node_cmd_reply_error("Missing `room=` argument.") if not node_cmd_validate_room(room_id): return node_cmd_reply_error( f"Invalid room ID format: `{room_id}`\n" "Expected: `!localpart:server`" ) # ── get ─────────────────────────────────────────────────────────────── if subcmd == NODE_SUBCMD_GET: try: override = await asyncio.to_thread(self._policy_store.get_override, room_id) except Exception as exc: logger.warning("PolicyStore get_override error: %s", exc) return "⚠️ Could not read policy store." # env map lookup for context env_node: Optional[str] = None if self._node_policy is not None: env_node = self._node_policy.room_node_map.get(room_id) default = self._node_policy.default_node if self._node_policy else self._node_id await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.node.get", agent_id="control", node_id=self._node_id, room_id=ctrl_room_id, event_id=event_id, status="ok", data={"sender": sender, "queried_room": room_id}, ) return node_cmd_reply_get(room_id, override, env_node, default) # ── unset ───────────────────────────────────────────────────────────── if subcmd == NODE_SUBCMD_UNSET: try: deleted = await asyncio.to_thread(self._policy_store.delete_override, room_id) except Exception as exc: logger.warning("PolicyStore delete_override error: %s", exc) return "⚠️ Could not write to policy store." await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.node.unset", agent_id="control", node_id=self._node_id, room_id=ctrl_room_id, event_id=event_id, status="ok", data={"sender": sender, "target_room": room_id, "was_set": deleted}, ) return node_cmd_reply_unset_ok(room_id) if deleted else node_cmd_reply_unset_not_found(room_id) # ── set ─────────────────────────────────────────────────────────────── if subcmd == NODE_SUBCMD_SET: if not node_id: return node_cmd_reply_error("Missing `node=` argument for `set`.") allowed = self._node_policy.allowed_nodes if self._node_policy else frozenset([self._node_id]) if node_id not in allowed: allowed_list = ", ".join(f"`{n}`" for n in sorted(allowed)) return node_cmd_reply_error( f"Node `{node_id}` is not in allowed list: {allowed_list}" ) try: await asyncio.to_thread(self._policy_store.set_override, room_id, node_id, sender) except Exception as exc: logger.warning("PolicyStore set_override error: %s", exc) return "⚠️ Could not write to policy store." await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.node.set", agent_id="control", node_id=self._node_id, room_id=ctrl_room_id, event_id=event_id, status="ok", data={"sender": sender, "target_room": room_id, "set_node": node_id}, ) return node_cmd_reply_set(room_id, node_id) return node_cmd_reply_error("Unhandled subcommand.") # ── M4.1: Bridge status for operators ───────────────────────────────────── async def _handle_bridge_status( self, http_client: httpx.AsyncClient, sender: str, room_id: str, event_id: str, ) -> str: """Build and return a bridge health snapshot for `!status` in control room.""" try: snapshot: Dict[str, Any] = { "node_id": self._node_id, "worker_count": self._worker_count, "room_count": len(self._room_map.mappings), "mixed_room_count": ( len(self._mixed_room_config.rooms) if self._mixed_room_config else 0 ), "operators_count": ( len(self._control_config.operator_allowlist) if self._control_config and self._control_config.operator_allowlist else 0 ), } # Queue info (exposed by MatrixIngressLoop via get_status) status = self.get_status() snapshot["queue_size"] = status.get("queue_size", "?") snapshot["queue_max"] = status.get("queue_max", "?") # Control safety if self._control_limiter is not None: snapshot["control_safety"] = self._control_limiter.as_health_dict() # Persistent dedupe if self._event_store is not None: snapshot["persistent_dedupe"] = self._event_store.as_health_dict() await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.bridge_status", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", data={"sender": sender}, ) return bridge_status_reply(snapshot) except Exception as exc: logger.warning("_handle_bridge_status error: %s", exc) return status_not_available_reply() # ── Control command handler ──────────────────────────────────────────────── async def _try_control( self, client: MatrixClient, http_client: httpx.AsyncClient, event: Dict[str, Any], room_id: str, ) -> None: """ Process a message from a control room. Non-command messages (not starting with '!') are silently ignored. All command attempts are audited regardless of authorization. """ assert self._control_config is not None event_id = event.get("event_id", "") sender = event.get("sender", "") text = event.get("content", {}).get("body", "").strip() if not text or not is_control_message(text): return # not a command, ignore client.mark_seen(event_id) # Authorization check authorized, rejection_reason = check_authorization(sender, room_id, self._control_config) if not authorized: await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.unauthorized", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code=rejection_reason, data={"sender": sender, "command_preview": text[:80]}, ) logger.warning( "Unauthorized control command: sender=%s room=%s reason=%s cmd=%r", sender, room_id, rejection_reason, text[:60], ) if self._control_unauthorized_behavior == "reply_error": try: txn_id = MatrixClient.make_txn_id(room_id, event_id + "_unauth") await client.send_text(room_id, unauthorized_reply(rejection_reason), txn_id) except Exception as exc: logger.warning("Could not send unauthorized reply: %s", exc) return # M3.4: Rate limiting + cooldown (after auth, before parse/dispatch) if self._control_limiter is not None: sender_hash_ctrl = _sender_hash(sender) allowed_room, retry_room = self._control_limiter.check_room(room_id) if not allowed_room: scope = "room" logger.info("Control rate limited: scope=%s room=%s sender=%s", scope, room_id, sender) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.rate_limited", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code="rate_limited_room", data={"sender": sender, "scope": scope, "retry_after_s": retry_room}, ) if self._on_control_rate_limited: self._on_control_rate_limited(scope) txn_id = MatrixClient.make_txn_id(room_id, event_id + "_rl") await client.send_text(room_id, rate_limited_reply(scope, retry_room), txn_id) return allowed_op, retry_op = self._control_limiter.check_operator(sender_hash_ctrl) if not allowed_op: scope = "operator" logger.info("Control rate limited: scope=%s sender=%s", scope, sender) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.rate_limited", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code="rate_limited_operator", data={"sender": sender, "scope": scope, "retry_after_s": retry_op}, ) if self._on_control_rate_limited: self._on_control_rate_limited(scope) txn_id = MatrixClient.make_txn_id(room_id, event_id + "_rl") await client.send_text(room_id, rate_limited_reply(scope, retry_op), txn_id) return # Parse command cmd = parse_command(text) if cmd is None: logger.warning("Control message from %s could not be parsed: %r", sender, text[:60]) return # M3.4: Cooldown check (anti-double-click, per operator+verb+subcommand) if self._control_limiter is not None: sender_hash_ctrl = _sender_hash(sender) allowed_cd, wait_cd = self._control_limiter.check_cooldown( sender_hash_ctrl, cmd.verb, cmd.subcommand or "", ) if not allowed_cd: scope = "cooldown" logger.info( "Control cooldown: sender=%s verb=%s sub=%s wait=%.1fs", sender, cmd.verb, cmd.subcommand, wait_cd, ) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.rate_limited", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code="cooldown", data={ "sender": sender, "scope": scope, "verb": cmd.verb, "subcommand": cmd.subcommand, "wait_s": wait_cd, }, ) if self._on_control_rate_limited: self._on_control_rate_limited(scope) txn_id = MatrixClient.make_txn_id(room_id, event_id + "_cd") await client.send_text(room_id, rate_limited_reply(scope, wait_cd), txn_id) return # Metric callback if self._on_control_command: self._on_control_command(sender, cmd.verb, cmd.subcommand) # Audit every authorized command await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.command", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", data={ "sender": sender, "verb": cmd.verb, "subcommand": cmd.subcommand, "args": list(cmd.args), "kwargs": dict(cmd.kwargs), "is_known": cmd.is_known, }, ) logger.info( "Control command: sender=%s verb=%s sub=%s args=%s", sender, cmd.verb, cmd.subcommand, cmd.args, ) # Dispatch command txn_id = MatrixClient.make_txn_id(room_id, event_id + "_ctrl") # M9.0: Dangerous commands → two-step confirmation (intent leg) if ( self._confirm_store is not None and is_dangerous_cmd(cmd) and cmd.verb != VERB_CONFIRM ): reply_text = await self._handle_dangerous_intent( http_client, cmd, sender, room_id, event_id, ) elif cmd.verb == VERB_CONFIRM: # M9.0: !confirm (second leg) reply_text = await self._handle_confirm_cmd( http_client, cmd, sender, room_id, event_id, ) elif cmd.verb == VERB_HELP: reply_text = help_reply() elif cmd.verb == VERB_CONFIRM: # Fallback if confirm_store is None (disabled) — inform the operator reply_text = "❌ Confirmation system is disabled." elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_START: reply_text = await self._handle_runbook_start( http_client, client, cmd, sender, room_id, event_id, ) elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_NEXT: # M3.2: advance to next step reply_text = await self._handle_runbook_next( http_client, client, cmd, sender, room_id, event_id, ) elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_COMPLETE: # M3.2: mark manual step complete reply_text = await self._handle_runbook_complete( http_client, client, cmd, sender, room_id, event_id, ) elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_STATUS: # M3.3: show run status reply_text = await self._handle_runbook_status( http_client, client, cmd, sender, room_id, event_id, ) elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_EVIDENCE: # M3.3: generate release evidence reply_text = await self._handle_runbook_evidence( http_client, client, cmd, sender, room_id, event_id, ) elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_POST_REVIEW: # M3.3: generate post-release review reply_text = await self._handle_runbook_post_review( http_client, client, cmd, sender, room_id, event_id, ) elif cmd.verb == VERB_STATUS: # M4.1: bridge health snapshot for operators reply_text = await self._handle_bridge_status( http_client, sender, room_id, event_id, ) elif cmd.verb == VERB_NODES: # M5.1: node policy overview for operators reply_text = await self._handle_nodes( http_client, sender, room_id, event_id, ) elif cmd.verb == VERB_NODE: # M6.0: dynamic room-node override commands # Reconstruct args_text from parsed command parts _node_args_parts = [] if cmd.subcommand: _node_args_parts.append(cmd.subcommand) _node_args_parts.extend(cmd.args) _node_args_parts.extend(f"{k}={v}" for k, v in cmd.kwargs.items()) _node_args_text = " ".join(_node_args_parts) reply_text = await self._handle_node_cmd( http_client, sender, room_id, event_id, _node_args_text, ) elif cmd.verb == VERB_ROOM: # M6.1: dynamic mixed room agent overrides reply_text = await self._handle_room_cmd( http_client, sender, room_id, event_id, cmd.subcommand, cmd.args, cmd.kwargs, ) elif cmd.verb == VERB_POLICY: # M6.2: policy snapshot export/import reply_text = await self._handle_policy_cmd( http_client, sender, room_id, event_id, cmd.subcommand, cmd.kwargs, ) elif not cmd.is_known: reply_text = unknown_command_reply(cmd) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.unknown_cmd", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code="unknown_verb", data={"verb": cmd.verb, "sender": sender}, ) else: reply_text = not_implemented_reply(cmd) try: await client.send_text(room_id, reply_text, txn_id) except Exception as exc: logger.error("Could not send control reply: %s", exc) async def _handle_runbook_start( self, http_client: httpx.AsyncClient, client: "MatrixClient", cmd: ControlCommand, sender: str, room_id: str, event_id: str, ) -> str: """ M3.1: Execute !runbook start [node=NODA1]. Calls sofiia-console POST /api/runbooks/internal/runs. Returns reply text (success or failure) for delivery to the control room. Audits matrix.control.runbook.start regardless of outcome. """ # Extract positional runbook_path runbook_path = cmd.args[0].strip() if cmd.args else "" node_id = cmd.kwargs.get("node", "NODA1").strip() # Validate path before calling the console path_error = _ctrl_runner.validate_runbook_path(runbook_path) if path_error: logger.warning( "!runbook start invalid path: sender=%s path=%r error=%s", sender, runbook_path, path_error, ) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.runbook.start", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code="invalid_path", data={"sender": sender, "runbook_path": runbook_path, "error": path_error}, ) return start_usage_reply() # Call sofiia-console internal API run_id: Optional[str] = None http_status: Optional[int] = None try: result = await _ctrl_runner.start_runbook_run( http_client=http_client, console_url=self._console_url, control_token=self._control_token, runbook_path=runbook_path, operator_id=sender, node_id=node_id, ) run_id = result.get("run_id", "") steps_total = result.get("steps_total", 0) status = result.get("status", "running") http_status = 200 logger.info( "Runbook started: run_id=%s path=%s node=%s steps=%d by sender=%s", run_id, runbook_path, node_id, steps_total, sender, ) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.runbook.start", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", data={ "sender": sender, "runbook_path": runbook_path, "node_id": node_id, "run_id": run_id, "steps_total": steps_total, "http_status": http_status, }, ) return runbook_started_reply(run_id, steps_total, status) except _ctrl_runner.RunnerError as exc: reason = str(exc) logger.error( "!runbook start failed: sender=%s path=%r node=%s error=%s", sender, runbook_path, node_id, reason, ) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.runbook.start", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code="runner_error", data={ "sender": sender, "runbook_path": runbook_path, "node_id": node_id, "error": reason, "http_status": http_status, }, ) return runbook_start_error_reply(reason) async def _handle_runbook_next( self, http_client: httpx.AsyncClient, client: "MatrixClient", cmd: ControlCommand, sender: str, room_id: str, event_id: str, ) -> str: """ M3.2: Execute !runbook next . Calls sofiia-console POST /api/runbooks/internal/runs/{run_id}/next. Returns reply text for the control room. Audits matrix.control.runbook.next. """ run_id = cmd.args[0].strip() if cmd.args else "" if not run_id: return next_usage_reply() # M3.4: per-run rate limit for !runbook next if self._control_limiter is not None: allowed_run, retry_run = self._control_limiter.check_run_next(run_id) if not allowed_run: scope = "run" if self._on_control_rate_limited: self._on_control_rate_limited(scope) return rate_limited_reply(scope, retry_run) http_status: Optional[int] = None try: result = await _ctrl_runner.next_runbook_step( http_client=http_client, console_url=self._console_url, control_token=self._control_token, run_id=run_id, operator_id=sender, ) http_status = 200 step_type = result.get("type", "unknown") step_index = result.get("step_index", 0) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.runbook.next", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", data={ "sender": sender, "run_id": run_id, "step_index": step_index, "step_type": step_type, "http_status": http_status, }, ) if step_type == "manual": return next_manual_reply( run_id=run_id, step_index=step_index, steps_total=result.get("steps_total"), title=result.get("title", ""), instructions_md=result.get("instructions_md", ""), ) else: # http_check / script result_dict = result.get("result") or {} duration_ms = int(result_dict.get("duration_ms", 0)) if isinstance(result_dict, dict) else 0 return next_auto_reply( run_id=run_id, step_index=step_index, action_type=step_type, step_status=result.get("step_status", "ok"), duration_ms=duration_ms or None, completed=bool(result.get("completed", False)), ) except _ctrl_runner.RunnerError as exc: reason = str(exc) logger.error( "!runbook next failed: sender=%s run_id=%r error=%s", sender, run_id, reason, ) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.runbook.next", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code="runner_error", data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status}, ) return next_error_reply(run_id, reason) async def _handle_runbook_complete( self, http_client: httpx.AsyncClient, client: "MatrixClient", cmd: ControlCommand, sender: str, room_id: str, event_id: str, ) -> str: """ M3.2: Execute !runbook complete step= status=ok|warn|fail [notes=...] Calls sofiia-console POST /api/runbooks/internal/runs/{run_id}/steps/{n}/complete. Audits matrix.control.runbook.complete. """ run_id = cmd.args[0].strip() if cmd.args else "" if not run_id: return complete_usage_reply() # step kwarg required step_raw = cmd.kwargs.get("step", "").strip() if not step_raw or not step_raw.isdigit(): return complete_usage_reply() step_index = int(step_raw) # status kwarg required status = cmd.kwargs.get("status", "").strip().lower() if status not in ("ok", "warn", "fail", "skipped"): return complete_usage_reply() # notes: kwarg or remaining positional args (joined with space) notes = cmd.kwargs.get("notes", "").strip() if not notes and len(cmd.args) > 1: notes = " ".join(cmd.args[1:]) notes = sanitize_notes(notes) # M3.4: strip control chars + truncate to MAX_NOTES_LEN http_status: Optional[int] = None try: result = await _ctrl_runner.complete_runbook_step( http_client=http_client, console_url=self._console_url, control_token=self._control_token, run_id=run_id, step_index=step_index, status=status, notes=notes, operator_id=sender, ) http_status = 200 run_completed = bool(result.get("run_completed", False)) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.runbook.complete", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", data={ "sender": sender, "run_id": run_id, "step_index": step_index, "status": status, "run_completed": run_completed, "http_status": http_status, }, ) return complete_ok_reply(run_id, step_index, status, run_completed) except _ctrl_runner.RunnerError as exc: reason = str(exc) logger.error( "!runbook complete failed: sender=%s run_id=%r step=%d error=%s", sender, run_id, step_index, reason, ) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.runbook.complete", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code="runner_error", data={ "sender": sender, "run_id": run_id, "step_index": step_index, "error": reason, "http_status": http_status, }, ) return complete_error_reply(run_id, reason) async def _handle_runbook_status( self, http_client: httpx.AsyncClient, client: "MatrixClient", cmd: ControlCommand, sender: str, room_id: str, event_id: str, ) -> str: """M3.3: !runbook status — GET run info + format status.""" run_id = cmd.args[0].strip() if cmd.args else "" if not run_id: return status_usage_reply() http_status: Optional[int] = None try: result = await _ctrl_runner.get_runbook_run( http_client=http_client, console_url=self._console_url, control_token=self._control_token, run_id=run_id, ) http_status = 200 await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.runbook.status", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", data={ "sender": sender, "run_id": run_id, "run_status": result.get("status"), "http_status": http_status, }, ) return status_reply(result) except _ctrl_runner.RunnerError as exc: reason = str(exc) logger.error("!runbook status failed: sender=%s run_id=%r error=%s", sender, run_id, reason) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.runbook.status", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code="runner_error", data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status}, ) return status_error_reply(run_id, reason) async def _handle_runbook_evidence( self, http_client: httpx.AsyncClient, client: "MatrixClient", cmd: ControlCommand, sender: str, room_id: str, event_id: str, ) -> str: """M3.3: !runbook evidence — generate release evidence.""" run_id = cmd.args[0].strip() if cmd.args else "" if not run_id: return evidence_usage_reply() http_status: Optional[int] = None try: result = await _ctrl_runner.generate_evidence( http_client=http_client, console_url=self._console_url, control_token=self._control_token, run_id=run_id, ) http_status = 200 await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.runbook.evidence", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", data={ "sender": sender, "run_id": run_id, "evidence_path": result.get("evidence_path"), "bytes": result.get("bytes"), "http_status": http_status, }, ) return evidence_reply(result) except _ctrl_runner.RunnerError as exc: reason = str(exc) logger.error("!runbook evidence failed: sender=%s run_id=%r error=%s", sender, run_id, reason) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.runbook.evidence", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code="runner_error", data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status}, ) return evidence_error_reply(run_id, reason) async def _handle_runbook_post_review( self, http_client: httpx.AsyncClient, client: "MatrixClient", cmd: ControlCommand, sender: str, room_id: str, event_id: str, ) -> str: """M3.3: !runbook post_review — generate post-release review.""" run_id = cmd.args[0].strip() if cmd.args else "" if not run_id: return post_review_usage_reply() http_status: Optional[int] = None try: result = await _ctrl_runner.generate_post_review( http_client=http_client, console_url=self._console_url, control_token=self._control_token, run_id=run_id, ) http_status = 200 await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.runbook.post_review", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", data={ "sender": sender, "run_id": run_id, "path": result.get("path"), "bytes": result.get("bytes"), "http_status": http_status, }, ) return post_review_reply(result) except _ctrl_runner.RunnerError as exc: reason = str(exc) logger.error("!runbook post_review failed: sender=%s run_id=%r error=%s", sender, run_id, reason) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.control.runbook.post_review", agent_id="control", node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code="runner_error", data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status}, ) return post_review_error_reply(run_id, reason) # ── Worker ───────────────────────────────────────────────────────────────── async def _worker( self, queue: "asyncio.Queue[Optional[_QueueEntry]]", client: MatrixClient, http_client: httpx.AsyncClient, ) -> None: """Consume queue entries until cancelled.""" while True: entry = await queue.get() # blocks until item available; raises CancelledError on cancel try: await self._process_entry(client, http_client, entry) except Exception as exc: logger.error("Worker unhandled error: %s", exc) finally: queue.task_done() if self._on_queue_size: self._on_queue_size(queue.qsize()) # ── Process (invoke + send + audit) ─────────────────────────────────────── async def _process_entry( self, client: MatrixClient, http_client: httpx.AsyncClient, entry: _QueueEntry, ) -> None: event = entry.event event_id = event.get("event_id", "") sender = event.get("sender", "") text = event.get("content", {}).get("body", "").strip() room_id = entry.room_id agent_id = entry.agent_id # H3: Queue wait latency wait_s = time.monotonic() - entry.enqueue_time if self._on_queue_wait: self._on_queue_wait(agent_id, wait_s) routing_reason = entry.routing_reason is_mixed = entry.is_mixed logger.info( "Processing: room=%s agent=%s event=%s len=%d wait=%.3fs mixed=%s reason=%s", room_id, agent_id, event_id, len(text), wait_s, is_mixed, routing_reason, ) if self._on_message_received: self._on_message_received(room_id, agent_id) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.message.received", agent_id=agent_id, node_id=self._node_id, room_id=room_id, event_id=event_id, status="ok", data={ "sender": sender, "text_len": len(text), "queue_wait_ms": int(wait_s * 1000), "routing_reason": routing_reason, "is_mixed": is_mixed, }, ) # Session Scope v2: canonical key + PII-safe sender hash. # Scope is always room_agent for user-initiated messages. # Control room messages never reach _process_entry (handled by _try_control). scope = SCOPE_ROOM_AGENT session_id = _build_session_key(room_id, agent_id, scope=scope) sender_hash = _sender_hash(sender) logger.debug( "Session scope v2: session_key=%s scope=%s sender_hash=%s", session_id, scope, sender_hash, ) # M5.0: Node-aware routing # Extract node=X kwarg from body (mixed rooms only, to avoid breaking direct rooms) explicit_node: Optional[str] = None effective_text = text if is_mixed and self._node_policy is not None: explicit_node, effective_text = extract_node_kwarg(text) # M6.0: look up dynamic policy store override for this room store_override: Optional[str] = None if self._policy_store is not None and self._policy_store.is_open: try: store_override = await asyncio.to_thread( self._policy_store.get_override, room_id ) except Exception as exc: # noqa: BLE001 logger.warning("PolicyStore get_override failed: %s", exc) node_res = ( self._node_policy.resolve(room_id, explicit_node, store_override=store_override) if self._node_policy is not None else NodeResolution(node_id=self._node_id, source=NODE_SOURCE_DEFAULT) ) if node_res.rejected_node: logger.info( "Node kwarg rejected: requested=%s allowed=%s room=%s agent=%s", node_res.rejected_node, self._node_policy.allowed_nodes if self._node_policy else {}, room_id, agent_id, ) if self._on_node_rejected: self._on_node_rejected(node_res.rejected_node) txn_rej = MatrixClient.make_txn_id(room_id, event_id + "_node_rej") allowed = self._node_policy.allowed_nodes if self._node_policy else frozenset() reply_rej = node_rejected_reply(node_res.rejected_node, allowed) try: await client.send_text(room_id, reply_rej, txn_rej) except Exception as exc: logger.warning("Could not send node rejection reply: %s", exc) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.route.node_rejected", agent_id=agent_id, node_id=self._node_id, room_id=room_id, event_id=event_id, status="error", error_code="node_rejected", data={"requested_node": node_res.rejected_node, "resolved_node": node_res.node_id}, ) # Continue with fallback node (do not drop the message) if self._on_node_selected: self._on_node_selected(agent_id, node_res.node_id, node_res.source) await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.route.node_selected", agent_id=agent_id, node_id=node_res.node_id, room_id=room_id, event_id=event_id, status="ok", data={"node_id": node_res.node_id, "source": node_res.source}, ) # M2.2: per-room-agent concurrency cap (only for mixed rooms; single-agent rooms unaffected) _lock = self._get_concurrency_lock(room_id, agent_id) if is_mixed and self._mixed_concurrency_cap > 0 else None if _lock is not None: await _lock.acquire() try: await self._invoke_and_send( client, http_client, entry, session_id, wait_s, is_mixed, routing_reason, sender_hash=sender_hash, scope=scope, effective_node_id=node_res.node_id, node_source=node_res.source, effective_text=effective_text, ) finally: if _lock is not None: _lock.release() async def _invoke_and_send( self, client: MatrixClient, http_client: httpx.AsyncClient, entry: _QueueEntry, session_id: str, wait_s: float, is_mixed: bool, routing_reason: str, sender_hash: str = "", scope: str = SCOPE_ROOM_AGENT, # M5.0: resolved node effective_node_id: Optional[str] = None, node_source: str = NODE_SOURCE_DEFAULT, effective_text: Optional[str] = None, # text with node=X kwarg stripped ) -> None: """Inner: invoke Router + send reply (separated for concurrency lock wrapping).""" event = entry.event event_id = event.get("event_id", "") # Use effective_text if provided (node kwarg stripped), otherwise original body text = effective_text if effective_text is not None else event.get("content", {}).get("body", "").strip() room_id = entry.room_id agent_id = entry.agent_id node_id = effective_node_id if effective_node_id is not None else self._node_id # H3 + M8.0 + M8.1: Invoke with latency tracking, soft-failover, and sticky routing t0 = time.monotonic() reply_text: Optional[str] = None invoke_ok = False invoke_duration_s = 0.0 used_node_id = node_id # may change on failover # M8.1: check sticky cache (skip primary if sticky is set for this room:agent) sticky_key = make_sticky_key(room_id, agent_id) sticky_node: Optional[str] = None if node_source != NODE_SOURCE_EXPLICIT and self._sticky_cache is not None: sticky_node = self._sticky_cache.get(sticky_key) async def _do_invoke(target_node: str, target_source: str) -> Optional[str]: """Single invoke attempt; returns reply text or None on failure.""" nonlocal invoke_duration_s _t = time.monotonic() try: result = await _invoke_router( http_client, self._router_url, agent_id=agent_id, node_id=target_node, prompt=text, session_id=session_id, sender_hash=sender_hash, scope=scope, node_source=target_source, ) invoke_duration_s = time.monotonic() - _t if self._node_health_tracker is not None: self._node_health_tracker.record_ok(target_node, invoke_duration_s) if self._on_invoke_latency: self._on_invoke_latency(agent_id, invoke_duration_s, target_node) logger.info( "Invoke ok: agent=%s node=%s event=%s reply_len=%d duration=%dms", agent_id, target_node, event_id, len(result or ""), int(invoke_duration_s * 1000), ) return result except httpx.HTTPStatusError as exc: invoke_duration_s = time.monotonic() - _t _reason = FAILOVER_REASON_HTTP_5XX if exc.response.status_code >= 500 else "http_4xx" logger.error( "Router HTTP %d agent=%s node=%s event=%s duration=%dms", exc.response.status_code, agent_id, target_node, event_id, int(invoke_duration_s * 1000), ) if self._node_health_tracker and exc.response.status_code >= 500: self._node_health_tracker.record_error(target_node, _reason) if self._on_gateway_error: self._on_gateway_error(f"http_{exc.response.status_code}") await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.error", agent_id=agent_id, node_id=target_node, room_id=room_id, event_id=event_id, status="error", error_code=f"router_http_{exc.response.status_code}", duration_ms=int(invoke_duration_s * 1000), ) if exc.response.status_code >= 500: raise # eligible for failover return None # 4xx: not a node issue, don't failover except (httpx.ConnectError, httpx.TimeoutException) as exc: invoke_duration_s = time.monotonic() - _t _reason = ( FAILOVER_REASON_TIMEOUT if isinstance(exc, httpx.TimeoutException) else FAILOVER_REASON_NETWORK ) logger.error( "Router network error agent=%s node=%s event=%s: %s", agent_id, target_node, event_id, exc, ) if self._node_health_tracker: self._node_health_tracker.record_error(target_node, _reason) if self._on_gateway_error: self._on_gateway_error("network_error") await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.error", agent_id=agent_id, node_id=target_node, room_id=room_id, event_id=event_id, status="error", error_code="router_network_error", duration_ms=int(invoke_duration_s * 1000), ) raise # eligible for failover except Exception as exc: invoke_duration_s = time.monotonic() - _t logger.error( "Unexpected invoke error agent=%s node=%s event=%s: %s", agent_id, target_node, event_id, exc, ) if self._node_health_tracker: self._node_health_tracker.record_error(target_node, "unexpected") if self._on_gateway_error: self._on_gateway_error("unexpected") await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.error", agent_id=agent_id, node_id=target_node, room_id=room_id, event_id=event_id, status="error", error_code="router_unexpected", duration_ms=int(invoke_duration_s * 1000), ) return None # unexpected errors: no failover (could be code bug) if sticky_node is not None: # M8.1: sticky path — route directly to known-good fallback, skip primary logger.info( "Sticky: routing %s→%s (skipping primary=%s) agent=%s event=%s", sticky_key, sticky_node, node_id, agent_id, event_id, ) try: reply_text = await _do_invoke(sticky_node, NODE_SOURCE_DEFAULT) invoke_ok = reply_text is not None used_node_id = sticky_node except Exception: # noqa: BLE001 # Sticky node also failed — clear sticky and leave reply_text=None self._sticky_cache.delete(sticky_key) # type: ignore[union-attr] logger.warning( "Sticky node %s failed for %s — cleared (agent=%s event=%s)", sticky_node, sticky_key, agent_id, event_id, ) # M8.2: remove from DB as well await self._sticky_persist_delete(sticky_key) else: # Normal path: try primary; attempt failover on eligible errors try: reply_text = await _do_invoke(node_id, node_source) invoke_ok = reply_text is not None used_node_id = node_id except (httpx.ConnectError, httpx.TimeoutException, httpx.HTTPStatusError): # Primary failed with a failover-eligible error. # Failover only for non-explicit routing (explicit = user chose node). if node_source == NODE_SOURCE_EXPLICIT: logger.info( "Node %s failed for explicit routing — no failover (agent=%s event=%s)", node_id, agent_id, event_id, ) # reply_text stays None; error already audited else: # Attempt failover fallback_node: Optional[str] = None if self._node_health_tracker is not None and self._node_policy is not None: fallback_node = self._node_health_tracker.pick_fallback( node_id, self._node_policy.allowed_nodes ) elif self._node_policy is not None: # No tracker — pick any other allowed node deterministically others = sorted( n for n in self._node_policy.allowed_nodes if n != node_id ) fallback_node = others[0] if others else None if fallback_node: logger.warning( "Failover: %s → %s agent=%s event=%s", node_id, fallback_node, agent_id, event_id, ) try: reply_text = await _do_invoke(fallback_node, NODE_SOURCE_DEFAULT) invoke_ok = reply_text is not None used_node_id = fallback_node if invoke_ok: # Fire failover callback and audit if self._on_failover: self._on_failover(node_id, fallback_node, "invoke_error") await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.node.failover", agent_id=agent_id, node_id=fallback_node, room_id=room_id, event_id=event_id, status="ok", data={ "from_node": node_id, "to_node": fallback_node, "original_source": node_source, }, ) # M8.1: set sticky — future messages skip primary if self._sticky_cache is not None: self._sticky_cache.set(sticky_key, fallback_node) scope = "mixed" if is_mixed else "direct" if self._on_sticky_set: self._on_sticky_set(fallback_node, scope) logger.info( "Sticky set: %s → %s scope=%s ttl=%.0fs", sticky_key, fallback_node, scope, self._sticky_cache.ttl_s, ) # M8.2: persist sticky to DB await self._sticky_persist_set(sticky_key, fallback_node) except Exception: # noqa: BLE001 pass # errors already audited inside _do_invoke if not invoke_ok or not reply_text: if invoke_ok: logger.warning("Empty reply from router agent=%s event=%s", agent_id, event_id) return # H3: Send with latency # M2.1: prefix reply with agent identity in mixed rooms ("Sofiia: ...") prefix = reply_prefix(agent_id, is_mixed) raw_reply = reply_text[:_REPLY_TEXT_MAX - len(prefix)] send_text = prefix + raw_reply txn_id = MatrixClient.make_txn_id(room_id, event_id) send_t0 = time.monotonic() try: await client.send_text(room_id, send_text, txn_id) send_duration_s = time.monotonic() - send_t0 if self._on_send_latency: self._on_send_latency(agent_id, send_duration_s) if self._on_message_replied: self._on_message_replied(room_id, agent_id, "ok") await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.agent.replied", agent_id=agent_id, node_id=used_node_id, room_id=room_id, event_id=event_id, status="ok", duration_ms=int(send_duration_s * 1000), data={ "reply_len": len(send_text), "truncated": len(reply_text) > _REPLY_TEXT_MAX, "router_duration_ms": int(invoke_duration_s * 1000), "queue_wait_ms": int(wait_s * 1000), "routing_reason": routing_reason, "is_mixed": is_mixed, "node_source": node_source, "failover": used_node_id != node_id, # M8.0: failover flag }, ) logger.info( "Reply sent: agent=%s event=%s reply_len=%d send_ms=%d", agent_id, event_id, len(send_text), int(send_duration_s * 1000), ) except Exception as exc: send_duration_s = time.monotonic() - send_t0 logger.error("Send failed agent=%s event=%s: %s", agent_id, event_id, exc) if self._on_message_replied: self._on_message_replied(room_id, agent_id, "error") if self._on_gateway_error: self._on_gateway_error("matrix_send_error") await _write_audit( http_client, self._console_url, self._internal_token, event="matrix.error", agent_id=agent_id, node_id=node_id, room_id=room_id, event_id=event_id, status="error", error_code="matrix_send_failed", duration_ms=int(send_duration_s * 1000), )