Files
microdao-daarion/services/matrix-bridge-dagi/app/ingress.py
Apple 82d5ff2a4f feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11:
- M4: agent discovery (!agents / !status)
- M5: node-aware routing + per-node observability
- M6: dynamic policy store (node/agent overrides, import/export)
- M7: Prometheus alerts + Grafana dashboard + metrics contract
- M8: node health tracker + soft failover + sticky cache + HA persistence
- M9: two-step confirm + diff preview for dangerous commands
- M10: auto-backup, restore, retention, policy history + change detail
- M11: soak scenarios (CI tests) + live soak script

Soak infrastructure (this commit):
- POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false)
- _preflight_inject() and _check_wal() in soak script
- --db-path arg for WAL delta reporting
- Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands

Made-with: Cursor
2026-03-05 07:51:37 -08:00

3576 lines
161 KiB
Python

"""
Matrix Ingress + Egress Loop — Phase M1.4 + H1 + H2 + H3 + M2.1 + M2.2 + M3.0 + M3.1 + M3.3 + SessionScopeV2
Architecture (H2):
Reader task → asyncio.Queue(maxsize) → N Worker tasks
─────────────────────────────────────────────────────────
Reader:
sync_poll() → extract_room_messages()
→ rate_limit check (H1)
→ mark_seen / dedupe
→ queue.put_nowait() or DROP (audit matrix.queue_full + metric)
Workers (N concurrent):
queue.get() → measure wait latency (H3)
→ audit matrix.message.received
→ invoke Router (timed, H3)
→ send_text() (timed, H3)
→ audit matrix.agent.replied | matrix.error
Shutdown:
1. stop_event set → reader exits loop
2. queue.join() with drain_timeout → workers finish in-flight
3. worker tasks cancelled
Queue entry: _QueueEntry(event, room_id, agent_id, enqueue_time, routing_reason, is_mixed)
"""
import asyncio
import hashlib
import json as _json
import logging
import os as _os
import time
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional
import httpx
from .control import (
ControlConfig, ControlCommand,
check_authorization, parse_command, is_control_message,
not_implemented_reply, unknown_command_reply, unauthorized_reply, help_reply,
start_usage_reply, runbook_started_reply, runbook_start_error_reply,
next_usage_reply, next_manual_reply, next_auto_reply, next_error_reply,
complete_usage_reply, complete_ok_reply, complete_error_reply,
status_usage_reply, status_reply, status_error_reply,
evidence_usage_reply, evidence_reply, evidence_error_reply,
post_review_usage_reply, post_review_reply, post_review_error_reply,
rate_limited_reply, sanitize_notes, MAX_NOTES_LEN,
status_not_available_reply, nodes_reply,
VERB_HELP, VERB_RUNBOOK, VERB_STATUS, VERB_NODES, VERB_NODE, VERB_CONFIRM,
is_dangerous_cmd, build_normalized_args,
confirm_intent_reply, confirm_success_reply, confirm_expired_reply,
NODE_SUBCMD_SET, NODE_SUBCMD_UNSET, NODE_SUBCMD_GET, NODE_SUBCMD_LIST,
parse_node_cmd, node_cmd_validate_room,
node_cmd_reply_set, node_cmd_reply_unset_ok, node_cmd_reply_unset_not_found,
node_cmd_reply_get, node_cmd_reply_list, node_cmd_reply_error,
VERB_ROOM, ROOM_SUBCMD_AGENTS,
ROOM_ACTION_SET, ROOM_ACTION_ADD, ROOM_ACTION_REMOVE,
ROOM_ACTION_GET, ROOM_ACTION_LIST, ROOM_ACTION_UNSET,
parse_room_agents_cmd,
room_agents_reply_set, room_agents_reply_add, room_agents_reply_remove,
room_agents_reply_unset_ok, room_agents_reply_unset_not_found,
room_agents_reply_get, room_agents_reply_list, room_agents_reply_error,
VERB_POLICY, POLICY_EXPORTS_SUBDIR, validate_export_path,
policy_import_intent_reply, format_import_diff as _format_import_diff,
policy_export_reply, policy_import_dry_run_reply, policy_import_reply,
policy_cmd_error,
policy_prune_preview_reply, policy_prune_applied_reply,
policy_restore_intent_reply, policy_restore_applied_reply,
policy_history_reply, policy_change_detail_reply,
SUBCOMMAND_START, SUBCOMMAND_NEXT, SUBCOMMAND_COMPLETE,
SUBCOMMAND_STATUS, SUBCOMMAND_EVIDENCE, SUBCOMMAND_POST_REVIEW,
)
from .control_limiter import ControlRateLimiter
from .discovery import agents_reply, bridge_status_reply, is_discovery_message
from .event_store import EventStore
from .node_policy import (
NodePolicy, NodeResolution,
NODE_SOURCE_DEFAULT, NODE_SOURCE_EXPLICIT, NODE_SOURCE_ROOM_MAP,
extract_node_kwarg, node_rejected_reply,
)
from .node_health import (
NodeHealthTracker, NodeHealthConfig,
NODE_STATE_HEALTHY, NODE_STATE_DEGRADED, NODE_STATE_DOWN,
FAILOVER_REASON_TIMEOUT, FAILOVER_REASON_HTTP_5XX, FAILOVER_REASON_NETWORK,
)
from .sticky_cache import StickyNodeCache, make_sticky_key
from .confirm_store import ConfirmStore
from . import control_runner as _ctrl_runner
from .matrix_client import MatrixClient
from .mixed_routing import (
MixedRoomConfig, MixedRoom, route_message, reply_prefix, build_override_config,
REASON_REJECTED_UNKNOWN_AGENT, REASON_REJECTED_SLASH_TOO_LONG, REASON_REJECTED_NO_MAPPING,
)
from .rate_limit import InMemoryRateLimiter
from .room_mapping import RoomMappingConfig, RoomMapping
logger = logging.getLogger(__name__)
# ── Constants ──────────────────────────────────────────────────────────────────
_MAX_RETRY_BACKOFF = 60.0
_INIT_RETRY_BACKOFF = 2.0
_ROUTER_TIMEOUT_S = 45.0
_AUDIT_TIMEOUT_S = 5.0
_REPLY_TEXT_MAX = 4000
_WORKER_GET_TIMEOUT_S = 1.0 # how long a worker waits on empty queue before re-checking
# ── Queue entry ────────────────────────────────────────────────────────────────
@dataclass
class _QueueEntry:
event: Dict[str, Any]
room_id: str
agent_id: str
enqueue_time: float # time.monotonic() at enqueue
routing_reason: str = "direct"
is_mixed: bool = False # True for mixed-room entries (reply tagging, session isolation)
# ── Session Scope v2 ───────────────────────────────────────────────────────────
# Invariants:
# 1. Control room messages never reach the Router (no session_key needed there).
# 2. Matrix and Telegram never share a key namespace (prefix "matrix:").
# 3. Mixed rooms: each (room_id, agent_id) pair has its own key — no cross-agent leakage.
# 4. Logs/metrics receive sender_hash (sha256[:16]), never raw Matrix user_id.
SCOPE_ROOM_AGENT = "room_agent" # default: shared room context per agent
SCOPE_OPS_RUNBOOK = "ops_runbook" # future: control/ops room invocations
SCOPE_DM_USER = "dm_agent_user" # future: per-user DM isolation
def _sender_hash(sender: str) -> str:
"""PII-safe 16-hex hash of a Matrix user_id (e.g. @alice:server → 'a3f9...')."""
return hashlib.sha256(sender.encode()).hexdigest()[:16]
def _build_session_key(room_id: str, agent_id: str, scope: str = SCOPE_ROOM_AGENT) -> str:
"""
Canonical session key v2: matrix:{scope}:{room_key}:{agent_id}
Examples:
matrix:room_agent:roomXserver_yourdomain:sofiia
matrix:ops_runbook:opsroomXserver:sofiia
"""
room_key = room_id.replace("!", "").replace(":", "_")
return f"matrix:{scope}:{room_key}:{agent_id}"
# ── Router invoke ──────────────────────────────────────────────────────────────
async def _invoke_router(
http_client: httpx.AsyncClient,
router_url: str,
agent_id: str,
node_id: str,
prompt: str,
session_id: str,
sender_hash: str = "",
scope: str = SCOPE_ROOM_AGENT,
node_source: str = NODE_SOURCE_DEFAULT,
) -> str:
"""POST /v1/agents/{agent_id}/infer → response text. Raises httpx.HTTPError on failure."""
url = f"{router_url.rstrip('/')}/v1/agents/{agent_id}/infer"
payload = {
"prompt": prompt,
"session_id": session_id,
"user_id": "matrix_bridge",
"metadata": {
"transport": "matrix",
"node_id": node_id,
"node_source": node_source, # M5.0: how node was resolved
"session_key": session_id, # explicit for Router/Memory to index on
"sender_hash": sender_hash, # PII-safe sender fingerprint
"scope": scope,
},
}
resp = await http_client.post(url, json=payload, timeout=_ROUTER_TIMEOUT_S)
resp.raise_for_status()
data = resp.json()
text = (
data.get("response")
or data.get("text")
or data.get("content")
or data.get("message")
or ""
)
return (text if isinstance(text, str) else str(text)).strip()
# ── M6.2: File helpers (run in thread) ────────────────────────────────────────
def _write_json_file(path: str, data: Any) -> None:
"""Synchronously write data as JSON to path (UTF-8, 2-space indent)."""
with open(path, "w", encoding="utf-8") as fh:
_json.dump(data, fh, ensure_ascii=False, indent=2)
def _read_json_file(path: str) -> Any:
"""Synchronously read and parse a JSON file."""
with open(path, encoding="utf-8") as fh:
return _json.load(fh)
# ── Audit write ────────────────────────────────────────────────────────────────
async def _write_audit(
http_client: httpx.AsyncClient,
console_url: str,
internal_token: str,
event: str,
agent_id: str,
node_id: str,
room_id: str,
event_id: str,
status: str = "ok",
error_code: Optional[str] = None,
duration_ms: Optional[int] = None,
data: Optional[Dict[str, Any]] = None,
) -> None:
"""Fire-and-forget. Never raises."""
if not console_url or not internal_token:
return
try:
await http_client.post(
f"{console_url.rstrip('/')}/api/audit/internal",
json={
"event": event,
"operator_id": "matrix_bridge",
"node_id": node_id,
"agent_id": agent_id,
"chat_id": room_id,
"status": status,
"error_code": error_code,
"duration_ms": duration_ms,
"data": {"matrix_event_id": event_id, "matrix_room_id": room_id, **(data or {})},
},
headers={"X-Internal-Service-Token": internal_token},
timeout=_AUDIT_TIMEOUT_S,
)
except Exception as exc:
logger.warning("Audit write failed (non-blocking): %s", exc)
# ── Ingress loop (reader + workers) ───────────────────────────────────────────
class MatrixIngressLoop:
"""
Drives the full Matrix → Router → Matrix pipeline with backpressure.
Reader task: sync → extract → rate_check → dedupe → queue.put_nowait
Worker tasks: queue.get → invoke → send → audit
Metric callbacks (all optional, called synchronously):
on_message_received(room_id, agent_id)
on_message_replied(room_id, agent_id, status)
on_gateway_error(error_type)
on_rate_limited(room_id, agent_id, limit_type)
on_queue_dropped(room_id, agent_id)
on_queue_size(current_size: int)
on_invoke_latency(agent_id, duration_seconds)
on_send_latency(agent_id, duration_seconds)
on_queue_wait(agent_id, wait_seconds)
on_routed(agent_id, reason) M2.2: successful routing
on_route_rejected(room_id, reason) M2.2: routing rejection
"""
def __init__(
self,
matrix_homeserver_url: str,
matrix_access_token: str,
matrix_user_id: str,
router_url: str,
node_id: str,
room_map: RoomMappingConfig,
sofiia_console_url: str = "",
sofiia_internal_token: str = "",
rate_limiter: Optional[InMemoryRateLimiter] = None,
queue_max_events: int = 100,
worker_concurrency: int = 2,
queue_drain_timeout_s: float = 5.0,
mixed_room_config: Optional[MixedRoomConfig] = None,
# M2.2: guard rails
unknown_agent_behavior: str = "ignore", # "ignore" | "reply_error"
max_slash_len: int = 32,
mixed_concurrency_cap: int = 1, # 0 = unlimited
# M3.0: control channel
control_config: Optional[ControlConfig] = None,
control_unauthorized_behavior: str = "ignore", # "ignore" | "reply_error"
# M3.1: runbook runner integration
sofiia_control_token: str = "",
# M3.4: control channel safety
control_limiter: Optional["ControlRateLimiter"] = None,
# M2.3: persistent event deduplication
event_store: Optional["EventStore"] = None,
# M4.0: agent discovery
discovery_rpm: int = 20,
# M5.0: node-aware routing
node_policy: Optional["NodePolicy"] = None,
# Callbacks
on_message_received: Optional[Callable[[str, str], None]] = None,
on_message_replied: Optional[Callable[[str, str, str], None]] = None,
on_gateway_error: Optional[Callable[[str], None]] = None,
on_rate_limited: Optional[Callable[[str, str, str], None]] = None,
on_queue_dropped: Optional[Callable[[str, str], None]] = None,
on_queue_size: Optional[Callable[[int], None]] = None,
on_invoke_latency: Optional[Callable[..., None]] = None, # (agent_id, duration_s, node_id="")
on_send_latency: Optional[Callable[[str, float], None]] = None,
on_queue_wait: Optional[Callable[[str, float], None]] = None,
on_routed: Optional[Callable[[str, str], None]] = None,
on_route_rejected: Optional[Callable[[str, str], None]] = None,
on_control_command: Optional[Callable[[str, str, str], None]] = None,
on_control_rate_limited: Optional[Callable[[str], None]] = None,
on_dedupe_persistent_hit: Optional[Callable[[str, str], None]] = None,
on_dedupe_persistent_insert: Optional[Callable[[], None]] = None,
# M5.0: node routing callbacks
on_node_selected: Optional[Callable[[str, str, str], None]] = None, # (agent_id, node_id, source)
on_node_rejected: Optional[Callable[[str], None]] = None, # (rejected_node)
# M5.1: per-node stats for !status reply
node_stats_getter: Optional[Callable[[], Dict[str, Any]]] = None,
# M6.0: persistent policy store for dynamic room-node overrides
policy_store: Optional[Any] = None, # app.policy_store.PolicyStore
# M6.2: data directory for policy exports/imports
bridge_data_dir: Optional[str] = None,
# M8.0: node health tracker for soft-failover
node_health_tracker: Optional[NodeHealthTracker] = None,
on_failover: Optional[Callable[[str, str, str], None]] = None, # (from_node, to_node, reason)
# M8.1: sticky failover cache (anti-flap)
sticky_cache: Optional[StickyNodeCache] = None,
on_sticky_set: Optional[Callable[[str, str], None]] = None, # (node_id, scope)
# M8.2: HA state persistence config
ha_health_snapshot_interval_s: int = 60,
ha_health_max_age_s: int = 600,
# M9.0: Two-step confirmation store for dangerous commands
confirm_store: Optional[ConfirmStore] = None,
# M10.0: Auto-backup retention policy (days; 0 = keep forever)
policy_export_retention_days: int = 30,
# M10.2: max rows to keep in policy_changes history (0 = unlimited)
policy_history_limit: int = 100,
) -> None:
self._hs_url = matrix_homeserver_url
self._token = matrix_access_token
self._user_id = matrix_user_id
self._router_url = router_url
self._node_id = node_id
self._room_map = room_map
self._console_url = sofiia_console_url
self._internal_token = sofiia_internal_token
self._rate_limiter = rate_limiter
self._queue_max = queue_max_events
self._worker_count = worker_concurrency
self._drain_timeout_s = queue_drain_timeout_s
# Callbacks
self._on_message_received = on_message_received
self._on_message_replied = on_message_replied
self._on_gateway_error = on_gateway_error
self._on_rate_limited = on_rate_limited
self._on_queue_dropped = on_queue_dropped
self._on_queue_size = on_queue_size
self._on_invoke_latency = on_invoke_latency
self._on_send_latency = on_send_latency
self._on_queue_wait = on_queue_wait
self._mixed_room_config = mixed_room_config
self._control_config = control_config
self._control_unauthorized_behavior = control_unauthorized_behavior
self._control_token = sofiia_control_token
self._unknown_agent_behavior = unknown_agent_behavior
self._max_slash_len = max_slash_len
self._mixed_concurrency_cap = mixed_concurrency_cap
self._on_routed = on_routed
self._on_route_rejected = on_route_rejected
self._on_control_command = on_control_command
self._on_control_rate_limited = on_control_rate_limited
# M3.4: control channel safety
self._control_limiter = control_limiter
# M2.3: persistent event deduplication
self._event_store: Optional[EventStore] = event_store
self._on_dedupe_persistent_hit = on_dedupe_persistent_hit
self._on_dedupe_persistent_insert = on_dedupe_persistent_insert
# M5.0: node routing callbacks
self._on_node_selected = on_node_selected
self._on_node_rejected = on_node_rejected
# M5.1: per-node stats getter for !status reply
self._node_stats_getter = node_stats_getter
# M6.0: persistent policy store
self._policy_store = policy_store
# M6.2: policy exports directory
self._bridge_data_dir: Optional[str] = bridge_data_dir
self._policy_last_export_at: Optional[int] = None
self._policy_last_import_at: Optional[int] = None
# M10.0: auto-backup retention
self._policy_export_retention_days: int = policy_export_retention_days
# M10.2: history table row limit
self._policy_history_limit: int = policy_history_limit
# M8.0: node health + soft-failover
self._node_health_tracker: Optional[NodeHealthTracker] = node_health_tracker
self._on_failover = on_failover
# M8.1: sticky failover cache
self._sticky_cache: Optional[StickyNodeCache] = sticky_cache
self._on_sticky_set = on_sticky_set
# M9.0: two-step confirmation store
self._confirm_store: Optional[ConfirmStore] = confirm_store
# M8.2: HA state persistence
self._ha_health_snapshot_interval_s: int = ha_health_snapshot_interval_s
self._ha_health_max_age_s: int = ha_health_max_age_s
self._ha_sticky_loaded: int = 0 # count of sticky entries loaded on startup
self._ha_health_loaded: bool = False # whether health state was loaded on startup
# M4.0: agent discovery — simple per-room sliding window (reuses InMemoryRateLimiter logic)
self._discovery_rpm = discovery_rpm
# M5.0: node-aware routing policy
self._node_policy: Optional[NodePolicy] = node_policy
from collections import defaultdict, deque
self._discovery_windows: dict = defaultdict(deque)
# Lazily populated semaphores keyed by "{room_id}:{agent_id}"
self._concurrency_locks: Dict[str, asyncio.Semaphore] = {}
self._next_batch: Optional[str] = None
self._queue: Optional[asyncio.Queue] = None # exposed for /health
@property
def next_batch(self) -> Optional[str]:
return self._next_batch
@property
def queue_size(self) -> int:
return self._queue.qsize() if self._queue else 0
@property
def worker_count(self) -> int:
return self._worker_count
def get_status(self) -> Dict[str, Any]:
"""Return a simple bridge status dict for health/ops queries."""
status: Dict[str, Any] = {
"queue_size": self._queue.qsize() if self._queue else 0,
"queue_max": self._queue_max,
"worker_count": self._worker_count,
}
if self._node_policy is not None:
status["node_policy"] = self._node_policy.as_info_dict()
# M5.1: per-node routed/rejected counters
if self._node_stats_getter is not None:
status["nodes"] = self._node_stats_getter()
# M6.0: policy store info
if self._policy_store is not None:
try:
status["policy_store_ok"] = self._policy_store.is_open
status["policy_store_path"] = self._policy_store.db_path
status["policy_overrides_count"] = self._policy_store.count_overrides()
status["policy_agent_overrides_count"] = self._policy_store.count_agent_overrides() # M6.1
except Exception as exc: # noqa: BLE001
status["policy_store_ok"] = False
status["policy_store_error"] = str(exc)
# M6.2: policy snapshot timestamps
if self._policy_last_export_at is not None:
status["policy_last_export_at"] = self._policy_last_export_at
if self._policy_last_import_at is not None:
status["policy_last_import_at"] = self._policy_last_import_at
# M6.2: policy DB mtime (best-effort)
if self._policy_store is not None:
try:
db_path = self._policy_store.db_path
if db_path and _os.path.exists(db_path):
status["policy_db_mtime"] = int(_os.path.getmtime(db_path))
except Exception: # noqa: BLE001
pass
# M10.2: policy change history count
if self._policy_store is not None and self._policy_store.is_open:
try:
status["policy_changes_count"] = self._policy_store.get_policy_changes_count()
except Exception: # noqa: BLE001
pass
# M8.0: node health tracker state
if self._node_health_tracker is not None:
allowed = (
self._node_policy.allowed_nodes
if self._node_policy is not None
else None
)
status["node_health"] = self._node_health_tracker.all_info(allowed)
# M8.1: sticky failover cache info
if self._sticky_cache is not None:
status["sticky_active_keys"] = self._sticky_cache.active_count()
status["sticky_ttl_s"] = self._sticky_cache.ttl_s
# M9.0: pending confirmations
if self._confirm_store is not None:
status["confirm_pending"] = self._confirm_store.pending_count()
status["confirm_ttl_s"] = self._confirm_store.ttl_s
# M8.2: HA persistence info
status["ha_sticky_loaded"] = self._ha_sticky_loaded
status["ha_health_loaded"] = self._ha_health_loaded
status["ha_health_snapshot_interval_s"] = self._ha_health_snapshot_interval_s
return status
@property
def active_lock_count(self) -> int:
"""Number of room-agent pairs currently holding a concurrency lock."""
return sum(1 for lock in self._concurrency_locks.values() if lock.locked())
def _get_concurrency_lock(self, room_id: str, agent_id: str) -> asyncio.Semaphore:
"""Lazily create and return the semaphore for a (room, agent) pair."""
key = f"{room_id}:{agent_id}"
if key not in self._concurrency_locks:
cap = self._mixed_concurrency_cap if self._mixed_concurrency_cap > 0 else 2 ** 31
self._concurrency_locks[key] = asyncio.Semaphore(cap)
return self._concurrency_locks[key]
# ── Public run ─────────────────────────────────────────────────────────────
async def run(self, stop_event: asyncio.Event) -> None:
mixed_rooms_count = self._mixed_room_config.total_rooms if self._mixed_room_config else 0
logger.info(
"Matrix ingress loop started | hs=%s node=%s mappings=%d mixed_rooms=%d "
"queue_max=%d workers=%d",
self._hs_url, self._node_id, self._room_map.total_mappings, mixed_rooms_count,
self._queue_max, self._worker_count,
)
if self._room_map.total_mappings == 0 and mixed_rooms_count == 0:
logger.warning("No room mappings — ingress loop is idle")
queue: asyncio.Queue[Optional[_QueueEntry]] = asyncio.Queue(
maxsize=self._queue_max
)
self._queue = queue
async with MatrixClient(self._hs_url, self._token, self._user_id) as client:
for mapping in self._room_map.mappings:
if mapping.agent_id in self._room_map.allowed_agents:
try:
await client.join_room(mapping.room_id)
except Exception as exc:
logger.warning("Could not join room %s: %s", mapping.room_id, exc)
if self._mixed_room_config:
for room_id in self._mixed_room_config.rooms:
try:
await client.join_room(room_id)
except Exception as exc:
logger.warning("Could not join mixed room %s: %s", room_id, exc)
if self._control_config and self._control_config.is_enabled:
for room_id in self._control_config.control_rooms:
try:
await client.join_room(room_id)
except Exception as exc:
logger.warning("Could not join control room %s: %s", room_id, exc)
logger.info(
"Control channel: %d rooms, %d operators",
len(self._control_config.control_rooms),
len(self._control_config.operator_allowlist),
)
async with httpx.AsyncClient() as http_client:
# M8.2: Load persisted HA state before processing any messages
await self._load_ha_state()
# Start workers
worker_tasks = [
asyncio.create_task(
self._worker(queue, client, http_client),
name=f"matrix_worker_{i}",
)
for i in range(self._worker_count)
]
# M8.2: Start periodic node health snapshot task
_health_snapshot_task = None
if (
self._ha_health_snapshot_interval_s > 0
and self._policy_store is not None
and self._node_health_tracker is not None
):
_health_snapshot_task = asyncio.create_task(
self._node_health_snapshot_loop(),
name="ha_health_snapshot",
)
# Run reader until stop_event
await self._reader(client, queue, http_client, stop_event)
# Drain: wait for all enqueued items to be processed
logger.info(
"Reader stopped. Draining queue (%d items, timeout=%.1fs)...",
queue.qsize(), self._drain_timeout_s,
)
try:
await asyncio.wait_for(queue.join(), timeout=self._drain_timeout_s)
logger.info("Queue drained successfully")
except asyncio.TimeoutError:
remaining = queue.qsize()
logger.warning(
"Drain timeout (%.1fs): %d items not processed",
self._drain_timeout_s, remaining,
)
# Cancel workers
for task in worker_tasks:
task.cancel()
# M8.2: Cancel health snapshot task if running
if _health_snapshot_task is not None and not _health_snapshot_task.done():
_health_snapshot_task.cancel()
results = await asyncio.gather(*worker_tasks, return_exceptions=True)
cancelled = sum(1 for r in results if isinstance(r, asyncio.CancelledError))
logger.info("Workers stopped (%d cancelled)", cancelled)
self._queue = None
logger.info("Matrix ingress loop stopped")
# ── Reader ─────────────────────────────────────────────────────────────────
async def _reader(
self,
client: MatrixClient,
queue: "asyncio.Queue[Optional[_QueueEntry]]",
http_client: httpx.AsyncClient,
stop_event: asyncio.Event,
) -> None:
backoff = _INIT_RETRY_BACKOFF
while not stop_event.is_set():
try:
sync_resp = await client.sync_poll(since=self._next_batch)
self._next_batch = sync_resp.get("next_batch")
backoff = _INIT_RETRY_BACKOFF
await self._enqueue_from_sync(client, queue, http_client, sync_resp)
except asyncio.CancelledError:
break
except Exception as exc:
logger.error("Reader error (retry in %.1fs): %s", backoff, exc)
if self._on_gateway_error:
self._on_gateway_error("sync_error")
try:
await asyncio.wait_for(stop_event.wait(), timeout=backoff)
except asyncio.TimeoutError:
pass
backoff = min(backoff * 2, _MAX_RETRY_BACKOFF)
async def _enqueue_from_sync(
self,
client: MatrixClient,
queue: "asyncio.Queue[Optional[_QueueEntry]]",
http_client: httpx.AsyncClient,
sync_resp: Dict[str, Any],
) -> None:
# M3.0: Control rooms — handled first, not forwarded to agents
if self._control_config and self._control_config.is_enabled:
for room_id in self._control_config.control_rooms:
messages = client.extract_room_messages(sync_resp, room_id)
for event in messages:
await self._try_control(client, http_client, event, room_id)
# Regular rooms: 1 room → 1 agent (M1 / M2.0)
for mapping in self._room_map.mappings:
if mapping.agent_id not in self._room_map.allowed_agents:
continue
messages = client.extract_room_messages(sync_resp, mapping.room_id)
for event in messages:
text = event.get("content", {}).get("body", "").strip()
# M4.0: agent discovery before routing
if is_discovery_message(text):
await self._handle_discovery(client, http_client, event, mapping.room_id)
continue
await self._try_enqueue(client, queue, http_client, event, mapping)
# Mixed rooms: 1 room → N agents, routing per message (M2.1)
if self._mixed_room_config:
for room_id in self._mixed_room_config.rooms:
messages = client.extract_room_messages(sync_resp, room_id)
for event in messages:
text = event.get("content", {}).get("body", "").strip()
# M4.0: agent discovery before routing
if is_discovery_message(text):
await self._handle_discovery(client, http_client, event, room_id)
continue
await self._try_enqueue_mixed(client, queue, http_client, event, room_id)
async def _try_enqueue(
self,
client: MatrixClient,
queue: "asyncio.Queue[Optional[_QueueEntry]]",
http_client: httpx.AsyncClient,
event: Dict[str, Any],
mapping: RoomMapping,
) -> None:
event_id = event.get("event_id", "")
sender = event.get("sender", "")
text = event.get("content", {}).get("body", "").strip()
room_id = mapping.room_id
agent_id = mapping.agent_id
if not text:
return
# H1: Rate limit (before mark_seen — don't charge quota on drop)
if self._rate_limiter is not None:
allowed, limit_type = self._rate_limiter.check(room_id=room_id, sender=sender)
if not allowed:
logger.warning(
"Rate limited: room=%s sender=%s limit_type=%s event=%s",
room_id, sender, limit_type, event_id,
)
if self._on_rate_limited:
self._on_rate_limited(room_id, agent_id, limit_type or "unknown")
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.rate_limited",
agent_id=agent_id, node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code=f"rate_limit_{limit_type}",
data={"sender": sender, "limit_type": limit_type},
)
return
# Dedupe — mark before enqueue (prevents double-enqueue on retry)
client.mark_seen(event_id)
# M2.3: Persistent dedupe (cross-restart protection)
if self._event_store is not None:
try:
already = await self._event_store.is_processed(room_id, event_id)
except Exception as exc:
logger.warning("EventStore.is_processed error (degraded): %s", exc)
already = False
if already:
logger.debug("Persistent dedupe hit: event=%s room=%s", event_id, room_id)
if self._on_dedupe_persistent_hit:
self._on_dedupe_persistent_hit(room_id, agent_id)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.dedupe.persistent_hit",
agent_id=agent_id, node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="ok",
data={"sender": sender},
)
return
# H2: Enqueue or drop
entry = _QueueEntry(
event=event,
room_id=room_id,
agent_id=agent_id,
enqueue_time=time.monotonic(),
)
enqueued = False
try:
queue.put_nowait(entry)
enqueued = True
qsize = queue.qsize()
logger.debug("Enqueued event=%s qsize=%d", event_id, qsize)
if self._on_queue_size:
self._on_queue_size(qsize)
except asyncio.QueueFull:
logger.warning(
"Queue full (max=%d): dropping event=%s room=%s agent=%s",
self._queue_max, event_id, room_id, agent_id,
)
if self._on_queue_dropped:
self._on_queue_dropped(room_id, agent_id)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.queue_full",
agent_id=agent_id, node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code="queue_full",
data={"queue_max": self._queue_max, "sender": sender},
)
# M2.3: Mark as processed ONLY after successful enqueue
if enqueued and self._event_store is not None:
sender_hash = _sender_hash(sender)
try:
await self._event_store.mark_processed(room_id, event_id, sender_hash)
if self._on_dedupe_persistent_insert:
self._on_dedupe_persistent_insert()
except Exception as exc:
logger.warning("EventStore.mark_processed error (degraded): %s", exc)
async def _try_enqueue_mixed(
self,
client: MatrixClient,
queue: "asyncio.Queue[Optional[_QueueEntry]]",
http_client: httpx.AsyncClient,
event: Dict[str, Any],
room_id: str,
) -> None:
"""Enqueue a message from a mixed room, routing to the appropriate agent."""
assert self._mixed_room_config is not None
event_id = event.get("event_id", "")
sender = event.get("sender", "")
text = event.get("content", {}).get("body", "").strip()
if not text:
return
# M6.1: look up dynamic agent override for this room
_routing_config = self._mixed_room_config
if self._policy_store is not None and self._policy_store.is_open:
try:
_agent_ov = await asyncio.to_thread(
self._policy_store.get_agent_override, room_id
)
if _agent_ov is not None:
_ov_agents, _ov_default = _agent_ov
_effective_default = _ov_default or (_ov_agents[0] if _ov_agents else None)
if _ov_agents and _effective_default:
_routing_config = build_override_config(
self._mixed_room_config, room_id,
_ov_agents, _effective_default,
)
except Exception as _exc: # noqa: BLE001
logger.warning("PolicyStore get_agent_override failed: %s", _exc)
# Route message to determine target agent
agent_id, routing_reason, effective_text = route_message(
text, room_id, _routing_config, self._room_map.allowed_agents,
max_slash_len=self._max_slash_len,
)
if agent_id is None:
# M2.2: routing rejected — audit + metric + optional error reply
logger.warning(
"Mixed room %s: routing rejected reason=%s event=%s",
room_id, routing_reason, event_id,
)
if self._on_route_rejected:
self._on_route_rejected(room_id, routing_reason)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.route.rejected",
agent_id="unknown", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code=routing_reason,
data={"routing_reason": routing_reason, "sender": sender, "text_len": len(text)},
)
# M2.2: optional user-facing error reply in room
if self._unknown_agent_behavior == "reply_error" and routing_reason == REASON_REJECTED_UNKNOWN_AGENT:
available = self._mixed_room_config.agents_for_room(room_id)
# Extract agent name from text (first slash token, if any)
slash_token = text.strip().split()[0].lstrip("/") if text.strip().startswith("/") else ""
label = f"`/{slash_token}`" if slash_token else "this command"
error_msg = (
f"⚠️ Unknown agent {label}. "
f"Available in this room: {', '.join(available)}"
)
txn_id = MatrixClient.make_txn_id(room_id, event_id + "_reject")
try:
await client.send_text(room_id, error_msg, txn_id)
except Exception as exc:
logger.warning("Could not send route-error reply: %s", exc)
return
# M2.2: successful route — fire metric callback
if self._on_routed:
self._on_routed(agent_id, routing_reason)
# H1: Rate limit (uses final agent_id for metric tagging)
if self._rate_limiter is not None:
allowed, limit_type = self._rate_limiter.check(room_id=room_id, sender=sender)
if not allowed:
logger.warning(
"Rate limited (mixed): room=%s sender=%s agent=%s limit_type=%s",
room_id, sender, agent_id, limit_type,
)
if self._on_rate_limited:
self._on_rate_limited(room_id, agent_id, limit_type or "unknown")
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.rate_limited",
agent_id=agent_id, node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code=f"rate_limit_{limit_type}",
data={"sender": sender, "limit_type": limit_type, "routing_reason": routing_reason},
)
return
client.mark_seen(event_id)
# M2.3: Persistent dedupe (cross-restart protection, mixed rooms)
if self._event_store is not None:
try:
already = await self._event_store.is_processed(room_id, event_id)
except Exception as exc:
logger.warning("EventStore.is_processed error mixed (degraded): %s", exc)
already = False
if already:
logger.debug("Persistent dedupe hit (mixed): event=%s room=%s agent=%s", event_id, room_id, agent_id)
if self._on_dedupe_persistent_hit:
self._on_dedupe_persistent_hit(room_id, agent_id)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.dedupe.persistent_hit",
agent_id=agent_id, node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="ok",
data={"sender": sender, "routing_reason": routing_reason},
)
return
# Store effective_text (stripped of routing token) in a patched event copy
effective_event = dict(event)
effective_event["content"] = dict(event.get("content", {}))
effective_event["content"]["body"] = effective_text
entry = _QueueEntry(
event=effective_event,
room_id=room_id,
agent_id=agent_id,
enqueue_time=time.monotonic(),
routing_reason=routing_reason,
is_mixed=True,
)
enqueued_mixed = False
try:
queue.put_nowait(entry)
enqueued_mixed = True
qsize = queue.qsize()
logger.debug(
"Enqueued (mixed): event=%s agent=%s reason=%s qsize=%d",
event_id, agent_id, routing_reason, qsize,
)
if self._on_queue_size:
self._on_queue_size(qsize)
except asyncio.QueueFull:
logger.warning(
"Queue full (max=%d): dropping mixed event=%s room=%s agent=%s",
self._queue_max, event_id, room_id, agent_id,
)
if self._on_queue_dropped:
self._on_queue_dropped(room_id, agent_id)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.queue_full",
agent_id=agent_id, node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code="queue_full",
data={"queue_max": self._queue_max, "sender": sender},
)
# M2.3: Mark as processed ONLY after successful enqueue
if enqueued_mixed and self._event_store is not None:
sender_hash = _sender_hash(sender)
try:
await self._event_store.mark_processed(room_id, event_id, sender_hash)
if self._on_dedupe_persistent_insert:
self._on_dedupe_persistent_insert()
except Exception as exc:
logger.warning("EventStore.mark_processed error mixed (degraded): %s", exc)
# ── M4.0: Agent discovery ──────────────────────────────────────────────────
def _check_discovery_rate(self, room_id: str) -> bool:
"""Sliding-window per-room rate check for discovery replies. Returns True if allowed."""
if self._discovery_rpm <= 0:
return True
now = time.monotonic()
window = self._discovery_windows[room_id]
cutoff = now - 60.0
while window and window[0] < cutoff:
window.popleft()
if len(window) >= self._discovery_rpm:
return False
window.append(now)
return True
async def _handle_discovery(
self,
client: MatrixClient,
http_client: httpx.AsyncClient,
event: Dict[str, Any],
room_id: str,
) -> None:
"""
Reply to !agents in any user room (no auth required).
Rate-limited per room. Marks event as seen + persisted (no router enqueue).
"""
event_id = event.get("event_id", "")
sender = event.get("sender", "")
# Rate limit for discovery replies
if not self._check_discovery_rate(room_id):
logger.debug("Discovery rate limited: room=%s", room_id)
client.mark_seen(event_id)
return
client.mark_seen(event_id)
# M6.1: use store-based agent config if available for accurate discovery
_disc_config = self._mixed_room_config
if self._policy_store is not None and self._policy_store.is_open:
try:
_disc_ov = await asyncio.to_thread(
self._policy_store.get_agent_override, room_id
)
if _disc_ov is not None:
_d_agents, _d_default = _disc_ov
_d_eff_default = _d_default or (_d_agents[0] if _d_agents else None)
if _d_agents and _d_eff_default and self._mixed_room_config is not None:
_disc_config = build_override_config(
self._mixed_room_config, room_id, _d_agents, _d_eff_default,
)
except Exception: # noqa: BLE001
pass
reply = agents_reply(room_id, self._room_map, _disc_config)
txn_id = MatrixClient.make_txn_id(room_id, event_id + "_discovery")
try:
await client.send_text(room_id, reply, txn_id)
except Exception as exc:
logger.warning("Could not send discovery reply: %s", exc)
# Persist dedupe so restart doesn't re-deliver this discovery
if self._event_store is not None:
sender_hash = _sender_hash(sender)
await self._event_store.mark_processed(room_id, event_id, sender_hash)
# ── M6.1: Dynamic mixed room agent overrides via !room agents command ─────
async def _handle_room_cmd(
self,
http_client: httpx.AsyncClient,
sender: str,
ctrl_room_id: str,
event_id: str,
cmd_subcommand: str,
cmd_args: tuple,
cmd_kwargs: Dict[str, str],
) -> str:
"""Handle `!room agents <set|add|remove|get|list|unset>` from authorized operator."""
if self._policy_store is None or not self._policy_store.is_open:
return "⚠️ Policy store not available."
if cmd_subcommand != ROOM_SUBCMD_AGENTS:
return room_agents_reply_error(
f"Unknown subcommand: `{cmd_subcommand or '?'}`. Use `!room agents <action>`."
)
action, room_id, agents_list, single_agent, default_agent = parse_room_agents_cmd(
cmd_subcommand, cmd_args, cmd_kwargs,
)
if action not in (ROOM_ACTION_SET, ROOM_ACTION_ADD, ROOM_ACTION_REMOVE,
ROOM_ACTION_GET, ROOM_ACTION_LIST, ROOM_ACTION_UNSET):
return room_agents_reply_error(f"Unknown action: `{action or '?'}`")
# Validate allowed agents from global policy
allowed_all = self._room_map.allowed_agents # global allowed agents set
# ── list ──────────────────────────────────────────────────────────────
if action == ROOM_ACTION_LIST:
try:
rows = await asyncio.to_thread(self._policy_store.list_agent_overrides, 10)
total = await asyncio.to_thread(self._policy_store.count_agent_overrides)
except Exception as exc:
logger.warning("PolicyStore list_agent_overrides error: %s", exc)
return "⚠️ Could not read policy store."
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.room.agents.list",
agent_id="control", node_id=self._node_id,
room_id=ctrl_room_id, event_id=event_id,
status="ok", data={"sender": sender, "total": total},
)
return room_agents_reply_list(rows, total)
# ── subcommands that require room_id ──────────────────────────────────
if not room_id:
return room_agents_reply_error("Missing `room=` argument.")
if not node_cmd_validate_room(room_id):
return room_agents_reply_error(
f"Invalid room ID format: `{room_id}`\nExpected: `!localpart:server`"
)
# ── get ───────────────────────────────────────────────────────────────
if action == ROOM_ACTION_GET:
try:
ov = await asyncio.to_thread(self._policy_store.get_agent_override, room_id)
except Exception as exc:
logger.warning("PolicyStore get_agent_override error: %s", exc)
return "⚠️ Could not read policy store."
ov_agents, ov_default = (ov if ov else (None, None))
env_room = (
self._mixed_room_config.rooms.get(room_id) if self._mixed_room_config else None
)
env_agents = list(env_room.agents) if env_room else None
env_default = env_room.default_agent if env_room else None
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.room.agents.get",
agent_id="control", node_id=self._node_id,
room_id=ctrl_room_id, event_id=event_id,
status="ok", data={"sender": sender, "queried_room": room_id},
)
return room_agents_reply_get(room_id, ov_agents, ov_default, env_agents, env_default)
# ── unset ─────────────────────────────────────────────────────────────
if action == ROOM_ACTION_UNSET:
try:
deleted = await asyncio.to_thread(self._policy_store.delete_agent_override, room_id)
except Exception as exc:
logger.warning("PolicyStore delete_agent_override error: %s", exc)
return "⚠️ Could not write to policy store."
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.room.agents.unset",
agent_id="control", node_id=self._node_id,
room_id=ctrl_room_id, event_id=event_id,
status="ok", data={"sender": sender, "target_room": room_id, "was_set": deleted},
)
return room_agents_reply_unset_ok(room_id) if deleted else room_agents_reply_unset_not_found(room_id)
# ── add ───────────────────────────────────────────────────────────────
if action == ROOM_ACTION_ADD:
if not single_agent:
return room_agents_reply_error("Missing `agent=` argument for `add`.")
if single_agent not in allowed_all:
allowed_str = ", ".join(f"`{a}`" for a in sorted(allowed_all))
return room_agents_reply_error(f"Agent `{single_agent}` not in allowed agents: {allowed_str}")
try:
new_agents, new_default = await asyncio.to_thread(
self._policy_store.add_agent_to_room, room_id, single_agent, sender
)
except Exception as exc:
logger.warning("PolicyStore add_agent_to_room error: %s", exc)
return "⚠️ Could not write to policy store."
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.room.agents.add",
agent_id="control", node_id=self._node_id,
room_id=ctrl_room_id, event_id=event_id,
status="ok", data={"sender": sender, "target_room": room_id, "agent": single_agent},
)
return room_agents_reply_add(room_id, single_agent, new_agents, new_default)
# ── remove ────────────────────────────────────────────────────────────
if action == ROOM_ACTION_REMOVE:
if not single_agent:
return room_agents_reply_error("Missing `agent=` argument for `remove`.")
try:
removed, err = await asyncio.to_thread(
self._policy_store.remove_agent_from_room, room_id, single_agent, sender
)
except Exception as exc:
logger.warning("PolicyStore remove_agent_from_room error: %s", exc)
return "⚠️ Could not write to policy store."
if not removed:
return room_agents_reply_error(err or "Could not remove agent.")
# Get updated state
try:
ov = await asyncio.to_thread(self._policy_store.get_agent_override, room_id)
except Exception: # noqa: BLE001
ov = None
remaining = ov[0] if ov else []
new_default_r = ov[1] if ov else None
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.room.agents.remove",
agent_id="control", node_id=self._node_id,
room_id=ctrl_room_id, event_id=event_id,
status="ok", data={"sender": sender, "target_room": room_id, "agent": single_agent},
)
return room_agents_reply_remove(room_id, single_agent, remaining, new_default_r)
# ── set ───────────────────────────────────────────────────────────────
if action == ROOM_ACTION_SET:
if not agents_list:
return room_agents_reply_error("Missing `agents=` argument for `set`.")
invalid = [a for a in agents_list if a not in allowed_all]
if invalid:
allowed_str = ", ".join(f"`{a}`" for a in sorted(allowed_all))
return room_agents_reply_error(
f"Unknown agents: {', '.join(f'`{a}`' for a in invalid)}\nAllowed: {allowed_str}"
)
effective_default = default_agent if default_agent else agents_list[0]
if effective_default not in agents_list:
return room_agents_reply_error(
f"Default agent `{effective_default}` not in provided agents list."
)
try:
await asyncio.to_thread(
self._policy_store.set_agent_override,
room_id, agents_list, effective_default, sender,
)
except Exception as exc:
logger.warning("PolicyStore set_agent_override error: %s", exc)
return "⚠️ Could not write to policy store."
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.room.agents.set",
agent_id="control", node_id=self._node_id,
room_id=ctrl_room_id, event_id=event_id,
status="ok", data={
"sender": sender, "target_room": room_id,
"agents": agents_list, "default": effective_default,
},
)
return room_agents_reply_set(room_id, agents_list, effective_default)
return room_agents_reply_error("Unhandled action.")
# ── M6.2: Policy snapshot export / import ─────────────────────────────────
async def _handle_policy_cmd(
self,
http_client: httpx.AsyncClient,
sender: str,
ctrl_room_id: str,
event_id: str,
subcommand: Optional[str],
cmd_kwargs: Dict[str, str],
) -> str:
"""
Handle `!policy export`, `!policy import`, and `!policy prune_exports`.
Requires policy_store and bridge_data_dir to be configured.
"""
if self._policy_store is None or not self._policy_store.is_open:
return policy_cmd_error("Policy store is not available.")
if not self._bridge_data_dir:
return policy_cmd_error("BRIDGE_DATA_DIR is not configured.")
exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
# ── export ────────────────────────────────────────────────────────────
if subcommand == "export":
try:
snapshot = await asyncio.to_thread(self._policy_store.export_all)
node_count = len(snapshot.get("room_node_overrides", []))
agent_count = len(snapshot.get("room_agent_overrides", []))
import datetime as _dt
ts = _dt.datetime.now(_dt.timezone.utc).strftime("%Y%m%d-%H%M%S")
filename = f"policy-{ts}.json"
await asyncio.to_thread(_os.makedirs, exports_dir, exist_ok=True)
export_path = _os.path.join(exports_dir, filename)
await asyncio.to_thread(
_write_json_file, export_path, snapshot
)
self._policy_last_export_at = int(time.time())
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.policy.export",
agent_id="control", node_id=self._node_id,
room_id=ctrl_room_id, event_id=event_id,
status="ok", data={
"sender": sender, "file": filename,
"node_overrides": node_count, "agent_overrides": agent_count,
},
)
return policy_export_reply(export_path, node_count, agent_count)
except Exception as exc: # noqa: BLE001
logger.exception("_handle_policy_cmd export error: %s", exc)
return policy_cmd_error(f"Export failed: {exc}")
# ── import ────────────────────────────────────────────────────────────
if subcommand == "import":
filename = cmd_kwargs.get("path", "").strip()
if not filename:
return policy_cmd_error("Missing `path=` argument.")
safe_path = validate_export_path(exports_dir, filename)
if safe_path is None:
return policy_cmd_error(
f"Invalid path `{filename}`. Only simple filenames within the exports "
f"directory are allowed."
)
mode_raw = cmd_kwargs.get("mode", "merge").strip().lower()
if mode_raw not in ("merge", "replace"):
return policy_cmd_error("mode must be `merge` or `replace`.")
dry_raw = cmd_kwargs.get("dry_run", "1").strip()
dry_run = dry_raw not in ("0", "false", "no")
try:
raw_text = await asyncio.to_thread(_read_json_file, safe_path)
except FileNotFoundError:
return policy_cmd_error(f"File not found: `{filename}`")
except Exception as exc: # noqa: BLE001
return policy_cmd_error(f"Cannot read file: {exc}")
try:
data = raw_text if isinstance(raw_text, dict) else {}
stats = await asyncio.to_thread(
self._policy_store.import_snapshot,
data, mode_raw, dry_run, sender,
)
except ValueError as ve:
return policy_cmd_error(str(ve))
except Exception as exc: # noqa: BLE001
logger.exception("_handle_policy_cmd import error: %s", exc)
return policy_cmd_error(f"Import failed: {exc}")
if not dry_run:
self._policy_last_import_at = int(time.time())
# M10.2: record in policy change history
_is_destr = (
stats.get("node_deleted", 0) + stats.get("agent_deleted", 0)
) > 0
_ds = (
f"node: +{stats['node_added']} ~{stats['node_updated']} "
f"-{stats['node_deleted']}; "
f"agent: +{stats['agent_added']} ~{stats['agent_updated']} "
f"-{stats['agent_deleted']}"
)
try:
await asyncio.to_thread(
self._policy_store.record_policy_change,
"policy.import", mode_raw, filename,
_sender_hash(sender), _ds, _is_destr,
stats.get("node_added", 0), stats.get("node_updated", 0),
stats.get("node_deleted", 0),
stats.get("agent_added", 0), stats.get("agent_updated", 0),
stats.get("agent_deleted", 0),
self._policy_history_limit,
)
except Exception as _exc: # noqa: BLE001
logger.warning("Failed to record import history (non-fatal): %s", _exc)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.policy.import",
agent_id="control", node_id=self._node_id,
room_id=ctrl_room_id, event_id=event_id,
status="ok", data={
"sender": sender, "file": filename,
"mode": mode_raw, "dry_run": dry_run, "stats": stats,
},
)
return (
policy_import_dry_run_reply(stats, mode_raw)
if dry_run
else policy_import_reply(stats, mode_raw)
)
# ── restore (M10.1) ───────────────────────────────────────────────────
if subcommand == "restore":
filename = cmd_kwargs.get("path", "").strip()
if not filename:
return policy_cmd_error("Missing `path=` argument.")
safe_path = validate_export_path(exports_dir, filename)
if safe_path is None:
return policy_cmd_error(
f"Invalid path `{filename}`. Only simple filenames within the exports "
"directory are allowed."
)
mode_raw = cmd_kwargs.get("mode", "replace").strip().lower()
if mode_raw not in ("merge", "replace"):
return policy_cmd_error("mode must be `merge` or `replace`.")
try:
raw_data = await asyncio.to_thread(_read_json_file, safe_path)
except FileNotFoundError:
return policy_cmd_error(f"File not found: `{filename}`")
except Exception as exc: # noqa: BLE001
return policy_cmd_error(f"Cannot read file: {exc}")
data = raw_data if isinstance(raw_data, dict) else {}
try:
stats = await asyncio.to_thread(
self._policy_store.import_snapshot,
data, mode_raw, False, sender,
)
self._policy_last_import_at = int(time.time())
except ValueError as ve:
return policy_cmd_error(str(ve))
except Exception as exc: # noqa: BLE001
logger.exception("_handle_policy_cmd restore error: %s", exc)
return policy_cmd_error(f"Restore failed: {exc}")
# M10.2: record in policy change history
_is_destr = (
stats.get("node_deleted", 0) + stats.get("agent_deleted", 0)
) > 0
_rds = (
f"restore/{mode_raw}: "
f"node: +{stats['node_added']} ~{stats['node_updated']} "
f"-{stats['node_deleted']}; "
f"agent: +{stats['agent_added']} ~{stats['agent_updated']} "
f"-{stats['agent_deleted']}"
)
try:
await asyncio.to_thread(
self._policy_store.record_policy_change,
"policy.restore", mode_raw, filename,
_sender_hash(sender), _rds, _is_destr,
stats.get("node_added", 0), stats.get("node_updated", 0),
stats.get("node_deleted", 0),
stats.get("agent_added", 0), stats.get("agent_updated", 0),
stats.get("agent_deleted", 0),
self._policy_history_limit,
)
except Exception as _exc: # noqa: BLE001
logger.warning("Failed to record restore history (non-fatal): %s", _exc)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.policy.restore",
agent_id="control", node_id=self._node_id,
room_id=ctrl_room_id, event_id=event_id,
status="ok",
data={
"sender": sender, "file": filename,
"mode": mode_raw, "stats": stats,
},
)
return policy_restore_applied_reply(stats, mode_raw)
# ── prune_exports ─────────────────────────────────────────────────────
if subcommand == "prune_exports":
dry_raw = cmd_kwargs.get("dry_run", "1").strip()
dry_run = dry_raw not in ("0", "false", "no")
try:
retention_days = int(
cmd_kwargs.get(
"retention_days", str(self._policy_export_retention_days)
)
)
except (ValueError, TypeError):
return policy_cmd_error("`retention_days` must be a positive integer.")
try:
await asyncio.to_thread(_os.makedirs, exports_dir, exist_ok=True)
result = await asyncio.to_thread(
self._policy_store.prune_exports,
exports_dir, retention_days, dry_run,
)
except Exception as exc: # noqa: BLE001
logger.exception("_handle_policy_cmd prune error: %s", exc)
return policy_cmd_error(f"Prune failed: {exc}")
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.policy.prune_exports",
agent_id="control", node_id=self._node_id,
room_id=ctrl_room_id, event_id=event_id,
status="ok",
data={
"sender": sender,
"dry_run": dry_run,
"retention_days": retention_days,
"files_to_delete": result.count,
"bytes_to_free": result.total_bytes,
},
)
return (
policy_prune_preview_reply(result, retention_days)
if dry_run
else policy_prune_applied_reply(result, retention_days)
)
# ── history (M10.2) ───────────────────────────────────────────────────
if subcommand == "history":
try:
limit_raw = int(cmd_kwargs.get("limit", "10"))
except (ValueError, TypeError):
return policy_cmd_error("`limit` must be a positive integer.")
safe_limit = max(1, min(limit_raw, 20))
try:
changes = await asyncio.to_thread(
self._policy_store.list_policy_changes, safe_limit,
)
except Exception as exc: # noqa: BLE001
logger.exception("_handle_policy_cmd history error: %s", exc)
return policy_cmd_error(f"History fetch failed: {exc}")
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.policy.history",
agent_id="control", node_id=self._node_id,
room_id=ctrl_room_id, event_id=event_id,
status="ok", data={"sender": sender, "limit": safe_limit, "count": len(changes)},
)
return policy_history_reply(changes)
# ── change (M10.3) ────────────────────────────────────────────────────
if subcommand == "change":
try:
change_id = int(cmd_kwargs.get("id", "0"))
except (ValueError, TypeError):
return policy_cmd_error("`id` must be a positive integer (DB change id).")
if change_id <= 0:
return policy_cmd_error("Missing or invalid `id=` argument. "
"Use `!policy history` to get change ids.")
try:
change = await asyncio.to_thread(
self._policy_store.get_policy_change_by_id, change_id,
)
except Exception as exc: # noqa: BLE001
logger.exception("_handle_policy_cmd change detail error: %s", exc)
return policy_cmd_error(f"DB error: {exc}")
if change is None:
return policy_cmd_error(
f"Change id={change_id} not found. "
"Use `!policy history` to see available ids."
)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.policy.change_detail",
agent_id="control", node_id=self._node_id,
room_id=ctrl_room_id, event_id=event_id,
status="ok", data={"sender": sender, "change_id": change_id},
)
return policy_change_detail_reply(change)
return policy_cmd_error(f"Unknown subcommand: `{subcommand!r}`.")
# ── M5.1: Nodes overview for operators ────────────────────────────────────
async def _handle_nodes(
self,
http_client: httpx.AsyncClient,
sender: str,
room_id: str,
event_id: str,
) -> str:
"""Return node policy overview for `!nodes` in control room."""
try:
policy_info = (
self._node_policy.as_info_dict()
if self._node_policy is not None
else {}
)
node_stats = (
self._node_stats_getter()
if self._node_stats_getter is not None
else {}
)
# M8.0: merge health state into node_stats
if self._node_health_tracker is not None:
allowed = (
self._node_policy.allowed_nodes
if self._node_policy is not None
else None
)
health_all = self._node_health_tracker.all_info(allowed)
for nid, info in health_all.items():
if nid not in node_stats:
node_stats[nid] = {}
node_stats[nid]["health"] = info.get("state", NODE_STATE_HEALTHY)
node_stats[nid]["ewma_latency_s"] = info.get("ewma_latency_s")
node_stats[nid]["consecutive_failures"] = info.get("consecutive_failures", 0)
# M8.1: include sticky cache info
sticky_info = None
if self._sticky_cache is not None:
sticky_entries = self._sticky_cache.active_entries()
sticky_info = {
"active_keys": len(sticky_entries),
"ttl_s": self._sticky_cache.ttl_s,
"entries": [
{"key": k, "node": n, "remaining_s": round(r, 0)}
for k, n, r in sticky_entries[:5] # show at most 5
],
}
if len(sticky_entries) > 5:
sticky_info["truncated"] = len(sticky_entries) - 5
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.nodes",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="ok", data={"sender": sender},
)
return nodes_reply(policy_info, node_stats, sticky_info=sticky_info)
except Exception as exc:
logger.warning("_handle_nodes error: %s", exc)
return "⚠️ Node info not available."
# ── M8.2: HA state persistence helpers ────────────────────────────────────
async def _load_ha_state(self) -> None:
"""
Load persisted HA state (sticky entries + node health) from PolicyStore on startup.
Non-fatal: any error is logged and bridge continues in in-memory-only mode.
"""
if self._policy_store is None or not self._policy_store.is_open:
return
# Load sticky entries
if self._sticky_cache is not None:
try:
entries = await asyncio.to_thread(self._policy_store.load_sticky_entries)
now_unix = int(time.time())
loaded = 0
for key, node_id, expires_at_unix in entries:
remaining_s = expires_at_unix - now_unix
if remaining_s > 0:
self._sticky_cache.set(key, node_id, ttl_s=float(remaining_s))
loaded += 1
self._ha_sticky_loaded = loaded
logger.info("HA: loaded %d sticky entries from DB", loaded)
except Exception as exc: # noqa: BLE001
logger.warning("HA: failed to load sticky entries (non-fatal): %s", exc)
# Load node health snapshot
if self._node_health_tracker is not None and self._ha_health_max_age_s > 0:
try:
snapshot = await asyncio.to_thread(
self._policy_store.load_node_health, self._ha_health_max_age_s
)
if snapshot:
for node_id, info in snapshot.items():
self._node_health_tracker.restore_node(
node_id,
ewma_latency_s=info.get("ewma_latency_s"),
consecutive_failures=int(info.get("consecutive_failures", 0)),
)
self._ha_health_loaded = True
logger.info(
"HA: loaded node health for %d nodes from DB", len(snapshot)
)
else:
logger.info("HA: no fresh node health snapshot found in DB")
except Exception as exc: # noqa: BLE001
logger.warning("HA: failed to load node health (non-fatal): %s", exc)
async def _sticky_persist_set(self, key: str, node_id: str) -> None:
"""Persist a sticky entry to DB after setting it in-memory (M8.2)."""
if self._policy_store is None or not self._policy_store.is_open:
return
if self._sticky_cache is None:
return
expires_at_unix = int(time.time()) + int(self._sticky_cache.ttl_s)
try:
await asyncio.to_thread(
self._policy_store.upsert_sticky, key, node_id, expires_at_unix
)
except Exception as exc: # noqa: BLE001
logger.warning("HA: failed to persist sticky key=%s (non-fatal): %s", key, exc)
async def _sticky_persist_delete(self, key: str) -> None:
"""Remove a sticky entry from DB after deleting it in-memory (M8.2)."""
if self._policy_store is None or not self._policy_store.is_open:
return
try:
await asyncio.to_thread(self._policy_store.delete_sticky, key)
except Exception as exc: # noqa: BLE001
logger.warning("HA: failed to delete sticky key=%s (non-fatal): %s", key, exc)
async def _node_health_snapshot_loop(self) -> None:
"""
Background task: periodically write node health state to DB (M8.2).
Runs until policy_store becomes unavailable or interval is 0.
"""
if self._ha_health_snapshot_interval_s <= 0:
return
logger.debug(
"HA: health snapshot loop started (interval=%ds)", self._ha_health_snapshot_interval_s
)
while True:
await asyncio.sleep(self._ha_health_snapshot_interval_s)
if self._policy_store is None or not self._policy_store.is_open:
break
if self._node_health_tracker is None or self._node_policy is None:
break
try:
for node_id in sorted(self._node_policy.allowed_nodes):
info = self._node_health_tracker.as_info_dict(node_id)
await asyncio.to_thread(
self._policy_store.upsert_node_health,
node_id,
info.get("ewma_latency_s"),
int(info.get("consecutive_failures", 0)),
)
logger.debug(
"HA: health snapshot written for %d nodes",
len(self._node_policy.allowed_nodes),
)
except Exception as exc: # noqa: BLE001
logger.warning("HA: health snapshot write failed (non-fatal): %s", exc)
# ── M9.0: Two-step confirmation for dangerous control commands ─────────────
async def _handle_policy_import_intent(
self,
http_client: "httpx.AsyncClient",
cmd: "ControlCommand",
sender: str,
room_id: str,
event_id: str,
action_summary: str,
normalized: str,
) -> str:
"""
M9.1: Intent step for !policy import with diff preview and hash binding.
Reads the file, computes a diff preview, stores a hash-bound callback,
and returns a formatted preview reply containing the nonce.
The confirm callback verifies the file hasn't changed before applying.
"""
assert self._confirm_store is not None
# ── Validate args ──────────────────────────────────────────────────────
if self._policy_store is None or not self._policy_store.is_open:
return policy_cmd_error("Policy store not available.")
if not self._bridge_data_dir:
return policy_cmd_error("BRIDGE_DATA_DIR not configured.")
filename = cmd.kwargs.get("path", "").strip()
if not filename:
return policy_cmd_error("Missing `path=` argument.")
exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
safe_path = validate_export_path(exports_dir, filename)
if safe_path is None:
return policy_cmd_error(
f"Invalid path `{filename}`. Only simple filenames within the exports "
"directory are allowed."
)
mode_raw = cmd.kwargs.get("mode", "merge").strip().lower()
if mode_raw not in ("merge", "replace"):
return policy_cmd_error("mode must be `merge` or `replace`.")
# ── Read file + compute diff preview ───────────────────────────────────
try:
raw_data = await asyncio.to_thread(_read_json_file, safe_path)
except FileNotFoundError:
return policy_cmd_error(f"File not found: `{filename}`")
except Exception as exc: # noqa: BLE001
return policy_cmd_error(f"Cannot read file: {exc}")
if not isinstance(raw_data, dict):
return policy_cmd_error("Invalid JSON format (expected object).")
try:
diff = await asyncio.to_thread(
self._policy_store.compute_import_diff, raw_data, mode_raw,
)
except ValueError as ve:
return policy_cmd_error(str(ve))
except Exception as exc: # noqa: BLE001
return policy_cmd_error(f"Preview failed: {exc}")
# ── Compute snapshot hash for confirm binding ─────────────────────────
_content_bytes = _json.dumps(
raw_data, sort_keys=True, ensure_ascii=True
).encode("utf-8")
snapshot_hash = hashlib.sha256(
(filename + ":" + mode_raw + ":").encode("utf-8") + _content_bytes
).hexdigest()[:32]
sender_hash = _sender_hash(sender)
_captured_hash = snapshot_hash
_captured_data = raw_data
_captured_mode = mode_raw
_captured_path = safe_path
_captured_fname = filename
_captured_sender = sender
# ── Build hash-bound callback ──────────────────────────────────────────
# Late-capture of nonce for autobackup filename (set after add() below)
_nonce_holder: list = []
async def _callback():
# Re-read file and verify hash to detect tampering (anti-TOCTOU)
try:
fresh_data = await asyncio.to_thread(_read_json_file, _captured_path)
except Exception as exc: # noqa: BLE001
return (
f"❌ Cannot re-read file `{_captured_fname}` at apply time: {exc}\n"
"Re-issue the command.",
"",
)
fresh_bytes = _json.dumps(
fresh_data if isinstance(fresh_data, dict) else {},
sort_keys=True, ensure_ascii=True,
).encode("utf-8")
fresh_hash = hashlib.sha256(
(_captured_fname + ":" + _captured_mode + ":").encode("utf-8") + fresh_bytes
).hexdigest()[:32]
if fresh_hash != _captured_hash:
logger.warning(
"Policy import confirm rejected: file changed since preview "
"(sender=%s file=%s)", _captured_sender, _captured_fname,
)
return (
f"❌ File `{_captured_fname}` changed after preview — confirm rejected.\n"
"Re-issue `!policy import ...` to get a new preview.",
"",
)
# M10.0: Auto-backup current policy before applying changes
_autobackup_basename = ""
_autobackup_hash = ""
if self._policy_store is not None and self._bridge_data_dir:
_exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
_nonce_suffix = _nonce_holder[0] if _nonce_holder else "BACKUP"
_sender_hash8 = _sender_hash(_captured_sender)[:8]
try:
_os.makedirs(_exports_dir, exist_ok=True)
_ab_path, _autobackup_hash = await asyncio.to_thread(
self._policy_store.write_autobackup,
_exports_dir, _sender_hash8, _nonce_suffix,
)
_autobackup_basename = _os.path.basename(_ab_path)
logger.info(
"Policy auto-backup written: %s hash=%s",
_autobackup_basename, _autobackup_hash,
)
except Exception as exc: # noqa: BLE001
logger.warning("Policy auto-backup failed (non-fatal): %s", exc)
# Apply the import using the captured (previewed) data
try:
stats = await asyncio.to_thread(
self._policy_store.import_snapshot,
_captured_data, _captured_mode, False, _captured_sender,
)
self._policy_last_import_at = int(time.time())
diff_summary = (
f"node: +{stats['node_added']} ~{stats['node_updated']} "
f"-{stats['node_deleted']}; "
f"agent: +{stats['agent_added']} ~{stats['agent_updated']} "
f"-{stats['agent_deleted']}"
)
if _autobackup_basename:
diff_summary += f"; autobackup={_autobackup_basename}"
# M10.2: record in policy change history
_is_destr = (
stats.get("node_deleted", 0) + stats.get("agent_deleted", 0)
) > 0
try:
await asyncio.to_thread(
self._policy_store.record_policy_change,
"policy.import", _captured_mode, _captured_fname,
_sender_hash(_captured_sender), diff_summary, _is_destr,
stats.get("node_added", 0), stats.get("node_updated", 0),
stats.get("node_deleted", 0),
stats.get("agent_added", 0), stats.get("agent_updated", 0),
stats.get("agent_deleted", 0),
self._policy_history_limit,
)
except Exception as _exc: # noqa: BLE001
logger.warning("Failed to record policy change history: %s", _exc)
reply = policy_import_reply(stats, _captured_mode)
if _autobackup_basename:
reply += (
f"\n\n💾 Auto-backup saved: `{_autobackup_basename}` "
f"(hash `{_autobackup_hash}`)"
)
return reply, diff_summary
except Exception as exc: # noqa: BLE001
logger.exception(
"Policy import apply failed: sender=%s file=%s", _captured_sender, _captured_fname,
)
return f"❌ Import failed: {exc}", ""
# ── Store pending confirmation ─────────────────────────────────────────
nonce = self._confirm_store.add(
sender_hash=sender_hash,
verb="policy.import",
normalized_args=normalized,
action_summary=action_summary,
room_id=room_id,
callback=_callback,
)
# M10.0: make nonce available inside _callback for backup filename
_nonce_holder.append(nonce)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.intent",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="ok",
data={
"sender_hash": sender_hash,
"verb": "policy.import",
"normalized": normalized,
"nonce": nonce,
"expires_in_s": int(self._confirm_store.ttl_s),
"snapshot_hash_prefix": snapshot_hash[:8],
"diff": {
"node_added": diff.node_added,
"node_updated": diff.node_updated,
"node_deleted": diff.node_deleted,
"agent_added": diff.agent_added,
"agent_updated": diff.agent_updated,
"agent_deleted": diff.agent_deleted,
"sample_keys": diff.sample_keys,
},
},
)
logger.info(
"Confirm policy import intent: sender=%s mode=%s file=%s hash=%s nonce=%s",
sender, mode_raw, filename, snapshot_hash[:8], nonce,
)
return policy_import_intent_reply(
diff=diff,
action_summary=action_summary,
nonce=nonce,
ttl_s=int(self._confirm_store.ttl_s),
)
async def _handle_policy_restore_intent(
self,
http_client: "httpx.AsyncClient",
cmd: "ControlCommand",
sender: str,
room_id: str,
event_id: str,
*,
action_summary: str,
normalized: str,
) -> str:
"""
Intent step for !policy restore (M10.1).
Reads the snapshot file, computes a diff preview, binds a SHA-256 hash
to the exact file content + mode, stores a callback in ConfirmStore, and
returns a rollback preview reply with the nonce.
"""
assert self._confirm_store is not None
assert self._policy_store is not None
assert self._bridge_data_dir is not None
filename = cmd.kwargs.get("path", "").strip()
if not filename:
return policy_cmd_error("Missing `path=` argument.")
exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
safe_path = validate_export_path(exports_dir, filename)
if safe_path is None:
return policy_cmd_error(
f"Invalid path `{filename}`. Only simple filenames within the exports "
"directory are allowed."
)
mode_raw = cmd.kwargs.get("mode", "replace").strip().lower()
if mode_raw not in ("merge", "replace"):
return policy_cmd_error("mode must be `merge` or `replace`.")
# ── Read file + compute diff preview ──────────────────────────────────
try:
raw_data = await asyncio.to_thread(_read_json_file, safe_path)
except FileNotFoundError:
return policy_cmd_error(f"File not found: `{filename}`")
except Exception as exc: # noqa: BLE001
return policy_cmd_error(f"Cannot read file: {exc}")
if not isinstance(raw_data, dict):
return policy_cmd_error("Invalid JSON format (expected object).")
try:
diff = await asyncio.to_thread(
self._policy_store.compute_import_diff, raw_data, mode_raw,
)
except ValueError as ve:
return policy_cmd_error(str(ve))
except Exception as exc: # noqa: BLE001
return policy_cmd_error(f"Preview failed: {exc}")
# ── Compute snapshot hash for confirm binding (anti-TOCTOU) ──────────
_content_bytes = _json.dumps(
raw_data, sort_keys=True, ensure_ascii=True
).encode("utf-8")
snapshot_hash = hashlib.sha256(
(filename + ":" + mode_raw + ":restore:").encode("utf-8") + _content_bytes
).hexdigest()[:32]
sender_hash = _sender_hash(sender)
_captured_hash = snapshot_hash
_captured_data = raw_data
_captured_mode = mode_raw
_captured_path = safe_path
_captured_fname = filename
_captured_sender = sender
# Late-capture of nonce for autobackup filename
_nonce_holder: list = []
# ── Build hash-bound callback ─────────────────────────────────────────
async def _callback():
# Re-read + verify hash (anti-TOCTOU)
try:
fresh_data = await asyncio.to_thread(_read_json_file, _captured_path)
except Exception as exc: # noqa: BLE001
return (
f"❌ Cannot re-read `{_captured_fname}` at apply time: {exc}\n"
"Re-issue the command.",
"",
)
fresh_bytes = _json.dumps(
fresh_data if isinstance(fresh_data, dict) else {},
sort_keys=True, ensure_ascii=True,
).encode("utf-8")
fresh_hash = hashlib.sha256(
(_captured_fname + ":" + _captured_mode + ":restore:").encode("utf-8")
+ fresh_bytes
).hexdigest()[:32]
if fresh_hash != _captured_hash:
logger.warning(
"Policy restore confirm rejected: file changed since preview "
"(sender=%s file=%s)", _captured_sender, _captured_fname,
)
return (
f"❌ File `{_captured_fname}` changed after preview — confirm rejected.\n"
"Re-issue `!policy restore ...` to get a new preview.",
"",
)
# Auto-backup current state before overwriting
_autobackup_basename = ""
_autobackup_hash = ""
if self._policy_store is not None and self._bridge_data_dir:
_exp_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
_nonce_suffix = _nonce_holder[0] if _nonce_holder else "RESTORE"
_sender_hash8 = _sender_hash(_captured_sender)[:8]
try:
_os.makedirs(_exp_dir, exist_ok=True)
_ab_path, _autobackup_hash = await asyncio.to_thread(
self._policy_store.write_autobackup,
_exp_dir, _sender_hash8, _nonce_suffix,
)
_autobackup_basename = _os.path.basename(_ab_path)
logger.info(
"Pre-restore backup written: %s hash=%s",
_autobackup_basename, _autobackup_hash,
)
except Exception as exc: # noqa: BLE001
logger.warning("Pre-restore backup failed (non-fatal): %s", exc)
# Apply restore
try:
stats = await asyncio.to_thread(
self._policy_store.import_snapshot,
_captured_data, _captured_mode, False, _captured_sender,
)
self._policy_last_import_at = int(time.time())
diff_summary = (
f"restore/{_captured_mode}: "
f"node: +{stats['node_added']} ~{stats['node_updated']} "
f"-{stats['node_deleted']}; "
f"agent: +{stats['agent_added']} ~{stats['agent_updated']} "
f"-{stats['agent_deleted']}"
)
if _autobackup_basename:
diff_summary += f"; autobackup={_autobackup_basename}"
# M10.2: record in policy change history
_is_destr = (
stats.get("node_deleted", 0) + stats.get("agent_deleted", 0)
) > 0
try:
await asyncio.to_thread(
self._policy_store.record_policy_change,
"policy.restore", _captured_mode, _captured_fname,
_sender_hash(_captured_sender), diff_summary, _is_destr,
stats.get("node_added", 0), stats.get("node_updated", 0),
stats.get("node_deleted", 0),
stats.get("agent_added", 0), stats.get("agent_updated", 0),
stats.get("agent_deleted", 0),
self._policy_history_limit,
)
except Exception as _exc: # noqa: BLE001
logger.warning("Failed to record restore history: %s", _exc)
reply = policy_restore_applied_reply(
stats, _captured_mode, _autobackup_basename
)
return reply, diff_summary
except Exception as exc: # noqa: BLE001
logger.exception(
"Policy restore apply failed: sender=%s file=%s",
_captured_sender, _captured_fname,
)
return f"❌ Restore failed: {exc}", ""
# ── Store pending confirmation ────────────────────────────────────────
nonce = self._confirm_store.add(
sender_hash=sender_hash,
verb="policy.restore",
normalized_args=normalized,
action_summary=action_summary,
room_id=room_id,
callback=_callback,
)
_nonce_holder.append(nonce)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.policy.restore",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="ok",
data={
"sender_hash": sender_hash,
"verb": "policy.restore",
"normalized": normalized,
"nonce": nonce,
"expires_in_s": int(self._confirm_store.ttl_s),
"mode": mode_raw,
"snapshot_hash_prefix": snapshot_hash[:8],
"diff": {
"node_added": diff.node_added,
"node_updated": diff.node_updated,
"node_deleted": diff.node_deleted,
"agent_added": diff.agent_added,
"agent_updated": diff.agent_updated,
"agent_deleted": diff.agent_deleted,
"sample_keys": diff.sample_keys,
},
},
)
logger.info(
"Confirm policy restore intent: sender=%s mode=%s file=%s hash=%s nonce=%s",
sender, mode_raw, filename, snapshot_hash[:8], nonce,
)
return policy_restore_intent_reply(
diff=diff,
action_summary=action_summary,
nonce=nonce,
ttl_s=int(self._confirm_store.ttl_s),
)
async def _handle_dangerous_intent(
self,
http_client: "httpx.AsyncClient",
cmd: "ControlCommand",
sender: str,
room_id: str,
event_id: str,
) -> str:
"""
First leg of the two-step confirm flow (M9.0).
Does NOT apply the command. Stores a pending confirmation with a
callback that will execute the original handler, and returns a reply
containing the nonce that the operator must send via !confirm <nonce>.
"""
assert self._confirm_store is not None
sender_hash = _sender_hash(sender)
normalized = build_normalized_args(cmd)
action_summary = (
f"!{cmd.verb} {cmd.subcommand or ''} {normalized}".strip()
)
# M9.1: policy import gets a richer preview with diff + hash binding
if cmd.verb == VERB_POLICY and (cmd.subcommand or "").lower() == "import":
return await self._handle_policy_import_intent(
http_client, cmd, sender, room_id, event_id,
action_summary=action_summary, normalized=normalized,
)
# M10.1: policy restore — rollback with diff preview + hash binding
if cmd.verb == VERB_POLICY and (cmd.subcommand or "").lower() == "restore":
if self._policy_store is None or not self._bridge_data_dir:
return policy_cmd_error(
"Policy store or data directory not configured."
)
return await self._handle_policy_restore_intent(
http_client, cmd, sender, room_id, event_id,
action_summary=action_summary, normalized=normalized,
)
# Build the callback: calls the actual handler when confirmed.
# We capture all args by closure so the callback is self-contained.
_verb = cmd.verb
_subcmd = cmd.subcommand
_args = cmd.args
_kw = dict(cmd.kwargs)
async def _callback():
if _verb == VERB_NODE:
# Reconstruct args_text (same as _try_control does)
_parts = []
if _subcmd:
_parts.append(_subcmd)
_parts.extend(_args)
_parts.extend(f"{k}={v}" for k, v in _kw.items())
reply = await self._handle_node_cmd(
http_client, sender, room_id, event_id, " ".join(_parts),
)
elif _verb == VERB_ROOM:
reply = await self._handle_room_cmd(
http_client, sender, room_id, event_id,
_subcmd, tuple(_args), _kw,
)
elif _verb == VERB_POLICY:
reply = await self._handle_policy_cmd(
http_client, sender, room_id, event_id, _subcmd, _kw,
)
else:
reply = f"❌ Unknown dangerous verb: {_verb}"
return reply, action_summary
nonce = self._confirm_store.add(
sender_hash=sender_hash,
verb=f"{cmd.verb}.{cmd.subcommand or ''}",
normalized_args=normalized,
action_summary=action_summary,
room_id=room_id,
callback=_callback,
)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.intent",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="ok",
data={
"sender_hash": sender_hash,
"verb": cmd.verb,
"subcommand": cmd.subcommand or "",
"normalized": normalized,
"nonce": nonce,
"expires_in_s": int(self._confirm_store.ttl_s),
},
)
logger.info(
"Confirm intent: sender=%s verb=%s/%s nonce=%s ttl=%.0fs",
sender, cmd.verb, cmd.subcommand, nonce, self._confirm_store.ttl_s,
)
return confirm_intent_reply(action_summary, nonce, int(self._confirm_store.ttl_s))
async def _handle_confirm_cmd(
self,
http_client: "httpx.AsyncClient",
cmd: "ControlCommand",
sender: str,
room_id: str,
event_id: str,
) -> str:
"""
Second leg of the two-step confirm flow (M9.0).
Validates the nonce and sender, executes the stored callback, and
emits confirmed + applied audit events.
"""
if self._confirm_store is None:
return "❌ Confirmation store not active."
# Nonce may come as subcommand (token right after !confirm)
nonce = (cmd.subcommand or "").strip().upper()
if not nonce and cmd.args:
nonce = cmd.args[0].strip().upper()
if not nonce:
return "❌ Usage: `!confirm <code>` — provide the confirmation code."
sender_hash = _sender_hash(sender)
entry = self._confirm_store.pop(nonce, sender_hash)
if entry is None:
logger.info(
"Confirm rejected: sender=%s nonce=%s (invalid/expired/wrong-sender)",
sender, nonce,
)
return confirm_expired_reply()
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.confirmed",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="ok",
data={
"sender_hash": sender_hash,
"nonce": nonce,
"verb": entry.verb,
"action_summary": entry.action_summary,
},
)
logger.info(
"Confirm accepted: sender=%s nonce=%s verb=%s",
sender, nonce, entry.verb,
)
try:
reply_text, diff_summary = await entry.callback()
except Exception as exc:
logger.exception(
"Confirm callback failed: sender=%s nonce=%s verb=%s",
sender, nonce, entry.verb,
)
return f"❌ Apply failed: {exc}"
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.applied",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="ok",
data={
"sender_hash": sender_hash,
"verb": entry.verb,
"normalized": entry.normalized_args,
"diff_summary": diff_summary,
},
)
return confirm_success_reply(reply_text)
# ── M6.0: Dynamic room-node overrides via !node command ───────────────────
async def _handle_node_cmd(
self,
http_client: httpx.AsyncClient,
sender: str,
ctrl_room_id: str,
event_id: str,
args_text: str,
) -> str:
"""Handle `!node <set|unset|get|list>` from an authorized operator."""
if self._policy_store is None or not self._policy_store.is_open:
return "⚠️ Policy store not available."
subcmd, room_id, node_id = parse_node_cmd(args_text)
if subcmd not in (NODE_SUBCMD_SET, NODE_SUBCMD_UNSET, NODE_SUBCMD_GET, NODE_SUBCMD_LIST):
return node_cmd_reply_error(
f"Unknown subcommand: `{subcmd or '?'}`"
)
# ── list ──────────────────────────────────────────────────────────────
if subcmd == NODE_SUBCMD_LIST:
try:
rows = await asyncio.to_thread(self._policy_store.list_overrides, 10)
total = await asyncio.to_thread(self._policy_store.count_overrides)
except Exception as exc:
logger.warning("PolicyStore list_overrides error: %s", exc)
return "⚠️ Could not read policy store."
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.node.list",
agent_id="control", node_id=self._node_id,
room_id=ctrl_room_id, event_id=event_id,
status="ok", data={"sender": sender, "total": total},
)
return node_cmd_reply_list(rows, total)
# ── subcommands that require room_id ──────────────────────────────────
if not room_id:
return node_cmd_reply_error("Missing `room=` argument.")
if not node_cmd_validate_room(room_id):
return node_cmd_reply_error(
f"Invalid room ID format: `{room_id}`\n"
"Expected: `!localpart:server`"
)
# ── get ───────────────────────────────────────────────────────────────
if subcmd == NODE_SUBCMD_GET:
try:
override = await asyncio.to_thread(self._policy_store.get_override, room_id)
except Exception as exc:
logger.warning("PolicyStore get_override error: %s", exc)
return "⚠️ Could not read policy store."
# env map lookup for context
env_node: Optional[str] = None
if self._node_policy is not None:
env_node = self._node_policy.room_node_map.get(room_id)
default = self._node_policy.default_node if self._node_policy else self._node_id
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.node.get",
agent_id="control", node_id=self._node_id,
room_id=ctrl_room_id, event_id=event_id,
status="ok", data={"sender": sender, "queried_room": room_id},
)
return node_cmd_reply_get(room_id, override, env_node, default)
# ── unset ─────────────────────────────────────────────────────────────
if subcmd == NODE_SUBCMD_UNSET:
try:
deleted = await asyncio.to_thread(self._policy_store.delete_override, room_id)
except Exception as exc:
logger.warning("PolicyStore delete_override error: %s", exc)
return "⚠️ Could not write to policy store."
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.node.unset",
agent_id="control", node_id=self._node_id,
room_id=ctrl_room_id, event_id=event_id,
status="ok", data={"sender": sender, "target_room": room_id, "was_set": deleted},
)
return node_cmd_reply_unset_ok(room_id) if deleted else node_cmd_reply_unset_not_found(room_id)
# ── set ───────────────────────────────────────────────────────────────
if subcmd == NODE_SUBCMD_SET:
if not node_id:
return node_cmd_reply_error("Missing `node=` argument for `set`.")
allowed = self._node_policy.allowed_nodes if self._node_policy else frozenset([self._node_id])
if node_id not in allowed:
allowed_list = ", ".join(f"`{n}`" for n in sorted(allowed))
return node_cmd_reply_error(
f"Node `{node_id}` is not in allowed list: {allowed_list}"
)
try:
await asyncio.to_thread(self._policy_store.set_override, room_id, node_id, sender)
except Exception as exc:
logger.warning("PolicyStore set_override error: %s", exc)
return "⚠️ Could not write to policy store."
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.node.set",
agent_id="control", node_id=self._node_id,
room_id=ctrl_room_id, event_id=event_id,
status="ok", data={"sender": sender, "target_room": room_id, "set_node": node_id},
)
return node_cmd_reply_set(room_id, node_id)
return node_cmd_reply_error("Unhandled subcommand.")
# ── M4.1: Bridge status for operators ─────────────────────────────────────
async def _handle_bridge_status(
self,
http_client: httpx.AsyncClient,
sender: str,
room_id: str,
event_id: str,
) -> str:
"""Build and return a bridge health snapshot for `!status` in control room."""
try:
snapshot: Dict[str, Any] = {
"node_id": self._node_id,
"worker_count": self._worker_count,
"room_count": len(self._room_map.mappings),
"mixed_room_count": (
len(self._mixed_room_config.rooms)
if self._mixed_room_config
else 0
),
"operators_count": (
len(self._control_config.operator_allowlist)
if self._control_config and self._control_config.operator_allowlist
else 0
),
}
# Queue info (exposed by MatrixIngressLoop via get_status)
status = self.get_status()
snapshot["queue_size"] = status.get("queue_size", "?")
snapshot["queue_max"] = status.get("queue_max", "?")
# Control safety
if self._control_limiter is not None:
snapshot["control_safety"] = self._control_limiter.as_health_dict()
# Persistent dedupe
if self._event_store is not None:
snapshot["persistent_dedupe"] = self._event_store.as_health_dict()
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.bridge_status",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="ok", data={"sender": sender},
)
return bridge_status_reply(snapshot)
except Exception as exc:
logger.warning("_handle_bridge_status error: %s", exc)
return status_not_available_reply()
# ── Control command handler ────────────────────────────────────────────────
async def _try_control(
self,
client: MatrixClient,
http_client: httpx.AsyncClient,
event: Dict[str, Any],
room_id: str,
) -> None:
"""
Process a message from a control room.
Non-command messages (not starting with '!') are silently ignored.
All command attempts are audited regardless of authorization.
"""
assert self._control_config is not None
event_id = event.get("event_id", "")
sender = event.get("sender", "")
text = event.get("content", {}).get("body", "").strip()
if not text or not is_control_message(text):
return # not a command, ignore
client.mark_seen(event_id)
# Authorization check
authorized, rejection_reason = check_authorization(sender, room_id, self._control_config)
if not authorized:
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.unauthorized",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code=rejection_reason,
data={"sender": sender, "command_preview": text[:80]},
)
logger.warning(
"Unauthorized control command: sender=%s room=%s reason=%s cmd=%r",
sender, room_id, rejection_reason, text[:60],
)
if self._control_unauthorized_behavior == "reply_error":
try:
txn_id = MatrixClient.make_txn_id(room_id, event_id + "_unauth")
await client.send_text(room_id, unauthorized_reply(rejection_reason), txn_id)
except Exception as exc:
logger.warning("Could not send unauthorized reply: %s", exc)
return
# M3.4: Rate limiting + cooldown (after auth, before parse/dispatch)
if self._control_limiter is not None:
sender_hash_ctrl = _sender_hash(sender)
allowed_room, retry_room = self._control_limiter.check_room(room_id)
if not allowed_room:
scope = "room"
logger.info("Control rate limited: scope=%s room=%s sender=%s", scope, room_id, sender)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.rate_limited",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code="rate_limited_room",
data={"sender": sender, "scope": scope, "retry_after_s": retry_room},
)
if self._on_control_rate_limited:
self._on_control_rate_limited(scope)
txn_id = MatrixClient.make_txn_id(room_id, event_id + "_rl")
await client.send_text(room_id, rate_limited_reply(scope, retry_room), txn_id)
return
allowed_op, retry_op = self._control_limiter.check_operator(sender_hash_ctrl)
if not allowed_op:
scope = "operator"
logger.info("Control rate limited: scope=%s sender=%s", scope, sender)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.rate_limited",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code="rate_limited_operator",
data={"sender": sender, "scope": scope, "retry_after_s": retry_op},
)
if self._on_control_rate_limited:
self._on_control_rate_limited(scope)
txn_id = MatrixClient.make_txn_id(room_id, event_id + "_rl")
await client.send_text(room_id, rate_limited_reply(scope, retry_op), txn_id)
return
# Parse command
cmd = parse_command(text)
if cmd is None:
logger.warning("Control message from %s could not be parsed: %r", sender, text[:60])
return
# M3.4: Cooldown check (anti-double-click, per operator+verb+subcommand)
if self._control_limiter is not None:
sender_hash_ctrl = _sender_hash(sender)
allowed_cd, wait_cd = self._control_limiter.check_cooldown(
sender_hash_ctrl, cmd.verb, cmd.subcommand or "",
)
if not allowed_cd:
scope = "cooldown"
logger.info(
"Control cooldown: sender=%s verb=%s sub=%s wait=%.1fs",
sender, cmd.verb, cmd.subcommand, wait_cd,
)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.rate_limited",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code="cooldown",
data={
"sender": sender, "scope": scope,
"verb": cmd.verb, "subcommand": cmd.subcommand,
"wait_s": wait_cd,
},
)
if self._on_control_rate_limited:
self._on_control_rate_limited(scope)
txn_id = MatrixClient.make_txn_id(room_id, event_id + "_cd")
await client.send_text(room_id, rate_limited_reply(scope, wait_cd), txn_id)
return
# Metric callback
if self._on_control_command:
self._on_control_command(sender, cmd.verb, cmd.subcommand)
# Audit every authorized command
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.command",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="ok",
data={
"sender": sender,
"verb": cmd.verb,
"subcommand": cmd.subcommand,
"args": list(cmd.args),
"kwargs": dict(cmd.kwargs),
"is_known": cmd.is_known,
},
)
logger.info(
"Control command: sender=%s verb=%s sub=%s args=%s",
sender, cmd.verb, cmd.subcommand, cmd.args,
)
# Dispatch command
txn_id = MatrixClient.make_txn_id(room_id, event_id + "_ctrl")
# M9.0: Dangerous commands → two-step confirmation (intent leg)
if (
self._confirm_store is not None
and is_dangerous_cmd(cmd)
and cmd.verb != VERB_CONFIRM
):
reply_text = await self._handle_dangerous_intent(
http_client, cmd, sender, room_id, event_id,
)
elif cmd.verb == VERB_CONFIRM:
# M9.0: !confirm <nonce> (second leg)
reply_text = await self._handle_confirm_cmd(
http_client, cmd, sender, room_id, event_id,
)
elif cmd.verb == VERB_HELP:
reply_text = help_reply()
elif cmd.verb == VERB_CONFIRM:
# Fallback if confirm_store is None (disabled) — inform the operator
reply_text = "❌ Confirmation system is disabled."
elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_START:
reply_text = await self._handle_runbook_start(
http_client, client, cmd, sender, room_id, event_id,
)
elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_NEXT:
# M3.2: advance to next step
reply_text = await self._handle_runbook_next(
http_client, client, cmd, sender, room_id, event_id,
)
elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_COMPLETE:
# M3.2: mark manual step complete
reply_text = await self._handle_runbook_complete(
http_client, client, cmd, sender, room_id, event_id,
)
elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_STATUS:
# M3.3: show run status
reply_text = await self._handle_runbook_status(
http_client, client, cmd, sender, room_id, event_id,
)
elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_EVIDENCE:
# M3.3: generate release evidence
reply_text = await self._handle_runbook_evidence(
http_client, client, cmd, sender, room_id, event_id,
)
elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_POST_REVIEW:
# M3.3: generate post-release review
reply_text = await self._handle_runbook_post_review(
http_client, client, cmd, sender, room_id, event_id,
)
elif cmd.verb == VERB_STATUS:
# M4.1: bridge health snapshot for operators
reply_text = await self._handle_bridge_status(
http_client, sender, room_id, event_id,
)
elif cmd.verb == VERB_NODES:
# M5.1: node policy overview for operators
reply_text = await self._handle_nodes(
http_client, sender, room_id, event_id,
)
elif cmd.verb == VERB_NODE:
# M6.0: dynamic room-node override commands
# Reconstruct args_text from parsed command parts
_node_args_parts = []
if cmd.subcommand:
_node_args_parts.append(cmd.subcommand)
_node_args_parts.extend(cmd.args)
_node_args_parts.extend(f"{k}={v}" for k, v in cmd.kwargs.items())
_node_args_text = " ".join(_node_args_parts)
reply_text = await self._handle_node_cmd(
http_client, sender, room_id, event_id, _node_args_text,
)
elif cmd.verb == VERB_ROOM:
# M6.1: dynamic mixed room agent overrides
reply_text = await self._handle_room_cmd(
http_client, sender, room_id, event_id,
cmd.subcommand, cmd.args, cmd.kwargs,
)
elif cmd.verb == VERB_POLICY:
# M6.2: policy snapshot export/import
reply_text = await self._handle_policy_cmd(
http_client, sender, room_id, event_id,
cmd.subcommand, cmd.kwargs,
)
elif not cmd.is_known:
reply_text = unknown_command_reply(cmd)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.unknown_cmd",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code="unknown_verb",
data={"verb": cmd.verb, "sender": sender},
)
else:
reply_text = not_implemented_reply(cmd)
try:
await client.send_text(room_id, reply_text, txn_id)
except Exception as exc:
logger.error("Could not send control reply: %s", exc)
async def _handle_runbook_start(
self,
http_client: httpx.AsyncClient,
client: "MatrixClient",
cmd: ControlCommand,
sender: str,
room_id: str,
event_id: str,
) -> str:
"""
M3.1: Execute !runbook start <path> [node=NODA1].
Calls sofiia-console POST /api/runbooks/internal/runs.
Returns reply text (success or failure) for delivery to the control room.
Audits matrix.control.runbook.start regardless of outcome.
"""
# Extract positional runbook_path
runbook_path = cmd.args[0].strip() if cmd.args else ""
node_id = cmd.kwargs.get("node", "NODA1").strip()
# Validate path before calling the console
path_error = _ctrl_runner.validate_runbook_path(runbook_path)
if path_error:
logger.warning(
"!runbook start invalid path: sender=%s path=%r error=%s",
sender, runbook_path, path_error,
)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.runbook.start",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code="invalid_path",
data={"sender": sender, "runbook_path": runbook_path, "error": path_error},
)
return start_usage_reply()
# Call sofiia-console internal API
run_id: Optional[str] = None
http_status: Optional[int] = None
try:
result = await _ctrl_runner.start_runbook_run(
http_client=http_client,
console_url=self._console_url,
control_token=self._control_token,
runbook_path=runbook_path,
operator_id=sender,
node_id=node_id,
)
run_id = result.get("run_id", "")
steps_total = result.get("steps_total", 0)
status = result.get("status", "running")
http_status = 200
logger.info(
"Runbook started: run_id=%s path=%s node=%s steps=%d by sender=%s",
run_id, runbook_path, node_id, steps_total, sender,
)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.runbook.start",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="ok",
data={
"sender": sender,
"runbook_path": runbook_path,
"node_id": node_id,
"run_id": run_id,
"steps_total": steps_total,
"http_status": http_status,
},
)
return runbook_started_reply(run_id, steps_total, status)
except _ctrl_runner.RunnerError as exc:
reason = str(exc)
logger.error(
"!runbook start failed: sender=%s path=%r node=%s error=%s",
sender, runbook_path, node_id, reason,
)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.runbook.start",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code="runner_error",
data={
"sender": sender,
"runbook_path": runbook_path,
"node_id": node_id,
"error": reason,
"http_status": http_status,
},
)
return runbook_start_error_reply(reason)
async def _handle_runbook_next(
self,
http_client: httpx.AsyncClient,
client: "MatrixClient",
cmd: ControlCommand,
sender: str,
room_id: str,
event_id: str,
) -> str:
"""
M3.2: Execute !runbook next <run_id>.
Calls sofiia-console POST /api/runbooks/internal/runs/{run_id}/next.
Returns reply text for the control room.
Audits matrix.control.runbook.next.
"""
run_id = cmd.args[0].strip() if cmd.args else ""
if not run_id:
return next_usage_reply()
# M3.4: per-run rate limit for !runbook next
if self._control_limiter is not None:
allowed_run, retry_run = self._control_limiter.check_run_next(run_id)
if not allowed_run:
scope = "run"
if self._on_control_rate_limited:
self._on_control_rate_limited(scope)
return rate_limited_reply(scope, retry_run)
http_status: Optional[int] = None
try:
result = await _ctrl_runner.next_runbook_step(
http_client=http_client,
console_url=self._console_url,
control_token=self._control_token,
run_id=run_id,
operator_id=sender,
)
http_status = 200
step_type = result.get("type", "unknown")
step_index = result.get("step_index", 0)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.runbook.next",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="ok",
data={
"sender": sender,
"run_id": run_id,
"step_index": step_index,
"step_type": step_type,
"http_status": http_status,
},
)
if step_type == "manual":
return next_manual_reply(
run_id=run_id,
step_index=step_index,
steps_total=result.get("steps_total"),
title=result.get("title", ""),
instructions_md=result.get("instructions_md", ""),
)
else:
# http_check / script
result_dict = result.get("result") or {}
duration_ms = int(result_dict.get("duration_ms", 0)) if isinstance(result_dict, dict) else 0
return next_auto_reply(
run_id=run_id,
step_index=step_index,
action_type=step_type,
step_status=result.get("step_status", "ok"),
duration_ms=duration_ms or None,
completed=bool(result.get("completed", False)),
)
except _ctrl_runner.RunnerError as exc:
reason = str(exc)
logger.error(
"!runbook next failed: sender=%s run_id=%r error=%s",
sender, run_id, reason,
)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.runbook.next",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code="runner_error",
data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status},
)
return next_error_reply(run_id, reason)
async def _handle_runbook_complete(
self,
http_client: httpx.AsyncClient,
client: "MatrixClient",
cmd: ControlCommand,
sender: str,
room_id: str,
event_id: str,
) -> str:
"""
M3.2: Execute !runbook complete <run_id> step=<n> status=ok|warn|fail [notes=...]
Calls sofiia-console POST /api/runbooks/internal/runs/{run_id}/steps/{n}/complete.
Audits matrix.control.runbook.complete.
"""
run_id = cmd.args[0].strip() if cmd.args else ""
if not run_id:
return complete_usage_reply()
# step kwarg required
step_raw = cmd.kwargs.get("step", "").strip()
if not step_raw or not step_raw.isdigit():
return complete_usage_reply()
step_index = int(step_raw)
# status kwarg required
status = cmd.kwargs.get("status", "").strip().lower()
if status not in ("ok", "warn", "fail", "skipped"):
return complete_usage_reply()
# notes: kwarg or remaining positional args (joined with space)
notes = cmd.kwargs.get("notes", "").strip()
if not notes and len(cmd.args) > 1:
notes = " ".join(cmd.args[1:])
notes = sanitize_notes(notes) # M3.4: strip control chars + truncate to MAX_NOTES_LEN
http_status: Optional[int] = None
try:
result = await _ctrl_runner.complete_runbook_step(
http_client=http_client,
console_url=self._console_url,
control_token=self._control_token,
run_id=run_id,
step_index=step_index,
status=status,
notes=notes,
operator_id=sender,
)
http_status = 200
run_completed = bool(result.get("run_completed", False))
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.runbook.complete",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="ok",
data={
"sender": sender,
"run_id": run_id,
"step_index": step_index,
"status": status,
"run_completed": run_completed,
"http_status": http_status,
},
)
return complete_ok_reply(run_id, step_index, status, run_completed)
except _ctrl_runner.RunnerError as exc:
reason = str(exc)
logger.error(
"!runbook complete failed: sender=%s run_id=%r step=%d error=%s",
sender, run_id, step_index, reason,
)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.runbook.complete",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code="runner_error",
data={
"sender": sender,
"run_id": run_id,
"step_index": step_index,
"error": reason,
"http_status": http_status,
},
)
return complete_error_reply(run_id, reason)
async def _handle_runbook_status(
self,
http_client: httpx.AsyncClient,
client: "MatrixClient",
cmd: ControlCommand,
sender: str,
room_id: str,
event_id: str,
) -> str:
"""M3.3: !runbook status <run_id> — GET run info + format status."""
run_id = cmd.args[0].strip() if cmd.args else ""
if not run_id:
return status_usage_reply()
http_status: Optional[int] = None
try:
result = await _ctrl_runner.get_runbook_run(
http_client=http_client,
console_url=self._console_url,
control_token=self._control_token,
run_id=run_id,
)
http_status = 200
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.runbook.status",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="ok",
data={
"sender": sender, "run_id": run_id,
"run_status": result.get("status"),
"http_status": http_status,
},
)
return status_reply(result)
except _ctrl_runner.RunnerError as exc:
reason = str(exc)
logger.error("!runbook status failed: sender=%s run_id=%r error=%s", sender, run_id, reason)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.runbook.status",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code="runner_error",
data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status},
)
return status_error_reply(run_id, reason)
async def _handle_runbook_evidence(
self,
http_client: httpx.AsyncClient,
client: "MatrixClient",
cmd: ControlCommand,
sender: str,
room_id: str,
event_id: str,
) -> str:
"""M3.3: !runbook evidence <run_id> — generate release evidence."""
run_id = cmd.args[0].strip() if cmd.args else ""
if not run_id:
return evidence_usage_reply()
http_status: Optional[int] = None
try:
result = await _ctrl_runner.generate_evidence(
http_client=http_client,
console_url=self._console_url,
control_token=self._control_token,
run_id=run_id,
)
http_status = 200
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.runbook.evidence",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="ok",
data={
"sender": sender, "run_id": run_id,
"evidence_path": result.get("evidence_path"),
"bytes": result.get("bytes"),
"http_status": http_status,
},
)
return evidence_reply(result)
except _ctrl_runner.RunnerError as exc:
reason = str(exc)
logger.error("!runbook evidence failed: sender=%s run_id=%r error=%s", sender, run_id, reason)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.runbook.evidence",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code="runner_error",
data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status},
)
return evidence_error_reply(run_id, reason)
async def _handle_runbook_post_review(
self,
http_client: httpx.AsyncClient,
client: "MatrixClient",
cmd: ControlCommand,
sender: str,
room_id: str,
event_id: str,
) -> str:
"""M3.3: !runbook post_review <run_id> — generate post-release review."""
run_id = cmd.args[0].strip() if cmd.args else ""
if not run_id:
return post_review_usage_reply()
http_status: Optional[int] = None
try:
result = await _ctrl_runner.generate_post_review(
http_client=http_client,
console_url=self._console_url,
control_token=self._control_token,
run_id=run_id,
)
http_status = 200
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.runbook.post_review",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="ok",
data={
"sender": sender, "run_id": run_id,
"path": result.get("path"),
"bytes": result.get("bytes"),
"http_status": http_status,
},
)
return post_review_reply(result)
except _ctrl_runner.RunnerError as exc:
reason = str(exc)
logger.error("!runbook post_review failed: sender=%s run_id=%r error=%s", sender, run_id, reason)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.runbook.post_review",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code="runner_error",
data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status},
)
return post_review_error_reply(run_id, reason)
# ── Worker ─────────────────────────────────────────────────────────────────
async def _worker(
self,
queue: "asyncio.Queue[Optional[_QueueEntry]]",
client: MatrixClient,
http_client: httpx.AsyncClient,
) -> None:
"""Consume queue entries until cancelled."""
while True:
entry = await queue.get() # blocks until item available; raises CancelledError on cancel
try:
await self._process_entry(client, http_client, entry)
except Exception as exc:
logger.error("Worker unhandled error: %s", exc)
finally:
queue.task_done()
if self._on_queue_size:
self._on_queue_size(queue.qsize())
# ── Process (invoke + send + audit) ───────────────────────────────────────
async def _process_entry(
self,
client: MatrixClient,
http_client: httpx.AsyncClient,
entry: _QueueEntry,
) -> None:
event = entry.event
event_id = event.get("event_id", "")
sender = event.get("sender", "")
text = event.get("content", {}).get("body", "").strip()
room_id = entry.room_id
agent_id = entry.agent_id
# H3: Queue wait latency
wait_s = time.monotonic() - entry.enqueue_time
if self._on_queue_wait:
self._on_queue_wait(agent_id, wait_s)
routing_reason = entry.routing_reason
is_mixed = entry.is_mixed
logger.info(
"Processing: room=%s agent=%s event=%s len=%d wait=%.3fs mixed=%s reason=%s",
room_id, agent_id, event_id, len(text), wait_s, is_mixed, routing_reason,
)
if self._on_message_received:
self._on_message_received(room_id, agent_id)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.message.received",
agent_id=agent_id, node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="ok",
data={
"sender": sender,
"text_len": len(text),
"queue_wait_ms": int(wait_s * 1000),
"routing_reason": routing_reason,
"is_mixed": is_mixed,
},
)
# Session Scope v2: canonical key + PII-safe sender hash.
# Scope is always room_agent for user-initiated messages.
# Control room messages never reach _process_entry (handled by _try_control).
scope = SCOPE_ROOM_AGENT
session_id = _build_session_key(room_id, agent_id, scope=scope)
sender_hash = _sender_hash(sender)
logger.debug(
"Session scope v2: session_key=%s scope=%s sender_hash=%s",
session_id, scope, sender_hash,
)
# M5.0: Node-aware routing
# Extract node=X kwarg from body (mixed rooms only, to avoid breaking direct rooms)
explicit_node: Optional[str] = None
effective_text = text
if is_mixed and self._node_policy is not None:
explicit_node, effective_text = extract_node_kwarg(text)
# M6.0: look up dynamic policy store override for this room
store_override: Optional[str] = None
if self._policy_store is not None and self._policy_store.is_open:
try:
store_override = await asyncio.to_thread(
self._policy_store.get_override, room_id
)
except Exception as exc: # noqa: BLE001
logger.warning("PolicyStore get_override failed: %s", exc)
node_res = (
self._node_policy.resolve(room_id, explicit_node, store_override=store_override)
if self._node_policy is not None
else NodeResolution(node_id=self._node_id, source=NODE_SOURCE_DEFAULT)
)
if node_res.rejected_node:
logger.info(
"Node kwarg rejected: requested=%s allowed=%s room=%s agent=%s",
node_res.rejected_node, self._node_policy.allowed_nodes if self._node_policy else {}, room_id, agent_id,
)
if self._on_node_rejected:
self._on_node_rejected(node_res.rejected_node)
txn_rej = MatrixClient.make_txn_id(room_id, event_id + "_node_rej")
allowed = self._node_policy.allowed_nodes if self._node_policy else frozenset()
reply_rej = node_rejected_reply(node_res.rejected_node, allowed)
try:
await client.send_text(room_id, reply_rej, txn_rej)
except Exception as exc:
logger.warning("Could not send node rejection reply: %s", exc)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.route.node_rejected",
agent_id=agent_id, node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code="node_rejected",
data={"requested_node": node_res.rejected_node, "resolved_node": node_res.node_id},
)
# Continue with fallback node (do not drop the message)
if self._on_node_selected:
self._on_node_selected(agent_id, node_res.node_id, node_res.source)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.route.node_selected",
agent_id=agent_id, node_id=node_res.node_id,
room_id=room_id, event_id=event_id,
status="ok",
data={"node_id": node_res.node_id, "source": node_res.source},
)
# M2.2: per-room-agent concurrency cap (only for mixed rooms; single-agent rooms unaffected)
_lock = self._get_concurrency_lock(room_id, agent_id) if is_mixed and self._mixed_concurrency_cap > 0 else None
if _lock is not None:
await _lock.acquire()
try:
await self._invoke_and_send(
client, http_client, entry, session_id, wait_s, is_mixed, routing_reason,
sender_hash=sender_hash, scope=scope,
effective_node_id=node_res.node_id, node_source=node_res.source,
effective_text=effective_text,
)
finally:
if _lock is not None:
_lock.release()
async def _invoke_and_send(
self,
client: MatrixClient,
http_client: httpx.AsyncClient,
entry: _QueueEntry,
session_id: str,
wait_s: float,
is_mixed: bool,
routing_reason: str,
sender_hash: str = "",
scope: str = SCOPE_ROOM_AGENT,
# M5.0: resolved node
effective_node_id: Optional[str] = None,
node_source: str = NODE_SOURCE_DEFAULT,
effective_text: Optional[str] = None, # text with node=X kwarg stripped
) -> None:
"""Inner: invoke Router + send reply (separated for concurrency lock wrapping)."""
event = entry.event
event_id = event.get("event_id", "")
# Use effective_text if provided (node kwarg stripped), otherwise original body
text = effective_text if effective_text is not None else event.get("content", {}).get("body", "").strip()
room_id = entry.room_id
agent_id = entry.agent_id
node_id = effective_node_id if effective_node_id is not None else self._node_id
# H3 + M8.0 + M8.1: Invoke with latency tracking, soft-failover, and sticky routing
t0 = time.monotonic()
reply_text: Optional[str] = None
invoke_ok = False
invoke_duration_s = 0.0
used_node_id = node_id # may change on failover
# M8.1: check sticky cache (skip primary if sticky is set for this room:agent)
sticky_key = make_sticky_key(room_id, agent_id)
sticky_node: Optional[str] = None
if node_source != NODE_SOURCE_EXPLICIT and self._sticky_cache is not None:
sticky_node = self._sticky_cache.get(sticky_key)
async def _do_invoke(target_node: str, target_source: str) -> Optional[str]:
"""Single invoke attempt; returns reply text or None on failure."""
nonlocal invoke_duration_s
_t = time.monotonic()
try:
result = await _invoke_router(
http_client, self._router_url,
agent_id=agent_id, node_id=target_node,
prompt=text, session_id=session_id,
sender_hash=sender_hash, scope=scope,
node_source=target_source,
)
invoke_duration_s = time.monotonic() - _t
if self._node_health_tracker is not None:
self._node_health_tracker.record_ok(target_node, invoke_duration_s)
if self._on_invoke_latency:
self._on_invoke_latency(agent_id, invoke_duration_s, target_node)
logger.info(
"Invoke ok: agent=%s node=%s event=%s reply_len=%d duration=%dms",
agent_id, target_node, event_id, len(result or ""), int(invoke_duration_s * 1000),
)
return result
except httpx.HTTPStatusError as exc:
invoke_duration_s = time.monotonic() - _t
_reason = FAILOVER_REASON_HTTP_5XX if exc.response.status_code >= 500 else "http_4xx"
logger.error(
"Router HTTP %d agent=%s node=%s event=%s duration=%dms",
exc.response.status_code, agent_id, target_node, event_id,
int(invoke_duration_s * 1000),
)
if self._node_health_tracker and exc.response.status_code >= 500:
self._node_health_tracker.record_error(target_node, _reason)
if self._on_gateway_error:
self._on_gateway_error(f"http_{exc.response.status_code}")
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.error", agent_id=agent_id, node_id=target_node,
room_id=room_id, event_id=event_id,
status="error", error_code=f"router_http_{exc.response.status_code}",
duration_ms=int(invoke_duration_s * 1000),
)
if exc.response.status_code >= 500:
raise # eligible for failover
return None # 4xx: not a node issue, don't failover
except (httpx.ConnectError, httpx.TimeoutException) as exc:
invoke_duration_s = time.monotonic() - _t
_reason = (
FAILOVER_REASON_TIMEOUT
if isinstance(exc, httpx.TimeoutException)
else FAILOVER_REASON_NETWORK
)
logger.error(
"Router network error agent=%s node=%s event=%s: %s",
agent_id, target_node, event_id, exc,
)
if self._node_health_tracker:
self._node_health_tracker.record_error(target_node, _reason)
if self._on_gateway_error:
self._on_gateway_error("network_error")
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.error", agent_id=agent_id, node_id=target_node,
room_id=room_id, event_id=event_id,
status="error", error_code="router_network_error",
duration_ms=int(invoke_duration_s * 1000),
)
raise # eligible for failover
except Exception as exc:
invoke_duration_s = time.monotonic() - _t
logger.error(
"Unexpected invoke error agent=%s node=%s event=%s: %s",
agent_id, target_node, event_id, exc,
)
if self._node_health_tracker:
self._node_health_tracker.record_error(target_node, "unexpected")
if self._on_gateway_error:
self._on_gateway_error("unexpected")
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.error", agent_id=agent_id, node_id=target_node,
room_id=room_id, event_id=event_id,
status="error", error_code="router_unexpected",
duration_ms=int(invoke_duration_s * 1000),
)
return None # unexpected errors: no failover (could be code bug)
if sticky_node is not None:
# M8.1: sticky path — route directly to known-good fallback, skip primary
logger.info(
"Sticky: routing %s%s (skipping primary=%s) agent=%s event=%s",
sticky_key, sticky_node, node_id, agent_id, event_id,
)
try:
reply_text = await _do_invoke(sticky_node, NODE_SOURCE_DEFAULT)
invoke_ok = reply_text is not None
used_node_id = sticky_node
except Exception: # noqa: BLE001
# Sticky node also failed — clear sticky and leave reply_text=None
self._sticky_cache.delete(sticky_key) # type: ignore[union-attr]
logger.warning(
"Sticky node %s failed for %s — cleared (agent=%s event=%s)",
sticky_node, sticky_key, agent_id, event_id,
)
# M8.2: remove from DB as well
await self._sticky_persist_delete(sticky_key)
else:
# Normal path: try primary; attempt failover on eligible errors
try:
reply_text = await _do_invoke(node_id, node_source)
invoke_ok = reply_text is not None
used_node_id = node_id
except (httpx.ConnectError, httpx.TimeoutException, httpx.HTTPStatusError):
# Primary failed with a failover-eligible error.
# Failover only for non-explicit routing (explicit = user chose node).
if node_source == NODE_SOURCE_EXPLICIT:
logger.info(
"Node %s failed for explicit routing — no failover (agent=%s event=%s)",
node_id, agent_id, event_id,
)
# reply_text stays None; error already audited
else:
# Attempt failover
fallback_node: Optional[str] = None
if self._node_health_tracker is not None and self._node_policy is not None:
fallback_node = self._node_health_tracker.pick_fallback(
node_id, self._node_policy.allowed_nodes
)
elif self._node_policy is not None:
# No tracker — pick any other allowed node deterministically
others = sorted(
n for n in self._node_policy.allowed_nodes if n != node_id
)
fallback_node = others[0] if others else None
if fallback_node:
logger.warning(
"Failover: %s%s agent=%s event=%s",
node_id, fallback_node, agent_id, event_id,
)
try:
reply_text = await _do_invoke(fallback_node, NODE_SOURCE_DEFAULT)
invoke_ok = reply_text is not None
used_node_id = fallback_node
if invoke_ok:
# Fire failover callback and audit
if self._on_failover:
self._on_failover(node_id, fallback_node, "invoke_error")
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.node.failover",
agent_id=agent_id, node_id=fallback_node,
room_id=room_id, event_id=event_id,
status="ok", data={
"from_node": node_id,
"to_node": fallback_node,
"original_source": node_source,
},
)
# M8.1: set sticky — future messages skip primary
if self._sticky_cache is not None:
self._sticky_cache.set(sticky_key, fallback_node)
scope = "mixed" if is_mixed else "direct"
if self._on_sticky_set:
self._on_sticky_set(fallback_node, scope)
logger.info(
"Sticky set: %s%s scope=%s ttl=%.0fs",
sticky_key, fallback_node, scope,
self._sticky_cache.ttl_s,
)
# M8.2: persist sticky to DB
await self._sticky_persist_set(sticky_key, fallback_node)
except Exception: # noqa: BLE001
pass # errors already audited inside _do_invoke
if not invoke_ok or not reply_text:
if invoke_ok:
logger.warning("Empty reply from router agent=%s event=%s", agent_id, event_id)
return
# H3: Send with latency
# M2.1: prefix reply with agent identity in mixed rooms ("Sofiia: ...")
prefix = reply_prefix(agent_id, is_mixed)
raw_reply = reply_text[:_REPLY_TEXT_MAX - len(prefix)]
send_text = prefix + raw_reply
txn_id = MatrixClient.make_txn_id(room_id, event_id)
send_t0 = time.monotonic()
try:
await client.send_text(room_id, send_text, txn_id)
send_duration_s = time.monotonic() - send_t0
if self._on_send_latency:
self._on_send_latency(agent_id, send_duration_s)
if self._on_message_replied:
self._on_message_replied(room_id, agent_id, "ok")
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.agent.replied", agent_id=agent_id, node_id=used_node_id,
room_id=room_id, event_id=event_id, status="ok",
duration_ms=int(send_duration_s * 1000),
data={
"reply_len": len(send_text),
"truncated": len(reply_text) > _REPLY_TEXT_MAX,
"router_duration_ms": int(invoke_duration_s * 1000),
"queue_wait_ms": int(wait_s * 1000),
"routing_reason": routing_reason,
"is_mixed": is_mixed,
"node_source": node_source,
"failover": used_node_id != node_id, # M8.0: failover flag
},
)
logger.info(
"Reply sent: agent=%s event=%s reply_len=%d send_ms=%d",
agent_id, event_id, len(send_text), int(send_duration_s * 1000),
)
except Exception as exc:
send_duration_s = time.monotonic() - send_t0
logger.error("Send failed agent=%s event=%s: %s", agent_id, event_id, exc)
if self._on_message_replied:
self._on_message_replied(room_id, agent_id, "error")
if self._on_gateway_error:
self._on_gateway_error("matrix_send_error")
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.error", agent_id=agent_id, node_id=node_id,
room_id=room_id, event_id=event_id,
status="error", error_code="matrix_send_failed",
duration_ms=int(send_duration_s * 1000),
)