Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
3576 lines
161 KiB
Python
3576 lines
161 KiB
Python
"""
|
|
Matrix Ingress + Egress Loop — Phase M1.4 + H1 + H2 + H3 + M2.1 + M2.2 + M3.0 + M3.1 + M3.3 + SessionScopeV2
|
|
|
|
Architecture (H2):
|
|
Reader task → asyncio.Queue(maxsize) → N Worker tasks
|
|
─────────────────────────────────────────────────────────
|
|
Reader:
|
|
sync_poll() → extract_room_messages()
|
|
→ rate_limit check (H1)
|
|
→ mark_seen / dedupe
|
|
→ queue.put_nowait() or DROP (audit matrix.queue_full + metric)
|
|
|
|
Workers (N concurrent):
|
|
queue.get() → measure wait latency (H3)
|
|
→ audit matrix.message.received
|
|
→ invoke Router (timed, H3)
|
|
→ send_text() (timed, H3)
|
|
→ audit matrix.agent.replied | matrix.error
|
|
|
|
Shutdown:
|
|
1. stop_event set → reader exits loop
|
|
2. queue.join() with drain_timeout → workers finish in-flight
|
|
3. worker tasks cancelled
|
|
|
|
Queue entry: _QueueEntry(event, room_id, agent_id, enqueue_time, routing_reason, is_mixed)
|
|
"""
|
|
|
|
import asyncio
|
|
import hashlib
|
|
import json as _json
|
|
import logging
|
|
import os as _os
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from typing import Any, Callable, Dict, List, Optional
|
|
|
|
import httpx
|
|
|
|
from .control import (
|
|
ControlConfig, ControlCommand,
|
|
check_authorization, parse_command, is_control_message,
|
|
not_implemented_reply, unknown_command_reply, unauthorized_reply, help_reply,
|
|
start_usage_reply, runbook_started_reply, runbook_start_error_reply,
|
|
next_usage_reply, next_manual_reply, next_auto_reply, next_error_reply,
|
|
complete_usage_reply, complete_ok_reply, complete_error_reply,
|
|
status_usage_reply, status_reply, status_error_reply,
|
|
evidence_usage_reply, evidence_reply, evidence_error_reply,
|
|
post_review_usage_reply, post_review_reply, post_review_error_reply,
|
|
rate_limited_reply, sanitize_notes, MAX_NOTES_LEN,
|
|
status_not_available_reply, nodes_reply,
|
|
VERB_HELP, VERB_RUNBOOK, VERB_STATUS, VERB_NODES, VERB_NODE, VERB_CONFIRM,
|
|
is_dangerous_cmd, build_normalized_args,
|
|
confirm_intent_reply, confirm_success_reply, confirm_expired_reply,
|
|
NODE_SUBCMD_SET, NODE_SUBCMD_UNSET, NODE_SUBCMD_GET, NODE_SUBCMD_LIST,
|
|
parse_node_cmd, node_cmd_validate_room,
|
|
node_cmd_reply_set, node_cmd_reply_unset_ok, node_cmd_reply_unset_not_found,
|
|
node_cmd_reply_get, node_cmd_reply_list, node_cmd_reply_error,
|
|
VERB_ROOM, ROOM_SUBCMD_AGENTS,
|
|
ROOM_ACTION_SET, ROOM_ACTION_ADD, ROOM_ACTION_REMOVE,
|
|
ROOM_ACTION_GET, ROOM_ACTION_LIST, ROOM_ACTION_UNSET,
|
|
parse_room_agents_cmd,
|
|
room_agents_reply_set, room_agents_reply_add, room_agents_reply_remove,
|
|
room_agents_reply_unset_ok, room_agents_reply_unset_not_found,
|
|
room_agents_reply_get, room_agents_reply_list, room_agents_reply_error,
|
|
VERB_POLICY, POLICY_EXPORTS_SUBDIR, validate_export_path,
|
|
policy_import_intent_reply, format_import_diff as _format_import_diff,
|
|
policy_export_reply, policy_import_dry_run_reply, policy_import_reply,
|
|
policy_cmd_error,
|
|
policy_prune_preview_reply, policy_prune_applied_reply,
|
|
policy_restore_intent_reply, policy_restore_applied_reply,
|
|
policy_history_reply, policy_change_detail_reply,
|
|
SUBCOMMAND_START, SUBCOMMAND_NEXT, SUBCOMMAND_COMPLETE,
|
|
SUBCOMMAND_STATUS, SUBCOMMAND_EVIDENCE, SUBCOMMAND_POST_REVIEW,
|
|
)
|
|
from .control_limiter import ControlRateLimiter
|
|
from .discovery import agents_reply, bridge_status_reply, is_discovery_message
|
|
from .event_store import EventStore
|
|
from .node_policy import (
|
|
NodePolicy, NodeResolution,
|
|
NODE_SOURCE_DEFAULT, NODE_SOURCE_EXPLICIT, NODE_SOURCE_ROOM_MAP,
|
|
extract_node_kwarg, node_rejected_reply,
|
|
)
|
|
from .node_health import (
|
|
NodeHealthTracker, NodeHealthConfig,
|
|
NODE_STATE_HEALTHY, NODE_STATE_DEGRADED, NODE_STATE_DOWN,
|
|
FAILOVER_REASON_TIMEOUT, FAILOVER_REASON_HTTP_5XX, FAILOVER_REASON_NETWORK,
|
|
)
|
|
from .sticky_cache import StickyNodeCache, make_sticky_key
|
|
from .confirm_store import ConfirmStore
|
|
from . import control_runner as _ctrl_runner
|
|
from .matrix_client import MatrixClient
|
|
from .mixed_routing import (
|
|
MixedRoomConfig, MixedRoom, route_message, reply_prefix, build_override_config,
|
|
REASON_REJECTED_UNKNOWN_AGENT, REASON_REJECTED_SLASH_TOO_LONG, REASON_REJECTED_NO_MAPPING,
|
|
)
|
|
from .rate_limit import InMemoryRateLimiter
|
|
from .room_mapping import RoomMappingConfig, RoomMapping
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ── Constants ──────────────────────────────────────────────────────────────────
|
|
|
|
_MAX_RETRY_BACKOFF = 60.0
|
|
_INIT_RETRY_BACKOFF = 2.0
|
|
_ROUTER_TIMEOUT_S = 45.0
|
|
_AUDIT_TIMEOUT_S = 5.0
|
|
_REPLY_TEXT_MAX = 4000
|
|
_WORKER_GET_TIMEOUT_S = 1.0 # how long a worker waits on empty queue before re-checking
|
|
|
|
|
|
# ── Queue entry ────────────────────────────────────────────────────────────────
|
|
|
|
@dataclass
|
|
class _QueueEntry:
|
|
event: Dict[str, Any]
|
|
room_id: str
|
|
agent_id: str
|
|
enqueue_time: float # time.monotonic() at enqueue
|
|
routing_reason: str = "direct"
|
|
is_mixed: bool = False # True for mixed-room entries (reply tagging, session isolation)
|
|
|
|
|
|
# ── Session Scope v2 ───────────────────────────────────────────────────────────
|
|
# Invariants:
|
|
# 1. Control room messages never reach the Router (no session_key needed there).
|
|
# 2. Matrix and Telegram never share a key namespace (prefix "matrix:").
|
|
# 3. Mixed rooms: each (room_id, agent_id) pair has its own key — no cross-agent leakage.
|
|
# 4. Logs/metrics receive sender_hash (sha256[:16]), never raw Matrix user_id.
|
|
|
|
SCOPE_ROOM_AGENT = "room_agent" # default: shared room context per agent
|
|
SCOPE_OPS_RUNBOOK = "ops_runbook" # future: control/ops room invocations
|
|
SCOPE_DM_USER = "dm_agent_user" # future: per-user DM isolation
|
|
|
|
|
|
def _sender_hash(sender: str) -> str:
|
|
"""PII-safe 16-hex hash of a Matrix user_id (e.g. @alice:server → 'a3f9...')."""
|
|
return hashlib.sha256(sender.encode()).hexdigest()[:16]
|
|
|
|
|
|
def _build_session_key(room_id: str, agent_id: str, scope: str = SCOPE_ROOM_AGENT) -> str:
|
|
"""
|
|
Canonical session key v2: matrix:{scope}:{room_key}:{agent_id}
|
|
|
|
Examples:
|
|
matrix:room_agent:roomXserver_yourdomain:sofiia
|
|
matrix:ops_runbook:opsroomXserver:sofiia
|
|
"""
|
|
room_key = room_id.replace("!", "").replace(":", "_")
|
|
return f"matrix:{scope}:{room_key}:{agent_id}"
|
|
|
|
|
|
# ── Router invoke ──────────────────────────────────────────────────────────────
|
|
|
|
async def _invoke_router(
|
|
http_client: httpx.AsyncClient,
|
|
router_url: str,
|
|
agent_id: str,
|
|
node_id: str,
|
|
prompt: str,
|
|
session_id: str,
|
|
sender_hash: str = "",
|
|
scope: str = SCOPE_ROOM_AGENT,
|
|
node_source: str = NODE_SOURCE_DEFAULT,
|
|
) -> str:
|
|
"""POST /v1/agents/{agent_id}/infer → response text. Raises httpx.HTTPError on failure."""
|
|
url = f"{router_url.rstrip('/')}/v1/agents/{agent_id}/infer"
|
|
payload = {
|
|
"prompt": prompt,
|
|
"session_id": session_id,
|
|
"user_id": "matrix_bridge",
|
|
"metadata": {
|
|
"transport": "matrix",
|
|
"node_id": node_id,
|
|
"node_source": node_source, # M5.0: how node was resolved
|
|
"session_key": session_id, # explicit for Router/Memory to index on
|
|
"sender_hash": sender_hash, # PII-safe sender fingerprint
|
|
"scope": scope,
|
|
},
|
|
}
|
|
resp = await http_client.post(url, json=payload, timeout=_ROUTER_TIMEOUT_S)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
text = (
|
|
data.get("response")
|
|
or data.get("text")
|
|
or data.get("content")
|
|
or data.get("message")
|
|
or ""
|
|
)
|
|
return (text if isinstance(text, str) else str(text)).strip()
|
|
|
|
|
|
# ── M6.2: File helpers (run in thread) ────────────────────────────────────────
|
|
|
|
def _write_json_file(path: str, data: Any) -> None:
|
|
"""Synchronously write data as JSON to path (UTF-8, 2-space indent)."""
|
|
with open(path, "w", encoding="utf-8") as fh:
|
|
_json.dump(data, fh, ensure_ascii=False, indent=2)
|
|
|
|
|
|
def _read_json_file(path: str) -> Any:
|
|
"""Synchronously read and parse a JSON file."""
|
|
with open(path, encoding="utf-8") as fh:
|
|
return _json.load(fh)
|
|
|
|
|
|
# ── Audit write ────────────────────────────────────────────────────────────────
|
|
|
|
async def _write_audit(
|
|
http_client: httpx.AsyncClient,
|
|
console_url: str,
|
|
internal_token: str,
|
|
event: str,
|
|
agent_id: str,
|
|
node_id: str,
|
|
room_id: str,
|
|
event_id: str,
|
|
status: str = "ok",
|
|
error_code: Optional[str] = None,
|
|
duration_ms: Optional[int] = None,
|
|
data: Optional[Dict[str, Any]] = None,
|
|
) -> None:
|
|
"""Fire-and-forget. Never raises."""
|
|
if not console_url or not internal_token:
|
|
return
|
|
try:
|
|
await http_client.post(
|
|
f"{console_url.rstrip('/')}/api/audit/internal",
|
|
json={
|
|
"event": event,
|
|
"operator_id": "matrix_bridge",
|
|
"node_id": node_id,
|
|
"agent_id": agent_id,
|
|
"chat_id": room_id,
|
|
"status": status,
|
|
"error_code": error_code,
|
|
"duration_ms": duration_ms,
|
|
"data": {"matrix_event_id": event_id, "matrix_room_id": room_id, **(data or {})},
|
|
},
|
|
headers={"X-Internal-Service-Token": internal_token},
|
|
timeout=_AUDIT_TIMEOUT_S,
|
|
)
|
|
except Exception as exc:
|
|
logger.warning("Audit write failed (non-blocking): %s", exc)
|
|
|
|
|
|
# ── Ingress loop (reader + workers) ───────────────────────────────────────────
|
|
|
|
class MatrixIngressLoop:
|
|
"""
|
|
Drives the full Matrix → Router → Matrix pipeline with backpressure.
|
|
|
|
Reader task: sync → extract → rate_check → dedupe → queue.put_nowait
|
|
Worker tasks: queue.get → invoke → send → audit
|
|
|
|
Metric callbacks (all optional, called synchronously):
|
|
on_message_received(room_id, agent_id)
|
|
on_message_replied(room_id, agent_id, status)
|
|
on_gateway_error(error_type)
|
|
on_rate_limited(room_id, agent_id, limit_type)
|
|
on_queue_dropped(room_id, agent_id)
|
|
on_queue_size(current_size: int)
|
|
on_invoke_latency(agent_id, duration_seconds)
|
|
on_send_latency(agent_id, duration_seconds)
|
|
on_queue_wait(agent_id, wait_seconds)
|
|
on_routed(agent_id, reason) M2.2: successful routing
|
|
on_route_rejected(room_id, reason) M2.2: routing rejection
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
matrix_homeserver_url: str,
|
|
matrix_access_token: str,
|
|
matrix_user_id: str,
|
|
router_url: str,
|
|
node_id: str,
|
|
room_map: RoomMappingConfig,
|
|
sofiia_console_url: str = "",
|
|
sofiia_internal_token: str = "",
|
|
rate_limiter: Optional[InMemoryRateLimiter] = None,
|
|
queue_max_events: int = 100,
|
|
worker_concurrency: int = 2,
|
|
queue_drain_timeout_s: float = 5.0,
|
|
mixed_room_config: Optional[MixedRoomConfig] = None,
|
|
# M2.2: guard rails
|
|
unknown_agent_behavior: str = "ignore", # "ignore" | "reply_error"
|
|
max_slash_len: int = 32,
|
|
mixed_concurrency_cap: int = 1, # 0 = unlimited
|
|
# M3.0: control channel
|
|
control_config: Optional[ControlConfig] = None,
|
|
control_unauthorized_behavior: str = "ignore", # "ignore" | "reply_error"
|
|
# M3.1: runbook runner integration
|
|
sofiia_control_token: str = "",
|
|
# M3.4: control channel safety
|
|
control_limiter: Optional["ControlRateLimiter"] = None,
|
|
# M2.3: persistent event deduplication
|
|
event_store: Optional["EventStore"] = None,
|
|
# M4.0: agent discovery
|
|
discovery_rpm: int = 20,
|
|
# M5.0: node-aware routing
|
|
node_policy: Optional["NodePolicy"] = None,
|
|
# Callbacks
|
|
on_message_received: Optional[Callable[[str, str], None]] = None,
|
|
on_message_replied: Optional[Callable[[str, str, str], None]] = None,
|
|
on_gateway_error: Optional[Callable[[str], None]] = None,
|
|
on_rate_limited: Optional[Callable[[str, str, str], None]] = None,
|
|
on_queue_dropped: Optional[Callable[[str, str], None]] = None,
|
|
on_queue_size: Optional[Callable[[int], None]] = None,
|
|
on_invoke_latency: Optional[Callable[..., None]] = None, # (agent_id, duration_s, node_id="")
|
|
on_send_latency: Optional[Callable[[str, float], None]] = None,
|
|
on_queue_wait: Optional[Callable[[str, float], None]] = None,
|
|
on_routed: Optional[Callable[[str, str], None]] = None,
|
|
on_route_rejected: Optional[Callable[[str, str], None]] = None,
|
|
on_control_command: Optional[Callable[[str, str, str], None]] = None,
|
|
on_control_rate_limited: Optional[Callable[[str], None]] = None,
|
|
on_dedupe_persistent_hit: Optional[Callable[[str, str], None]] = None,
|
|
on_dedupe_persistent_insert: Optional[Callable[[], None]] = None,
|
|
# M5.0: node routing callbacks
|
|
on_node_selected: Optional[Callable[[str, str, str], None]] = None, # (agent_id, node_id, source)
|
|
on_node_rejected: Optional[Callable[[str], None]] = None, # (rejected_node)
|
|
# M5.1: per-node stats for !status reply
|
|
node_stats_getter: Optional[Callable[[], Dict[str, Any]]] = None,
|
|
# M6.0: persistent policy store for dynamic room-node overrides
|
|
policy_store: Optional[Any] = None, # app.policy_store.PolicyStore
|
|
# M6.2: data directory for policy exports/imports
|
|
bridge_data_dir: Optional[str] = None,
|
|
# M8.0: node health tracker for soft-failover
|
|
node_health_tracker: Optional[NodeHealthTracker] = None,
|
|
on_failover: Optional[Callable[[str, str, str], None]] = None, # (from_node, to_node, reason)
|
|
# M8.1: sticky failover cache (anti-flap)
|
|
sticky_cache: Optional[StickyNodeCache] = None,
|
|
on_sticky_set: Optional[Callable[[str, str], None]] = None, # (node_id, scope)
|
|
# M8.2: HA state persistence config
|
|
ha_health_snapshot_interval_s: int = 60,
|
|
ha_health_max_age_s: int = 600,
|
|
# M9.0: Two-step confirmation store for dangerous commands
|
|
confirm_store: Optional[ConfirmStore] = None,
|
|
# M10.0: Auto-backup retention policy (days; 0 = keep forever)
|
|
policy_export_retention_days: int = 30,
|
|
# M10.2: max rows to keep in policy_changes history (0 = unlimited)
|
|
policy_history_limit: int = 100,
|
|
) -> None:
|
|
self._hs_url = matrix_homeserver_url
|
|
self._token = matrix_access_token
|
|
self._user_id = matrix_user_id
|
|
self._router_url = router_url
|
|
self._node_id = node_id
|
|
self._room_map = room_map
|
|
self._console_url = sofiia_console_url
|
|
self._internal_token = sofiia_internal_token
|
|
self._rate_limiter = rate_limiter
|
|
self._queue_max = queue_max_events
|
|
self._worker_count = worker_concurrency
|
|
self._drain_timeout_s = queue_drain_timeout_s
|
|
# Callbacks
|
|
self._on_message_received = on_message_received
|
|
self._on_message_replied = on_message_replied
|
|
self._on_gateway_error = on_gateway_error
|
|
self._on_rate_limited = on_rate_limited
|
|
self._on_queue_dropped = on_queue_dropped
|
|
self._on_queue_size = on_queue_size
|
|
self._on_invoke_latency = on_invoke_latency
|
|
self._on_send_latency = on_send_latency
|
|
self._on_queue_wait = on_queue_wait
|
|
self._mixed_room_config = mixed_room_config
|
|
self._control_config = control_config
|
|
self._control_unauthorized_behavior = control_unauthorized_behavior
|
|
self._control_token = sofiia_control_token
|
|
self._unknown_agent_behavior = unknown_agent_behavior
|
|
self._max_slash_len = max_slash_len
|
|
self._mixed_concurrency_cap = mixed_concurrency_cap
|
|
self._on_routed = on_routed
|
|
self._on_route_rejected = on_route_rejected
|
|
self._on_control_command = on_control_command
|
|
self._on_control_rate_limited = on_control_rate_limited
|
|
# M3.4: control channel safety
|
|
self._control_limiter = control_limiter
|
|
# M2.3: persistent event deduplication
|
|
self._event_store: Optional[EventStore] = event_store
|
|
self._on_dedupe_persistent_hit = on_dedupe_persistent_hit
|
|
self._on_dedupe_persistent_insert = on_dedupe_persistent_insert
|
|
# M5.0: node routing callbacks
|
|
self._on_node_selected = on_node_selected
|
|
self._on_node_rejected = on_node_rejected
|
|
# M5.1: per-node stats getter for !status reply
|
|
self._node_stats_getter = node_stats_getter
|
|
# M6.0: persistent policy store
|
|
self._policy_store = policy_store
|
|
# M6.2: policy exports directory
|
|
self._bridge_data_dir: Optional[str] = bridge_data_dir
|
|
self._policy_last_export_at: Optional[int] = None
|
|
self._policy_last_import_at: Optional[int] = None
|
|
# M10.0: auto-backup retention
|
|
self._policy_export_retention_days: int = policy_export_retention_days
|
|
# M10.2: history table row limit
|
|
self._policy_history_limit: int = policy_history_limit
|
|
# M8.0: node health + soft-failover
|
|
self._node_health_tracker: Optional[NodeHealthTracker] = node_health_tracker
|
|
self._on_failover = on_failover
|
|
# M8.1: sticky failover cache
|
|
self._sticky_cache: Optional[StickyNodeCache] = sticky_cache
|
|
self._on_sticky_set = on_sticky_set
|
|
# M9.0: two-step confirmation store
|
|
self._confirm_store: Optional[ConfirmStore] = confirm_store
|
|
# M8.2: HA state persistence
|
|
self._ha_health_snapshot_interval_s: int = ha_health_snapshot_interval_s
|
|
self._ha_health_max_age_s: int = ha_health_max_age_s
|
|
self._ha_sticky_loaded: int = 0 # count of sticky entries loaded on startup
|
|
self._ha_health_loaded: bool = False # whether health state was loaded on startup
|
|
# M4.0: agent discovery — simple per-room sliding window (reuses InMemoryRateLimiter logic)
|
|
self._discovery_rpm = discovery_rpm
|
|
# M5.0: node-aware routing policy
|
|
self._node_policy: Optional[NodePolicy] = node_policy
|
|
from collections import defaultdict, deque
|
|
self._discovery_windows: dict = defaultdict(deque)
|
|
# Lazily populated semaphores keyed by "{room_id}:{agent_id}"
|
|
self._concurrency_locks: Dict[str, asyncio.Semaphore] = {}
|
|
self._next_batch: Optional[str] = None
|
|
self._queue: Optional[asyncio.Queue] = None # exposed for /health
|
|
|
|
@property
|
|
def next_batch(self) -> Optional[str]:
|
|
return self._next_batch
|
|
|
|
@property
|
|
def queue_size(self) -> int:
|
|
return self._queue.qsize() if self._queue else 0
|
|
|
|
@property
|
|
def worker_count(self) -> int:
|
|
return self._worker_count
|
|
|
|
def get_status(self) -> Dict[str, Any]:
|
|
"""Return a simple bridge status dict for health/ops queries."""
|
|
status: Dict[str, Any] = {
|
|
"queue_size": self._queue.qsize() if self._queue else 0,
|
|
"queue_max": self._queue_max,
|
|
"worker_count": self._worker_count,
|
|
}
|
|
if self._node_policy is not None:
|
|
status["node_policy"] = self._node_policy.as_info_dict()
|
|
# M5.1: per-node routed/rejected counters
|
|
if self._node_stats_getter is not None:
|
|
status["nodes"] = self._node_stats_getter()
|
|
# M6.0: policy store info
|
|
if self._policy_store is not None:
|
|
try:
|
|
status["policy_store_ok"] = self._policy_store.is_open
|
|
status["policy_store_path"] = self._policy_store.db_path
|
|
status["policy_overrides_count"] = self._policy_store.count_overrides()
|
|
status["policy_agent_overrides_count"] = self._policy_store.count_agent_overrides() # M6.1
|
|
except Exception as exc: # noqa: BLE001
|
|
status["policy_store_ok"] = False
|
|
status["policy_store_error"] = str(exc)
|
|
# M6.2: policy snapshot timestamps
|
|
if self._policy_last_export_at is not None:
|
|
status["policy_last_export_at"] = self._policy_last_export_at
|
|
if self._policy_last_import_at is not None:
|
|
status["policy_last_import_at"] = self._policy_last_import_at
|
|
# M6.2: policy DB mtime (best-effort)
|
|
if self._policy_store is not None:
|
|
try:
|
|
db_path = self._policy_store.db_path
|
|
if db_path and _os.path.exists(db_path):
|
|
status["policy_db_mtime"] = int(_os.path.getmtime(db_path))
|
|
except Exception: # noqa: BLE001
|
|
pass
|
|
# M10.2: policy change history count
|
|
if self._policy_store is not None and self._policy_store.is_open:
|
|
try:
|
|
status["policy_changes_count"] = self._policy_store.get_policy_changes_count()
|
|
except Exception: # noqa: BLE001
|
|
pass
|
|
# M8.0: node health tracker state
|
|
if self._node_health_tracker is not None:
|
|
allowed = (
|
|
self._node_policy.allowed_nodes
|
|
if self._node_policy is not None
|
|
else None
|
|
)
|
|
status["node_health"] = self._node_health_tracker.all_info(allowed)
|
|
# M8.1: sticky failover cache info
|
|
if self._sticky_cache is not None:
|
|
status["sticky_active_keys"] = self._sticky_cache.active_count()
|
|
status["sticky_ttl_s"] = self._sticky_cache.ttl_s
|
|
# M9.0: pending confirmations
|
|
if self._confirm_store is not None:
|
|
status["confirm_pending"] = self._confirm_store.pending_count()
|
|
status["confirm_ttl_s"] = self._confirm_store.ttl_s
|
|
# M8.2: HA persistence info
|
|
status["ha_sticky_loaded"] = self._ha_sticky_loaded
|
|
status["ha_health_loaded"] = self._ha_health_loaded
|
|
status["ha_health_snapshot_interval_s"] = self._ha_health_snapshot_interval_s
|
|
return status
|
|
|
|
@property
|
|
def active_lock_count(self) -> int:
|
|
"""Number of room-agent pairs currently holding a concurrency lock."""
|
|
return sum(1 for lock in self._concurrency_locks.values() if lock.locked())
|
|
|
|
def _get_concurrency_lock(self, room_id: str, agent_id: str) -> asyncio.Semaphore:
|
|
"""Lazily create and return the semaphore for a (room, agent) pair."""
|
|
key = f"{room_id}:{agent_id}"
|
|
if key not in self._concurrency_locks:
|
|
cap = self._mixed_concurrency_cap if self._mixed_concurrency_cap > 0 else 2 ** 31
|
|
self._concurrency_locks[key] = asyncio.Semaphore(cap)
|
|
return self._concurrency_locks[key]
|
|
|
|
# ── Public run ─────────────────────────────────────────────────────────────
|
|
|
|
async def run(self, stop_event: asyncio.Event) -> None:
|
|
mixed_rooms_count = self._mixed_room_config.total_rooms if self._mixed_room_config else 0
|
|
logger.info(
|
|
"Matrix ingress loop started | hs=%s node=%s mappings=%d mixed_rooms=%d "
|
|
"queue_max=%d workers=%d",
|
|
self._hs_url, self._node_id, self._room_map.total_mappings, mixed_rooms_count,
|
|
self._queue_max, self._worker_count,
|
|
)
|
|
|
|
if self._room_map.total_mappings == 0 and mixed_rooms_count == 0:
|
|
logger.warning("No room mappings — ingress loop is idle")
|
|
|
|
queue: asyncio.Queue[Optional[_QueueEntry]] = asyncio.Queue(
|
|
maxsize=self._queue_max
|
|
)
|
|
self._queue = queue
|
|
|
|
async with MatrixClient(self._hs_url, self._token, self._user_id) as client:
|
|
for mapping in self._room_map.mappings:
|
|
if mapping.agent_id in self._room_map.allowed_agents:
|
|
try:
|
|
await client.join_room(mapping.room_id)
|
|
except Exception as exc:
|
|
logger.warning("Could not join room %s: %s", mapping.room_id, exc)
|
|
if self._mixed_room_config:
|
|
for room_id in self._mixed_room_config.rooms:
|
|
try:
|
|
await client.join_room(room_id)
|
|
except Exception as exc:
|
|
logger.warning("Could not join mixed room %s: %s", room_id, exc)
|
|
if self._control_config and self._control_config.is_enabled:
|
|
for room_id in self._control_config.control_rooms:
|
|
try:
|
|
await client.join_room(room_id)
|
|
except Exception as exc:
|
|
logger.warning("Could not join control room %s: %s", room_id, exc)
|
|
logger.info(
|
|
"Control channel: %d rooms, %d operators",
|
|
len(self._control_config.control_rooms),
|
|
len(self._control_config.operator_allowlist),
|
|
)
|
|
|
|
async with httpx.AsyncClient() as http_client:
|
|
# M8.2: Load persisted HA state before processing any messages
|
|
await self._load_ha_state()
|
|
|
|
# Start workers
|
|
worker_tasks = [
|
|
asyncio.create_task(
|
|
self._worker(queue, client, http_client),
|
|
name=f"matrix_worker_{i}",
|
|
)
|
|
for i in range(self._worker_count)
|
|
]
|
|
|
|
# M8.2: Start periodic node health snapshot task
|
|
_health_snapshot_task = None
|
|
if (
|
|
self._ha_health_snapshot_interval_s > 0
|
|
and self._policy_store is not None
|
|
and self._node_health_tracker is not None
|
|
):
|
|
_health_snapshot_task = asyncio.create_task(
|
|
self._node_health_snapshot_loop(),
|
|
name="ha_health_snapshot",
|
|
)
|
|
|
|
# Run reader until stop_event
|
|
await self._reader(client, queue, http_client, stop_event)
|
|
|
|
# Drain: wait for all enqueued items to be processed
|
|
logger.info(
|
|
"Reader stopped. Draining queue (%d items, timeout=%.1fs)...",
|
|
queue.qsize(), self._drain_timeout_s,
|
|
)
|
|
try:
|
|
await asyncio.wait_for(queue.join(), timeout=self._drain_timeout_s)
|
|
logger.info("Queue drained successfully")
|
|
except asyncio.TimeoutError:
|
|
remaining = queue.qsize()
|
|
logger.warning(
|
|
"Drain timeout (%.1fs): %d items not processed",
|
|
self._drain_timeout_s, remaining,
|
|
)
|
|
|
|
# Cancel workers
|
|
for task in worker_tasks:
|
|
task.cancel()
|
|
# M8.2: Cancel health snapshot task if running
|
|
if _health_snapshot_task is not None and not _health_snapshot_task.done():
|
|
_health_snapshot_task.cancel()
|
|
results = await asyncio.gather(*worker_tasks, return_exceptions=True)
|
|
cancelled = sum(1 for r in results if isinstance(r, asyncio.CancelledError))
|
|
logger.info("Workers stopped (%d cancelled)", cancelled)
|
|
|
|
self._queue = None
|
|
logger.info("Matrix ingress loop stopped")
|
|
|
|
# ── Reader ─────────────────────────────────────────────────────────────────
|
|
|
|
async def _reader(
|
|
self,
|
|
client: MatrixClient,
|
|
queue: "asyncio.Queue[Optional[_QueueEntry]]",
|
|
http_client: httpx.AsyncClient,
|
|
stop_event: asyncio.Event,
|
|
) -> None:
|
|
backoff = _INIT_RETRY_BACKOFF
|
|
while not stop_event.is_set():
|
|
try:
|
|
sync_resp = await client.sync_poll(since=self._next_batch)
|
|
self._next_batch = sync_resp.get("next_batch")
|
|
backoff = _INIT_RETRY_BACKOFF
|
|
await self._enqueue_from_sync(client, queue, http_client, sync_resp)
|
|
except asyncio.CancelledError:
|
|
break
|
|
except Exception as exc:
|
|
logger.error("Reader error (retry in %.1fs): %s", backoff, exc)
|
|
if self._on_gateway_error:
|
|
self._on_gateway_error("sync_error")
|
|
try:
|
|
await asyncio.wait_for(stop_event.wait(), timeout=backoff)
|
|
except asyncio.TimeoutError:
|
|
pass
|
|
backoff = min(backoff * 2, _MAX_RETRY_BACKOFF)
|
|
|
|
async def _enqueue_from_sync(
|
|
self,
|
|
client: MatrixClient,
|
|
queue: "asyncio.Queue[Optional[_QueueEntry]]",
|
|
http_client: httpx.AsyncClient,
|
|
sync_resp: Dict[str, Any],
|
|
) -> None:
|
|
# M3.0: Control rooms — handled first, not forwarded to agents
|
|
if self._control_config and self._control_config.is_enabled:
|
|
for room_id in self._control_config.control_rooms:
|
|
messages = client.extract_room_messages(sync_resp, room_id)
|
|
for event in messages:
|
|
await self._try_control(client, http_client, event, room_id)
|
|
|
|
# Regular rooms: 1 room → 1 agent (M1 / M2.0)
|
|
for mapping in self._room_map.mappings:
|
|
if mapping.agent_id not in self._room_map.allowed_agents:
|
|
continue
|
|
messages = client.extract_room_messages(sync_resp, mapping.room_id)
|
|
for event in messages:
|
|
text = event.get("content", {}).get("body", "").strip()
|
|
# M4.0: agent discovery before routing
|
|
if is_discovery_message(text):
|
|
await self._handle_discovery(client, http_client, event, mapping.room_id)
|
|
continue
|
|
await self._try_enqueue(client, queue, http_client, event, mapping)
|
|
|
|
# Mixed rooms: 1 room → N agents, routing per message (M2.1)
|
|
if self._mixed_room_config:
|
|
for room_id in self._mixed_room_config.rooms:
|
|
messages = client.extract_room_messages(sync_resp, room_id)
|
|
for event in messages:
|
|
text = event.get("content", {}).get("body", "").strip()
|
|
# M4.0: agent discovery before routing
|
|
if is_discovery_message(text):
|
|
await self._handle_discovery(client, http_client, event, room_id)
|
|
continue
|
|
await self._try_enqueue_mixed(client, queue, http_client, event, room_id)
|
|
|
|
async def _try_enqueue(
|
|
self,
|
|
client: MatrixClient,
|
|
queue: "asyncio.Queue[Optional[_QueueEntry]]",
|
|
http_client: httpx.AsyncClient,
|
|
event: Dict[str, Any],
|
|
mapping: RoomMapping,
|
|
) -> None:
|
|
event_id = event.get("event_id", "")
|
|
sender = event.get("sender", "")
|
|
text = event.get("content", {}).get("body", "").strip()
|
|
room_id = mapping.room_id
|
|
agent_id = mapping.agent_id
|
|
|
|
if not text:
|
|
return
|
|
|
|
# H1: Rate limit (before mark_seen — don't charge quota on drop)
|
|
if self._rate_limiter is not None:
|
|
allowed, limit_type = self._rate_limiter.check(room_id=room_id, sender=sender)
|
|
if not allowed:
|
|
logger.warning(
|
|
"Rate limited: room=%s sender=%s limit_type=%s event=%s",
|
|
room_id, sender, limit_type, event_id,
|
|
)
|
|
if self._on_rate_limited:
|
|
self._on_rate_limited(room_id, agent_id, limit_type or "unknown")
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.rate_limited",
|
|
agent_id=agent_id, node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code=f"rate_limit_{limit_type}",
|
|
data={"sender": sender, "limit_type": limit_type},
|
|
)
|
|
return
|
|
|
|
# Dedupe — mark before enqueue (prevents double-enqueue on retry)
|
|
client.mark_seen(event_id)
|
|
|
|
# M2.3: Persistent dedupe (cross-restart protection)
|
|
if self._event_store is not None:
|
|
try:
|
|
already = await self._event_store.is_processed(room_id, event_id)
|
|
except Exception as exc:
|
|
logger.warning("EventStore.is_processed error (degraded): %s", exc)
|
|
already = False
|
|
|
|
if already:
|
|
logger.debug("Persistent dedupe hit: event=%s room=%s", event_id, room_id)
|
|
if self._on_dedupe_persistent_hit:
|
|
self._on_dedupe_persistent_hit(room_id, agent_id)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.dedupe.persistent_hit",
|
|
agent_id=agent_id, node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok",
|
|
data={"sender": sender},
|
|
)
|
|
return
|
|
|
|
# H2: Enqueue or drop
|
|
entry = _QueueEntry(
|
|
event=event,
|
|
room_id=room_id,
|
|
agent_id=agent_id,
|
|
enqueue_time=time.monotonic(),
|
|
)
|
|
enqueued = False
|
|
try:
|
|
queue.put_nowait(entry)
|
|
enqueued = True
|
|
qsize = queue.qsize()
|
|
logger.debug("Enqueued event=%s qsize=%d", event_id, qsize)
|
|
if self._on_queue_size:
|
|
self._on_queue_size(qsize)
|
|
except asyncio.QueueFull:
|
|
logger.warning(
|
|
"Queue full (max=%d): dropping event=%s room=%s agent=%s",
|
|
self._queue_max, event_id, room_id, agent_id,
|
|
)
|
|
if self._on_queue_dropped:
|
|
self._on_queue_dropped(room_id, agent_id)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.queue_full",
|
|
agent_id=agent_id, node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code="queue_full",
|
|
data={"queue_max": self._queue_max, "sender": sender},
|
|
)
|
|
|
|
# M2.3: Mark as processed ONLY after successful enqueue
|
|
if enqueued and self._event_store is not None:
|
|
sender_hash = _sender_hash(sender)
|
|
try:
|
|
await self._event_store.mark_processed(room_id, event_id, sender_hash)
|
|
if self._on_dedupe_persistent_insert:
|
|
self._on_dedupe_persistent_insert()
|
|
except Exception as exc:
|
|
logger.warning("EventStore.mark_processed error (degraded): %s", exc)
|
|
|
|
async def _try_enqueue_mixed(
|
|
self,
|
|
client: MatrixClient,
|
|
queue: "asyncio.Queue[Optional[_QueueEntry]]",
|
|
http_client: httpx.AsyncClient,
|
|
event: Dict[str, Any],
|
|
room_id: str,
|
|
) -> None:
|
|
"""Enqueue a message from a mixed room, routing to the appropriate agent."""
|
|
assert self._mixed_room_config is not None
|
|
event_id = event.get("event_id", "")
|
|
sender = event.get("sender", "")
|
|
text = event.get("content", {}).get("body", "").strip()
|
|
|
|
if not text:
|
|
return
|
|
|
|
# M6.1: look up dynamic agent override for this room
|
|
_routing_config = self._mixed_room_config
|
|
if self._policy_store is not None and self._policy_store.is_open:
|
|
try:
|
|
_agent_ov = await asyncio.to_thread(
|
|
self._policy_store.get_agent_override, room_id
|
|
)
|
|
if _agent_ov is not None:
|
|
_ov_agents, _ov_default = _agent_ov
|
|
_effective_default = _ov_default or (_ov_agents[0] if _ov_agents else None)
|
|
if _ov_agents and _effective_default:
|
|
_routing_config = build_override_config(
|
|
self._mixed_room_config, room_id,
|
|
_ov_agents, _effective_default,
|
|
)
|
|
except Exception as _exc: # noqa: BLE001
|
|
logger.warning("PolicyStore get_agent_override failed: %s", _exc)
|
|
|
|
# Route message to determine target agent
|
|
agent_id, routing_reason, effective_text = route_message(
|
|
text, room_id, _routing_config, self._room_map.allowed_agents,
|
|
max_slash_len=self._max_slash_len,
|
|
)
|
|
|
|
if agent_id is None:
|
|
# M2.2: routing rejected — audit + metric + optional error reply
|
|
logger.warning(
|
|
"Mixed room %s: routing rejected reason=%s event=%s",
|
|
room_id, routing_reason, event_id,
|
|
)
|
|
if self._on_route_rejected:
|
|
self._on_route_rejected(room_id, routing_reason)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.route.rejected",
|
|
agent_id="unknown", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code=routing_reason,
|
|
data={"routing_reason": routing_reason, "sender": sender, "text_len": len(text)},
|
|
)
|
|
# M2.2: optional user-facing error reply in room
|
|
if self._unknown_agent_behavior == "reply_error" and routing_reason == REASON_REJECTED_UNKNOWN_AGENT:
|
|
available = self._mixed_room_config.agents_for_room(room_id)
|
|
# Extract agent name from text (first slash token, if any)
|
|
slash_token = text.strip().split()[0].lstrip("/") if text.strip().startswith("/") else ""
|
|
label = f"`/{slash_token}`" if slash_token else "this command"
|
|
error_msg = (
|
|
f"⚠️ Unknown agent {label}. "
|
|
f"Available in this room: {', '.join(available)}"
|
|
)
|
|
txn_id = MatrixClient.make_txn_id(room_id, event_id + "_reject")
|
|
try:
|
|
await client.send_text(room_id, error_msg, txn_id)
|
|
except Exception as exc:
|
|
logger.warning("Could not send route-error reply: %s", exc)
|
|
return
|
|
|
|
# M2.2: successful route — fire metric callback
|
|
if self._on_routed:
|
|
self._on_routed(agent_id, routing_reason)
|
|
|
|
# H1: Rate limit (uses final agent_id for metric tagging)
|
|
if self._rate_limiter is not None:
|
|
allowed, limit_type = self._rate_limiter.check(room_id=room_id, sender=sender)
|
|
if not allowed:
|
|
logger.warning(
|
|
"Rate limited (mixed): room=%s sender=%s agent=%s limit_type=%s",
|
|
room_id, sender, agent_id, limit_type,
|
|
)
|
|
if self._on_rate_limited:
|
|
self._on_rate_limited(room_id, agent_id, limit_type or "unknown")
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.rate_limited",
|
|
agent_id=agent_id, node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code=f"rate_limit_{limit_type}",
|
|
data={"sender": sender, "limit_type": limit_type, "routing_reason": routing_reason},
|
|
)
|
|
return
|
|
|
|
client.mark_seen(event_id)
|
|
|
|
# M2.3: Persistent dedupe (cross-restart protection, mixed rooms)
|
|
if self._event_store is not None:
|
|
try:
|
|
already = await self._event_store.is_processed(room_id, event_id)
|
|
except Exception as exc:
|
|
logger.warning("EventStore.is_processed error mixed (degraded): %s", exc)
|
|
already = False
|
|
|
|
if already:
|
|
logger.debug("Persistent dedupe hit (mixed): event=%s room=%s agent=%s", event_id, room_id, agent_id)
|
|
if self._on_dedupe_persistent_hit:
|
|
self._on_dedupe_persistent_hit(room_id, agent_id)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.dedupe.persistent_hit",
|
|
agent_id=agent_id, node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok",
|
|
data={"sender": sender, "routing_reason": routing_reason},
|
|
)
|
|
return
|
|
|
|
# Store effective_text (stripped of routing token) in a patched event copy
|
|
effective_event = dict(event)
|
|
effective_event["content"] = dict(event.get("content", {}))
|
|
effective_event["content"]["body"] = effective_text
|
|
|
|
entry = _QueueEntry(
|
|
event=effective_event,
|
|
room_id=room_id,
|
|
agent_id=agent_id,
|
|
enqueue_time=time.monotonic(),
|
|
routing_reason=routing_reason,
|
|
is_mixed=True,
|
|
)
|
|
enqueued_mixed = False
|
|
try:
|
|
queue.put_nowait(entry)
|
|
enqueued_mixed = True
|
|
qsize = queue.qsize()
|
|
logger.debug(
|
|
"Enqueued (mixed): event=%s agent=%s reason=%s qsize=%d",
|
|
event_id, agent_id, routing_reason, qsize,
|
|
)
|
|
if self._on_queue_size:
|
|
self._on_queue_size(qsize)
|
|
except asyncio.QueueFull:
|
|
logger.warning(
|
|
"Queue full (max=%d): dropping mixed event=%s room=%s agent=%s",
|
|
self._queue_max, event_id, room_id, agent_id,
|
|
)
|
|
if self._on_queue_dropped:
|
|
self._on_queue_dropped(room_id, agent_id)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.queue_full",
|
|
agent_id=agent_id, node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code="queue_full",
|
|
data={"queue_max": self._queue_max, "sender": sender},
|
|
)
|
|
|
|
# M2.3: Mark as processed ONLY after successful enqueue
|
|
if enqueued_mixed and self._event_store is not None:
|
|
sender_hash = _sender_hash(sender)
|
|
try:
|
|
await self._event_store.mark_processed(room_id, event_id, sender_hash)
|
|
if self._on_dedupe_persistent_insert:
|
|
self._on_dedupe_persistent_insert()
|
|
except Exception as exc:
|
|
logger.warning("EventStore.mark_processed error mixed (degraded): %s", exc)
|
|
|
|
# ── M4.0: Agent discovery ──────────────────────────────────────────────────
|
|
|
|
def _check_discovery_rate(self, room_id: str) -> bool:
|
|
"""Sliding-window per-room rate check for discovery replies. Returns True if allowed."""
|
|
if self._discovery_rpm <= 0:
|
|
return True
|
|
now = time.monotonic()
|
|
window = self._discovery_windows[room_id]
|
|
cutoff = now - 60.0
|
|
while window and window[0] < cutoff:
|
|
window.popleft()
|
|
if len(window) >= self._discovery_rpm:
|
|
return False
|
|
window.append(now)
|
|
return True
|
|
|
|
async def _handle_discovery(
|
|
self,
|
|
client: MatrixClient,
|
|
http_client: httpx.AsyncClient,
|
|
event: Dict[str, Any],
|
|
room_id: str,
|
|
) -> None:
|
|
"""
|
|
Reply to !agents in any user room (no auth required).
|
|
Rate-limited per room. Marks event as seen + persisted (no router enqueue).
|
|
"""
|
|
event_id = event.get("event_id", "")
|
|
sender = event.get("sender", "")
|
|
|
|
# Rate limit for discovery replies
|
|
if not self._check_discovery_rate(room_id):
|
|
logger.debug("Discovery rate limited: room=%s", room_id)
|
|
client.mark_seen(event_id)
|
|
return
|
|
|
|
client.mark_seen(event_id)
|
|
|
|
# M6.1: use store-based agent config if available for accurate discovery
|
|
_disc_config = self._mixed_room_config
|
|
if self._policy_store is not None and self._policy_store.is_open:
|
|
try:
|
|
_disc_ov = await asyncio.to_thread(
|
|
self._policy_store.get_agent_override, room_id
|
|
)
|
|
if _disc_ov is not None:
|
|
_d_agents, _d_default = _disc_ov
|
|
_d_eff_default = _d_default or (_d_agents[0] if _d_agents else None)
|
|
if _d_agents and _d_eff_default and self._mixed_room_config is not None:
|
|
_disc_config = build_override_config(
|
|
self._mixed_room_config, room_id, _d_agents, _d_eff_default,
|
|
)
|
|
except Exception: # noqa: BLE001
|
|
pass
|
|
|
|
reply = agents_reply(room_id, self._room_map, _disc_config)
|
|
txn_id = MatrixClient.make_txn_id(room_id, event_id + "_discovery")
|
|
try:
|
|
await client.send_text(room_id, reply, txn_id)
|
|
except Exception as exc:
|
|
logger.warning("Could not send discovery reply: %s", exc)
|
|
|
|
# Persist dedupe so restart doesn't re-deliver this discovery
|
|
if self._event_store is not None:
|
|
sender_hash = _sender_hash(sender)
|
|
await self._event_store.mark_processed(room_id, event_id, sender_hash)
|
|
|
|
# ── M6.1: Dynamic mixed room agent overrides via !room agents command ─────
|
|
|
|
async def _handle_room_cmd(
|
|
self,
|
|
http_client: httpx.AsyncClient,
|
|
sender: str,
|
|
ctrl_room_id: str,
|
|
event_id: str,
|
|
cmd_subcommand: str,
|
|
cmd_args: tuple,
|
|
cmd_kwargs: Dict[str, str],
|
|
) -> str:
|
|
"""Handle `!room agents <set|add|remove|get|list|unset>` from authorized operator."""
|
|
if self._policy_store is None or not self._policy_store.is_open:
|
|
return "⚠️ Policy store not available."
|
|
|
|
if cmd_subcommand != ROOM_SUBCMD_AGENTS:
|
|
return room_agents_reply_error(
|
|
f"Unknown subcommand: `{cmd_subcommand or '?'}`. Use `!room agents <action>`."
|
|
)
|
|
|
|
action, room_id, agents_list, single_agent, default_agent = parse_room_agents_cmd(
|
|
cmd_subcommand, cmd_args, cmd_kwargs,
|
|
)
|
|
|
|
if action not in (ROOM_ACTION_SET, ROOM_ACTION_ADD, ROOM_ACTION_REMOVE,
|
|
ROOM_ACTION_GET, ROOM_ACTION_LIST, ROOM_ACTION_UNSET):
|
|
return room_agents_reply_error(f"Unknown action: `{action or '?'}`")
|
|
|
|
# Validate allowed agents from global policy
|
|
allowed_all = self._room_map.allowed_agents # global allowed agents set
|
|
|
|
# ── list ──────────────────────────────────────────────────────────────
|
|
if action == ROOM_ACTION_LIST:
|
|
try:
|
|
rows = await asyncio.to_thread(self._policy_store.list_agent_overrides, 10)
|
|
total = await asyncio.to_thread(self._policy_store.count_agent_overrides)
|
|
except Exception as exc:
|
|
logger.warning("PolicyStore list_agent_overrides error: %s", exc)
|
|
return "⚠️ Could not read policy store."
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.room.agents.list",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=ctrl_room_id, event_id=event_id,
|
|
status="ok", data={"sender": sender, "total": total},
|
|
)
|
|
return room_agents_reply_list(rows, total)
|
|
|
|
# ── subcommands that require room_id ──────────────────────────────────
|
|
if not room_id:
|
|
return room_agents_reply_error("Missing `room=` argument.")
|
|
if not node_cmd_validate_room(room_id):
|
|
return room_agents_reply_error(
|
|
f"Invalid room ID format: `{room_id}`\nExpected: `!localpart:server`"
|
|
)
|
|
|
|
# ── get ───────────────────────────────────────────────────────────────
|
|
if action == ROOM_ACTION_GET:
|
|
try:
|
|
ov = await asyncio.to_thread(self._policy_store.get_agent_override, room_id)
|
|
except Exception as exc:
|
|
logger.warning("PolicyStore get_agent_override error: %s", exc)
|
|
return "⚠️ Could not read policy store."
|
|
ov_agents, ov_default = (ov if ov else (None, None))
|
|
env_room = (
|
|
self._mixed_room_config.rooms.get(room_id) if self._mixed_room_config else None
|
|
)
|
|
env_agents = list(env_room.agents) if env_room else None
|
|
env_default = env_room.default_agent if env_room else None
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.room.agents.get",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=ctrl_room_id, event_id=event_id,
|
|
status="ok", data={"sender": sender, "queried_room": room_id},
|
|
)
|
|
return room_agents_reply_get(room_id, ov_agents, ov_default, env_agents, env_default)
|
|
|
|
# ── unset ─────────────────────────────────────────────────────────────
|
|
if action == ROOM_ACTION_UNSET:
|
|
try:
|
|
deleted = await asyncio.to_thread(self._policy_store.delete_agent_override, room_id)
|
|
except Exception as exc:
|
|
logger.warning("PolicyStore delete_agent_override error: %s", exc)
|
|
return "⚠️ Could not write to policy store."
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.room.agents.unset",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=ctrl_room_id, event_id=event_id,
|
|
status="ok", data={"sender": sender, "target_room": room_id, "was_set": deleted},
|
|
)
|
|
return room_agents_reply_unset_ok(room_id) if deleted else room_agents_reply_unset_not_found(room_id)
|
|
|
|
# ── add ───────────────────────────────────────────────────────────────
|
|
if action == ROOM_ACTION_ADD:
|
|
if not single_agent:
|
|
return room_agents_reply_error("Missing `agent=` argument for `add`.")
|
|
if single_agent not in allowed_all:
|
|
allowed_str = ", ".join(f"`{a}`" for a in sorted(allowed_all))
|
|
return room_agents_reply_error(f"Agent `{single_agent}` not in allowed agents: {allowed_str}")
|
|
try:
|
|
new_agents, new_default = await asyncio.to_thread(
|
|
self._policy_store.add_agent_to_room, room_id, single_agent, sender
|
|
)
|
|
except Exception as exc:
|
|
logger.warning("PolicyStore add_agent_to_room error: %s", exc)
|
|
return "⚠️ Could not write to policy store."
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.room.agents.add",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=ctrl_room_id, event_id=event_id,
|
|
status="ok", data={"sender": sender, "target_room": room_id, "agent": single_agent},
|
|
)
|
|
return room_agents_reply_add(room_id, single_agent, new_agents, new_default)
|
|
|
|
# ── remove ────────────────────────────────────────────────────────────
|
|
if action == ROOM_ACTION_REMOVE:
|
|
if not single_agent:
|
|
return room_agents_reply_error("Missing `agent=` argument for `remove`.")
|
|
try:
|
|
removed, err = await asyncio.to_thread(
|
|
self._policy_store.remove_agent_from_room, room_id, single_agent, sender
|
|
)
|
|
except Exception as exc:
|
|
logger.warning("PolicyStore remove_agent_from_room error: %s", exc)
|
|
return "⚠️ Could not write to policy store."
|
|
if not removed:
|
|
return room_agents_reply_error(err or "Could not remove agent.")
|
|
# Get updated state
|
|
try:
|
|
ov = await asyncio.to_thread(self._policy_store.get_agent_override, room_id)
|
|
except Exception: # noqa: BLE001
|
|
ov = None
|
|
remaining = ov[0] if ov else []
|
|
new_default_r = ov[1] if ov else None
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.room.agents.remove",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=ctrl_room_id, event_id=event_id,
|
|
status="ok", data={"sender": sender, "target_room": room_id, "agent": single_agent},
|
|
)
|
|
return room_agents_reply_remove(room_id, single_agent, remaining, new_default_r)
|
|
|
|
# ── set ───────────────────────────────────────────────────────────────
|
|
if action == ROOM_ACTION_SET:
|
|
if not agents_list:
|
|
return room_agents_reply_error("Missing `agents=` argument for `set`.")
|
|
invalid = [a for a in agents_list if a not in allowed_all]
|
|
if invalid:
|
|
allowed_str = ", ".join(f"`{a}`" for a in sorted(allowed_all))
|
|
return room_agents_reply_error(
|
|
f"Unknown agents: {', '.join(f'`{a}`' for a in invalid)}\nAllowed: {allowed_str}"
|
|
)
|
|
effective_default = default_agent if default_agent else agents_list[0]
|
|
if effective_default not in agents_list:
|
|
return room_agents_reply_error(
|
|
f"Default agent `{effective_default}` not in provided agents list."
|
|
)
|
|
try:
|
|
await asyncio.to_thread(
|
|
self._policy_store.set_agent_override,
|
|
room_id, agents_list, effective_default, sender,
|
|
)
|
|
except Exception as exc:
|
|
logger.warning("PolicyStore set_agent_override error: %s", exc)
|
|
return "⚠️ Could not write to policy store."
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.room.agents.set",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=ctrl_room_id, event_id=event_id,
|
|
status="ok", data={
|
|
"sender": sender, "target_room": room_id,
|
|
"agents": agents_list, "default": effective_default,
|
|
},
|
|
)
|
|
return room_agents_reply_set(room_id, agents_list, effective_default)
|
|
|
|
return room_agents_reply_error("Unhandled action.")
|
|
|
|
# ── M6.2: Policy snapshot export / import ─────────────────────────────────
|
|
|
|
async def _handle_policy_cmd(
|
|
self,
|
|
http_client: httpx.AsyncClient,
|
|
sender: str,
|
|
ctrl_room_id: str,
|
|
event_id: str,
|
|
subcommand: Optional[str],
|
|
cmd_kwargs: Dict[str, str],
|
|
) -> str:
|
|
"""
|
|
Handle `!policy export`, `!policy import`, and `!policy prune_exports`.
|
|
Requires policy_store and bridge_data_dir to be configured.
|
|
"""
|
|
if self._policy_store is None or not self._policy_store.is_open:
|
|
return policy_cmd_error("Policy store is not available.")
|
|
|
|
if not self._bridge_data_dir:
|
|
return policy_cmd_error("BRIDGE_DATA_DIR is not configured.")
|
|
|
|
exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
|
|
|
|
# ── export ────────────────────────────────────────────────────────────
|
|
if subcommand == "export":
|
|
try:
|
|
snapshot = await asyncio.to_thread(self._policy_store.export_all)
|
|
node_count = len(snapshot.get("room_node_overrides", []))
|
|
agent_count = len(snapshot.get("room_agent_overrides", []))
|
|
|
|
import datetime as _dt
|
|
ts = _dt.datetime.now(_dt.timezone.utc).strftime("%Y%m%d-%H%M%S")
|
|
filename = f"policy-{ts}.json"
|
|
|
|
await asyncio.to_thread(_os.makedirs, exports_dir, exist_ok=True)
|
|
export_path = _os.path.join(exports_dir, filename)
|
|
await asyncio.to_thread(
|
|
_write_json_file, export_path, snapshot
|
|
)
|
|
|
|
self._policy_last_export_at = int(time.time())
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.policy.export",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=ctrl_room_id, event_id=event_id,
|
|
status="ok", data={
|
|
"sender": sender, "file": filename,
|
|
"node_overrides": node_count, "agent_overrides": agent_count,
|
|
},
|
|
)
|
|
return policy_export_reply(export_path, node_count, agent_count)
|
|
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.exception("_handle_policy_cmd export error: %s", exc)
|
|
return policy_cmd_error(f"Export failed: {exc}")
|
|
|
|
# ── import ────────────────────────────────────────────────────────────
|
|
if subcommand == "import":
|
|
filename = cmd_kwargs.get("path", "").strip()
|
|
if not filename:
|
|
return policy_cmd_error("Missing `path=` argument.")
|
|
|
|
safe_path = validate_export_path(exports_dir, filename)
|
|
if safe_path is None:
|
|
return policy_cmd_error(
|
|
f"Invalid path `{filename}`. Only simple filenames within the exports "
|
|
f"directory are allowed."
|
|
)
|
|
|
|
mode_raw = cmd_kwargs.get("mode", "merge").strip().lower()
|
|
if mode_raw not in ("merge", "replace"):
|
|
return policy_cmd_error("mode must be `merge` or `replace`.")
|
|
|
|
dry_raw = cmd_kwargs.get("dry_run", "1").strip()
|
|
dry_run = dry_raw not in ("0", "false", "no")
|
|
|
|
try:
|
|
raw_text = await asyncio.to_thread(_read_json_file, safe_path)
|
|
except FileNotFoundError:
|
|
return policy_cmd_error(f"File not found: `{filename}`")
|
|
except Exception as exc: # noqa: BLE001
|
|
return policy_cmd_error(f"Cannot read file: {exc}")
|
|
|
|
try:
|
|
data = raw_text if isinstance(raw_text, dict) else {}
|
|
stats = await asyncio.to_thread(
|
|
self._policy_store.import_snapshot,
|
|
data, mode_raw, dry_run, sender,
|
|
)
|
|
except ValueError as ve:
|
|
return policy_cmd_error(str(ve))
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.exception("_handle_policy_cmd import error: %s", exc)
|
|
return policy_cmd_error(f"Import failed: {exc}")
|
|
|
|
if not dry_run:
|
|
self._policy_last_import_at = int(time.time())
|
|
# M10.2: record in policy change history
|
|
_is_destr = (
|
|
stats.get("node_deleted", 0) + stats.get("agent_deleted", 0)
|
|
) > 0
|
|
_ds = (
|
|
f"node: +{stats['node_added']} ~{stats['node_updated']} "
|
|
f"-{stats['node_deleted']}; "
|
|
f"agent: +{stats['agent_added']} ~{stats['agent_updated']} "
|
|
f"-{stats['agent_deleted']}"
|
|
)
|
|
try:
|
|
await asyncio.to_thread(
|
|
self._policy_store.record_policy_change,
|
|
"policy.import", mode_raw, filename,
|
|
_sender_hash(sender), _ds, _is_destr,
|
|
stats.get("node_added", 0), stats.get("node_updated", 0),
|
|
stats.get("node_deleted", 0),
|
|
stats.get("agent_added", 0), stats.get("agent_updated", 0),
|
|
stats.get("agent_deleted", 0),
|
|
self._policy_history_limit,
|
|
)
|
|
except Exception as _exc: # noqa: BLE001
|
|
logger.warning("Failed to record import history (non-fatal): %s", _exc)
|
|
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.policy.import",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=ctrl_room_id, event_id=event_id,
|
|
status="ok", data={
|
|
"sender": sender, "file": filename,
|
|
"mode": mode_raw, "dry_run": dry_run, "stats": stats,
|
|
},
|
|
)
|
|
return (
|
|
policy_import_dry_run_reply(stats, mode_raw)
|
|
if dry_run
|
|
else policy_import_reply(stats, mode_raw)
|
|
)
|
|
|
|
# ── restore (M10.1) ───────────────────────────────────────────────────
|
|
if subcommand == "restore":
|
|
filename = cmd_kwargs.get("path", "").strip()
|
|
if not filename:
|
|
return policy_cmd_error("Missing `path=` argument.")
|
|
|
|
safe_path = validate_export_path(exports_dir, filename)
|
|
if safe_path is None:
|
|
return policy_cmd_error(
|
|
f"Invalid path `{filename}`. Only simple filenames within the exports "
|
|
"directory are allowed."
|
|
)
|
|
|
|
mode_raw = cmd_kwargs.get("mode", "replace").strip().lower()
|
|
if mode_raw not in ("merge", "replace"):
|
|
return policy_cmd_error("mode must be `merge` or `replace`.")
|
|
|
|
try:
|
|
raw_data = await asyncio.to_thread(_read_json_file, safe_path)
|
|
except FileNotFoundError:
|
|
return policy_cmd_error(f"File not found: `{filename}`")
|
|
except Exception as exc: # noqa: BLE001
|
|
return policy_cmd_error(f"Cannot read file: {exc}")
|
|
|
|
data = raw_data if isinstance(raw_data, dict) else {}
|
|
try:
|
|
stats = await asyncio.to_thread(
|
|
self._policy_store.import_snapshot,
|
|
data, mode_raw, False, sender,
|
|
)
|
|
self._policy_last_import_at = int(time.time())
|
|
except ValueError as ve:
|
|
return policy_cmd_error(str(ve))
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.exception("_handle_policy_cmd restore error: %s", exc)
|
|
return policy_cmd_error(f"Restore failed: {exc}")
|
|
|
|
# M10.2: record in policy change history
|
|
_is_destr = (
|
|
stats.get("node_deleted", 0) + stats.get("agent_deleted", 0)
|
|
) > 0
|
|
_rds = (
|
|
f"restore/{mode_raw}: "
|
|
f"node: +{stats['node_added']} ~{stats['node_updated']} "
|
|
f"-{stats['node_deleted']}; "
|
|
f"agent: +{stats['agent_added']} ~{stats['agent_updated']} "
|
|
f"-{stats['agent_deleted']}"
|
|
)
|
|
try:
|
|
await asyncio.to_thread(
|
|
self._policy_store.record_policy_change,
|
|
"policy.restore", mode_raw, filename,
|
|
_sender_hash(sender), _rds, _is_destr,
|
|
stats.get("node_added", 0), stats.get("node_updated", 0),
|
|
stats.get("node_deleted", 0),
|
|
stats.get("agent_added", 0), stats.get("agent_updated", 0),
|
|
stats.get("agent_deleted", 0),
|
|
self._policy_history_limit,
|
|
)
|
|
except Exception as _exc: # noqa: BLE001
|
|
logger.warning("Failed to record restore history (non-fatal): %s", _exc)
|
|
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.policy.restore",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=ctrl_room_id, event_id=event_id,
|
|
status="ok",
|
|
data={
|
|
"sender": sender, "file": filename,
|
|
"mode": mode_raw, "stats": stats,
|
|
},
|
|
)
|
|
return policy_restore_applied_reply(stats, mode_raw)
|
|
|
|
# ── prune_exports ─────────────────────────────────────────────────────
|
|
if subcommand == "prune_exports":
|
|
dry_raw = cmd_kwargs.get("dry_run", "1").strip()
|
|
dry_run = dry_raw not in ("0", "false", "no")
|
|
try:
|
|
retention_days = int(
|
|
cmd_kwargs.get(
|
|
"retention_days", str(self._policy_export_retention_days)
|
|
)
|
|
)
|
|
except (ValueError, TypeError):
|
|
return policy_cmd_error("`retention_days` must be a positive integer.")
|
|
|
|
try:
|
|
await asyncio.to_thread(_os.makedirs, exports_dir, exist_ok=True)
|
|
result = await asyncio.to_thread(
|
|
self._policy_store.prune_exports,
|
|
exports_dir, retention_days, dry_run,
|
|
)
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.exception("_handle_policy_cmd prune error: %s", exc)
|
|
return policy_cmd_error(f"Prune failed: {exc}")
|
|
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.policy.prune_exports",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=ctrl_room_id, event_id=event_id,
|
|
status="ok",
|
|
data={
|
|
"sender": sender,
|
|
"dry_run": dry_run,
|
|
"retention_days": retention_days,
|
|
"files_to_delete": result.count,
|
|
"bytes_to_free": result.total_bytes,
|
|
},
|
|
)
|
|
return (
|
|
policy_prune_preview_reply(result, retention_days)
|
|
if dry_run
|
|
else policy_prune_applied_reply(result, retention_days)
|
|
)
|
|
|
|
# ── history (M10.2) ───────────────────────────────────────────────────
|
|
if subcommand == "history":
|
|
try:
|
|
limit_raw = int(cmd_kwargs.get("limit", "10"))
|
|
except (ValueError, TypeError):
|
|
return policy_cmd_error("`limit` must be a positive integer.")
|
|
safe_limit = max(1, min(limit_raw, 20))
|
|
try:
|
|
changes = await asyncio.to_thread(
|
|
self._policy_store.list_policy_changes, safe_limit,
|
|
)
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.exception("_handle_policy_cmd history error: %s", exc)
|
|
return policy_cmd_error(f"History fetch failed: {exc}")
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.policy.history",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=ctrl_room_id, event_id=event_id,
|
|
status="ok", data={"sender": sender, "limit": safe_limit, "count": len(changes)},
|
|
)
|
|
return policy_history_reply(changes)
|
|
|
|
# ── change (M10.3) ────────────────────────────────────────────────────
|
|
if subcommand == "change":
|
|
try:
|
|
change_id = int(cmd_kwargs.get("id", "0"))
|
|
except (ValueError, TypeError):
|
|
return policy_cmd_error("`id` must be a positive integer (DB change id).")
|
|
if change_id <= 0:
|
|
return policy_cmd_error("Missing or invalid `id=` argument. "
|
|
"Use `!policy history` to get change ids.")
|
|
try:
|
|
change = await asyncio.to_thread(
|
|
self._policy_store.get_policy_change_by_id, change_id,
|
|
)
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.exception("_handle_policy_cmd change detail error: %s", exc)
|
|
return policy_cmd_error(f"DB error: {exc}")
|
|
if change is None:
|
|
return policy_cmd_error(
|
|
f"Change id={change_id} not found. "
|
|
"Use `!policy history` to see available ids."
|
|
)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.policy.change_detail",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=ctrl_room_id, event_id=event_id,
|
|
status="ok", data={"sender": sender, "change_id": change_id},
|
|
)
|
|
return policy_change_detail_reply(change)
|
|
|
|
return policy_cmd_error(f"Unknown subcommand: `{subcommand!r}`.")
|
|
|
|
# ── M5.1: Nodes overview for operators ────────────────────────────────────
|
|
|
|
async def _handle_nodes(
|
|
self,
|
|
http_client: httpx.AsyncClient,
|
|
sender: str,
|
|
room_id: str,
|
|
event_id: str,
|
|
) -> str:
|
|
"""Return node policy overview for `!nodes` in control room."""
|
|
try:
|
|
policy_info = (
|
|
self._node_policy.as_info_dict()
|
|
if self._node_policy is not None
|
|
else {}
|
|
)
|
|
node_stats = (
|
|
self._node_stats_getter()
|
|
if self._node_stats_getter is not None
|
|
else {}
|
|
)
|
|
# M8.0: merge health state into node_stats
|
|
if self._node_health_tracker is not None:
|
|
allowed = (
|
|
self._node_policy.allowed_nodes
|
|
if self._node_policy is not None
|
|
else None
|
|
)
|
|
health_all = self._node_health_tracker.all_info(allowed)
|
|
for nid, info in health_all.items():
|
|
if nid not in node_stats:
|
|
node_stats[nid] = {}
|
|
node_stats[nid]["health"] = info.get("state", NODE_STATE_HEALTHY)
|
|
node_stats[nid]["ewma_latency_s"] = info.get("ewma_latency_s")
|
|
node_stats[nid]["consecutive_failures"] = info.get("consecutive_failures", 0)
|
|
# M8.1: include sticky cache info
|
|
sticky_info = None
|
|
if self._sticky_cache is not None:
|
|
sticky_entries = self._sticky_cache.active_entries()
|
|
sticky_info = {
|
|
"active_keys": len(sticky_entries),
|
|
"ttl_s": self._sticky_cache.ttl_s,
|
|
"entries": [
|
|
{"key": k, "node": n, "remaining_s": round(r, 0)}
|
|
for k, n, r in sticky_entries[:5] # show at most 5
|
|
],
|
|
}
|
|
if len(sticky_entries) > 5:
|
|
sticky_info["truncated"] = len(sticky_entries) - 5
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.nodes",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok", data={"sender": sender},
|
|
)
|
|
return nodes_reply(policy_info, node_stats, sticky_info=sticky_info)
|
|
except Exception as exc:
|
|
logger.warning("_handle_nodes error: %s", exc)
|
|
return "⚠️ Node info not available."
|
|
|
|
# ── M8.2: HA state persistence helpers ────────────────────────────────────
|
|
|
|
async def _load_ha_state(self) -> None:
|
|
"""
|
|
Load persisted HA state (sticky entries + node health) from PolicyStore on startup.
|
|
Non-fatal: any error is logged and bridge continues in in-memory-only mode.
|
|
"""
|
|
if self._policy_store is None or not self._policy_store.is_open:
|
|
return
|
|
|
|
# Load sticky entries
|
|
if self._sticky_cache is not None:
|
|
try:
|
|
entries = await asyncio.to_thread(self._policy_store.load_sticky_entries)
|
|
now_unix = int(time.time())
|
|
loaded = 0
|
|
for key, node_id, expires_at_unix in entries:
|
|
remaining_s = expires_at_unix - now_unix
|
|
if remaining_s > 0:
|
|
self._sticky_cache.set(key, node_id, ttl_s=float(remaining_s))
|
|
loaded += 1
|
|
self._ha_sticky_loaded = loaded
|
|
logger.info("HA: loaded %d sticky entries from DB", loaded)
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.warning("HA: failed to load sticky entries (non-fatal): %s", exc)
|
|
|
|
# Load node health snapshot
|
|
if self._node_health_tracker is not None and self._ha_health_max_age_s > 0:
|
|
try:
|
|
snapshot = await asyncio.to_thread(
|
|
self._policy_store.load_node_health, self._ha_health_max_age_s
|
|
)
|
|
if snapshot:
|
|
for node_id, info in snapshot.items():
|
|
self._node_health_tracker.restore_node(
|
|
node_id,
|
|
ewma_latency_s=info.get("ewma_latency_s"),
|
|
consecutive_failures=int(info.get("consecutive_failures", 0)),
|
|
)
|
|
self._ha_health_loaded = True
|
|
logger.info(
|
|
"HA: loaded node health for %d nodes from DB", len(snapshot)
|
|
)
|
|
else:
|
|
logger.info("HA: no fresh node health snapshot found in DB")
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.warning("HA: failed to load node health (non-fatal): %s", exc)
|
|
|
|
async def _sticky_persist_set(self, key: str, node_id: str) -> None:
|
|
"""Persist a sticky entry to DB after setting it in-memory (M8.2)."""
|
|
if self._policy_store is None or not self._policy_store.is_open:
|
|
return
|
|
if self._sticky_cache is None:
|
|
return
|
|
expires_at_unix = int(time.time()) + int(self._sticky_cache.ttl_s)
|
|
try:
|
|
await asyncio.to_thread(
|
|
self._policy_store.upsert_sticky, key, node_id, expires_at_unix
|
|
)
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.warning("HA: failed to persist sticky key=%s (non-fatal): %s", key, exc)
|
|
|
|
async def _sticky_persist_delete(self, key: str) -> None:
|
|
"""Remove a sticky entry from DB after deleting it in-memory (M8.2)."""
|
|
if self._policy_store is None or not self._policy_store.is_open:
|
|
return
|
|
try:
|
|
await asyncio.to_thread(self._policy_store.delete_sticky, key)
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.warning("HA: failed to delete sticky key=%s (non-fatal): %s", key, exc)
|
|
|
|
async def _node_health_snapshot_loop(self) -> None:
|
|
"""
|
|
Background task: periodically write node health state to DB (M8.2).
|
|
Runs until policy_store becomes unavailable or interval is 0.
|
|
"""
|
|
if self._ha_health_snapshot_interval_s <= 0:
|
|
return
|
|
logger.debug(
|
|
"HA: health snapshot loop started (interval=%ds)", self._ha_health_snapshot_interval_s
|
|
)
|
|
while True:
|
|
await asyncio.sleep(self._ha_health_snapshot_interval_s)
|
|
if self._policy_store is None or not self._policy_store.is_open:
|
|
break
|
|
if self._node_health_tracker is None or self._node_policy is None:
|
|
break
|
|
try:
|
|
for node_id in sorted(self._node_policy.allowed_nodes):
|
|
info = self._node_health_tracker.as_info_dict(node_id)
|
|
await asyncio.to_thread(
|
|
self._policy_store.upsert_node_health,
|
|
node_id,
|
|
info.get("ewma_latency_s"),
|
|
int(info.get("consecutive_failures", 0)),
|
|
)
|
|
logger.debug(
|
|
"HA: health snapshot written for %d nodes",
|
|
len(self._node_policy.allowed_nodes),
|
|
)
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.warning("HA: health snapshot write failed (non-fatal): %s", exc)
|
|
|
|
# ── M9.0: Two-step confirmation for dangerous control commands ─────────────
|
|
|
|
async def _handle_policy_import_intent(
|
|
self,
|
|
http_client: "httpx.AsyncClient",
|
|
cmd: "ControlCommand",
|
|
sender: str,
|
|
room_id: str,
|
|
event_id: str,
|
|
action_summary: str,
|
|
normalized: str,
|
|
) -> str:
|
|
"""
|
|
M9.1: Intent step for !policy import with diff preview and hash binding.
|
|
|
|
Reads the file, computes a diff preview, stores a hash-bound callback,
|
|
and returns a formatted preview reply containing the nonce.
|
|
The confirm callback verifies the file hasn't changed before applying.
|
|
"""
|
|
assert self._confirm_store is not None
|
|
|
|
# ── Validate args ──────────────────────────────────────────────────────
|
|
if self._policy_store is None or not self._policy_store.is_open:
|
|
return policy_cmd_error("Policy store not available.")
|
|
if not self._bridge_data_dir:
|
|
return policy_cmd_error("BRIDGE_DATA_DIR not configured.")
|
|
|
|
filename = cmd.kwargs.get("path", "").strip()
|
|
if not filename:
|
|
return policy_cmd_error("Missing `path=` argument.")
|
|
|
|
exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
|
|
safe_path = validate_export_path(exports_dir, filename)
|
|
if safe_path is None:
|
|
return policy_cmd_error(
|
|
f"Invalid path `{filename}`. Only simple filenames within the exports "
|
|
"directory are allowed."
|
|
)
|
|
|
|
mode_raw = cmd.kwargs.get("mode", "merge").strip().lower()
|
|
if mode_raw not in ("merge", "replace"):
|
|
return policy_cmd_error("mode must be `merge` or `replace`.")
|
|
|
|
# ── Read file + compute diff preview ───────────────────────────────────
|
|
try:
|
|
raw_data = await asyncio.to_thread(_read_json_file, safe_path)
|
|
except FileNotFoundError:
|
|
return policy_cmd_error(f"File not found: `{filename}`")
|
|
except Exception as exc: # noqa: BLE001
|
|
return policy_cmd_error(f"Cannot read file: {exc}")
|
|
|
|
if not isinstance(raw_data, dict):
|
|
return policy_cmd_error("Invalid JSON format (expected object).")
|
|
|
|
try:
|
|
diff = await asyncio.to_thread(
|
|
self._policy_store.compute_import_diff, raw_data, mode_raw,
|
|
)
|
|
except ValueError as ve:
|
|
return policy_cmd_error(str(ve))
|
|
except Exception as exc: # noqa: BLE001
|
|
return policy_cmd_error(f"Preview failed: {exc}")
|
|
|
|
# ── Compute snapshot hash for confirm binding ─────────────────────────
|
|
_content_bytes = _json.dumps(
|
|
raw_data, sort_keys=True, ensure_ascii=True
|
|
).encode("utf-8")
|
|
snapshot_hash = hashlib.sha256(
|
|
(filename + ":" + mode_raw + ":").encode("utf-8") + _content_bytes
|
|
).hexdigest()[:32]
|
|
|
|
sender_hash = _sender_hash(sender)
|
|
_captured_hash = snapshot_hash
|
|
_captured_data = raw_data
|
|
_captured_mode = mode_raw
|
|
_captured_path = safe_path
|
|
_captured_fname = filename
|
|
_captured_sender = sender
|
|
|
|
# ── Build hash-bound callback ──────────────────────────────────────────
|
|
# Late-capture of nonce for autobackup filename (set after add() below)
|
|
_nonce_holder: list = []
|
|
|
|
async def _callback():
|
|
# Re-read file and verify hash to detect tampering (anti-TOCTOU)
|
|
try:
|
|
fresh_data = await asyncio.to_thread(_read_json_file, _captured_path)
|
|
except Exception as exc: # noqa: BLE001
|
|
return (
|
|
f"❌ Cannot re-read file `{_captured_fname}` at apply time: {exc}\n"
|
|
"Re-issue the command.",
|
|
"",
|
|
)
|
|
|
|
fresh_bytes = _json.dumps(
|
|
fresh_data if isinstance(fresh_data, dict) else {},
|
|
sort_keys=True, ensure_ascii=True,
|
|
).encode("utf-8")
|
|
fresh_hash = hashlib.sha256(
|
|
(_captured_fname + ":" + _captured_mode + ":").encode("utf-8") + fresh_bytes
|
|
).hexdigest()[:32]
|
|
|
|
if fresh_hash != _captured_hash:
|
|
logger.warning(
|
|
"Policy import confirm rejected: file changed since preview "
|
|
"(sender=%s file=%s)", _captured_sender, _captured_fname,
|
|
)
|
|
return (
|
|
f"❌ File `{_captured_fname}` changed after preview — confirm rejected.\n"
|
|
"Re-issue `!policy import ...` to get a new preview.",
|
|
"",
|
|
)
|
|
|
|
# M10.0: Auto-backup current policy before applying changes
|
|
_autobackup_basename = ""
|
|
_autobackup_hash = ""
|
|
if self._policy_store is not None and self._bridge_data_dir:
|
|
_exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
|
|
_nonce_suffix = _nonce_holder[0] if _nonce_holder else "BACKUP"
|
|
_sender_hash8 = _sender_hash(_captured_sender)[:8]
|
|
try:
|
|
_os.makedirs(_exports_dir, exist_ok=True)
|
|
_ab_path, _autobackup_hash = await asyncio.to_thread(
|
|
self._policy_store.write_autobackup,
|
|
_exports_dir, _sender_hash8, _nonce_suffix,
|
|
)
|
|
_autobackup_basename = _os.path.basename(_ab_path)
|
|
logger.info(
|
|
"Policy auto-backup written: %s hash=%s",
|
|
_autobackup_basename, _autobackup_hash,
|
|
)
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.warning("Policy auto-backup failed (non-fatal): %s", exc)
|
|
|
|
# Apply the import using the captured (previewed) data
|
|
try:
|
|
stats = await asyncio.to_thread(
|
|
self._policy_store.import_snapshot,
|
|
_captured_data, _captured_mode, False, _captured_sender,
|
|
)
|
|
self._policy_last_import_at = int(time.time())
|
|
diff_summary = (
|
|
f"node: +{stats['node_added']} ~{stats['node_updated']} "
|
|
f"-{stats['node_deleted']}; "
|
|
f"agent: +{stats['agent_added']} ~{stats['agent_updated']} "
|
|
f"-{stats['agent_deleted']}"
|
|
)
|
|
if _autobackup_basename:
|
|
diff_summary += f"; autobackup={_autobackup_basename}"
|
|
# M10.2: record in policy change history
|
|
_is_destr = (
|
|
stats.get("node_deleted", 0) + stats.get("agent_deleted", 0)
|
|
) > 0
|
|
try:
|
|
await asyncio.to_thread(
|
|
self._policy_store.record_policy_change,
|
|
"policy.import", _captured_mode, _captured_fname,
|
|
_sender_hash(_captured_sender), diff_summary, _is_destr,
|
|
stats.get("node_added", 0), stats.get("node_updated", 0),
|
|
stats.get("node_deleted", 0),
|
|
stats.get("agent_added", 0), stats.get("agent_updated", 0),
|
|
stats.get("agent_deleted", 0),
|
|
self._policy_history_limit,
|
|
)
|
|
except Exception as _exc: # noqa: BLE001
|
|
logger.warning("Failed to record policy change history: %s", _exc)
|
|
reply = policy_import_reply(stats, _captured_mode)
|
|
if _autobackup_basename:
|
|
reply += (
|
|
f"\n\n💾 Auto-backup saved: `{_autobackup_basename}` "
|
|
f"(hash `{_autobackup_hash}`)"
|
|
)
|
|
return reply, diff_summary
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.exception(
|
|
"Policy import apply failed: sender=%s file=%s", _captured_sender, _captured_fname,
|
|
)
|
|
return f"❌ Import failed: {exc}", ""
|
|
|
|
# ── Store pending confirmation ─────────────────────────────────────────
|
|
nonce = self._confirm_store.add(
|
|
sender_hash=sender_hash,
|
|
verb="policy.import",
|
|
normalized_args=normalized,
|
|
action_summary=action_summary,
|
|
room_id=room_id,
|
|
callback=_callback,
|
|
)
|
|
# M10.0: make nonce available inside _callback for backup filename
|
|
_nonce_holder.append(nonce)
|
|
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.intent",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok",
|
|
data={
|
|
"sender_hash": sender_hash,
|
|
"verb": "policy.import",
|
|
"normalized": normalized,
|
|
"nonce": nonce,
|
|
"expires_in_s": int(self._confirm_store.ttl_s),
|
|
"snapshot_hash_prefix": snapshot_hash[:8],
|
|
"diff": {
|
|
"node_added": diff.node_added,
|
|
"node_updated": diff.node_updated,
|
|
"node_deleted": diff.node_deleted,
|
|
"agent_added": diff.agent_added,
|
|
"agent_updated": diff.agent_updated,
|
|
"agent_deleted": diff.agent_deleted,
|
|
"sample_keys": diff.sample_keys,
|
|
},
|
|
},
|
|
)
|
|
|
|
logger.info(
|
|
"Confirm policy import intent: sender=%s mode=%s file=%s hash=%s nonce=%s",
|
|
sender, mode_raw, filename, snapshot_hash[:8], nonce,
|
|
)
|
|
return policy_import_intent_reply(
|
|
diff=diff,
|
|
action_summary=action_summary,
|
|
nonce=nonce,
|
|
ttl_s=int(self._confirm_store.ttl_s),
|
|
)
|
|
|
|
async def _handle_policy_restore_intent(
|
|
self,
|
|
http_client: "httpx.AsyncClient",
|
|
cmd: "ControlCommand",
|
|
sender: str,
|
|
room_id: str,
|
|
event_id: str,
|
|
*,
|
|
action_summary: str,
|
|
normalized: str,
|
|
) -> str:
|
|
"""
|
|
Intent step for !policy restore (M10.1).
|
|
|
|
Reads the snapshot file, computes a diff preview, binds a SHA-256 hash
|
|
to the exact file content + mode, stores a callback in ConfirmStore, and
|
|
returns a rollback preview reply with the nonce.
|
|
"""
|
|
assert self._confirm_store is not None
|
|
assert self._policy_store is not None
|
|
assert self._bridge_data_dir is not None
|
|
|
|
filename = cmd.kwargs.get("path", "").strip()
|
|
if not filename:
|
|
return policy_cmd_error("Missing `path=` argument.")
|
|
|
|
exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
|
|
safe_path = validate_export_path(exports_dir, filename)
|
|
if safe_path is None:
|
|
return policy_cmd_error(
|
|
f"Invalid path `{filename}`. Only simple filenames within the exports "
|
|
"directory are allowed."
|
|
)
|
|
|
|
mode_raw = cmd.kwargs.get("mode", "replace").strip().lower()
|
|
if mode_raw not in ("merge", "replace"):
|
|
return policy_cmd_error("mode must be `merge` or `replace`.")
|
|
|
|
# ── Read file + compute diff preview ──────────────────────────────────
|
|
try:
|
|
raw_data = await asyncio.to_thread(_read_json_file, safe_path)
|
|
except FileNotFoundError:
|
|
return policy_cmd_error(f"File not found: `{filename}`")
|
|
except Exception as exc: # noqa: BLE001
|
|
return policy_cmd_error(f"Cannot read file: {exc}")
|
|
|
|
if not isinstance(raw_data, dict):
|
|
return policy_cmd_error("Invalid JSON format (expected object).")
|
|
|
|
try:
|
|
diff = await asyncio.to_thread(
|
|
self._policy_store.compute_import_diff, raw_data, mode_raw,
|
|
)
|
|
except ValueError as ve:
|
|
return policy_cmd_error(str(ve))
|
|
except Exception as exc: # noqa: BLE001
|
|
return policy_cmd_error(f"Preview failed: {exc}")
|
|
|
|
# ── Compute snapshot hash for confirm binding (anti-TOCTOU) ──────────
|
|
_content_bytes = _json.dumps(
|
|
raw_data, sort_keys=True, ensure_ascii=True
|
|
).encode("utf-8")
|
|
snapshot_hash = hashlib.sha256(
|
|
(filename + ":" + mode_raw + ":restore:").encode("utf-8") + _content_bytes
|
|
).hexdigest()[:32]
|
|
|
|
sender_hash = _sender_hash(sender)
|
|
_captured_hash = snapshot_hash
|
|
_captured_data = raw_data
|
|
_captured_mode = mode_raw
|
|
_captured_path = safe_path
|
|
_captured_fname = filename
|
|
_captured_sender = sender
|
|
|
|
# Late-capture of nonce for autobackup filename
|
|
_nonce_holder: list = []
|
|
|
|
# ── Build hash-bound callback ─────────────────────────────────────────
|
|
async def _callback():
|
|
# Re-read + verify hash (anti-TOCTOU)
|
|
try:
|
|
fresh_data = await asyncio.to_thread(_read_json_file, _captured_path)
|
|
except Exception as exc: # noqa: BLE001
|
|
return (
|
|
f"❌ Cannot re-read `{_captured_fname}` at apply time: {exc}\n"
|
|
"Re-issue the command.",
|
|
"",
|
|
)
|
|
|
|
fresh_bytes = _json.dumps(
|
|
fresh_data if isinstance(fresh_data, dict) else {},
|
|
sort_keys=True, ensure_ascii=True,
|
|
).encode("utf-8")
|
|
fresh_hash = hashlib.sha256(
|
|
(_captured_fname + ":" + _captured_mode + ":restore:").encode("utf-8")
|
|
+ fresh_bytes
|
|
).hexdigest()[:32]
|
|
|
|
if fresh_hash != _captured_hash:
|
|
logger.warning(
|
|
"Policy restore confirm rejected: file changed since preview "
|
|
"(sender=%s file=%s)", _captured_sender, _captured_fname,
|
|
)
|
|
return (
|
|
f"❌ File `{_captured_fname}` changed after preview — confirm rejected.\n"
|
|
"Re-issue `!policy restore ...` to get a new preview.",
|
|
"",
|
|
)
|
|
|
|
# Auto-backup current state before overwriting
|
|
_autobackup_basename = ""
|
|
_autobackup_hash = ""
|
|
if self._policy_store is not None and self._bridge_data_dir:
|
|
_exp_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR)
|
|
_nonce_suffix = _nonce_holder[0] if _nonce_holder else "RESTORE"
|
|
_sender_hash8 = _sender_hash(_captured_sender)[:8]
|
|
try:
|
|
_os.makedirs(_exp_dir, exist_ok=True)
|
|
_ab_path, _autobackup_hash = await asyncio.to_thread(
|
|
self._policy_store.write_autobackup,
|
|
_exp_dir, _sender_hash8, _nonce_suffix,
|
|
)
|
|
_autobackup_basename = _os.path.basename(_ab_path)
|
|
logger.info(
|
|
"Pre-restore backup written: %s hash=%s",
|
|
_autobackup_basename, _autobackup_hash,
|
|
)
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.warning("Pre-restore backup failed (non-fatal): %s", exc)
|
|
|
|
# Apply restore
|
|
try:
|
|
stats = await asyncio.to_thread(
|
|
self._policy_store.import_snapshot,
|
|
_captured_data, _captured_mode, False, _captured_sender,
|
|
)
|
|
self._policy_last_import_at = int(time.time())
|
|
diff_summary = (
|
|
f"restore/{_captured_mode}: "
|
|
f"node: +{stats['node_added']} ~{stats['node_updated']} "
|
|
f"-{stats['node_deleted']}; "
|
|
f"agent: +{stats['agent_added']} ~{stats['agent_updated']} "
|
|
f"-{stats['agent_deleted']}"
|
|
)
|
|
if _autobackup_basename:
|
|
diff_summary += f"; autobackup={_autobackup_basename}"
|
|
# M10.2: record in policy change history
|
|
_is_destr = (
|
|
stats.get("node_deleted", 0) + stats.get("agent_deleted", 0)
|
|
) > 0
|
|
try:
|
|
await asyncio.to_thread(
|
|
self._policy_store.record_policy_change,
|
|
"policy.restore", _captured_mode, _captured_fname,
|
|
_sender_hash(_captured_sender), diff_summary, _is_destr,
|
|
stats.get("node_added", 0), stats.get("node_updated", 0),
|
|
stats.get("node_deleted", 0),
|
|
stats.get("agent_added", 0), stats.get("agent_updated", 0),
|
|
stats.get("agent_deleted", 0),
|
|
self._policy_history_limit,
|
|
)
|
|
except Exception as _exc: # noqa: BLE001
|
|
logger.warning("Failed to record restore history: %s", _exc)
|
|
reply = policy_restore_applied_reply(
|
|
stats, _captured_mode, _autobackup_basename
|
|
)
|
|
return reply, diff_summary
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.exception(
|
|
"Policy restore apply failed: sender=%s file=%s",
|
|
_captured_sender, _captured_fname,
|
|
)
|
|
return f"❌ Restore failed: {exc}", ""
|
|
|
|
# ── Store pending confirmation ────────────────────────────────────────
|
|
nonce = self._confirm_store.add(
|
|
sender_hash=sender_hash,
|
|
verb="policy.restore",
|
|
normalized_args=normalized,
|
|
action_summary=action_summary,
|
|
room_id=room_id,
|
|
callback=_callback,
|
|
)
|
|
_nonce_holder.append(nonce)
|
|
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.policy.restore",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok",
|
|
data={
|
|
"sender_hash": sender_hash,
|
|
"verb": "policy.restore",
|
|
"normalized": normalized,
|
|
"nonce": nonce,
|
|
"expires_in_s": int(self._confirm_store.ttl_s),
|
|
"mode": mode_raw,
|
|
"snapshot_hash_prefix": snapshot_hash[:8],
|
|
"diff": {
|
|
"node_added": diff.node_added,
|
|
"node_updated": diff.node_updated,
|
|
"node_deleted": diff.node_deleted,
|
|
"agent_added": diff.agent_added,
|
|
"agent_updated": diff.agent_updated,
|
|
"agent_deleted": diff.agent_deleted,
|
|
"sample_keys": diff.sample_keys,
|
|
},
|
|
},
|
|
)
|
|
|
|
logger.info(
|
|
"Confirm policy restore intent: sender=%s mode=%s file=%s hash=%s nonce=%s",
|
|
sender, mode_raw, filename, snapshot_hash[:8], nonce,
|
|
)
|
|
return policy_restore_intent_reply(
|
|
diff=diff,
|
|
action_summary=action_summary,
|
|
nonce=nonce,
|
|
ttl_s=int(self._confirm_store.ttl_s),
|
|
)
|
|
|
|
async def _handle_dangerous_intent(
|
|
self,
|
|
http_client: "httpx.AsyncClient",
|
|
cmd: "ControlCommand",
|
|
sender: str,
|
|
room_id: str,
|
|
event_id: str,
|
|
) -> str:
|
|
"""
|
|
First leg of the two-step confirm flow (M9.0).
|
|
|
|
Does NOT apply the command. Stores a pending confirmation with a
|
|
callback that will execute the original handler, and returns a reply
|
|
containing the nonce that the operator must send via !confirm <nonce>.
|
|
"""
|
|
assert self._confirm_store is not None
|
|
|
|
sender_hash = _sender_hash(sender)
|
|
normalized = build_normalized_args(cmd)
|
|
action_summary = (
|
|
f"!{cmd.verb} {cmd.subcommand or ''} {normalized}".strip()
|
|
)
|
|
|
|
# M9.1: policy import gets a richer preview with diff + hash binding
|
|
if cmd.verb == VERB_POLICY and (cmd.subcommand or "").lower() == "import":
|
|
return await self._handle_policy_import_intent(
|
|
http_client, cmd, sender, room_id, event_id,
|
|
action_summary=action_summary, normalized=normalized,
|
|
)
|
|
|
|
# M10.1: policy restore — rollback with diff preview + hash binding
|
|
if cmd.verb == VERB_POLICY and (cmd.subcommand or "").lower() == "restore":
|
|
if self._policy_store is None or not self._bridge_data_dir:
|
|
return policy_cmd_error(
|
|
"Policy store or data directory not configured."
|
|
)
|
|
return await self._handle_policy_restore_intent(
|
|
http_client, cmd, sender, room_id, event_id,
|
|
action_summary=action_summary, normalized=normalized,
|
|
)
|
|
|
|
# Build the callback: calls the actual handler when confirmed.
|
|
# We capture all args by closure so the callback is self-contained.
|
|
_verb = cmd.verb
|
|
_subcmd = cmd.subcommand
|
|
_args = cmd.args
|
|
_kw = dict(cmd.kwargs)
|
|
|
|
async def _callback():
|
|
if _verb == VERB_NODE:
|
|
# Reconstruct args_text (same as _try_control does)
|
|
_parts = []
|
|
if _subcmd:
|
|
_parts.append(_subcmd)
|
|
_parts.extend(_args)
|
|
_parts.extend(f"{k}={v}" for k, v in _kw.items())
|
|
reply = await self._handle_node_cmd(
|
|
http_client, sender, room_id, event_id, " ".join(_parts),
|
|
)
|
|
elif _verb == VERB_ROOM:
|
|
reply = await self._handle_room_cmd(
|
|
http_client, sender, room_id, event_id,
|
|
_subcmd, tuple(_args), _kw,
|
|
)
|
|
elif _verb == VERB_POLICY:
|
|
reply = await self._handle_policy_cmd(
|
|
http_client, sender, room_id, event_id, _subcmd, _kw,
|
|
)
|
|
else:
|
|
reply = f"❌ Unknown dangerous verb: {_verb}"
|
|
return reply, action_summary
|
|
|
|
nonce = self._confirm_store.add(
|
|
sender_hash=sender_hash,
|
|
verb=f"{cmd.verb}.{cmd.subcommand or ''}",
|
|
normalized_args=normalized,
|
|
action_summary=action_summary,
|
|
room_id=room_id,
|
|
callback=_callback,
|
|
)
|
|
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.intent",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok",
|
|
data={
|
|
"sender_hash": sender_hash,
|
|
"verb": cmd.verb,
|
|
"subcommand": cmd.subcommand or "",
|
|
"normalized": normalized,
|
|
"nonce": nonce,
|
|
"expires_in_s": int(self._confirm_store.ttl_s),
|
|
},
|
|
)
|
|
|
|
logger.info(
|
|
"Confirm intent: sender=%s verb=%s/%s nonce=%s ttl=%.0fs",
|
|
sender, cmd.verb, cmd.subcommand, nonce, self._confirm_store.ttl_s,
|
|
)
|
|
return confirm_intent_reply(action_summary, nonce, int(self._confirm_store.ttl_s))
|
|
|
|
async def _handle_confirm_cmd(
|
|
self,
|
|
http_client: "httpx.AsyncClient",
|
|
cmd: "ControlCommand",
|
|
sender: str,
|
|
room_id: str,
|
|
event_id: str,
|
|
) -> str:
|
|
"""
|
|
Second leg of the two-step confirm flow (M9.0).
|
|
|
|
Validates the nonce and sender, executes the stored callback, and
|
|
emits confirmed + applied audit events.
|
|
"""
|
|
if self._confirm_store is None:
|
|
return "❌ Confirmation store not active."
|
|
|
|
# Nonce may come as subcommand (token right after !confirm)
|
|
nonce = (cmd.subcommand or "").strip().upper()
|
|
if not nonce and cmd.args:
|
|
nonce = cmd.args[0].strip().upper()
|
|
if not nonce:
|
|
return "❌ Usage: `!confirm <code>` — provide the confirmation code."
|
|
|
|
sender_hash = _sender_hash(sender)
|
|
entry = self._confirm_store.pop(nonce, sender_hash)
|
|
|
|
if entry is None:
|
|
logger.info(
|
|
"Confirm rejected: sender=%s nonce=%s (invalid/expired/wrong-sender)",
|
|
sender, nonce,
|
|
)
|
|
return confirm_expired_reply()
|
|
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.confirmed",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok",
|
|
data={
|
|
"sender_hash": sender_hash,
|
|
"nonce": nonce,
|
|
"verb": entry.verb,
|
|
"action_summary": entry.action_summary,
|
|
},
|
|
)
|
|
|
|
logger.info(
|
|
"Confirm accepted: sender=%s nonce=%s verb=%s",
|
|
sender, nonce, entry.verb,
|
|
)
|
|
|
|
try:
|
|
reply_text, diff_summary = await entry.callback()
|
|
except Exception as exc:
|
|
logger.exception(
|
|
"Confirm callback failed: sender=%s nonce=%s verb=%s",
|
|
sender, nonce, entry.verb,
|
|
)
|
|
return f"❌ Apply failed: {exc}"
|
|
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.applied",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok",
|
|
data={
|
|
"sender_hash": sender_hash,
|
|
"verb": entry.verb,
|
|
"normalized": entry.normalized_args,
|
|
"diff_summary": diff_summary,
|
|
},
|
|
)
|
|
|
|
return confirm_success_reply(reply_text)
|
|
|
|
# ── M6.0: Dynamic room-node overrides via !node command ───────────────────
|
|
|
|
async def _handle_node_cmd(
|
|
self,
|
|
http_client: httpx.AsyncClient,
|
|
sender: str,
|
|
ctrl_room_id: str,
|
|
event_id: str,
|
|
args_text: str,
|
|
) -> str:
|
|
"""Handle `!node <set|unset|get|list>` from an authorized operator."""
|
|
if self._policy_store is None or not self._policy_store.is_open:
|
|
return "⚠️ Policy store not available."
|
|
|
|
subcmd, room_id, node_id = parse_node_cmd(args_text)
|
|
|
|
if subcmd not in (NODE_SUBCMD_SET, NODE_SUBCMD_UNSET, NODE_SUBCMD_GET, NODE_SUBCMD_LIST):
|
|
return node_cmd_reply_error(
|
|
f"Unknown subcommand: `{subcmd or '?'}`"
|
|
)
|
|
|
|
# ── list ──────────────────────────────────────────────────────────────
|
|
if subcmd == NODE_SUBCMD_LIST:
|
|
try:
|
|
rows = await asyncio.to_thread(self._policy_store.list_overrides, 10)
|
|
total = await asyncio.to_thread(self._policy_store.count_overrides)
|
|
except Exception as exc:
|
|
logger.warning("PolicyStore list_overrides error: %s", exc)
|
|
return "⚠️ Could not read policy store."
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.node.list",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=ctrl_room_id, event_id=event_id,
|
|
status="ok", data={"sender": sender, "total": total},
|
|
)
|
|
return node_cmd_reply_list(rows, total)
|
|
|
|
# ── subcommands that require room_id ──────────────────────────────────
|
|
if not room_id:
|
|
return node_cmd_reply_error("Missing `room=` argument.")
|
|
if not node_cmd_validate_room(room_id):
|
|
return node_cmd_reply_error(
|
|
f"Invalid room ID format: `{room_id}`\n"
|
|
"Expected: `!localpart:server`"
|
|
)
|
|
|
|
# ── get ───────────────────────────────────────────────────────────────
|
|
if subcmd == NODE_SUBCMD_GET:
|
|
try:
|
|
override = await asyncio.to_thread(self._policy_store.get_override, room_id)
|
|
except Exception as exc:
|
|
logger.warning("PolicyStore get_override error: %s", exc)
|
|
return "⚠️ Could not read policy store."
|
|
# env map lookup for context
|
|
env_node: Optional[str] = None
|
|
if self._node_policy is not None:
|
|
env_node = self._node_policy.room_node_map.get(room_id)
|
|
default = self._node_policy.default_node if self._node_policy else self._node_id
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.node.get",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=ctrl_room_id, event_id=event_id,
|
|
status="ok", data={"sender": sender, "queried_room": room_id},
|
|
)
|
|
return node_cmd_reply_get(room_id, override, env_node, default)
|
|
|
|
# ── unset ─────────────────────────────────────────────────────────────
|
|
if subcmd == NODE_SUBCMD_UNSET:
|
|
try:
|
|
deleted = await asyncio.to_thread(self._policy_store.delete_override, room_id)
|
|
except Exception as exc:
|
|
logger.warning("PolicyStore delete_override error: %s", exc)
|
|
return "⚠️ Could not write to policy store."
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.node.unset",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=ctrl_room_id, event_id=event_id,
|
|
status="ok", data={"sender": sender, "target_room": room_id, "was_set": deleted},
|
|
)
|
|
return node_cmd_reply_unset_ok(room_id) if deleted else node_cmd_reply_unset_not_found(room_id)
|
|
|
|
# ── set ───────────────────────────────────────────────────────────────
|
|
if subcmd == NODE_SUBCMD_SET:
|
|
if not node_id:
|
|
return node_cmd_reply_error("Missing `node=` argument for `set`.")
|
|
allowed = self._node_policy.allowed_nodes if self._node_policy else frozenset([self._node_id])
|
|
if node_id not in allowed:
|
|
allowed_list = ", ".join(f"`{n}`" for n in sorted(allowed))
|
|
return node_cmd_reply_error(
|
|
f"Node `{node_id}` is not in allowed list: {allowed_list}"
|
|
)
|
|
try:
|
|
await asyncio.to_thread(self._policy_store.set_override, room_id, node_id, sender)
|
|
except Exception as exc:
|
|
logger.warning("PolicyStore set_override error: %s", exc)
|
|
return "⚠️ Could not write to policy store."
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.node.set",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=ctrl_room_id, event_id=event_id,
|
|
status="ok", data={"sender": sender, "target_room": room_id, "set_node": node_id},
|
|
)
|
|
return node_cmd_reply_set(room_id, node_id)
|
|
|
|
return node_cmd_reply_error("Unhandled subcommand.")
|
|
|
|
# ── M4.1: Bridge status for operators ─────────────────────────────────────
|
|
|
|
async def _handle_bridge_status(
|
|
self,
|
|
http_client: httpx.AsyncClient,
|
|
sender: str,
|
|
room_id: str,
|
|
event_id: str,
|
|
) -> str:
|
|
"""Build and return a bridge health snapshot for `!status` in control room."""
|
|
try:
|
|
snapshot: Dict[str, Any] = {
|
|
"node_id": self._node_id,
|
|
"worker_count": self._worker_count,
|
|
"room_count": len(self._room_map.mappings),
|
|
"mixed_room_count": (
|
|
len(self._mixed_room_config.rooms)
|
|
if self._mixed_room_config
|
|
else 0
|
|
),
|
|
"operators_count": (
|
|
len(self._control_config.operator_allowlist)
|
|
if self._control_config and self._control_config.operator_allowlist
|
|
else 0
|
|
),
|
|
}
|
|
# Queue info (exposed by MatrixIngressLoop via get_status)
|
|
status = self.get_status()
|
|
snapshot["queue_size"] = status.get("queue_size", "?")
|
|
snapshot["queue_max"] = status.get("queue_max", "?")
|
|
# Control safety
|
|
if self._control_limiter is not None:
|
|
snapshot["control_safety"] = self._control_limiter.as_health_dict()
|
|
# Persistent dedupe
|
|
if self._event_store is not None:
|
|
snapshot["persistent_dedupe"] = self._event_store.as_health_dict()
|
|
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.bridge_status",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok", data={"sender": sender},
|
|
)
|
|
return bridge_status_reply(snapshot)
|
|
except Exception as exc:
|
|
logger.warning("_handle_bridge_status error: %s", exc)
|
|
return status_not_available_reply()
|
|
|
|
# ── Control command handler ────────────────────────────────────────────────
|
|
|
|
async def _try_control(
|
|
self,
|
|
client: MatrixClient,
|
|
http_client: httpx.AsyncClient,
|
|
event: Dict[str, Any],
|
|
room_id: str,
|
|
) -> None:
|
|
"""
|
|
Process a message from a control room.
|
|
|
|
Non-command messages (not starting with '!') are silently ignored.
|
|
All command attempts are audited regardless of authorization.
|
|
"""
|
|
assert self._control_config is not None
|
|
event_id = event.get("event_id", "")
|
|
sender = event.get("sender", "")
|
|
text = event.get("content", {}).get("body", "").strip()
|
|
|
|
if not text or not is_control_message(text):
|
|
return # not a command, ignore
|
|
|
|
client.mark_seen(event_id)
|
|
|
|
# Authorization check
|
|
authorized, rejection_reason = check_authorization(sender, room_id, self._control_config)
|
|
|
|
if not authorized:
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.unauthorized",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code=rejection_reason,
|
|
data={"sender": sender, "command_preview": text[:80]},
|
|
)
|
|
logger.warning(
|
|
"Unauthorized control command: sender=%s room=%s reason=%s cmd=%r",
|
|
sender, room_id, rejection_reason, text[:60],
|
|
)
|
|
if self._control_unauthorized_behavior == "reply_error":
|
|
try:
|
|
txn_id = MatrixClient.make_txn_id(room_id, event_id + "_unauth")
|
|
await client.send_text(room_id, unauthorized_reply(rejection_reason), txn_id)
|
|
except Exception as exc:
|
|
logger.warning("Could not send unauthorized reply: %s", exc)
|
|
return
|
|
|
|
# M3.4: Rate limiting + cooldown (after auth, before parse/dispatch)
|
|
if self._control_limiter is not None:
|
|
sender_hash_ctrl = _sender_hash(sender)
|
|
|
|
allowed_room, retry_room = self._control_limiter.check_room(room_id)
|
|
if not allowed_room:
|
|
scope = "room"
|
|
logger.info("Control rate limited: scope=%s room=%s sender=%s", scope, room_id, sender)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.rate_limited",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code="rate_limited_room",
|
|
data={"sender": sender, "scope": scope, "retry_after_s": retry_room},
|
|
)
|
|
if self._on_control_rate_limited:
|
|
self._on_control_rate_limited(scope)
|
|
txn_id = MatrixClient.make_txn_id(room_id, event_id + "_rl")
|
|
await client.send_text(room_id, rate_limited_reply(scope, retry_room), txn_id)
|
|
return
|
|
|
|
allowed_op, retry_op = self._control_limiter.check_operator(sender_hash_ctrl)
|
|
if not allowed_op:
|
|
scope = "operator"
|
|
logger.info("Control rate limited: scope=%s sender=%s", scope, sender)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.rate_limited",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code="rate_limited_operator",
|
|
data={"sender": sender, "scope": scope, "retry_after_s": retry_op},
|
|
)
|
|
if self._on_control_rate_limited:
|
|
self._on_control_rate_limited(scope)
|
|
txn_id = MatrixClient.make_txn_id(room_id, event_id + "_rl")
|
|
await client.send_text(room_id, rate_limited_reply(scope, retry_op), txn_id)
|
|
return
|
|
|
|
# Parse command
|
|
cmd = parse_command(text)
|
|
if cmd is None:
|
|
logger.warning("Control message from %s could not be parsed: %r", sender, text[:60])
|
|
return
|
|
|
|
# M3.4: Cooldown check (anti-double-click, per operator+verb+subcommand)
|
|
if self._control_limiter is not None:
|
|
sender_hash_ctrl = _sender_hash(sender)
|
|
allowed_cd, wait_cd = self._control_limiter.check_cooldown(
|
|
sender_hash_ctrl, cmd.verb, cmd.subcommand or "",
|
|
)
|
|
if not allowed_cd:
|
|
scope = "cooldown"
|
|
logger.info(
|
|
"Control cooldown: sender=%s verb=%s sub=%s wait=%.1fs",
|
|
sender, cmd.verb, cmd.subcommand, wait_cd,
|
|
)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.rate_limited",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code="cooldown",
|
|
data={
|
|
"sender": sender, "scope": scope,
|
|
"verb": cmd.verb, "subcommand": cmd.subcommand,
|
|
"wait_s": wait_cd,
|
|
},
|
|
)
|
|
if self._on_control_rate_limited:
|
|
self._on_control_rate_limited(scope)
|
|
txn_id = MatrixClient.make_txn_id(room_id, event_id + "_cd")
|
|
await client.send_text(room_id, rate_limited_reply(scope, wait_cd), txn_id)
|
|
return
|
|
|
|
# Metric callback
|
|
if self._on_control_command:
|
|
self._on_control_command(sender, cmd.verb, cmd.subcommand)
|
|
|
|
# Audit every authorized command
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.command",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok",
|
|
data={
|
|
"sender": sender,
|
|
"verb": cmd.verb,
|
|
"subcommand": cmd.subcommand,
|
|
"args": list(cmd.args),
|
|
"kwargs": dict(cmd.kwargs),
|
|
"is_known": cmd.is_known,
|
|
},
|
|
)
|
|
|
|
logger.info(
|
|
"Control command: sender=%s verb=%s sub=%s args=%s",
|
|
sender, cmd.verb, cmd.subcommand, cmd.args,
|
|
)
|
|
|
|
# Dispatch command
|
|
txn_id = MatrixClient.make_txn_id(room_id, event_id + "_ctrl")
|
|
|
|
# M9.0: Dangerous commands → two-step confirmation (intent leg)
|
|
if (
|
|
self._confirm_store is not None
|
|
and is_dangerous_cmd(cmd)
|
|
and cmd.verb != VERB_CONFIRM
|
|
):
|
|
reply_text = await self._handle_dangerous_intent(
|
|
http_client, cmd, sender, room_id, event_id,
|
|
)
|
|
elif cmd.verb == VERB_CONFIRM:
|
|
# M9.0: !confirm <nonce> (second leg)
|
|
reply_text = await self._handle_confirm_cmd(
|
|
http_client, cmd, sender, room_id, event_id,
|
|
)
|
|
elif cmd.verb == VERB_HELP:
|
|
reply_text = help_reply()
|
|
elif cmd.verb == VERB_CONFIRM:
|
|
# Fallback if confirm_store is None (disabled) — inform the operator
|
|
reply_text = "❌ Confirmation system is disabled."
|
|
elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_START:
|
|
reply_text = await self._handle_runbook_start(
|
|
http_client, client, cmd, sender, room_id, event_id,
|
|
)
|
|
elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_NEXT:
|
|
# M3.2: advance to next step
|
|
reply_text = await self._handle_runbook_next(
|
|
http_client, client, cmd, sender, room_id, event_id,
|
|
)
|
|
elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_COMPLETE:
|
|
# M3.2: mark manual step complete
|
|
reply_text = await self._handle_runbook_complete(
|
|
http_client, client, cmd, sender, room_id, event_id,
|
|
)
|
|
elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_STATUS:
|
|
# M3.3: show run status
|
|
reply_text = await self._handle_runbook_status(
|
|
http_client, client, cmd, sender, room_id, event_id,
|
|
)
|
|
elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_EVIDENCE:
|
|
# M3.3: generate release evidence
|
|
reply_text = await self._handle_runbook_evidence(
|
|
http_client, client, cmd, sender, room_id, event_id,
|
|
)
|
|
elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_POST_REVIEW:
|
|
# M3.3: generate post-release review
|
|
reply_text = await self._handle_runbook_post_review(
|
|
http_client, client, cmd, sender, room_id, event_id,
|
|
)
|
|
elif cmd.verb == VERB_STATUS:
|
|
# M4.1: bridge health snapshot for operators
|
|
reply_text = await self._handle_bridge_status(
|
|
http_client, sender, room_id, event_id,
|
|
)
|
|
elif cmd.verb == VERB_NODES:
|
|
# M5.1: node policy overview for operators
|
|
reply_text = await self._handle_nodes(
|
|
http_client, sender, room_id, event_id,
|
|
)
|
|
elif cmd.verb == VERB_NODE:
|
|
# M6.0: dynamic room-node override commands
|
|
# Reconstruct args_text from parsed command parts
|
|
_node_args_parts = []
|
|
if cmd.subcommand:
|
|
_node_args_parts.append(cmd.subcommand)
|
|
_node_args_parts.extend(cmd.args)
|
|
_node_args_parts.extend(f"{k}={v}" for k, v in cmd.kwargs.items())
|
|
_node_args_text = " ".join(_node_args_parts)
|
|
reply_text = await self._handle_node_cmd(
|
|
http_client, sender, room_id, event_id, _node_args_text,
|
|
)
|
|
elif cmd.verb == VERB_ROOM:
|
|
# M6.1: dynamic mixed room agent overrides
|
|
reply_text = await self._handle_room_cmd(
|
|
http_client, sender, room_id, event_id,
|
|
cmd.subcommand, cmd.args, cmd.kwargs,
|
|
)
|
|
elif cmd.verb == VERB_POLICY:
|
|
# M6.2: policy snapshot export/import
|
|
reply_text = await self._handle_policy_cmd(
|
|
http_client, sender, room_id, event_id,
|
|
cmd.subcommand, cmd.kwargs,
|
|
)
|
|
elif not cmd.is_known:
|
|
reply_text = unknown_command_reply(cmd)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.unknown_cmd",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code="unknown_verb",
|
|
data={"verb": cmd.verb, "sender": sender},
|
|
)
|
|
else:
|
|
reply_text = not_implemented_reply(cmd)
|
|
|
|
try:
|
|
await client.send_text(room_id, reply_text, txn_id)
|
|
except Exception as exc:
|
|
logger.error("Could not send control reply: %s", exc)
|
|
|
|
async def _handle_runbook_start(
|
|
self,
|
|
http_client: httpx.AsyncClient,
|
|
client: "MatrixClient",
|
|
cmd: ControlCommand,
|
|
sender: str,
|
|
room_id: str,
|
|
event_id: str,
|
|
) -> str:
|
|
"""
|
|
M3.1: Execute !runbook start <path> [node=NODA1].
|
|
|
|
Calls sofiia-console POST /api/runbooks/internal/runs.
|
|
Returns reply text (success or failure) for delivery to the control room.
|
|
Audits matrix.control.runbook.start regardless of outcome.
|
|
"""
|
|
# Extract positional runbook_path
|
|
runbook_path = cmd.args[0].strip() if cmd.args else ""
|
|
node_id = cmd.kwargs.get("node", "NODA1").strip()
|
|
|
|
# Validate path before calling the console
|
|
path_error = _ctrl_runner.validate_runbook_path(runbook_path)
|
|
if path_error:
|
|
logger.warning(
|
|
"!runbook start invalid path: sender=%s path=%r error=%s",
|
|
sender, runbook_path, path_error,
|
|
)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.runbook.start",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code="invalid_path",
|
|
data={"sender": sender, "runbook_path": runbook_path, "error": path_error},
|
|
)
|
|
return start_usage_reply()
|
|
|
|
# Call sofiia-console internal API
|
|
run_id: Optional[str] = None
|
|
http_status: Optional[int] = None
|
|
try:
|
|
result = await _ctrl_runner.start_runbook_run(
|
|
http_client=http_client,
|
|
console_url=self._console_url,
|
|
control_token=self._control_token,
|
|
runbook_path=runbook_path,
|
|
operator_id=sender,
|
|
node_id=node_id,
|
|
)
|
|
run_id = result.get("run_id", "")
|
|
steps_total = result.get("steps_total", 0)
|
|
status = result.get("status", "running")
|
|
http_status = 200
|
|
|
|
logger.info(
|
|
"Runbook started: run_id=%s path=%s node=%s steps=%d by sender=%s",
|
|
run_id, runbook_path, node_id, steps_total, sender,
|
|
)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.runbook.start",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok",
|
|
data={
|
|
"sender": sender,
|
|
"runbook_path": runbook_path,
|
|
"node_id": node_id,
|
|
"run_id": run_id,
|
|
"steps_total": steps_total,
|
|
"http_status": http_status,
|
|
},
|
|
)
|
|
return runbook_started_reply(run_id, steps_total, status)
|
|
|
|
except _ctrl_runner.RunnerError as exc:
|
|
reason = str(exc)
|
|
logger.error(
|
|
"!runbook start failed: sender=%s path=%r node=%s error=%s",
|
|
sender, runbook_path, node_id, reason,
|
|
)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.runbook.start",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code="runner_error",
|
|
data={
|
|
"sender": sender,
|
|
"runbook_path": runbook_path,
|
|
"node_id": node_id,
|
|
"error": reason,
|
|
"http_status": http_status,
|
|
},
|
|
)
|
|
return runbook_start_error_reply(reason)
|
|
|
|
async def _handle_runbook_next(
|
|
self,
|
|
http_client: httpx.AsyncClient,
|
|
client: "MatrixClient",
|
|
cmd: ControlCommand,
|
|
sender: str,
|
|
room_id: str,
|
|
event_id: str,
|
|
) -> str:
|
|
"""
|
|
M3.2: Execute !runbook next <run_id>.
|
|
|
|
Calls sofiia-console POST /api/runbooks/internal/runs/{run_id}/next.
|
|
Returns reply text for the control room.
|
|
Audits matrix.control.runbook.next.
|
|
"""
|
|
run_id = cmd.args[0].strip() if cmd.args else ""
|
|
if not run_id:
|
|
return next_usage_reply()
|
|
|
|
# M3.4: per-run rate limit for !runbook next
|
|
if self._control_limiter is not None:
|
|
allowed_run, retry_run = self._control_limiter.check_run_next(run_id)
|
|
if not allowed_run:
|
|
scope = "run"
|
|
if self._on_control_rate_limited:
|
|
self._on_control_rate_limited(scope)
|
|
return rate_limited_reply(scope, retry_run)
|
|
|
|
http_status: Optional[int] = None
|
|
try:
|
|
result = await _ctrl_runner.next_runbook_step(
|
|
http_client=http_client,
|
|
console_url=self._console_url,
|
|
control_token=self._control_token,
|
|
run_id=run_id,
|
|
operator_id=sender,
|
|
)
|
|
http_status = 200
|
|
step_type = result.get("type", "unknown")
|
|
step_index = result.get("step_index", 0)
|
|
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.runbook.next",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok",
|
|
data={
|
|
"sender": sender,
|
|
"run_id": run_id,
|
|
"step_index": step_index,
|
|
"step_type": step_type,
|
|
"http_status": http_status,
|
|
},
|
|
)
|
|
|
|
if step_type == "manual":
|
|
return next_manual_reply(
|
|
run_id=run_id,
|
|
step_index=step_index,
|
|
steps_total=result.get("steps_total"),
|
|
title=result.get("title", ""),
|
|
instructions_md=result.get("instructions_md", ""),
|
|
)
|
|
else:
|
|
# http_check / script
|
|
result_dict = result.get("result") or {}
|
|
duration_ms = int(result_dict.get("duration_ms", 0)) if isinstance(result_dict, dict) else 0
|
|
return next_auto_reply(
|
|
run_id=run_id,
|
|
step_index=step_index,
|
|
action_type=step_type,
|
|
step_status=result.get("step_status", "ok"),
|
|
duration_ms=duration_ms or None,
|
|
completed=bool(result.get("completed", False)),
|
|
)
|
|
|
|
except _ctrl_runner.RunnerError as exc:
|
|
reason = str(exc)
|
|
logger.error(
|
|
"!runbook next failed: sender=%s run_id=%r error=%s",
|
|
sender, run_id, reason,
|
|
)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.runbook.next",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code="runner_error",
|
|
data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status},
|
|
)
|
|
return next_error_reply(run_id, reason)
|
|
|
|
async def _handle_runbook_complete(
|
|
self,
|
|
http_client: httpx.AsyncClient,
|
|
client: "MatrixClient",
|
|
cmd: ControlCommand,
|
|
sender: str,
|
|
room_id: str,
|
|
event_id: str,
|
|
) -> str:
|
|
"""
|
|
M3.2: Execute !runbook complete <run_id> step=<n> status=ok|warn|fail [notes=...]
|
|
|
|
Calls sofiia-console POST /api/runbooks/internal/runs/{run_id}/steps/{n}/complete.
|
|
Audits matrix.control.runbook.complete.
|
|
"""
|
|
run_id = cmd.args[0].strip() if cmd.args else ""
|
|
if not run_id:
|
|
return complete_usage_reply()
|
|
|
|
# step kwarg required
|
|
step_raw = cmd.kwargs.get("step", "").strip()
|
|
if not step_raw or not step_raw.isdigit():
|
|
return complete_usage_reply()
|
|
step_index = int(step_raw)
|
|
|
|
# status kwarg required
|
|
status = cmd.kwargs.get("status", "").strip().lower()
|
|
if status not in ("ok", "warn", "fail", "skipped"):
|
|
return complete_usage_reply()
|
|
|
|
# notes: kwarg or remaining positional args (joined with space)
|
|
notes = cmd.kwargs.get("notes", "").strip()
|
|
if not notes and len(cmd.args) > 1:
|
|
notes = " ".join(cmd.args[1:])
|
|
notes = sanitize_notes(notes) # M3.4: strip control chars + truncate to MAX_NOTES_LEN
|
|
|
|
http_status: Optional[int] = None
|
|
try:
|
|
result = await _ctrl_runner.complete_runbook_step(
|
|
http_client=http_client,
|
|
console_url=self._console_url,
|
|
control_token=self._control_token,
|
|
run_id=run_id,
|
|
step_index=step_index,
|
|
status=status,
|
|
notes=notes,
|
|
operator_id=sender,
|
|
)
|
|
http_status = 200
|
|
run_completed = bool(result.get("run_completed", False))
|
|
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.runbook.complete",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok",
|
|
data={
|
|
"sender": sender,
|
|
"run_id": run_id,
|
|
"step_index": step_index,
|
|
"status": status,
|
|
"run_completed": run_completed,
|
|
"http_status": http_status,
|
|
},
|
|
)
|
|
return complete_ok_reply(run_id, step_index, status, run_completed)
|
|
|
|
except _ctrl_runner.RunnerError as exc:
|
|
reason = str(exc)
|
|
logger.error(
|
|
"!runbook complete failed: sender=%s run_id=%r step=%d error=%s",
|
|
sender, run_id, step_index, reason,
|
|
)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.runbook.complete",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code="runner_error",
|
|
data={
|
|
"sender": sender,
|
|
"run_id": run_id,
|
|
"step_index": step_index,
|
|
"error": reason,
|
|
"http_status": http_status,
|
|
},
|
|
)
|
|
return complete_error_reply(run_id, reason)
|
|
|
|
async def _handle_runbook_status(
|
|
self,
|
|
http_client: httpx.AsyncClient,
|
|
client: "MatrixClient",
|
|
cmd: ControlCommand,
|
|
sender: str,
|
|
room_id: str,
|
|
event_id: str,
|
|
) -> str:
|
|
"""M3.3: !runbook status <run_id> — GET run info + format status."""
|
|
run_id = cmd.args[0].strip() if cmd.args else ""
|
|
if not run_id:
|
|
return status_usage_reply()
|
|
|
|
http_status: Optional[int] = None
|
|
try:
|
|
result = await _ctrl_runner.get_runbook_run(
|
|
http_client=http_client,
|
|
console_url=self._console_url,
|
|
control_token=self._control_token,
|
|
run_id=run_id,
|
|
)
|
|
http_status = 200
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.runbook.status",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok",
|
|
data={
|
|
"sender": sender, "run_id": run_id,
|
|
"run_status": result.get("status"),
|
|
"http_status": http_status,
|
|
},
|
|
)
|
|
return status_reply(result)
|
|
|
|
except _ctrl_runner.RunnerError as exc:
|
|
reason = str(exc)
|
|
logger.error("!runbook status failed: sender=%s run_id=%r error=%s", sender, run_id, reason)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.runbook.status",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code="runner_error",
|
|
data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status},
|
|
)
|
|
return status_error_reply(run_id, reason)
|
|
|
|
async def _handle_runbook_evidence(
|
|
self,
|
|
http_client: httpx.AsyncClient,
|
|
client: "MatrixClient",
|
|
cmd: ControlCommand,
|
|
sender: str,
|
|
room_id: str,
|
|
event_id: str,
|
|
) -> str:
|
|
"""M3.3: !runbook evidence <run_id> — generate release evidence."""
|
|
run_id = cmd.args[0].strip() if cmd.args else ""
|
|
if not run_id:
|
|
return evidence_usage_reply()
|
|
|
|
http_status: Optional[int] = None
|
|
try:
|
|
result = await _ctrl_runner.generate_evidence(
|
|
http_client=http_client,
|
|
console_url=self._console_url,
|
|
control_token=self._control_token,
|
|
run_id=run_id,
|
|
)
|
|
http_status = 200
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.runbook.evidence",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok",
|
|
data={
|
|
"sender": sender, "run_id": run_id,
|
|
"evidence_path": result.get("evidence_path"),
|
|
"bytes": result.get("bytes"),
|
|
"http_status": http_status,
|
|
},
|
|
)
|
|
return evidence_reply(result)
|
|
|
|
except _ctrl_runner.RunnerError as exc:
|
|
reason = str(exc)
|
|
logger.error("!runbook evidence failed: sender=%s run_id=%r error=%s", sender, run_id, reason)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.runbook.evidence",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code="runner_error",
|
|
data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status},
|
|
)
|
|
return evidence_error_reply(run_id, reason)
|
|
|
|
async def _handle_runbook_post_review(
|
|
self,
|
|
http_client: httpx.AsyncClient,
|
|
client: "MatrixClient",
|
|
cmd: ControlCommand,
|
|
sender: str,
|
|
room_id: str,
|
|
event_id: str,
|
|
) -> str:
|
|
"""M3.3: !runbook post_review <run_id> — generate post-release review."""
|
|
run_id = cmd.args[0].strip() if cmd.args else ""
|
|
if not run_id:
|
|
return post_review_usage_reply()
|
|
|
|
http_status: Optional[int] = None
|
|
try:
|
|
result = await _ctrl_runner.generate_post_review(
|
|
http_client=http_client,
|
|
console_url=self._console_url,
|
|
control_token=self._control_token,
|
|
run_id=run_id,
|
|
)
|
|
http_status = 200
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.runbook.post_review",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok",
|
|
data={
|
|
"sender": sender, "run_id": run_id,
|
|
"path": result.get("path"),
|
|
"bytes": result.get("bytes"),
|
|
"http_status": http_status,
|
|
},
|
|
)
|
|
return post_review_reply(result)
|
|
|
|
except _ctrl_runner.RunnerError as exc:
|
|
reason = str(exc)
|
|
logger.error("!runbook post_review failed: sender=%s run_id=%r error=%s", sender, run_id, reason)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.control.runbook.post_review",
|
|
agent_id="control", node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code="runner_error",
|
|
data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status},
|
|
)
|
|
return post_review_error_reply(run_id, reason)
|
|
|
|
# ── Worker ─────────────────────────────────────────────────────────────────
|
|
|
|
async def _worker(
|
|
self,
|
|
queue: "asyncio.Queue[Optional[_QueueEntry]]",
|
|
client: MatrixClient,
|
|
http_client: httpx.AsyncClient,
|
|
) -> None:
|
|
"""Consume queue entries until cancelled."""
|
|
while True:
|
|
entry = await queue.get() # blocks until item available; raises CancelledError on cancel
|
|
try:
|
|
await self._process_entry(client, http_client, entry)
|
|
except Exception as exc:
|
|
logger.error("Worker unhandled error: %s", exc)
|
|
finally:
|
|
queue.task_done()
|
|
if self._on_queue_size:
|
|
self._on_queue_size(queue.qsize())
|
|
|
|
# ── Process (invoke + send + audit) ───────────────────────────────────────
|
|
|
|
async def _process_entry(
|
|
self,
|
|
client: MatrixClient,
|
|
http_client: httpx.AsyncClient,
|
|
entry: _QueueEntry,
|
|
) -> None:
|
|
event = entry.event
|
|
event_id = event.get("event_id", "")
|
|
sender = event.get("sender", "")
|
|
text = event.get("content", {}).get("body", "").strip()
|
|
room_id = entry.room_id
|
|
agent_id = entry.agent_id
|
|
|
|
# H3: Queue wait latency
|
|
wait_s = time.monotonic() - entry.enqueue_time
|
|
if self._on_queue_wait:
|
|
self._on_queue_wait(agent_id, wait_s)
|
|
|
|
routing_reason = entry.routing_reason
|
|
is_mixed = entry.is_mixed
|
|
|
|
logger.info(
|
|
"Processing: room=%s agent=%s event=%s len=%d wait=%.3fs mixed=%s reason=%s",
|
|
room_id, agent_id, event_id, len(text), wait_s, is_mixed, routing_reason,
|
|
)
|
|
|
|
if self._on_message_received:
|
|
self._on_message_received(room_id, agent_id)
|
|
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.message.received",
|
|
agent_id=agent_id, node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok",
|
|
data={
|
|
"sender": sender,
|
|
"text_len": len(text),
|
|
"queue_wait_ms": int(wait_s * 1000),
|
|
"routing_reason": routing_reason,
|
|
"is_mixed": is_mixed,
|
|
},
|
|
)
|
|
|
|
# Session Scope v2: canonical key + PII-safe sender hash.
|
|
# Scope is always room_agent for user-initiated messages.
|
|
# Control room messages never reach _process_entry (handled by _try_control).
|
|
scope = SCOPE_ROOM_AGENT
|
|
session_id = _build_session_key(room_id, agent_id, scope=scope)
|
|
sender_hash = _sender_hash(sender)
|
|
|
|
logger.debug(
|
|
"Session scope v2: session_key=%s scope=%s sender_hash=%s",
|
|
session_id, scope, sender_hash,
|
|
)
|
|
|
|
# M5.0: Node-aware routing
|
|
# Extract node=X kwarg from body (mixed rooms only, to avoid breaking direct rooms)
|
|
explicit_node: Optional[str] = None
|
|
effective_text = text
|
|
if is_mixed and self._node_policy is not None:
|
|
explicit_node, effective_text = extract_node_kwarg(text)
|
|
|
|
# M6.0: look up dynamic policy store override for this room
|
|
store_override: Optional[str] = None
|
|
if self._policy_store is not None and self._policy_store.is_open:
|
|
try:
|
|
store_override = await asyncio.to_thread(
|
|
self._policy_store.get_override, room_id
|
|
)
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.warning("PolicyStore get_override failed: %s", exc)
|
|
|
|
node_res = (
|
|
self._node_policy.resolve(room_id, explicit_node, store_override=store_override)
|
|
if self._node_policy is not None
|
|
else NodeResolution(node_id=self._node_id, source=NODE_SOURCE_DEFAULT)
|
|
)
|
|
|
|
if node_res.rejected_node:
|
|
logger.info(
|
|
"Node kwarg rejected: requested=%s allowed=%s room=%s agent=%s",
|
|
node_res.rejected_node, self._node_policy.allowed_nodes if self._node_policy else {}, room_id, agent_id,
|
|
)
|
|
if self._on_node_rejected:
|
|
self._on_node_rejected(node_res.rejected_node)
|
|
txn_rej = MatrixClient.make_txn_id(room_id, event_id + "_node_rej")
|
|
allowed = self._node_policy.allowed_nodes if self._node_policy else frozenset()
|
|
reply_rej = node_rejected_reply(node_res.rejected_node, allowed)
|
|
try:
|
|
await client.send_text(room_id, reply_rej, txn_rej)
|
|
except Exception as exc:
|
|
logger.warning("Could not send node rejection reply: %s", exc)
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.route.node_rejected",
|
|
agent_id=agent_id, node_id=self._node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code="node_rejected",
|
|
data={"requested_node": node_res.rejected_node, "resolved_node": node_res.node_id},
|
|
)
|
|
# Continue with fallback node (do not drop the message)
|
|
|
|
if self._on_node_selected:
|
|
self._on_node_selected(agent_id, node_res.node_id, node_res.source)
|
|
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.route.node_selected",
|
|
agent_id=agent_id, node_id=node_res.node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok",
|
|
data={"node_id": node_res.node_id, "source": node_res.source},
|
|
)
|
|
|
|
# M2.2: per-room-agent concurrency cap (only for mixed rooms; single-agent rooms unaffected)
|
|
_lock = self._get_concurrency_lock(room_id, agent_id) if is_mixed and self._mixed_concurrency_cap > 0 else None
|
|
if _lock is not None:
|
|
await _lock.acquire()
|
|
try:
|
|
await self._invoke_and_send(
|
|
client, http_client, entry, session_id, wait_s, is_mixed, routing_reason,
|
|
sender_hash=sender_hash, scope=scope,
|
|
effective_node_id=node_res.node_id, node_source=node_res.source,
|
|
effective_text=effective_text,
|
|
)
|
|
finally:
|
|
if _lock is not None:
|
|
_lock.release()
|
|
|
|
async def _invoke_and_send(
|
|
self,
|
|
client: MatrixClient,
|
|
http_client: httpx.AsyncClient,
|
|
entry: _QueueEntry,
|
|
session_id: str,
|
|
wait_s: float,
|
|
is_mixed: bool,
|
|
routing_reason: str,
|
|
sender_hash: str = "",
|
|
scope: str = SCOPE_ROOM_AGENT,
|
|
# M5.0: resolved node
|
|
effective_node_id: Optional[str] = None,
|
|
node_source: str = NODE_SOURCE_DEFAULT,
|
|
effective_text: Optional[str] = None, # text with node=X kwarg stripped
|
|
) -> None:
|
|
"""Inner: invoke Router + send reply (separated for concurrency lock wrapping)."""
|
|
event = entry.event
|
|
event_id = event.get("event_id", "")
|
|
# Use effective_text if provided (node kwarg stripped), otherwise original body
|
|
text = effective_text if effective_text is not None else event.get("content", {}).get("body", "").strip()
|
|
room_id = entry.room_id
|
|
agent_id = entry.agent_id
|
|
node_id = effective_node_id if effective_node_id is not None else self._node_id
|
|
|
|
# H3 + M8.0 + M8.1: Invoke with latency tracking, soft-failover, and sticky routing
|
|
t0 = time.monotonic()
|
|
reply_text: Optional[str] = None
|
|
invoke_ok = False
|
|
invoke_duration_s = 0.0
|
|
used_node_id = node_id # may change on failover
|
|
|
|
# M8.1: check sticky cache (skip primary if sticky is set for this room:agent)
|
|
sticky_key = make_sticky_key(room_id, agent_id)
|
|
sticky_node: Optional[str] = None
|
|
if node_source != NODE_SOURCE_EXPLICIT and self._sticky_cache is not None:
|
|
sticky_node = self._sticky_cache.get(sticky_key)
|
|
|
|
async def _do_invoke(target_node: str, target_source: str) -> Optional[str]:
|
|
"""Single invoke attempt; returns reply text or None on failure."""
|
|
nonlocal invoke_duration_s
|
|
_t = time.monotonic()
|
|
try:
|
|
result = await _invoke_router(
|
|
http_client, self._router_url,
|
|
agent_id=agent_id, node_id=target_node,
|
|
prompt=text, session_id=session_id,
|
|
sender_hash=sender_hash, scope=scope,
|
|
node_source=target_source,
|
|
)
|
|
invoke_duration_s = time.monotonic() - _t
|
|
if self._node_health_tracker is not None:
|
|
self._node_health_tracker.record_ok(target_node, invoke_duration_s)
|
|
if self._on_invoke_latency:
|
|
self._on_invoke_latency(agent_id, invoke_duration_s, target_node)
|
|
logger.info(
|
|
"Invoke ok: agent=%s node=%s event=%s reply_len=%d duration=%dms",
|
|
agent_id, target_node, event_id, len(result or ""), int(invoke_duration_s * 1000),
|
|
)
|
|
return result
|
|
except httpx.HTTPStatusError as exc:
|
|
invoke_duration_s = time.monotonic() - _t
|
|
_reason = FAILOVER_REASON_HTTP_5XX if exc.response.status_code >= 500 else "http_4xx"
|
|
logger.error(
|
|
"Router HTTP %d agent=%s node=%s event=%s duration=%dms",
|
|
exc.response.status_code, agent_id, target_node, event_id,
|
|
int(invoke_duration_s * 1000),
|
|
)
|
|
if self._node_health_tracker and exc.response.status_code >= 500:
|
|
self._node_health_tracker.record_error(target_node, _reason)
|
|
if self._on_gateway_error:
|
|
self._on_gateway_error(f"http_{exc.response.status_code}")
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.error", agent_id=agent_id, node_id=target_node,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code=f"router_http_{exc.response.status_code}",
|
|
duration_ms=int(invoke_duration_s * 1000),
|
|
)
|
|
if exc.response.status_code >= 500:
|
|
raise # eligible for failover
|
|
return None # 4xx: not a node issue, don't failover
|
|
|
|
except (httpx.ConnectError, httpx.TimeoutException) as exc:
|
|
invoke_duration_s = time.monotonic() - _t
|
|
_reason = (
|
|
FAILOVER_REASON_TIMEOUT
|
|
if isinstance(exc, httpx.TimeoutException)
|
|
else FAILOVER_REASON_NETWORK
|
|
)
|
|
logger.error(
|
|
"Router network error agent=%s node=%s event=%s: %s",
|
|
agent_id, target_node, event_id, exc,
|
|
)
|
|
if self._node_health_tracker:
|
|
self._node_health_tracker.record_error(target_node, _reason)
|
|
if self._on_gateway_error:
|
|
self._on_gateway_error("network_error")
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.error", agent_id=agent_id, node_id=target_node,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code="router_network_error",
|
|
duration_ms=int(invoke_duration_s * 1000),
|
|
)
|
|
raise # eligible for failover
|
|
|
|
except Exception as exc:
|
|
invoke_duration_s = time.monotonic() - _t
|
|
logger.error(
|
|
"Unexpected invoke error agent=%s node=%s event=%s: %s",
|
|
agent_id, target_node, event_id, exc,
|
|
)
|
|
if self._node_health_tracker:
|
|
self._node_health_tracker.record_error(target_node, "unexpected")
|
|
if self._on_gateway_error:
|
|
self._on_gateway_error("unexpected")
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.error", agent_id=agent_id, node_id=target_node,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code="router_unexpected",
|
|
duration_ms=int(invoke_duration_s * 1000),
|
|
)
|
|
return None # unexpected errors: no failover (could be code bug)
|
|
|
|
if sticky_node is not None:
|
|
# M8.1: sticky path — route directly to known-good fallback, skip primary
|
|
logger.info(
|
|
"Sticky: routing %s→%s (skipping primary=%s) agent=%s event=%s",
|
|
sticky_key, sticky_node, node_id, agent_id, event_id,
|
|
)
|
|
try:
|
|
reply_text = await _do_invoke(sticky_node, NODE_SOURCE_DEFAULT)
|
|
invoke_ok = reply_text is not None
|
|
used_node_id = sticky_node
|
|
except Exception: # noqa: BLE001
|
|
# Sticky node also failed — clear sticky and leave reply_text=None
|
|
self._sticky_cache.delete(sticky_key) # type: ignore[union-attr]
|
|
logger.warning(
|
|
"Sticky node %s failed for %s — cleared (agent=%s event=%s)",
|
|
sticky_node, sticky_key, agent_id, event_id,
|
|
)
|
|
# M8.2: remove from DB as well
|
|
await self._sticky_persist_delete(sticky_key)
|
|
|
|
else:
|
|
# Normal path: try primary; attempt failover on eligible errors
|
|
try:
|
|
reply_text = await _do_invoke(node_id, node_source)
|
|
invoke_ok = reply_text is not None
|
|
used_node_id = node_id
|
|
|
|
except (httpx.ConnectError, httpx.TimeoutException, httpx.HTTPStatusError):
|
|
# Primary failed with a failover-eligible error.
|
|
# Failover only for non-explicit routing (explicit = user chose node).
|
|
if node_source == NODE_SOURCE_EXPLICIT:
|
|
logger.info(
|
|
"Node %s failed for explicit routing — no failover (agent=%s event=%s)",
|
|
node_id, agent_id, event_id,
|
|
)
|
|
# reply_text stays None; error already audited
|
|
else:
|
|
# Attempt failover
|
|
fallback_node: Optional[str] = None
|
|
if self._node_health_tracker is not None and self._node_policy is not None:
|
|
fallback_node = self._node_health_tracker.pick_fallback(
|
|
node_id, self._node_policy.allowed_nodes
|
|
)
|
|
elif self._node_policy is not None:
|
|
# No tracker — pick any other allowed node deterministically
|
|
others = sorted(
|
|
n for n in self._node_policy.allowed_nodes if n != node_id
|
|
)
|
|
fallback_node = others[0] if others else None
|
|
|
|
if fallback_node:
|
|
logger.warning(
|
|
"Failover: %s → %s agent=%s event=%s",
|
|
node_id, fallback_node, agent_id, event_id,
|
|
)
|
|
try:
|
|
reply_text = await _do_invoke(fallback_node, NODE_SOURCE_DEFAULT)
|
|
invoke_ok = reply_text is not None
|
|
used_node_id = fallback_node
|
|
if invoke_ok:
|
|
# Fire failover callback and audit
|
|
if self._on_failover:
|
|
self._on_failover(node_id, fallback_node, "invoke_error")
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.node.failover",
|
|
agent_id=agent_id, node_id=fallback_node,
|
|
room_id=room_id, event_id=event_id,
|
|
status="ok", data={
|
|
"from_node": node_id,
|
|
"to_node": fallback_node,
|
|
"original_source": node_source,
|
|
},
|
|
)
|
|
# M8.1: set sticky — future messages skip primary
|
|
if self._sticky_cache is not None:
|
|
self._sticky_cache.set(sticky_key, fallback_node)
|
|
scope = "mixed" if is_mixed else "direct"
|
|
if self._on_sticky_set:
|
|
self._on_sticky_set(fallback_node, scope)
|
|
logger.info(
|
|
"Sticky set: %s → %s scope=%s ttl=%.0fs",
|
|
sticky_key, fallback_node, scope,
|
|
self._sticky_cache.ttl_s,
|
|
)
|
|
# M8.2: persist sticky to DB
|
|
await self._sticky_persist_set(sticky_key, fallback_node)
|
|
except Exception: # noqa: BLE001
|
|
pass # errors already audited inside _do_invoke
|
|
|
|
if not invoke_ok or not reply_text:
|
|
if invoke_ok:
|
|
logger.warning("Empty reply from router agent=%s event=%s", agent_id, event_id)
|
|
return
|
|
|
|
# H3: Send with latency
|
|
# M2.1: prefix reply with agent identity in mixed rooms ("Sofiia: ...")
|
|
prefix = reply_prefix(agent_id, is_mixed)
|
|
raw_reply = reply_text[:_REPLY_TEXT_MAX - len(prefix)]
|
|
send_text = prefix + raw_reply
|
|
txn_id = MatrixClient.make_txn_id(room_id, event_id)
|
|
send_t0 = time.monotonic()
|
|
|
|
try:
|
|
await client.send_text(room_id, send_text, txn_id)
|
|
send_duration_s = time.monotonic() - send_t0
|
|
if self._on_send_latency:
|
|
self._on_send_latency(agent_id, send_duration_s)
|
|
if self._on_message_replied:
|
|
self._on_message_replied(room_id, agent_id, "ok")
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.agent.replied", agent_id=agent_id, node_id=used_node_id,
|
|
room_id=room_id, event_id=event_id, status="ok",
|
|
duration_ms=int(send_duration_s * 1000),
|
|
data={
|
|
"reply_len": len(send_text),
|
|
"truncated": len(reply_text) > _REPLY_TEXT_MAX,
|
|
"router_duration_ms": int(invoke_duration_s * 1000),
|
|
"queue_wait_ms": int(wait_s * 1000),
|
|
"routing_reason": routing_reason,
|
|
"is_mixed": is_mixed,
|
|
"node_source": node_source,
|
|
"failover": used_node_id != node_id, # M8.0: failover flag
|
|
},
|
|
)
|
|
logger.info(
|
|
"Reply sent: agent=%s event=%s reply_len=%d send_ms=%d",
|
|
agent_id, event_id, len(send_text), int(send_duration_s * 1000),
|
|
)
|
|
|
|
except Exception as exc:
|
|
send_duration_s = time.monotonic() - send_t0
|
|
logger.error("Send failed agent=%s event=%s: %s", agent_id, event_id, exc)
|
|
if self._on_message_replied:
|
|
self._on_message_replied(room_id, agent_id, "error")
|
|
if self._on_gateway_error:
|
|
self._on_gateway_error("matrix_send_error")
|
|
await _write_audit(
|
|
http_client, self._console_url, self._internal_token,
|
|
event="matrix.error", agent_id=agent_id, node_id=node_id,
|
|
room_id=room_id, event_id=event_id,
|
|
status="error", error_code="matrix_send_failed",
|
|
duration_ms=int(send_duration_s * 1000),
|
|
)
|