feat(matrix-bridge-dagi): add operator allowlist for control commands (M3.0)

New: app/control.py
  - ControlConfig: operator_allowlist + control_rooms (frozensets)
  - parse_control_config(): validates @user:server + !room:server formats, fail-fast
  - parse_command(): parses !verb subcommand [args] [key=value] up to 512 chars
  - check_authorization(): AND(is_control_room, is_operator) → (bool, reason)
  - Reply helpers: not_implemented, unknown_command, unauthorized, help
  - KNOWN_VERBS: runbook, status, help (M3.1+ stubs)
  - MAX_CMD_LEN=512, MAX_CMD_TOKENS=20

ingress.py:
  - _try_control(): dispatch for control rooms (authorized → audit + reply, unauthorized → audit + optional )
  - join control rooms on startup
  - _enqueue_from_sync: control rooms processed first, never forwarded to agents
  - on_control_command(sender, verb, subcommand) metric callback
  - CONTROL_UNAUTHORIZED_BEHAVIOR: "ignore" | "reply_error"

Audit events:
  matrix.control.command       — authorised command (verb, subcommand, args, kwargs)
  matrix.control.unauthorized  — rejected by allowlist (reason: not_operator | not_control_room)
  matrix.control.unknown_cmd   — authorised but unrecognised verb

Config + main:
  - bridge_operator_allowlist, bridge_control_rooms, control_unauthorized_behavior
  - matrix_bridge_control_commands_total{sender,verb,subcommand} counter
  - /health: control_channel section (enabled, rooms_count, operators_count, behavior)
  - /bridge/mappings: control_rooms + control_operators_count
  - docker-compose: BRIDGE_OPERATOR_ALLOWLIST, BRIDGE_CONTROL_ROOMS, CONTROL_UNAUTHORIZED_BEHAVIOR

Tests: 40 new → 148 total pass
Made-with: Cursor
This commit is contained in:
Apple
2026-03-05 01:50:04 -08:00
parent d40b1e87c6
commit fe6e3d30ae
6 changed files with 945 additions and 5 deletions

View File

@@ -1,5 +1,5 @@
"""
matrix-bridge-dagi — configuration and validation (M2.1 + M2.2: mixed rooms + guard rails)
matrix-bridge-dagi — configuration and validation (M2.1 + M2.2 + M3.0)
"""
import os
from dataclasses import dataclass, field
@@ -46,6 +46,14 @@ class BridgeConfig:
unknown_agent_behavior: str # "ignore" | "reply_error"
mixed_concurrency_cap: int # max parallel invokes per (room, agent); 0 = unlimited
# M3.0: Operator control channel
# "@ivan:daarion.space,@sergiy:daarion.space"
bridge_operator_allowlist: str
# "!opsroom:server,!opsroom2:server2"
bridge_control_rooms: str
# "ignore" | "reply_error" (send ⛔ to room on unauthorized attempt)
control_unauthorized_behavior: str
# Service identity
node_id: str
build_sha: str
@@ -88,6 +96,9 @@ def load_config() -> BridgeConfig:
max_slash_len=max(4, int(_optional("MAX_SLASH_LEN", "32"))),
unknown_agent_behavior=_optional("UNKNOWN_AGENT_BEHAVIOR", "ignore"),
mixed_concurrency_cap=max(0, int(_optional("MIXED_CONCURRENCY_CAP", "1"))),
bridge_operator_allowlist=_optional("BRIDGE_OPERATOR_ALLOWLIST", ""),
bridge_control_rooms=_optional("BRIDGE_CONTROL_ROOMS", ""),
control_unauthorized_behavior=_optional("CONTROL_UNAUTHORIZED_BEHAVIOR", "ignore"),
node_id=_optional("NODE_ID", "NODA1"),
build_sha=_optional("BUILD_SHA", "dev"),
build_time=_optional("BUILD_TIME", "local"),

View File

@@ -0,0 +1,263 @@
"""
Matrix Bridge — Control Command Layer (M3.0)
Handles operator commands from designated control rooms.
Access policy (AND):
1. Message arrives in a BRIDGE_CONTROL_ROOM
2. Sender is in BRIDGE_OPERATOR_ALLOWLIST
3. Message starts with "!" prefix (e.g. "!runbook start ...")
Design principles:
- Bridge is a TRANSPORT only — it never executes scripts directly.
- All actions go via sofiia-console internal API (M3.1+).
- Every command attempt is audited regardless of authorization.
- Unknown commands acknowledged but not executed (forward-compatible).
Audit events emitted:
matrix.control.command — authorised command recognised
matrix.control.unauthorized — command from non-operator or wrong room
matrix.control.unknown_cmd — authorised but unrecognised verb
"""
import logging
import re
from dataclasses import dataclass, field
from typing import Dict, FrozenSet, List, Optional, Tuple
logger = logging.getLogger(__name__)
# ── Constants ─────────────────────────────────────────────────────────────────
# Supported control verbs (M3.1+ will implement them fully)
VERB_RUNBOOK = "runbook"
VERB_STATUS = "status"
VERB_HELP = "help"
KNOWN_VERBS: FrozenSet[str] = frozenset({VERB_RUNBOOK, VERB_STATUS, VERB_HELP})
# Max command line length to guard against garbage injection
_MAX_CMD_LEN = 512
# Max number of tokens in a single command
_MAX_CMD_TOKENS = 20
# Matrix user ID format: @localpart:server
_MATRIX_USER_RE = re.compile(r"^@[A-Za-z0-9._\-/=+]+:[A-Za-z0-9.\-]+$")
# Room ID format: !localpart:server
_ROOM_ID_RE = re.compile(r"^![A-Za-z0-9\-_.]+:[A-Za-z0-9\-_.]+$")
# ── Data structures ────────────────────────────────────────────────────────────
@dataclass(frozen=True)
class ControlCommand:
"""Parsed control command from a Matrix message."""
verb: str # e.g. "runbook"
subcommand: str # e.g. "start", "next", "complete", "evidence", "status"
args: Tuple[str, ...] # remaining positional args
kwargs: Dict[str, str] # key=value pairs parsed from args (e.g. node=NODA1)
raw: str # original message text
is_known: bool # True if verb in KNOWN_VERBS
@classmethod
def from_tokens(cls, tokens: List[str], raw: str) -> "ControlCommand":
"""Build ControlCommand from pre-split tokens (first token must not include '!')."""
verb = tokens[0].lower() if tokens else ""
subcommand = tokens[1].lower() if len(tokens) > 1 else ""
remaining = tokens[2:] if len(tokens) > 2 else []
positional: List[str] = []
kw: Dict[str, str] = {}
for token in remaining:
if "=" in token:
k, _, v = token.partition("=")
kw[k.lower().strip()] = v.strip()
else:
positional.append(token)
return cls(
verb=verb,
subcommand=subcommand,
args=tuple(positional),
kwargs=kw,
raw=raw,
is_known=verb in KNOWN_VERBS,
)
@dataclass
class ControlConfig:
"""
Parsed operator access policy for the control channel.
operator_allowlist: Frozenset of Matrix user IDs allowed to issue commands.
control_rooms: Frozenset of room IDs designated as control channels.
"""
operator_allowlist: FrozenSet[str] = field(default_factory=frozenset)
control_rooms: FrozenSet[str] = field(default_factory=frozenset)
@property
def is_enabled(self) -> bool:
"""Control channel is effective only when both sets are non-empty."""
return bool(self.operator_allowlist and self.control_rooms)
# ── Parsers ────────────────────────────────────────────────────────────────────
def parse_control_config(
raw_allowlist: str,
raw_control_rooms: str,
) -> ControlConfig:
"""
Parse BRIDGE_OPERATOR_ALLOWLIST and BRIDGE_CONTROL_ROOMS.
Allowlist format: "@ivan:daarion.space,@sergiy:daarion.space"
Control rooms fmt: "!opsroom:server,!opsroom2:server2"
Raises ValueError on:
- Malformed Matrix user ID
- Malformed room ID
"""
operators: List[str] = []
errors: List[str] = []
for entry in raw_allowlist.split(","):
uid = entry.strip()
if not uid:
continue
if not _MATRIX_USER_RE.match(uid):
errors.append(f"Invalid operator user_id: {uid!r}")
else:
operators.append(uid)
rooms: List[str] = []
for entry in raw_control_rooms.split(","):
rid = entry.strip()
if not rid:
continue
if not _ROOM_ID_RE.match(rid):
errors.append(f"Invalid control room_id: {rid!r}")
else:
rooms.append(rid)
if errors:
raise ValueError(f"Control config parse errors: {'; '.join(errors)}")
cfg = ControlConfig(
operator_allowlist=frozenset(operators),
control_rooms=frozenset(rooms),
)
if cfg.is_enabled:
logger.info(
"Control channel enabled: %d operators, %d rooms",
len(operators), len(rooms),
)
else:
logger.info("Control channel disabled (empty allowlist or no control rooms)")
return cfg
# ── Message inspection ────────────────────────────────────────────────────────
def is_control_message(text: str) -> bool:
"""Returns True if message looks like a control command (starts with '!')."""
return bool(text and text.strip().startswith("!"))
def is_control_room(room_id: str, config: ControlConfig) -> bool:
return room_id in config.control_rooms
def is_operator(sender: str, config: ControlConfig) -> bool:
return sender in config.operator_allowlist
def parse_command(text: str) -> Optional[ControlCommand]:
"""
Parse a control message into a ControlCommand.
Returns None if text is not a control command or is malformed/too long.
"""
stripped = text.strip()
if not stripped.startswith("!"):
return None
if len(stripped) > _MAX_CMD_LEN:
logger.warning("Control command too long (%d chars) — rejected", len(stripped))
return None
# Strip leading '!'
body = stripped[1:]
tokens = body.split()
if not tokens:
return None
if len(tokens) > _MAX_CMD_TOKENS:
logger.warning("Control command has too many tokens (%d) — rejected", len(tokens))
return None
return ControlCommand.from_tokens(tokens, raw=stripped)
# ── Authorization check ───────────────────────────────────────────────────────
def check_authorization(
sender: str,
room_id: str,
config: ControlConfig,
) -> Tuple[bool, str]:
"""
Returns (authorized: bool, rejection_reason: str).
Reasons:
- "not_operator": sender not in allowlist
- "not_control_room": room not in control_rooms
- "ok": authorized
"""
if not is_control_room(room_id, config):
return False, "not_control_room"
if not is_operator(sender, config):
logger.warning(
"Unauthorized control attempt: sender=%s room=%s not in allowlist",
sender, room_id,
)
return False, "not_operator"
return True, "ok"
# ── Reply helpers ─────────────────────────────────────────────────────────────
def not_implemented_reply(cmd: ControlCommand) -> str:
"""Reply for known commands not yet implemented (M3.0 stub)."""
return (
f"✅ Command acknowledged: `{cmd.raw}`\n"
f"⏳ `!{cmd.verb} {cmd.subcommand}` — implementation pending (M3.1+)."
)
def unknown_command_reply(cmd: ControlCommand) -> str:
"""Reply for unrecognised verbs."""
return (
f"⚠️ Unknown command: `{cmd.raw}`\n"
f"Known verbs: {', '.join(sorted(KNOWN_VERBS))}.\n"
f"Type `!help` for usage."
)
def unauthorized_reply(reason: str) -> str:
"""Reply for unauthorized command attempts (sent only when behavior=reply_error)."""
if reason == "not_operator":
return "⛔ Not authorised: your Matrix ID is not in the operator allowlist."
return "⛔ Not authorised: this room is not a control channel."
def help_reply() -> str:
"""Brief help text."""
return (
"**DAGI Bridge — Control Commands**\n\n"
"`!runbook start <path> [node=NODA1]` — Start a runbook run\n"
"`!runbook next <run_id>` — Advance to next step\n"
"`!runbook complete <run_id> step=<n> status=ok` — Mark step complete\n"
"`!runbook evidence <run_id>` — Get evidence artifact path\n"
"`!runbook status <run_id>` — Show current run state\n"
"`!status` — Bridge health summary\n"
"`!help` — This message\n\n"
"_Only authorised operators can issue control commands._"
)

View File

@@ -1,5 +1,5 @@
"""
Matrix Ingress + Egress Loop — Phase M1.4 + H1 + H2 + H3 + M2.1 + M2.2 (mixed rooms hardening)
Matrix Ingress + Egress Loop — Phase M1.4 + H1 + H2 + H3 + M2.1 + M2.2 + M3.0 (control channel)
Architecture (H2):
Reader task → asyncio.Queue(maxsize) → N Worker tasks
@@ -33,6 +33,12 @@ from typing import Any, Callable, Dict, List, Optional
import httpx
from .control import (
ControlConfig, ControlCommand,
check_authorization, parse_command, is_control_message,
not_implemented_reply, unknown_command_reply, unauthorized_reply, help_reply,
VERB_HELP,
)
from .matrix_client import MatrixClient
from .mixed_routing import (
MixedRoomConfig, route_message, reply_prefix,
@@ -178,6 +184,9 @@ class MatrixIngressLoop:
unknown_agent_behavior: str = "ignore", # "ignore" | "reply_error"
max_slash_len: int = 32,
mixed_concurrency_cap: int = 1, # 0 = unlimited
# M3.0: control channel
control_config: Optional[ControlConfig] = None,
control_unauthorized_behavior: str = "ignore", # "ignore" | "reply_error"
# Callbacks
on_message_received: Optional[Callable[[str, str], None]] = None,
on_message_replied: Optional[Callable[[str, str, str], None]] = None,
@@ -190,6 +199,7 @@ class MatrixIngressLoop:
on_queue_wait: Optional[Callable[[str, float], None]] = None,
on_routed: Optional[Callable[[str, str], None]] = None,
on_route_rejected: Optional[Callable[[str, str], None]] = None,
on_control_command: Optional[Callable[[str, str, str], None]] = None,
) -> None:
self._hs_url = matrix_homeserver_url
self._token = matrix_access_token
@@ -214,11 +224,14 @@ class MatrixIngressLoop:
self._on_send_latency = on_send_latency
self._on_queue_wait = on_queue_wait
self._mixed_room_config = mixed_room_config
self._control_config = control_config
self._control_unauthorized_behavior = control_unauthorized_behavior
self._unknown_agent_behavior = unknown_agent_behavior
self._max_slash_len = max_slash_len
self._mixed_concurrency_cap = mixed_concurrency_cap
self._on_routed = on_routed
self._on_route_rejected = on_route_rejected
self._on_control_command = on_control_command
# Lazily populated semaphores keyed by "{room_id}:{agent_id}"
self._concurrency_locks: Dict[str, asyncio.Semaphore] = {}
self._next_batch: Optional[str] = None
@@ -281,6 +294,17 @@ class MatrixIngressLoop:
await client.join_room(room_id)
except Exception as exc:
logger.warning("Could not join mixed room %s: %s", room_id, exc)
if self._control_config and self._control_config.is_enabled:
for room_id in self._control_config.control_rooms:
try:
await client.join_room(room_id)
except Exception as exc:
logger.warning("Could not join control room %s: %s", room_id, exc)
logger.info(
"Control channel: %d rooms, %d operators",
len(self._control_config.control_rooms),
len(self._control_config.operator_allowlist),
)
async with httpx.AsyncClient() as http_client:
# Start workers
@@ -355,6 +379,13 @@ class MatrixIngressLoop:
http_client: httpx.AsyncClient,
sync_resp: Dict[str, Any],
) -> None:
# M3.0: Control rooms — handled first, not forwarded to agents
if self._control_config and self._control_config.is_enabled:
for room_id in self._control_config.control_rooms:
messages = client.extract_room_messages(sync_resp, room_id)
for event in messages:
await self._try_control(client, http_client, event, room_id)
# Regular rooms: 1 room → 1 agent (M1 / M2.0)
for mapping in self._room_map.mappings:
if mapping.agent_id not in self._room_map.allowed_agents:
@@ -559,6 +590,110 @@ class MatrixIngressLoop:
data={"queue_max": self._queue_max, "sender": sender},
)
# ── Control command handler ────────────────────────────────────────────────
async def _try_control(
self,
client: MatrixClient,
http_client: httpx.AsyncClient,
event: Dict[str, Any],
room_id: str,
) -> None:
"""
Process a message from a control room.
Non-command messages (not starting with '!') are silently ignored.
All command attempts are audited regardless of authorization.
"""
assert self._control_config is not None
event_id = event.get("event_id", "")
sender = event.get("sender", "")
text = event.get("content", {}).get("body", "").strip()
if not text or not is_control_message(text):
return # not a command, ignore
client.mark_seen(event_id)
# Authorization check
authorized, rejection_reason = check_authorization(sender, room_id, self._control_config)
if not authorized:
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.unauthorized",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code=rejection_reason,
data={"sender": sender, "command_preview": text[:80]},
)
logger.warning(
"Unauthorized control command: sender=%s room=%s reason=%s cmd=%r",
sender, room_id, rejection_reason, text[:60],
)
if self._control_unauthorized_behavior == "reply_error":
try:
txn_id = MatrixClient.make_txn_id(room_id, event_id + "_unauth")
await client.send_text(room_id, unauthorized_reply(rejection_reason), txn_id)
except Exception as exc:
logger.warning("Could not send unauthorized reply: %s", exc)
return
# Parse command
cmd = parse_command(text)
if cmd is None:
logger.warning("Control message from %s could not be parsed: %r", sender, text[:60])
return
# Metric callback
if self._on_control_command:
self._on_control_command(sender, cmd.verb, cmd.subcommand)
# Audit every authorized command
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.command",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="ok",
data={
"sender": sender,
"verb": cmd.verb,
"subcommand": cmd.subcommand,
"args": list(cmd.args),
"kwargs": dict(cmd.kwargs),
"is_known": cmd.is_known,
},
)
logger.info(
"Control command: sender=%s verb=%s sub=%s args=%s",
sender, cmd.verb, cmd.subcommand, cmd.args,
)
# Build reply
txn_id = MatrixClient.make_txn_id(room_id, event_id + "_ctrl")
if cmd.verb == VERB_HELP:
reply_text = help_reply()
elif not cmd.is_known:
reply_text = unknown_command_reply(cmd)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.control.unknown_cmd",
agent_id="control", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code="unknown_verb",
data={"verb": cmd.verb, "sender": sender},
)
else:
# M3.1+ will implement actual runbook/status commands
reply_text = not_implemented_reply(cmd)
try:
await client.send_text(room_id, reply_text, txn_id)
except Exception as exc:
logger.error("Could not send control reply: %s", exc)
# ── Worker ─────────────────────────────────────────────────────────────────
async def _worker(

View File

@@ -32,6 +32,7 @@ except ImportError: # pragma: no cover
_PROM_OK = False
from .config import BridgeConfig, load_config
from .control import ControlConfig, parse_control_config
from .ingress import MatrixIngressLoop
from .mixed_routing import MixedRoomConfig, parse_mixed_room_map
from .rate_limit import InMemoryRateLimiter
@@ -120,6 +121,12 @@ if _PROM_OK:
"matrix_bridge_active_room_agent_locks",
"Number of room-agent pairs currently holding a concurrency lock",
)
# M3.0: Control channel
_control_commands_total = Counter(
"matrix_bridge_control_commands_total",
"Total control commands received from authorized operators",
["sender", "verb", "subcommand"],
)
# ── Startup state ─────────────────────────────────────────────────────────────
_START_TIME = time.monotonic()
@@ -129,6 +136,7 @@ _matrix_reachable: Optional[bool] = None
_gateway_reachable: Optional[bool] = None
_room_map: Optional[RoomMappingConfig] = None
_mixed_room_config: Optional[MixedRoomConfig] = None
_control_config: Optional[ControlConfig] = None
_rate_limiter: Optional[InMemoryRateLimiter] = None
_ingress_loop: Optional["MatrixIngressLoop"] = None # for /health queue_size
_ingress_task: Optional[asyncio.Task] = None
@@ -150,7 +158,7 @@ async def _probe_url(url: str, timeout: float = 5.0) -> bool:
@asynccontextmanager
async def lifespan(app_: Any):
global _cfg, _config_error, _matrix_reachable, _gateway_reachable
global _room_map, _mixed_room_config, _rate_limiter, _ingress_loop
global _room_map, _mixed_room_config, _control_config, _rate_limiter, _ingress_loop
try:
_cfg = load_config()
@@ -186,13 +194,24 @@ async def lifespan(app_: Any):
_cfg.rate_limit_room_rpm, _cfg.rate_limit_sender_rpm,
)
# M3.0: Operator control channel
if _cfg.bridge_operator_allowlist or _cfg.bridge_control_rooms:
_control_config = parse_control_config(
_cfg.bridge_operator_allowlist,
_cfg.bridge_control_rooms,
)
else:
_control_config = None
mixed_count = _mixed_room_config.total_rooms if _mixed_room_config else 0
ctrl_rooms = len(_control_config.control_rooms) if _control_config else 0
ctrl_ops = len(_control_config.operator_allowlist) if _control_config else 0
logger.info(
"✅ matrix-bridge-dagi started | node=%s build=%s homeserver=%s "
"agents=%s mappings=%d mixed_rooms=%d",
"agents=%s mappings=%d mixed_rooms=%d ctrl_rooms=%d ctrl_operators=%d",
_cfg.node_id, _cfg.build_sha, _cfg.matrix_homeserver_url,
list(_cfg.bridge_allowed_agents),
_room_map.total_mappings, mixed_count,
_room_map.total_mappings, mixed_count, ctrl_rooms, ctrl_ops,
)
# Connectivity smoke probes (non-blocking failures)
@@ -274,6 +293,13 @@ async def lifespan(app_: Any):
if _PROM_OK:
_route_rejected_total.labels(room_id=room_id, reason=reason).inc()
# M3.0 callbacks
def _on_control_command(sender: str, verb: str, subcommand: str) -> None:
if _PROM_OK:
_control_commands_total.labels(
sender=sender, verb=verb, subcommand=subcommand
).inc()
ingress = MatrixIngressLoop(
matrix_homeserver_url=_cfg.matrix_homeserver_url,
matrix_access_token=_cfg.matrix_access_token,
@@ -302,6 +328,9 @@ async def lifespan(app_: Any):
on_queue_wait=_on_queue_wait,
on_routed=_on_routed,
on_route_rejected=_on_route_rejected,
control_config=_control_config,
control_unauthorized_behavior=_cfg.control_unauthorized_behavior,
on_control_command=_on_control_command,
)
logger.info(
"✅ Backpressure queue: max=%d workers=%d drain_timeout=%.1fs",
@@ -400,6 +429,12 @@ async def health() -> Dict[str, Any]:
"concurrency_cap": _cfg.mixed_concurrency_cap,
"active_room_agent_locks": _ingress_loop.active_lock_count if _ingress_loop else 0,
},
"control_channel": {
"enabled": _control_config.is_enabled if _control_config else False,
"control_rooms_count": len(_control_config.control_rooms) if _control_config else 0,
"operators_count": len(_control_config.operator_allowlist) if _control_config else 0,
"unauthorized_behavior": _cfg.control_unauthorized_behavior,
},
}
@@ -424,6 +459,8 @@ async def bridge_mappings() -> Dict[str, Any]:
"mappings": _room_map.as_summary(),
"mixed_rooms_total": _mixed_room_config.total_rooms if _mixed_room_config else 0,
"mixed_rooms": _mixed_room_config.as_summary() if _mixed_room_config else [],
"control_rooms": sorted(_control_config.control_rooms) if _control_config else [],
"control_operators_count": len(_control_config.operator_allowlist) if _control_config else 0,
}