feat(matrix-bridge-dagi): harden mixed rooms with safe defaults and ops visibility (M2.2)

Guard rails (mixed_routing.py):
  - MAX_AGENTS_PER_MIXED_ROOM (default 5): fail-fast at parse time
  - MAX_SLASH_LEN (default 32): reject garbage/injection slash tokens
  - Unified rejection reasons: unknown_agent, slash_too_long, no_mapping
  - REASON_REJECTED_* constants (separate from success REASON_*)

Ingress (ingress.py):
  - per-room-agent concurrency semaphore (MIXED_CONCURRENCY_CAP, default 1)
  - active_lock_count property for /health + prometheus
  - UNKNOWN_AGENT_BEHAVIOR: "ignore" (silent) | "reply_error" (inform user)
  - on_routed(agent_id, reason) callback for routing metrics
  - on_route_rejected(room_id, reason) callback for rejection metrics
  - matrix.route.rejected audit event on every rejection

Config + main:
  - max_agents_per_mixed_room, max_slash_len, unknown_agent_behavior, mixed_concurrency_cap
  - matrix_bridge_routed_total{agent_id, reason} counter
  - matrix_bridge_route_rejected_total{room_id, reason} counter
  - matrix_bridge_active_room_agent_locks gauge
  - /health: mixed_guard_rails section + total_agents_in_mixed_rooms
  - docker-compose: all 4 new guard rail env vars

Runbook: section 9 — mixed room debug guide (6 acceptance tests, routing metrics, session isolation, lock hang, config guard)

Tests: 108 pass (94 → 108, +14 new tests for guard rails + callbacks + concurrency)
Made-with: Cursor
This commit is contained in:
Apple
2026-03-05 01:41:20 -08:00
parent a85a11984b
commit d40b1e87c6
8 changed files with 576 additions and 21 deletions

View File

@@ -1,5 +1,5 @@
"""
matrix-bridge-dagi — configuration and validation (M2.1: mixed rooms)
matrix-bridge-dagi — configuration and validation (M2.1 + M2.2: mixed rooms + guard rails)
"""
import os
from dataclasses import dataclass, field
@@ -40,6 +40,12 @@ class BridgeConfig:
# "!roomX:server=helion" — explicit default per mixed room (optional)
bridge_mixed_defaults: str
# M2.2: Mixed room guard rails
max_agents_per_mixed_room: int # fail-fast if room defines more agents than this
max_slash_len: int # reject slash token longer than this (anti-garbage)
unknown_agent_behavior: str # "ignore" | "reply_error"
mixed_concurrency_cap: int # max parallel invokes per (room, agent); 0 = unlimited
# Service identity
node_id: str
build_sha: str
@@ -78,6 +84,10 @@ def load_config() -> BridgeConfig:
queue_drain_timeout_s=max(1.0, float(_optional("QUEUE_DRAIN_TIMEOUT_S", "5"))),
bridge_mixed_room_map=_optional("BRIDGE_MIXED_ROOM_MAP", ""),
bridge_mixed_defaults=_optional("BRIDGE_MIXED_DEFAULTS", ""),
max_agents_per_mixed_room=max(1, int(_optional("MAX_AGENTS_PER_MIXED_ROOM", "5"))),
max_slash_len=max(4, int(_optional("MAX_SLASH_LEN", "32"))),
unknown_agent_behavior=_optional("UNKNOWN_AGENT_BEHAVIOR", "ignore"),
mixed_concurrency_cap=max(0, int(_optional("MIXED_CONCURRENCY_CAP", "1"))),
node_id=_optional("NODE_ID", "NODA1"),
build_sha=_optional("BUILD_SHA", "dev"),
build_time=_optional("BUILD_TIME", "local"),

View File

@@ -1,5 +1,5 @@
"""
Matrix Ingress + Egress Loop — Phase M1.4 + H1 + H2 + H3 + M2.1 (mixed rooms)
Matrix Ingress + Egress Loop — Phase M1.4 + H1 + H2 + H3 + M2.1 + M2.2 (mixed rooms hardening)
Architecture (H2):
Reader task → asyncio.Queue(maxsize) → N Worker tasks
@@ -34,7 +34,10 @@ from typing import Any, Callable, Dict, List, Optional
import httpx
from .matrix_client import MatrixClient
from .mixed_routing import MixedRoomConfig, route_message, reply_prefix
from .mixed_routing import (
MixedRoomConfig, route_message, reply_prefix,
REASON_REJECTED_UNKNOWN_AGENT, REASON_REJECTED_SLASH_TOO_LONG, REASON_REJECTED_NO_MAPPING,
)
from .rate_limit import InMemoryRateLimiter
from .room_mapping import RoomMappingConfig, RoomMapping
@@ -152,6 +155,8 @@ class MatrixIngressLoop:
on_invoke_latency(agent_id, duration_seconds)
on_send_latency(agent_id, duration_seconds)
on_queue_wait(agent_id, wait_seconds)
on_routed(agent_id, reason) M2.2: successful routing
on_route_rejected(room_id, reason) M2.2: routing rejection
"""
def __init__(
@@ -169,6 +174,10 @@ class MatrixIngressLoop:
worker_concurrency: int = 2,
queue_drain_timeout_s: float = 5.0,
mixed_room_config: Optional[MixedRoomConfig] = None,
# M2.2: guard rails
unknown_agent_behavior: str = "ignore", # "ignore" | "reply_error"
max_slash_len: int = 32,
mixed_concurrency_cap: int = 1, # 0 = unlimited
# Callbacks
on_message_received: Optional[Callable[[str, str], None]] = None,
on_message_replied: Optional[Callable[[str, str, str], None]] = None,
@@ -179,6 +188,8 @@ class MatrixIngressLoop:
on_invoke_latency: Optional[Callable[[str, float], None]] = None,
on_send_latency: Optional[Callable[[str, float], None]] = None,
on_queue_wait: Optional[Callable[[str, float], None]] = None,
on_routed: Optional[Callable[[str, str], None]] = None,
on_route_rejected: Optional[Callable[[str, str], None]] = None,
) -> None:
self._hs_url = matrix_homeserver_url
self._token = matrix_access_token
@@ -203,6 +214,13 @@ class MatrixIngressLoop:
self._on_send_latency = on_send_latency
self._on_queue_wait = on_queue_wait
self._mixed_room_config = mixed_room_config
self._unknown_agent_behavior = unknown_agent_behavior
self._max_slash_len = max_slash_len
self._mixed_concurrency_cap = mixed_concurrency_cap
self._on_routed = on_routed
self._on_route_rejected = on_route_rejected
# Lazily populated semaphores keyed by "{room_id}:{agent_id}"
self._concurrency_locks: Dict[str, asyncio.Semaphore] = {}
self._next_batch: Optional[str] = None
self._queue: Optional[asyncio.Queue] = None # exposed for /health
@@ -218,6 +236,19 @@ class MatrixIngressLoop:
def worker_count(self) -> int:
return self._worker_count
@property
def active_lock_count(self) -> int:
"""Number of room-agent pairs currently holding a concurrency lock."""
return sum(1 for lock in self._concurrency_locks.values() if lock.locked())
def _get_concurrency_lock(self, room_id: str, agent_id: str) -> asyncio.Semaphore:
"""Lazily create and return the semaphore for a (room, agent) pair."""
key = f"{room_id}:{agent_id}"
if key not in self._concurrency_locks:
cap = self._mixed_concurrency_cap if self._mixed_concurrency_cap > 0 else 2 ** 31
self._concurrency_locks[key] = asyncio.Semaphore(cap)
return self._concurrency_locks[key]
# ── Public run ─────────────────────────────────────────────────────────────
async def run(self, stop_event: asyncio.Event) -> None:
@@ -428,23 +459,46 @@ class MatrixIngressLoop:
# Route message to determine target agent
agent_id, routing_reason, effective_text = route_message(
text, room_id, self._mixed_room_config, self._room_map.allowed_agents,
max_slash_len=self._max_slash_len,
)
if agent_id is None:
# M2.2: routing rejected — audit + metric + optional error reply
logger.warning(
"Mixed room %s: unresolvable routing reason=%s event=%s — skipping",
"Mixed room %s: routing rejected reason=%s event=%s",
room_id, routing_reason, event_id,
)
if self._on_route_rejected:
self._on_route_rejected(room_id, routing_reason)
await _write_audit(
http_client, self._console_url, self._internal_token,
event="matrix.error",
event="matrix.route.rejected",
agent_id="unknown", node_id=self._node_id,
room_id=room_id, event_id=event_id,
status="error", error_code="no_agent_for_message",
data={"routing_reason": routing_reason, "sender": sender},
status="error", error_code=routing_reason,
data={"routing_reason": routing_reason, "sender": sender, "text_len": len(text)},
)
# M2.2: optional user-facing error reply in room
if self._unknown_agent_behavior == "reply_error" and routing_reason == REASON_REJECTED_UNKNOWN_AGENT:
available = self._mixed_room_config.agents_for_room(room_id)
# Extract agent name from text (first slash token, if any)
slash_token = text.strip().split()[0].lstrip("/") if text.strip().startswith("/") else ""
label = f"`/{slash_token}`" if slash_token else "this command"
error_msg = (
f"⚠️ Unknown agent {label}. "
f"Available in this room: {', '.join(available)}"
)
txn_id = MatrixClient.make_txn_id(room_id, event_id + "_reject")
try:
await client.send_text(room_id, error_msg, txn_id)
except Exception as exc:
logger.warning("Could not send route-error reply: %s", exc)
return
# M2.2: successful route — fire metric callback
if self._on_routed:
self._on_routed(agent_id, routing_reason)
# H1: Rate limit (uses final agent_id for metric tagging)
if self._rate_limiter is not None:
allowed, limit_type = self._rate_limiter.check(room_id=room_id, sender=sender)
@@ -578,6 +632,35 @@ class MatrixIngressLoop:
else:
session_id = f"matrix:{room_key}"
# M2.2: per-room-agent concurrency cap (only for mixed rooms; single-agent rooms unaffected)
_lock = self._get_concurrency_lock(room_id, agent_id) if is_mixed and self._mixed_concurrency_cap > 0 else None
if _lock is not None:
await _lock.acquire()
try:
await self._invoke_and_send(
client, http_client, entry, session_id, wait_s, is_mixed, routing_reason,
)
finally:
if _lock is not None:
_lock.release()
async def _invoke_and_send(
self,
client: MatrixClient,
http_client: httpx.AsyncClient,
entry: _QueueEntry,
session_id: str,
wait_s: float,
is_mixed: bool,
routing_reason: str,
) -> None:
"""Inner: invoke Router + send reply (separated for concurrency lock wrapping)."""
event = entry.event
event_id = event.get("event_id", "")
text = event.get("content", {}).get("body", "").strip()
room_id = entry.room_id
agent_id = entry.agent_id
# H3: Invoke with latency
t0 = time.monotonic()
reply_text: Optional[str] = None

View File

@@ -105,6 +105,21 @@ if _PROM_OK:
["agent_id"],
buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 30.0],
)
# M2.2: Mixed room routing metrics
_routed_total = Counter(
"matrix_bridge_routed_total",
"Successful message routing by reason",
["agent_id", "reason"],
)
_route_rejected_total = Counter(
"matrix_bridge_route_rejected_total",
"Messages rejected during routing (unknown agent, bad slash, etc.)",
["room_id", "reason"],
)
_active_room_agent_locks = Gauge(
"matrix_bridge_active_room_agent_locks",
"Number of room-agent pairs currently holding a concurrency lock",
)
# ── Startup state ─────────────────────────────────────────────────────────────
_START_TIME = time.monotonic()
@@ -151,6 +166,7 @@ async def lifespan(app_: Any):
_cfg.bridge_mixed_room_map,
_cfg.bridge_mixed_defaults,
_cfg.bridge_allowed_agents,
max_agents_per_room=_cfg.max_agents_per_mixed_room,
)
logger.info(
"✅ Mixed room config: %d rooms, agents=%s",
@@ -249,6 +265,15 @@ async def lifespan(app_: Any):
if _PROM_OK:
_queue_wait.labels(agent_id=agent_id).observe(wait_s)
# M2.2 callbacks
def _on_routed(agent_id: str, reason: str) -> None:
if _PROM_OK:
_routed_total.labels(agent_id=agent_id, reason=reason).inc()
def _on_route_rejected(room_id: str, reason: str) -> None:
if _PROM_OK:
_route_rejected_total.labels(room_id=room_id, reason=reason).inc()
ingress = MatrixIngressLoop(
matrix_homeserver_url=_cfg.matrix_homeserver_url,
matrix_access_token=_cfg.matrix_access_token,
@@ -263,6 +288,9 @@ async def lifespan(app_: Any):
worker_concurrency=_cfg.worker_concurrency,
queue_drain_timeout_s=_cfg.queue_drain_timeout_s,
mixed_room_config=_mixed_room_config,
unknown_agent_behavior=_cfg.unknown_agent_behavior,
max_slash_len=_cfg.max_slash_len,
mixed_concurrency_cap=_cfg.mixed_concurrency_cap,
on_message_received=_on_msg,
on_message_replied=_on_replied,
on_gateway_error=_on_gw_error,
@@ -272,6 +300,8 @@ async def lifespan(app_: Any):
on_invoke_latency=_on_invoke_latency,
on_send_latency=_on_send_latency,
on_queue_wait=_on_queue_wait,
on_routed=_on_routed,
on_route_rejected=_on_route_rejected,
)
logger.info(
"✅ Backpressure queue: max=%d workers=%d drain_timeout=%.1fs",
@@ -321,6 +351,8 @@ app.add_middleware(
@app.get("/health")
async def health() -> Dict[str, Any]:
uptime = int(time.monotonic() - _START_TIME)
if _PROM_OK and _ingress_loop is not None:
_active_room_agent_locks.set(_ingress_loop.active_lock_count)
if _config_error or _cfg is None:
return {
"ok": False,
@@ -351,6 +383,9 @@ async def health() -> Dict[str, Any]:
"gateway_reachable": _gateway_reachable,
"mappings_count": _room_map.total_mappings if _room_map else 0,
"mixed_rooms_count": _mixed_room_config.total_rooms if _mixed_room_config else 0,
"total_agents_in_mixed_rooms": sum(
len(r.agents) for r in _mixed_room_config.rooms.values()
) if _mixed_room_config else 0,
"config_ok": True,
"rate_limiter": _rate_limiter.stats() if _rate_limiter else None,
"queue": {
@@ -358,6 +393,13 @@ async def health() -> Dict[str, Any]:
"max": _cfg.queue_max_events,
"workers": _cfg.worker_concurrency,
},
"mixed_guard_rails": {
"max_agents_per_room": _cfg.max_agents_per_mixed_room,
"max_slash_len": _cfg.max_slash_len,
"unknown_agent_behavior": _cfg.unknown_agent_behavior,
"concurrency_cap": _cfg.mixed_concurrency_cap,
"active_room_agent_locks": _ingress_loop.active_lock_count if _ingress_loop else 0,
},
}

View File

@@ -1,5 +1,5 @@
"""
Mixed-Room Routing — Phase M2.1
Mixed-Room Routing — Phase M2.1 + M2.2 (guard rails + rejection audit)
Supports 1 room → N agents with deterministic message routing.
@@ -33,12 +33,21 @@ _SLASH_RE = re.compile(r"^/([A-Za-z0-9_\-]+)\s*(.*)", re.DOTALL)
_MENTION_AT_RE = re.compile(r"^@([A-Za-z0-9_\-]+)\s*(.*)", re.DOTALL)
_MENTION_COLON_RE = re.compile(r"^([A-Za-z0-9_\-]+):\s+(.*)", re.DOTALL)
# Routing reason labels
# Routing reason labels (successful routes)
REASON_SLASH = "slash_command"
REASON_AT_MENTION = "at_mention"
REASON_COLON_MENTION = "colon_mention"
REASON_DEFAULT = "default"
# Rejection reason labels (route_message returns agent_id=None + one of these)
REASON_REJECTED_UNKNOWN_AGENT = "unknown_agent"
REASON_REJECTED_SLASH_TOO_LONG = "slash_too_long"
REASON_REJECTED_NO_MAPPING = "no_mapping"
# Hard guards
_DEFAULT_MAX_AGENTS_PER_ROOM = 5
_DEFAULT_MAX_SLASH_LEN = 32
# ── Data structures ────────────────────────────────────────────────────────────
@@ -94,6 +103,7 @@ def parse_mixed_room_map(
raw_map: str,
raw_defaults: str,
allowed_agents: FrozenSet[str],
max_agents_per_room: int = _DEFAULT_MAX_AGENTS_PER_ROOM,
) -> MixedRoomConfig:
"""
Parse BRIDGE_MIXED_ROOM_MAP and BRIDGE_MIXED_DEFAULTS into MixedRoomConfig.
@@ -104,6 +114,7 @@ def parse_mixed_room_map(
Raises ValueError on:
- Malformed room_id
- Empty agent list
- Too many agents per room (> max_agents_per_room)
- Agent not in allowed_agents
- Duplicate room_id in map
"""
@@ -154,6 +165,13 @@ def parse_mixed_room_map(
errors.append(f"Empty agent list for room {room_id!r}")
continue
# M2.2 guard: fail-fast if too many agents per room
if len(agents) > max_agents_per_room:
errors.append(
f"Room {room_id!r} has {len(agents)} agents > MAX_AGENTS_PER_MIXED_ROOM={max_agents_per_room}"
)
continue
invalid = [a for a in agents if a not in allowed_agents]
if invalid:
errors.append(
@@ -189,6 +207,7 @@ def route_message(
room_id: str,
config: MixedRoomConfig,
allowed_agents: FrozenSet[str],
max_slash_len: int = _DEFAULT_MAX_SLASH_LEN,
) -> Tuple[Optional[str], str, str]:
"""
Determine which agent should handle this message.
@@ -196,8 +215,8 @@ def route_message(
Returns:
(agent_id, routing_reason, effective_text)
agent_id: matched agent or None if unresolvable
routing_reason: one of REASON_* constants
agent_id: matched agent or None if unresolvable / rejected
routing_reason: one of REASON_* or REASON_REJECTED_* constants
effective_text: text with routing prefix stripped (for cleaner invoke)
Priority:
@@ -205,10 +224,14 @@ def route_message(
2. @agentname ... (at-mention)
3. agentname: ... (colon-mention)
4. default agent for room (fallback)
Guard rails (M2.2):
- Slash command token longer than max_slash_len → REASON_REJECTED_SLASH_TOO_LONG
- Unknown agent in slash → REASON_REJECTED_UNKNOWN_AGENT (no fallthrough)
"""
room = config.rooms.get(room_id)
if room is None:
return None, "no_mapping", text
return None, REASON_REJECTED_NO_MAPPING, text
stripped = text.strip()
@@ -217,16 +240,25 @@ def route_message(
if m:
candidate = m.group(1).lower()
body = m.group(2).strip() or stripped # keep original if body empty
# M2.2: reject suspiciously long slash tokens (garbage / injection attempts)
if len(candidate) > max_slash_len:
logger.warning(
"Slash token too long (%d > %d) in room %s — rejected",
len(candidate), max_slash_len, room_id,
)
return None, REASON_REJECTED_SLASH_TOO_LONG, text
agent = _resolve_agent(candidate, room, allowed_agents)
if agent:
logger.debug("Slash route: /%s%s", candidate, agent)
return agent, REASON_SLASH, body
# Unknown agent → return None + log; do not fall through to default
# Unknown agent → hard reject, do NOT fall through to default
logger.warning(
"Slash command /%s in room %s: agent not recognised or not allowed",
candidate, room_id,
"Slash command /%s in room %s: agent not recognised or not allowed (available: %s)",
candidate, room_id, room.agents,
)
return None, f"unknown_slash_{candidate}", text
return None, REASON_REJECTED_UNKNOWN_AGENT, text
# 2. @mention: @sofiia hello
m = _MENTION_AT_RE.match(stripped)