feat(matrix-bridge-dagi): harden mixed rooms with safe defaults and ops visibility (M2.2)

Guard rails (mixed_routing.py):
  - MAX_AGENTS_PER_MIXED_ROOM (default 5): fail-fast at parse time
  - MAX_SLASH_LEN (default 32): reject garbage/injection slash tokens
  - Unified rejection reasons: unknown_agent, slash_too_long, no_mapping
  - REASON_REJECTED_* constants (separate from success REASON_*)

Ingress (ingress.py):
  - per-room-agent concurrency semaphore (MIXED_CONCURRENCY_CAP, default 1)
  - active_lock_count property for /health + prometheus
  - UNKNOWN_AGENT_BEHAVIOR: "ignore" (silent) | "reply_error" (inform user)
  - on_routed(agent_id, reason) callback for routing metrics
  - on_route_rejected(room_id, reason) callback for rejection metrics
  - matrix.route.rejected audit event on every rejection

Config + main:
  - max_agents_per_mixed_room, max_slash_len, unknown_agent_behavior, mixed_concurrency_cap
  - matrix_bridge_routed_total{agent_id, reason} counter
  - matrix_bridge_route_rejected_total{room_id, reason} counter
  - matrix_bridge_active_room_agent_locks gauge
  - /health: mixed_guard_rails section + total_agents_in_mixed_rooms
  - docker-compose: all 4 new guard rail env vars

Runbook: section 9 — mixed room debug guide (6 acceptance tests, routing metrics, session isolation, lock hang, config guard)

Tests: 108 pass (94 → 108, +14 new tests for guard rails + callbacks + concurrency)
Made-with: Cursor
This commit is contained in:
Apple
2026-03-05 01:41:20 -08:00
parent a85a11984b
commit d40b1e87c6
8 changed files with 576 additions and 21 deletions

View File

@@ -308,6 +308,158 @@ def test_rate_limited_mixed_room_event_dropped():
assert len(dropped) == 2 # two dropped by rate limiter
def test_on_route_rejected_callback_fires():
"""on_route_rejected fires when /unknown slash is used in mixed room."""
ingress = _make_ingress(mixed_raw=f"{ROOM_MIXED}=sofiia,helion")
rejected_calls: List[tuple] = []
ingress._on_route_rejected = lambda room, reason: rejected_calls.append((room, reason))
client = _fake_client({ROOM_MIXED: [_make_event("/unknownbot hello", event_id="r1")]})
queue: asyncio.Queue = asyncio.Queue(maxsize=50)
ingress._queue = queue
async def _run():
with patch("app.ingress._write_audit", new=AsyncMock()):
await ingress._enqueue_from_sync(client, queue, AsyncMock(), {})
run(_run())
assert queue.qsize() == 0
assert len(rejected_calls) == 1
room, reason = rejected_calls[0]
assert room == ROOM_MIXED
assert "unknown" in reason
def test_on_routed_callback_fires_with_reason():
"""on_routed fires with correct agent_id and routing_reason on successful route."""
ingress = _make_ingress(mixed_raw=f"{ROOM_MIXED}=sofiia,helion")
routed_calls: List[tuple] = []
ingress._on_routed = lambda agent, reason: routed_calls.append((agent, reason))
client = _fake_client({ROOM_MIXED: [_make_event("/helion hello", event_id="rt1")]})
queue: asyncio.Queue = asyncio.Queue(maxsize=50)
ingress._queue = queue
async def _run():
with patch("app.ingress._write_audit", new=AsyncMock()):
await ingress._enqueue_from_sync(client, queue, AsyncMock(), {})
run(_run())
assert len(routed_calls) == 1
agent, reason = routed_calls[0]
assert agent == "helion"
assert reason == "slash_command"
def test_unknown_agent_reply_error_sends_message():
"""UNKNOWN_AGENT_BEHAVIOR=reply_error → error message sent to room."""
ingress = _make_ingress(mixed_raw=f"{ROOM_MIXED}=sofiia,helion")
ingress._unknown_agent_behavior = "reply_error"
sent_texts: List[str] = []
mock_client = _fake_client({ROOM_MIXED: [_make_event("/druid hello", event_id="ue1")]})
mock_client.send_text = AsyncMock(side_effect=lambda room, text, txn_id=None: sent_texts.append(text))
queue: asyncio.Queue = asyncio.Queue(maxsize=50)
ingress._queue = queue
async def _run():
with patch("app.ingress._write_audit", new=AsyncMock()):
await ingress._enqueue_from_sync(mock_client, queue, AsyncMock(), {})
run(_run())
assert queue.qsize() == 0 # not enqueued
assert len(sent_texts) == 1
assert "Unknown agent" in sent_texts[0]
assert "sofiia" in sent_texts[0] or "helion" in sent_texts[0]
def test_unknown_agent_ignore_sends_nothing():
"""UNKNOWN_AGENT_BEHAVIOR=ignore (default) → no reply sent."""
ingress = _make_ingress(mixed_raw=f"{ROOM_MIXED}=sofiia,helion")
ingress._unknown_agent_behavior = "ignore"
sent_texts: List[str] = []
mock_client = _fake_client({ROOM_MIXED: [_make_event("/druid hello", event_id="ui1")]})
mock_client.send_text = AsyncMock(side_effect=lambda room, text, txn_id=None: sent_texts.append(text))
queue: asyncio.Queue = asyncio.Queue(maxsize=50)
ingress._queue = queue
async def _run():
with patch("app.ingress._write_audit", new=AsyncMock()):
await ingress._enqueue_from_sync(mock_client, queue, AsyncMock(), {})
run(_run())
assert queue.qsize() == 0
assert len(sent_texts) == 0 # silent
def test_concurrency_cap_active_lock_count():
"""active_lock_count returns 1 while semaphore is held."""
ingress = _make_ingress(mixed_raw=f"{ROOM_MIXED}=sofiia,helion")
ingress._mixed_concurrency_cap = 1
async def _run():
lock = ingress._get_concurrency_lock(ROOM_MIXED, "sofiia")
assert ingress.active_lock_count == 0
await lock.acquire()
assert ingress.active_lock_count == 1
lock.release()
assert ingress.active_lock_count == 0
run(_run())
def test_slash_too_long_rejected_and_not_enqueued():
"""Slash token longer than max_slash_len → rejected, not enqueued."""
ingress = _make_ingress(mixed_raw=f"{ROOM_MIXED}=sofiia,helion")
ingress._max_slash_len = 4 # very short for test
client = _fake_client({ROOM_MIXED: [_make_event("/toolongtoken hello", event_id="tl1")]})
queue: asyncio.Queue = asyncio.Queue(maxsize=50)
ingress._queue = queue
rejected_calls: List[str] = []
ingress._on_route_rejected = lambda room, reason: rejected_calls.append(reason)
async def _run():
with patch("app.ingress._write_audit", new=AsyncMock()):
await ingress._enqueue_from_sync(client, queue, AsyncMock(), {})
run(_run())
assert queue.qsize() == 0
assert len(rejected_calls) == 1
assert rejected_calls[0] == "slash_too_long"
def test_route_rejected_audit_event_written():
"""On routing rejection, matrix.route.rejected audit event must be written."""
ingress = _make_ingress(mixed_raw=f"{ROOM_MIXED}=sofiia,helion")
audit_events: List[str] = []
async def fake_audit(*args, **kwargs):
audit_events.append(kwargs.get("event", ""))
client = _fake_client({ROOM_MIXED: [_make_event("/unknownbot test", event_id="ra1")]})
queue: asyncio.Queue = asyncio.Queue(maxsize=50)
ingress._queue = queue
async def _run():
with patch("app.ingress._write_audit", side_effect=fake_audit):
await ingress._enqueue_from_sync(client, queue, AsyncMock(), {})
run(_run())
assert "matrix.route.rejected" in audit_events
def test_direct_room_reply_has_no_prefix():
"""Reply in single-agent (direct) room must NOT have a prefix."""
ingress = _make_ingress(direct_raw=f"druid:{ROOM_DIRECT}", allowed=frozenset({"druid"}))

View File

@@ -26,6 +26,9 @@ from app.mixed_routing import ( # noqa: E402
REASON_AT_MENTION,
REASON_COLON_MENTION,
REASON_DEFAULT,
REASON_REJECTED_UNKNOWN_AGENT,
REASON_REJECTED_SLASH_TOO_LONG,
REASON_REJECTED_NO_MAPPING,
)
ROOM_X = "!roomX:daarion.space"
@@ -146,7 +149,7 @@ def test_slash_unknown_agent_returns_none():
cfg = _make_cfg()
agent, reason, _ = route_message("/druid hello", ROOM_X, cfg, frozenset({"sofiia", "helion"}))
assert agent is None
assert "unknown_slash_druid" in reason
assert reason == REASON_REJECTED_UNKNOWN_AGENT
# ── Routing — @mention ────────────────────────────────────────────────────────
@@ -225,3 +228,70 @@ def test_reply_prefix_single_room_empty():
def test_reply_prefix_capitalises_first_letter():
assert reply_prefix("druid", is_mixed=True) == "Druid: "
assert reply_prefix("NUTRA", is_mixed=True) == "Nutra: " # capitalize() normalises case
# ── M2.2: Guard rails ─────────────────────────────────────────────────────────
def test_max_agents_per_room_raises():
"""More agents than max → ValueError at parse time."""
raw = f"{ROOM_X}=sofiia,helion,druid,nutra,alateya,yaromir" # 6 agents
allowed_6 = frozenset({"sofiia", "helion", "druid", "nutra", "alateya", "yaromir"})
with pytest.raises(ValueError, match="MAX_AGENTS_PER_MIXED_ROOM"):
parse_mixed_room_map(raw, "", allowed_6, max_agents_per_room=5)
def test_max_agents_per_room_exactly_at_limit_ok():
"""Exactly at limit should succeed."""
raw = f"{ROOM_X}=sofiia,helion,druid,nutra,alateya" # 5 = default limit
allowed_5 = frozenset({"sofiia", "helion", "druid", "nutra", "alateya"})
cfg = parse_mixed_room_map(raw, "", allowed_5, max_agents_per_room=5)
assert len(cfg.agents_for_room(ROOM_X)) == 5
def test_slash_too_long_returns_rejected_reason():
"""Slash command token longer than max_slash_len → rejection, no fallthrough."""
cfg = _make_cfg()
long_token = "a" * 33 # > default 32
agent, reason, _ = route_message(
f"/{long_token} hello", ROOM_X, cfg, frozenset({"sofiia", "helion"}),
max_slash_len=32,
)
assert agent is None
assert reason == REASON_REJECTED_SLASH_TOO_LONG
def test_slash_exactly_at_max_len_ok():
"""Slash token exactly at max_slash_len should NOT be rejected."""
allowed = frozenset({"sofiia", "helion"})
raw = f"{ROOM_X}=sofiia,helion"
# Create a 10-char agent name (within limit) — use a mock allowed set
cfg = parse_mixed_room_map(raw, "", allowed, max_agents_per_room=5)
agent, reason, _ = route_message("/sofiia hi", ROOM_X, cfg, allowed, max_slash_len=32)
assert agent == "sofiia"
assert reason == REASON_SLASH
def test_unknown_slash_returns_rejected_unknown_agent():
"""Slash with valid-length token but unknown agent → REASON_REJECTED_UNKNOWN_AGENT."""
cfg = _make_cfg()
agent, reason, _ = route_message(
"/druid hello", ROOM_X, cfg, frozenset({"sofiia", "helion"}),
max_slash_len=32,
)
assert agent is None
assert reason == REASON_REJECTED_UNKNOWN_AGENT
def test_no_mapping_returns_rejected_no_mapping():
"""Room not in config → REASON_REJECTED_NO_MAPPING."""
cfg = _make_cfg(room_id=ROOM_X)
agent, reason, _ = route_message("hello", ROOM_Y, cfg, ALLOWED, max_slash_len=32)
assert agent is None
assert reason == REASON_REJECTED_NO_MAPPING
def test_rejection_reasons_are_distinct_constants():
"""All rejection reason strings must differ from success reasons."""
success = {REASON_SLASH, REASON_AT_MENTION, REASON_COLON_MENTION, REASON_DEFAULT}
rejected = {REASON_REJECTED_UNKNOWN_AGENT, REASON_REJECTED_SLASH_TOO_LONG, REASON_REJECTED_NO_MAPPING}
assert not success.intersection(rejected), "Rejection reasons must not overlap with success reasons"