feat(matrix-bridge-dagi): harden mixed rooms with safe defaults and ops visibility (M2.2)

Guard rails (mixed_routing.py): - MAX_AGENTS_PER_MIXED_ROOM (default 5): fail-fast at parse time - MAX_SLASH_LEN (default 32): reject garbage/injection slash tokens - Unified rejection reasons: unknown_agent, slash_too_long, no_mapping - REASON_REJECTED_* constants (separate from success REASON_*) Ingress (ingress.py): - per-room-agent concurrency semaphore (MIXED_CONCURRENCY_CAP, default 1) - active_lock_count property for /health + prometheus - UNKNOWN_AGENT_BEHAVIOR: "ignore" (silent) | "reply_error" (inform user) - on_routed(agent_id, reason) callback for routing metrics - on_route_rejected(room_id, reason) callback for rejection metrics - matrix.route.rejected audit event on every rejection Config + main: - max_agents_per_mixed_room, max_slash_len, unknown_agent_behavior, mixed_concurrency_cap - matrix_bridge_routed_total{agent_id, reason} counter - matrix_bridge_route_rejected_total{room_id, reason} counter - matrix_bridge_active_room_agent_locks gauge - /health: mixed_guard_rails section + total_agents_in_mixed_rooms - docker-compose: all 4 new guard rail env vars Runbook: section 9 — mixed room debug guide (6 acceptance tests, routing metrics, session isolation, lock hang, config guard) Tests: 108 pass (94 → 108, +14 new tests for guard rails + callbacks + concurrency) Made-with: Cursor
2026-03-05 01:41:20 -08:00
parent a85a11984b
commit d40b1e87c6
8 changed files with 576 additions and 21 deletions
--- a/tests/test_matrix_bridge_mixed_ingress.py
+++ b/tests/test_matrix_bridge_mixed_ingress.py
@@ -308,6 +308,158 @@ def test_rate_limited_mixed_room_event_dropped():
    assert len(dropped) == 2     # two dropped by rate limiter


+def test_on_route_rejected_callback_fires():
+    """on_route_rejected fires when /unknown slash is used in mixed room."""
+    ingress = _make_ingress(mixed_raw=f"{ROOM_MIXED}=sofiia,helion")
+    rejected_calls: List[tuple] = []
+    ingress._on_route_rejected = lambda room, reason: rejected_calls.append((room, reason))
+
+    client = _fake_client({ROOM_MIXED: [_make_event("/unknownbot hello", event_id="r1")]})
+    queue: asyncio.Queue = asyncio.Queue(maxsize=50)
+    ingress._queue = queue
+
+    async def _run():
+        with patch("app.ingress._write_audit", new=AsyncMock()):
+            await ingress._enqueue_from_sync(client, queue, AsyncMock(), {})
+
+    run(_run())
+
+    assert queue.qsize() == 0
+    assert len(rejected_calls) == 1
+    room, reason = rejected_calls[0]
+    assert room == ROOM_MIXED
+    assert "unknown" in reason
+
+
+def test_on_routed_callback_fires_with_reason():
+    """on_routed fires with correct agent_id and routing_reason on successful route."""
+    ingress = _make_ingress(mixed_raw=f"{ROOM_MIXED}=sofiia,helion")
+    routed_calls: List[tuple] = []
+    ingress._on_routed = lambda agent, reason: routed_calls.append((agent, reason))
+
+    client = _fake_client({ROOM_MIXED: [_make_event("/helion hello", event_id="rt1")]})
+    queue: asyncio.Queue = asyncio.Queue(maxsize=50)
+    ingress._queue = queue
+
+    async def _run():
+        with patch("app.ingress._write_audit", new=AsyncMock()):
+            await ingress._enqueue_from_sync(client, queue, AsyncMock(), {})
+
+    run(_run())
+
+    assert len(routed_calls) == 1
+    agent, reason = routed_calls[0]
+    assert agent == "helion"
+    assert reason == "slash_command"
+
+
+def test_unknown_agent_reply_error_sends_message():
+    """UNKNOWN_AGENT_BEHAVIOR=reply_error → error message sent to room."""
+    ingress = _make_ingress(mixed_raw=f"{ROOM_MIXED}=sofiia,helion")
+    ingress._unknown_agent_behavior = "reply_error"
+
+    sent_texts: List[str] = []
+    mock_client = _fake_client({ROOM_MIXED: [_make_event("/druid hello", event_id="ue1")]})
+    mock_client.send_text = AsyncMock(side_effect=lambda room, text, txn_id=None: sent_texts.append(text))
+
+    queue: asyncio.Queue = asyncio.Queue(maxsize=50)
+    ingress._queue = queue
+
+    async def _run():
+        with patch("app.ingress._write_audit", new=AsyncMock()):
+            await ingress._enqueue_from_sync(mock_client, queue, AsyncMock(), {})
+
+    run(_run())
+
+    assert queue.qsize() == 0   # not enqueued
+    assert len(sent_texts) == 1
+    assert "Unknown agent" in sent_texts[0]
+    assert "sofiia" in sent_texts[0] or "helion" in sent_texts[0]
+
+
+def test_unknown_agent_ignore_sends_nothing():
+    """UNKNOWN_AGENT_BEHAVIOR=ignore (default) → no reply sent."""
+    ingress = _make_ingress(mixed_raw=f"{ROOM_MIXED}=sofiia,helion")
+    ingress._unknown_agent_behavior = "ignore"
+
+    sent_texts: List[str] = []
+    mock_client = _fake_client({ROOM_MIXED: [_make_event("/druid hello", event_id="ui1")]})
+    mock_client.send_text = AsyncMock(side_effect=lambda room, text, txn_id=None: sent_texts.append(text))
+
+    queue: asyncio.Queue = asyncio.Queue(maxsize=50)
+    ingress._queue = queue
+
+    async def _run():
+        with patch("app.ingress._write_audit", new=AsyncMock()):
+            await ingress._enqueue_from_sync(mock_client, queue, AsyncMock(), {})
+
+    run(_run())
+
+    assert queue.qsize() == 0
+    assert len(sent_texts) == 0   # silent
+
+
+def test_concurrency_cap_active_lock_count():
+    """active_lock_count returns 1 while semaphore is held."""
+    ingress = _make_ingress(mixed_raw=f"{ROOM_MIXED}=sofiia,helion")
+    ingress._mixed_concurrency_cap = 1
+
+    async def _run():
+        lock = ingress._get_concurrency_lock(ROOM_MIXED, "sofiia")
+        assert ingress.active_lock_count == 0
+        await lock.acquire()
+        assert ingress.active_lock_count == 1
+        lock.release()
+        assert ingress.active_lock_count == 0
+
+    run(_run())
+
+
+def test_slash_too_long_rejected_and_not_enqueued():
+    """Slash token longer than max_slash_len → rejected, not enqueued."""
+    ingress = _make_ingress(mixed_raw=f"{ROOM_MIXED}=sofiia,helion")
+    ingress._max_slash_len = 4   # very short for test
+
+    client = _fake_client({ROOM_MIXED: [_make_event("/toolongtoken hello", event_id="tl1")]})
+    queue: asyncio.Queue = asyncio.Queue(maxsize=50)
+    ingress._queue = queue
+
+    rejected_calls: List[str] = []
+    ingress._on_route_rejected = lambda room, reason: rejected_calls.append(reason)
+
+    async def _run():
+        with patch("app.ingress._write_audit", new=AsyncMock()):
+            await ingress._enqueue_from_sync(client, queue, AsyncMock(), {})
+
+    run(_run())
+
+    assert queue.qsize() == 0
+    assert len(rejected_calls) == 1
+    assert rejected_calls[0] == "slash_too_long"
+
+
+def test_route_rejected_audit_event_written():
+    """On routing rejection, matrix.route.rejected audit event must be written."""
+    ingress = _make_ingress(mixed_raw=f"{ROOM_MIXED}=sofiia,helion")
+
+    audit_events: List[str] = []
+
+    async def fake_audit(*args, **kwargs):
+        audit_events.append(kwargs.get("event", ""))
+
+    client = _fake_client({ROOM_MIXED: [_make_event("/unknownbot test", event_id="ra1")]})
+    queue: asyncio.Queue = asyncio.Queue(maxsize=50)
+    ingress._queue = queue
+
+    async def _run():
+        with patch("app.ingress._write_audit", side_effect=fake_audit):
+            await ingress._enqueue_from_sync(client, queue, AsyncMock(), {})
+
+    run(_run())
+
+    assert "matrix.route.rejected" in audit_events
+
+
 def test_direct_room_reply_has_no_prefix():
    """Reply in single-agent (direct) room must NOT have a prefix."""
    ingress = _make_ingress(direct_raw=f"druid:{ROOM_DIRECT}", allowed=frozenset({"druid"}))
--- a/tests/test_matrix_bridge_mixed_routing.py
+++ b/tests/test_matrix_bridge_mixed_routing.py
@@ -26,6 +26,9 @@ from app.mixed_routing import (  # noqa: E402
    REASON_AT_MENTION,
    REASON_COLON_MENTION,
    REASON_DEFAULT,
+    REASON_REJECTED_UNKNOWN_AGENT,
+    REASON_REJECTED_SLASH_TOO_LONG,
+    REASON_REJECTED_NO_MAPPING,
 )

 ROOM_X = "!roomX:daarion.space"
@@ -146,7 +149,7 @@ def test_slash_unknown_agent_returns_none():
    cfg = _make_cfg()
    agent, reason, _ = route_message("/druid hello", ROOM_X, cfg, frozenset({"sofiia", "helion"}))
    assert agent is None
-    assert "unknown_slash_druid" in reason
+    assert reason == REASON_REJECTED_UNKNOWN_AGENT


 # ── Routing — @mention ────────────────────────────────────────────────────────
@@ -225,3 +228,70 @@ def test_reply_prefix_single_room_empty():
 def test_reply_prefix_capitalises_first_letter():
    assert reply_prefix("druid", is_mixed=True) == "Druid: "
    assert reply_prefix("NUTRA", is_mixed=True) == "Nutra: "   # capitalize() normalises case
+
+
+# ── M2.2: Guard rails ─────────────────────────────────────────────────────────
+
+def test_max_agents_per_room_raises():
+    """More agents than max → ValueError at parse time."""
+    raw = f"{ROOM_X}=sofiia,helion,druid,nutra,alateya,yaromir"   # 6 agents
+    allowed_6 = frozenset({"sofiia", "helion", "druid", "nutra", "alateya", "yaromir"})
+    with pytest.raises(ValueError, match="MAX_AGENTS_PER_MIXED_ROOM"):
+        parse_mixed_room_map(raw, "", allowed_6, max_agents_per_room=5)
+
+
+def test_max_agents_per_room_exactly_at_limit_ok():
+    """Exactly at limit should succeed."""
+    raw = f"{ROOM_X}=sofiia,helion,druid,nutra,alateya"   # 5 = default limit
+    allowed_5 = frozenset({"sofiia", "helion", "druid", "nutra", "alateya"})
+    cfg = parse_mixed_room_map(raw, "", allowed_5, max_agents_per_room=5)
+    assert len(cfg.agents_for_room(ROOM_X)) == 5
+
+
+def test_slash_too_long_returns_rejected_reason():
+    """Slash command token longer than max_slash_len → rejection, no fallthrough."""
+    cfg = _make_cfg()
+    long_token = "a" * 33   # > default 32
+    agent, reason, _ = route_message(
+        f"/{long_token} hello", ROOM_X, cfg, frozenset({"sofiia", "helion"}),
+        max_slash_len=32,
+    )
+    assert agent is None
+    assert reason == REASON_REJECTED_SLASH_TOO_LONG
+
+
+def test_slash_exactly_at_max_len_ok():
+    """Slash token exactly at max_slash_len should NOT be rejected."""
+    allowed = frozenset({"sofiia", "helion"})
+    raw = f"{ROOM_X}=sofiia,helion"
+    # Create a 10-char agent name (within limit) — use a mock allowed set
+    cfg = parse_mixed_room_map(raw, "", allowed, max_agents_per_room=5)
+    agent, reason, _ = route_message("/sofiia hi", ROOM_X, cfg, allowed, max_slash_len=32)
+    assert agent == "sofiia"
+    assert reason == REASON_SLASH
+
+
+def test_unknown_slash_returns_rejected_unknown_agent():
+    """Slash with valid-length token but unknown agent → REASON_REJECTED_UNKNOWN_AGENT."""
+    cfg = _make_cfg()
+    agent, reason, _ = route_message(
+        "/druid hello", ROOM_X, cfg, frozenset({"sofiia", "helion"}),
+        max_slash_len=32,
+    )
+    assert agent is None
+    assert reason == REASON_REJECTED_UNKNOWN_AGENT
+
+
+def test_no_mapping_returns_rejected_no_mapping():
+    """Room not in config → REASON_REJECTED_NO_MAPPING."""
+    cfg = _make_cfg(room_id=ROOM_X)
+    agent, reason, _ = route_message("hello", ROOM_Y, cfg, ALLOWED, max_slash_len=32)
+    assert agent is None
+    assert reason == REASON_REJECTED_NO_MAPPING
+
+
+def test_rejection_reasons_are_distinct_constants():
+    """All rejection reason strings must differ from success reasons."""
+    success = {REASON_SLASH, REASON_AT_MENTION, REASON_COLON_MENTION, REASON_DEFAULT}
+    rejected = {REASON_REJECTED_UNKNOWN_AGENT, REASON_REJECTED_SLASH_TOO_LONG, REASON_REJECTED_NO_MAPPING}
+    assert not success.intersection(rejected), "Rejection reasons must not overlap with success reasons"