feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)

Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
2026-03-05 07:51:37 -08:00
parent fe6e3d30ae
commit 82d5ff2a4f
21 changed files with 9123 additions and 93 deletions
--- a/services/matrix-bridge-dagi/app/discovery.py
+++ b/services/matrix-bridge-dagi/app/discovery.py
@@ -0,0 +1,210 @@
+"""
+discovery — M4.0: Agent discovery helpers for Matrix user rooms.
+
+Provides formatted replies for `!agents` and `!agents status` commands.
+These commands are available to all room members (no auth required) and
+are processed BEFORE routing to the LLM agent.
+
+Supports:
+  - Mixed rooms: list all agents, default, usage examples
+  - Direct rooms: show single agent mapping
+  - Unknown rooms: "no mapping" notice
+"""
+from __future__ import annotations
+
+import datetime
+from typing import Optional
+
+from .mixed_routing import MixedRoomConfig
+from .room_mapping import RoomMappingConfig  # noqa: F401 — used in type hints
+
+
+def _fmt_ts(ts: int) -> str:
+    """Format a Unix timestamp as compact UTC string."""
+    try:
+        return datetime.datetime.fromtimestamp(ts, tz=datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    except Exception:  # noqa: BLE001
+        return str(ts)
+
+# Discovery command prefix
+DISCOVERY_CMD = "!agents"
+
+# Reply length cap (Matrix message, not truncated — kept short by design)
+_MAX_REPLY_LEN = 3500
+
+
+def is_discovery_message(text: str) -> bool:
+    """Return True if the message is a !agents discovery command."""
+    lower = text.strip().lower()
+    return lower == DISCOVERY_CMD or lower.startswith(DISCOVERY_CMD + " ")
+
+
+def agents_reply(
+    room_id: str,
+    room_map: RoomMappingConfig,
+    mixed_room_config: Optional[MixedRoomConfig],
+) -> str:
+    """
+    Build a discovery reply for the given room.
+
+    Mixed room  → list agents, default, usage examples.
+    Direct room → single agent mapping.
+    Unknown     → 'no mapping' notice.
+    """
+    # Mixed room check first
+    if mixed_room_config and mixed_room_config.is_mixed(room_id):
+        room = mixed_room_config.rooms.get(room_id)
+        if room is not None:
+            return _mixed_room_reply(room_id, room)
+
+    # Direct room check
+    agent_id = room_map.agent_for_room(room_id)
+    if agent_id is not None:
+        return _direct_room_reply(agent_id)
+
+    return _unknown_room_reply()
+
+
+def _mixed_room_reply(room_id: str, room) -> str:
+    """Format reply for a mixed room."""
+    agents = room.agents
+    default = room.default_agent or (agents[0] if agents else "?")
+    agent_list = ", ".join(f"**{a}**" for a in agents)
+
+    lines = [
+        f"🤖 **Agents available in this room:** {agent_list}",
+        f"⭐ **Default:** {default}",
+        "",
+        "**How to address an agent:**",
+    ]
+    for agent in agents[:5]:  # show max 5 examples
+        lines.append(f"  • `/{agent} <message>` — slash command")
+        lines.append(f"  • `@{agent} <message>` — mention")
+        lines.append(f"  • `{agent}: <message>` — colon prefix")
+    lines.extend([
+        "",
+        f"_Messages without prefix go to **{default}** by default._",
+    ])
+    return "\n".join(lines)
+
+
+def _direct_room_reply(agent_id: str) -> str:
+    """Format reply for a directly-mapped room (1 agent)."""
+    return (
+        f"🤖 This room is mapped to agent: **{agent_id}**\n\n"
+        f"All messages are forwarded to **{agent_id}** automatically.\n"
+        f"No prefix needed — just write your message."
+    )
+
+
+def _unknown_room_reply() -> str:
+    """Format reply when room has no mapping."""
+    return (
+        "⚠️ This room has no agent mapping.\n\n"
+        "Contact an operator to configure an agent for this room."
+    )
+
+
+# ── Bridge status reply (M4.1) ────────────────────────────────────────────────
+
+def bridge_status_reply(snapshot: dict) -> str:
+    """
+    Format a concise bridge health snapshot for `!status` in control room.
+
+    snapshot keys (all optional with defaults):
+      node_id, queue_size, queue_max, worker_count,
+      room_count, mixed_room_count, operators_count,
+      control_safety (dict), persistent_dedupe (dict),
+      dedupe_hits, dedupe_inserts
+    """
+    node_id = snapshot.get("node_id", "?")
+    q_size = snapshot.get("queue_size", "?")
+    q_max = snapshot.get("queue_max", "?")
+    workers = snapshot.get("worker_count", "?")
+    rooms = snapshot.get("room_count", 0)
+    mixed = snapshot.get("mixed_room_count", 0)
+    ops = snapshot.get("operators_count", 0)
+
+    safety = snapshot.get("control_safety") or {}
+    dedupe = snapshot.get("persistent_dedupe") or {}
+
+    node_policy = snapshot.get("node_policy") or {}
+    default_node = node_policy.get("default_node", node_id)
+    allowed_nodes = node_policy.get("allowed_nodes") or []
+    room_overrides = node_policy.get("room_overrides", 0)
+
+    lines = [
+        f"📡 **Bridge status** — node: `{node_id}`",
+        "",
+        f"**Queue:** {q_size}/{q_max}  workers: {workers}",
+        f"**Rooms:** {rooms} direct  {mixed} mixed  ops: {ops} operators",
+        "",
+    ]
+
+    # M5.0: node policy
+    if allowed_nodes:
+        allowed_str = ", ".join(f"`{n}`" for n in sorted(allowed_nodes))
+        lines.append(
+            f"**Node policy:** default=`{default_node}`  allowed={allowed_str}  room_overrides={room_overrides}"
+        )
+
+    # Control safety
+    if safety:
+        enabled = "✅" if safety.get("enabled") else "⬜"
+        lines.append(
+            f"**Control safety {enabled}:** "
+            f"room={safety.get('room_rpm', '?')}rpm  "
+            f"op={safety.get('operator_rpm', '?')}rpm  "
+            f"cooldown={safety.get('cooldown_s', '?')}s"
+        )
+
+    # Persistent dedupe
+    if dedupe:
+        ok_emoji = "✅" if dedupe.get("ok") else "❌"
+        pruned = dedupe.get("pruned_rows_last", 0)
+        ttl = dedupe.get("ttl_h", "?")
+        lines.append(
+            f"**Dedupe {ok_emoji}:** ttl={ttl}h  pruned_last={pruned}  "
+            f"db=`{dedupe.get('db_path') or 'n/a'}`"
+        )
+
+    # M6.0/M6.1: policy store status
+    ps = snapshot.get("policy_store") or {}
+    if ps:
+        ps_ok = "✅" if ps.get("ok") else "❌"
+        ps_node_count = ps.get("overrides_count", 0)
+        ps_agent_count = ps.get("agent_overrides_count", snapshot.get("policy_agent_overrides_count", 0))
+        ps_path = ps.get("policy_store_path") or ps.get("path") or "n/a"
+        lines.append(
+            f"**Policy store {ps_ok}:** node_overrides={ps_node_count}  "
+            f"agent_overrides={ps_agent_count}  db=`{ps_path}`"
+        )
+
+    # M6.2: last export/import timestamps + DB mtime
+    _last_export = snapshot.get("policy_last_export_at")
+    _last_import = snapshot.get("policy_last_import_at")
+    _db_mtime    = snapshot.get("policy_db_mtime")
+    _snap_parts: list = []
+    if _last_export:
+        _snap_parts.append(f"last_export=`{_fmt_ts(_last_export)}`")
+    if _last_import:
+        _snap_parts.append(f"last_import=`{_fmt_ts(_last_import)}`")
+    if _db_mtime:
+        _snap_parts.append(f"db_mtime=`{_fmt_ts(_db_mtime)}`")
+    if _snap_parts:
+        lines.append("**Policy snapshots:** " + "  ".join(_snap_parts))
+
+    # M5.1: per-node routed/rejected breakdown
+    node_stats = snapshot.get("nodes") or {}
+    if node_stats:
+        lines.append("\n**Per-node stats:**")
+        for nid in sorted(node_stats):
+            ns = node_stats[nid]
+            lines.append(
+                f"  `{nid}`: routed={ns.get('routed', 0)}  rejected={ns.get('rejected', 0)}"
+            )
+
+    reply = "\n".join(lines)
+    if len(reply) > _MAX_REPLY_LEN:
+        reply = reply[:_MAX_REPLY_LEN - 3] + "…"
+    return reply