feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)

Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
2026-03-05 07:51:37 -08:00
parent fe6e3d30ae
commit 82d5ff2a4f
21 changed files with 9123 additions and 93 deletions
--- a/services/matrix-bridge-dagi/app/control_limiter.py
+++ b/services/matrix-bridge-dagi/app/control_limiter.py
@@ -0,0 +1,138 @@
+"""
+control_limiter — M3.4: Rate limiting + cooldown for Matrix control channel.
+
+Protection layers:
+  1. Per-room sliding window     — CONTROL_ROOM_RPM (default 60)
+  2. Per-operator sliding window — CONTROL_OPERATOR_RPM (default 30)
+  3. Per-run sliding window      — CONTROL_RUN_NEXT_RPM (default 20, only !runbook next)
+  4. Per-operator cooldown       — CONTROL_COOLDOWN_S (default 2s, anti-double-click)
+
+All state is in-memory (lost on restart), which is intentional — limits reset with the bridge.
+
+Thread safety: not needed (asyncio single-threaded event loop).
+"""
+from __future__ import annotations
+
+import time
+from collections import defaultdict, deque
+from typing import Dict, Tuple
+
+
+# Sentinel value for "unlimited" (rpm == 0 → skip check)
+_UNLIMITED = 0
+
+
+class ControlRateLimiter:
+    """
+    Sliding-window rate limiter + cooldown for the Matrix control channel.
+
+    All rpm values are requests-per-minute over a 60-second rolling window.
+    cooldown_s is a per-{operator, verb, subcommand} debounce window (anti-double-click).
+    """
+
+    def __init__(
+        self,
+        room_rpm: int = 60,
+        operator_rpm: int = 30,
+        run_next_rpm: int = 20,
+        cooldown_s: float = 2.0,
+    ) -> None:
+        self.room_rpm = room_rpm
+        self.operator_rpm = operator_rpm
+        self.run_next_rpm = run_next_rpm
+        self.cooldown_s = cooldown_s
+
+        # Sliding-window storage: key → deque[float] (monotonic timestamps)
+        self._room_windows: Dict[str, deque] = defaultdict(deque)
+        self._op_windows: Dict[str, deque] = defaultdict(deque)
+        self._run_windows: Dict[str, deque] = defaultdict(deque)
+
+        # Cooldown: (sender_hash, verb, subcommand) → last accepted timestamp
+        self._cooldown_times: Dict[str, float] = {}
+
+    # ── Sliding window helpers ─────────────────────────────────────────────────
+
+    @staticmethod
+    def _check_window(
+        windows: Dict[str, deque],
+        key: str,
+        rpm: int,
+    ) -> Tuple[bool, float]:
+        """
+        Sliding-window check over a 60-second window.
+
+        Returns (allowed, retry_after_seconds).
+        If rpm == 0, always allowed.
+        """
+        if rpm == _UNLIMITED:
+            return True, 0.0
+
+        now = time.monotonic()
+        window = windows[key]
+        cutoff = now - 60.0
+
+        # Evict expired entries
+        while window and window[0] < cutoff:
+            window.popleft()
+
+        if len(window) >= rpm:
+            # Time until oldest entry expires
+            retry_after = max(0.0, 60.0 - (now - window[0]))
+            return False, retry_after
+
+        window.append(now)
+        return True, 0.0
+
+    # ── Public check methods ───────────────────────────────────────────────────
+
+    def check_room(self, room_id: str) -> Tuple[bool, float]:
+        """Per-room rate limit check. Returns (allowed, retry_after_s)."""
+        return self._check_window(self._room_windows, room_id, self.room_rpm)
+
+    def check_operator(self, sender_hash: str) -> Tuple[bool, float]:
+        """Per-operator rate limit check. Returns (allowed, retry_after_s)."""
+        return self._check_window(self._op_windows, sender_hash, self.operator_rpm)
+
+    def check_run_next(self, run_id: str) -> Tuple[bool, float]:
+        """
+        Per-run rate limit for !runbook next — prevents rapid-fire advancement.
+        Returns (allowed, retry_after_s).
+        """
+        return self._check_window(self._run_windows, run_id, self.run_next_rpm)
+
+    def check_cooldown(
+        self,
+        sender_hash: str,
+        verb: str,
+        subcommand: str,
+    ) -> Tuple[bool, float]:
+        """
+        Anti-double-click cooldown per (operator, verb, subcommand).
+
+        Returns (allowed, wait_s). On first call → records timestamp and allows.
+        On subsequent calls within cooldown_s → blocks and returns remaining wait.
+        """
+        if self.cooldown_s <= 0:
+            return True, 0.0
+
+        key = f"{sender_hash}:{verb}:{subcommand}"
+        now = time.monotonic()
+        last = self._cooldown_times.get(key)
+
+        if last is not None:
+            elapsed = now - last
+            if elapsed < self.cooldown_s:
+                return False, self.cooldown_s - elapsed
+
+        self._cooldown_times[key] = now
+        return True, 0.0
+
+    # ── Summary ───────────────────────────────────────────────────────────────
+
+    def as_health_dict(self) -> dict:
+        return {
+            "room_rpm": self.room_rpm,
+            "operator_rpm": self.operator_rpm,
+            "run_next_rpm": self.run_next_rpm,
+            "cooldown_s": self.cooldown_s,
+        }