feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
This commit is contained in:
138
services/matrix-bridge-dagi/app/control_limiter.py
Normal file
138
services/matrix-bridge-dagi/app/control_limiter.py
Normal file
@@ -0,0 +1,138 @@
|
||||
"""
|
||||
control_limiter — M3.4: Rate limiting + cooldown for Matrix control channel.
|
||||
|
||||
Protection layers:
|
||||
1. Per-room sliding window — CONTROL_ROOM_RPM (default 60)
|
||||
2. Per-operator sliding window — CONTROL_OPERATOR_RPM (default 30)
|
||||
3. Per-run sliding window — CONTROL_RUN_NEXT_RPM (default 20, only !runbook next)
|
||||
4. Per-operator cooldown — CONTROL_COOLDOWN_S (default 2s, anti-double-click)
|
||||
|
||||
All state is in-memory (lost on restart), which is intentional — limits reset with the bridge.
|
||||
|
||||
Thread safety: not needed (asyncio single-threaded event loop).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from collections import defaultdict, deque
|
||||
from typing import Dict, Tuple
|
||||
|
||||
|
||||
# Sentinel value for "unlimited" (rpm == 0 → skip check)
|
||||
_UNLIMITED = 0
|
||||
|
||||
|
||||
class ControlRateLimiter:
|
||||
"""
|
||||
Sliding-window rate limiter + cooldown for the Matrix control channel.
|
||||
|
||||
All rpm values are requests-per-minute over a 60-second rolling window.
|
||||
cooldown_s is a per-{operator, verb, subcommand} debounce window (anti-double-click).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
room_rpm: int = 60,
|
||||
operator_rpm: int = 30,
|
||||
run_next_rpm: int = 20,
|
||||
cooldown_s: float = 2.0,
|
||||
) -> None:
|
||||
self.room_rpm = room_rpm
|
||||
self.operator_rpm = operator_rpm
|
||||
self.run_next_rpm = run_next_rpm
|
||||
self.cooldown_s = cooldown_s
|
||||
|
||||
# Sliding-window storage: key → deque[float] (monotonic timestamps)
|
||||
self._room_windows: Dict[str, deque] = defaultdict(deque)
|
||||
self._op_windows: Dict[str, deque] = defaultdict(deque)
|
||||
self._run_windows: Dict[str, deque] = defaultdict(deque)
|
||||
|
||||
# Cooldown: (sender_hash, verb, subcommand) → last accepted timestamp
|
||||
self._cooldown_times: Dict[str, float] = {}
|
||||
|
||||
# ── Sliding window helpers ─────────────────────────────────────────────────
|
||||
|
||||
@staticmethod
|
||||
def _check_window(
|
||||
windows: Dict[str, deque],
|
||||
key: str,
|
||||
rpm: int,
|
||||
) -> Tuple[bool, float]:
|
||||
"""
|
||||
Sliding-window check over a 60-second window.
|
||||
|
||||
Returns (allowed, retry_after_seconds).
|
||||
If rpm == 0, always allowed.
|
||||
"""
|
||||
if rpm == _UNLIMITED:
|
||||
return True, 0.0
|
||||
|
||||
now = time.monotonic()
|
||||
window = windows[key]
|
||||
cutoff = now - 60.0
|
||||
|
||||
# Evict expired entries
|
||||
while window and window[0] < cutoff:
|
||||
window.popleft()
|
||||
|
||||
if len(window) >= rpm:
|
||||
# Time until oldest entry expires
|
||||
retry_after = max(0.0, 60.0 - (now - window[0]))
|
||||
return False, retry_after
|
||||
|
||||
window.append(now)
|
||||
return True, 0.0
|
||||
|
||||
# ── Public check methods ───────────────────────────────────────────────────
|
||||
|
||||
def check_room(self, room_id: str) -> Tuple[bool, float]:
|
||||
"""Per-room rate limit check. Returns (allowed, retry_after_s)."""
|
||||
return self._check_window(self._room_windows, room_id, self.room_rpm)
|
||||
|
||||
def check_operator(self, sender_hash: str) -> Tuple[bool, float]:
|
||||
"""Per-operator rate limit check. Returns (allowed, retry_after_s)."""
|
||||
return self._check_window(self._op_windows, sender_hash, self.operator_rpm)
|
||||
|
||||
def check_run_next(self, run_id: str) -> Tuple[bool, float]:
|
||||
"""
|
||||
Per-run rate limit for !runbook next — prevents rapid-fire advancement.
|
||||
Returns (allowed, retry_after_s).
|
||||
"""
|
||||
return self._check_window(self._run_windows, run_id, self.run_next_rpm)
|
||||
|
||||
def check_cooldown(
|
||||
self,
|
||||
sender_hash: str,
|
||||
verb: str,
|
||||
subcommand: str,
|
||||
) -> Tuple[bool, float]:
|
||||
"""
|
||||
Anti-double-click cooldown per (operator, verb, subcommand).
|
||||
|
||||
Returns (allowed, wait_s). On first call → records timestamp and allows.
|
||||
On subsequent calls within cooldown_s → blocks and returns remaining wait.
|
||||
"""
|
||||
if self.cooldown_s <= 0:
|
||||
return True, 0.0
|
||||
|
||||
key = f"{sender_hash}:{verb}:{subcommand}"
|
||||
now = time.monotonic()
|
||||
last = self._cooldown_times.get(key)
|
||||
|
||||
if last is not None:
|
||||
elapsed = now - last
|
||||
if elapsed < self.cooldown_s:
|
||||
return False, self.cooldown_s - elapsed
|
||||
|
||||
self._cooldown_times[key] = now
|
||||
return True, 0.0
|
||||
|
||||
# ── Summary ───────────────────────────────────────────────────────────────
|
||||
|
||||
def as_health_dict(self) -> dict:
|
||||
return {
|
||||
"room_rpm": self.room_rpm,
|
||||
"operator_rpm": self.operator_rpm,
|
||||
"run_next_rpm": self.run_next_rpm,
|
||||
"cooldown_s": self.cooldown_s,
|
||||
}
|
||||
Reference in New Issue
Block a user