Files
microdao-daarion/services/matrix-bridge-dagi/app/control_limiter.py
Apple 82d5ff2a4f feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11:
- M4: agent discovery (!agents / !status)
- M5: node-aware routing + per-node observability
- M6: dynamic policy store (node/agent overrides, import/export)
- M7: Prometheus alerts + Grafana dashboard + metrics contract
- M8: node health tracker + soft failover + sticky cache + HA persistence
- M9: two-step confirm + diff preview for dangerous commands
- M10: auto-backup, restore, retention, policy history + change detail
- M11: soak scenarios (CI tests) + live soak script

Soak infrastructure (this commit):
- POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false)
- _preflight_inject() and _check_wal() in soak script
- --db-path arg for WAL delta reporting
- Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands

Made-with: Cursor
2026-03-05 07:51:37 -08:00

139 lines
4.9 KiB
Python

"""
control_limiter — M3.4: Rate limiting + cooldown for Matrix control channel.
Protection layers:
1. Per-room sliding window — CONTROL_ROOM_RPM (default 60)
2. Per-operator sliding window — CONTROL_OPERATOR_RPM (default 30)
3. Per-run sliding window — CONTROL_RUN_NEXT_RPM (default 20, only !runbook next)
4. Per-operator cooldown — CONTROL_COOLDOWN_S (default 2s, anti-double-click)
All state is in-memory (lost on restart), which is intentional — limits reset with the bridge.
Thread safety: not needed (asyncio single-threaded event loop).
"""
from __future__ import annotations
import time
from collections import defaultdict, deque
from typing import Dict, Tuple
# Sentinel value for "unlimited" (rpm == 0 → skip check)
_UNLIMITED = 0
class ControlRateLimiter:
"""
Sliding-window rate limiter + cooldown for the Matrix control channel.
All rpm values are requests-per-minute over a 60-second rolling window.
cooldown_s is a per-{operator, verb, subcommand} debounce window (anti-double-click).
"""
def __init__(
self,
room_rpm: int = 60,
operator_rpm: int = 30,
run_next_rpm: int = 20,
cooldown_s: float = 2.0,
) -> None:
self.room_rpm = room_rpm
self.operator_rpm = operator_rpm
self.run_next_rpm = run_next_rpm
self.cooldown_s = cooldown_s
# Sliding-window storage: key → deque[float] (monotonic timestamps)
self._room_windows: Dict[str, deque] = defaultdict(deque)
self._op_windows: Dict[str, deque] = defaultdict(deque)
self._run_windows: Dict[str, deque] = defaultdict(deque)
# Cooldown: (sender_hash, verb, subcommand) → last accepted timestamp
self._cooldown_times: Dict[str, float] = {}
# ── Sliding window helpers ─────────────────────────────────────────────────
@staticmethod
def _check_window(
windows: Dict[str, deque],
key: str,
rpm: int,
) -> Tuple[bool, float]:
"""
Sliding-window check over a 60-second window.
Returns (allowed, retry_after_seconds).
If rpm == 0, always allowed.
"""
if rpm == _UNLIMITED:
return True, 0.0
now = time.monotonic()
window = windows[key]
cutoff = now - 60.0
# Evict expired entries
while window and window[0] < cutoff:
window.popleft()
if len(window) >= rpm:
# Time until oldest entry expires
retry_after = max(0.0, 60.0 - (now - window[0]))
return False, retry_after
window.append(now)
return True, 0.0
# ── Public check methods ───────────────────────────────────────────────────
def check_room(self, room_id: str) -> Tuple[bool, float]:
"""Per-room rate limit check. Returns (allowed, retry_after_s)."""
return self._check_window(self._room_windows, room_id, self.room_rpm)
def check_operator(self, sender_hash: str) -> Tuple[bool, float]:
"""Per-operator rate limit check. Returns (allowed, retry_after_s)."""
return self._check_window(self._op_windows, sender_hash, self.operator_rpm)
def check_run_next(self, run_id: str) -> Tuple[bool, float]:
"""
Per-run rate limit for !runbook next — prevents rapid-fire advancement.
Returns (allowed, retry_after_s).
"""
return self._check_window(self._run_windows, run_id, self.run_next_rpm)
def check_cooldown(
self,
sender_hash: str,
verb: str,
subcommand: str,
) -> Tuple[bool, float]:
"""
Anti-double-click cooldown per (operator, verb, subcommand).
Returns (allowed, wait_s). On first call → records timestamp and allows.
On subsequent calls within cooldown_s → blocks and returns remaining wait.
"""
if self.cooldown_s <= 0:
return True, 0.0
key = f"{sender_hash}:{verb}:{subcommand}"
now = time.monotonic()
last = self._cooldown_times.get(key)
if last is not None:
elapsed = now - last
if elapsed < self.cooldown_s:
return False, self.cooldown_s - elapsed
self._cooldown_times[key] = now
return True, 0.0
# ── Summary ───────────────────────────────────────────────────────────────
def as_health_dict(self) -> dict:
return {
"room_rpm": self.room_rpm,
"operator_rpm": self.operator_rpm,
"run_next_rpm": self.run_next_rpm,
"cooldown_s": self.cooldown_s,
}