Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
139 lines
4.9 KiB
Python
139 lines
4.9 KiB
Python
"""
|
|
control_limiter — M3.4: Rate limiting + cooldown for Matrix control channel.
|
|
|
|
Protection layers:
|
|
1. Per-room sliding window — CONTROL_ROOM_RPM (default 60)
|
|
2. Per-operator sliding window — CONTROL_OPERATOR_RPM (default 30)
|
|
3. Per-run sliding window — CONTROL_RUN_NEXT_RPM (default 20, only !runbook next)
|
|
4. Per-operator cooldown — CONTROL_COOLDOWN_S (default 2s, anti-double-click)
|
|
|
|
All state is in-memory (lost on restart), which is intentional — limits reset with the bridge.
|
|
|
|
Thread safety: not needed (asyncio single-threaded event loop).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import time
|
|
from collections import defaultdict, deque
|
|
from typing import Dict, Tuple
|
|
|
|
|
|
# Sentinel value for "unlimited" (rpm == 0 → skip check)
|
|
_UNLIMITED = 0
|
|
|
|
|
|
class ControlRateLimiter:
|
|
"""
|
|
Sliding-window rate limiter + cooldown for the Matrix control channel.
|
|
|
|
All rpm values are requests-per-minute over a 60-second rolling window.
|
|
cooldown_s is a per-{operator, verb, subcommand} debounce window (anti-double-click).
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
room_rpm: int = 60,
|
|
operator_rpm: int = 30,
|
|
run_next_rpm: int = 20,
|
|
cooldown_s: float = 2.0,
|
|
) -> None:
|
|
self.room_rpm = room_rpm
|
|
self.operator_rpm = operator_rpm
|
|
self.run_next_rpm = run_next_rpm
|
|
self.cooldown_s = cooldown_s
|
|
|
|
# Sliding-window storage: key → deque[float] (monotonic timestamps)
|
|
self._room_windows: Dict[str, deque] = defaultdict(deque)
|
|
self._op_windows: Dict[str, deque] = defaultdict(deque)
|
|
self._run_windows: Dict[str, deque] = defaultdict(deque)
|
|
|
|
# Cooldown: (sender_hash, verb, subcommand) → last accepted timestamp
|
|
self._cooldown_times: Dict[str, float] = {}
|
|
|
|
# ── Sliding window helpers ─────────────────────────────────────────────────
|
|
|
|
@staticmethod
|
|
def _check_window(
|
|
windows: Dict[str, deque],
|
|
key: str,
|
|
rpm: int,
|
|
) -> Tuple[bool, float]:
|
|
"""
|
|
Sliding-window check over a 60-second window.
|
|
|
|
Returns (allowed, retry_after_seconds).
|
|
If rpm == 0, always allowed.
|
|
"""
|
|
if rpm == _UNLIMITED:
|
|
return True, 0.0
|
|
|
|
now = time.monotonic()
|
|
window = windows[key]
|
|
cutoff = now - 60.0
|
|
|
|
# Evict expired entries
|
|
while window and window[0] < cutoff:
|
|
window.popleft()
|
|
|
|
if len(window) >= rpm:
|
|
# Time until oldest entry expires
|
|
retry_after = max(0.0, 60.0 - (now - window[0]))
|
|
return False, retry_after
|
|
|
|
window.append(now)
|
|
return True, 0.0
|
|
|
|
# ── Public check methods ───────────────────────────────────────────────────
|
|
|
|
def check_room(self, room_id: str) -> Tuple[bool, float]:
|
|
"""Per-room rate limit check. Returns (allowed, retry_after_s)."""
|
|
return self._check_window(self._room_windows, room_id, self.room_rpm)
|
|
|
|
def check_operator(self, sender_hash: str) -> Tuple[bool, float]:
|
|
"""Per-operator rate limit check. Returns (allowed, retry_after_s)."""
|
|
return self._check_window(self._op_windows, sender_hash, self.operator_rpm)
|
|
|
|
def check_run_next(self, run_id: str) -> Tuple[bool, float]:
|
|
"""
|
|
Per-run rate limit for !runbook next — prevents rapid-fire advancement.
|
|
Returns (allowed, retry_after_s).
|
|
"""
|
|
return self._check_window(self._run_windows, run_id, self.run_next_rpm)
|
|
|
|
def check_cooldown(
|
|
self,
|
|
sender_hash: str,
|
|
verb: str,
|
|
subcommand: str,
|
|
) -> Tuple[bool, float]:
|
|
"""
|
|
Anti-double-click cooldown per (operator, verb, subcommand).
|
|
|
|
Returns (allowed, wait_s). On first call → records timestamp and allows.
|
|
On subsequent calls within cooldown_s → blocks and returns remaining wait.
|
|
"""
|
|
if self.cooldown_s <= 0:
|
|
return True, 0.0
|
|
|
|
key = f"{sender_hash}:{verb}:{subcommand}"
|
|
now = time.monotonic()
|
|
last = self._cooldown_times.get(key)
|
|
|
|
if last is not None:
|
|
elapsed = now - last
|
|
if elapsed < self.cooldown_s:
|
|
return False, self.cooldown_s - elapsed
|
|
|
|
self._cooldown_times[key] = now
|
|
return True, 0.0
|
|
|
|
# ── Summary ───────────────────────────────────────────────────────────────
|
|
|
|
def as_health_dict(self) -> dict:
|
|
return {
|
|
"room_rpm": self.room_rpm,
|
|
"operator_rpm": self.operator_rpm,
|
|
"run_next_rpm": self.run_next_rpm,
|
|
"cooldown_s": self.cooldown_s,
|
|
}
|