feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
This commit is contained in:
149
services/matrix-bridge-dagi/app/sticky_cache.py
Normal file
149
services/matrix-bridge-dagi/app/sticky_cache.py
Normal file
@@ -0,0 +1,149 @@
|
||||
"""
|
||||
StickyNodeCache — M8.1: anti-flap sticky routing after soft-failover.
|
||||
|
||||
After a successful failover (primary → fallback), the bridge remembers the
|
||||
fallback node per room:agent pair for `ttl_s` seconds. Subsequent messages
|
||||
for the same pair skip the primary entirely and go directly to the known-good
|
||||
fallback, preventing oscillation ("flapping") while the primary recovers.
|
||||
|
||||
Key design
|
||||
----------
|
||||
key = "{room_id}:{agent_id}"
|
||||
ttl = FAILOVER_STICKY_TTL_S (default 300 s)
|
||||
|
||||
Priority in routing (when source != explicit):
|
||||
1. sticky cache (temporary)
|
||||
2. store override (desired long-term policy)
|
||||
3. env room_node_map
|
||||
4. env default
|
||||
|
||||
Sticky expires naturally; recovery is automatic — no operator action needed.
|
||||
If the sticky node also fails, the entry is removed and normal failover logic
|
||||
takes over again.
|
||||
|
||||
Thread safety
|
||||
-------------
|
||||
Uses threading.RLock — safe to call from asyncio callbacks without to_thread.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_DEFAULT_TTL_S = 300.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class _StickyEntry:
|
||||
node_id: str
|
||||
expires_at: float # time.monotonic() deadline
|
||||
|
||||
|
||||
class StickyNodeCache:
|
||||
"""
|
||||
In-memory sticky node preference cache.
|
||||
|
||||
Usage:
|
||||
cache = StickyNodeCache(ttl_s=300)
|
||||
|
||||
# After successful failover:
|
||||
cache.set("!room:srv:sofiia", "NODA2")
|
||||
|
||||
# Before routing the next message:
|
||||
node = cache.get("!room:srv:sofiia") # → "NODA2" or None if expired/missing
|
||||
|
||||
# If sticky node also fails:
|
||||
cache.delete("!room:srv:sofiia")
|
||||
"""
|
||||
|
||||
def __init__(self, ttl_s: float = _DEFAULT_TTL_S) -> None:
|
||||
if ttl_s <= 0:
|
||||
raise ValueError(f"ttl_s must be > 0, got {ttl_s}")
|
||||
self._ttl_s = ttl_s
|
||||
self._cache: Dict[str, _StickyEntry] = {}
|
||||
self._lock = threading.RLock()
|
||||
|
||||
# ── Public API ────────────────────────────────────────────────────────────
|
||||
|
||||
def set(self, key: str, node_id: str, ttl_s: Optional[float] = None) -> None:
|
||||
"""Set sticky preference; overwrites existing entry."""
|
||||
ttl = ttl_s if ttl_s is not None else self._ttl_s
|
||||
with self._lock:
|
||||
self._cache[key] = _StickyEntry(
|
||||
node_id=node_id,
|
||||
expires_at=time.monotonic() + ttl,
|
||||
)
|
||||
logger.debug("StickyCache.set: key=%s node=%s ttl=%.0fs", key, node_id, ttl)
|
||||
|
||||
def get(self, key: str) -> Optional[str]:
|
||||
"""
|
||||
Return sticky node_id if entry exists and not expired; else None.
|
||||
Expired entries are lazily removed on access.
|
||||
"""
|
||||
with self._lock:
|
||||
entry = self._cache.get(key)
|
||||
if entry is None:
|
||||
return None
|
||||
if time.monotonic() >= entry.expires_at:
|
||||
del self._cache[key]
|
||||
logger.debug("StickyCache.expired: key=%s node=%s", key, entry.node_id)
|
||||
return None
|
||||
return entry.node_id
|
||||
|
||||
def delete(self, key: str) -> bool:
|
||||
"""Remove an entry. Returns True if it existed."""
|
||||
with self._lock:
|
||||
existed = key in self._cache
|
||||
self._cache.pop(key, None)
|
||||
if existed:
|
||||
logger.debug("StickyCache.delete: key=%s", key)
|
||||
return existed
|
||||
|
||||
def active_count(self) -> int:
|
||||
"""Count of non-expired entries (best-effort; no eviction)."""
|
||||
now = time.monotonic()
|
||||
with self._lock:
|
||||
return sum(1 for e in self._cache.values() if e.expires_at > now)
|
||||
|
||||
def active_entries(self) -> List[Tuple[str, str, float]]:
|
||||
"""
|
||||
Return (key, node_id, ttl_remaining_s) for all non-expired entries.
|
||||
Useful for ops visibility in !status/!nodes.
|
||||
"""
|
||||
now = time.monotonic()
|
||||
with self._lock:
|
||||
result = []
|
||||
for k, e in self._cache.items():
|
||||
remaining = e.expires_at - now
|
||||
if remaining > 0:
|
||||
result.append((k, e.node_id, remaining))
|
||||
return sorted(result, key=lambda x: x[0])
|
||||
|
||||
def cleanup(self) -> int:
|
||||
"""
|
||||
Remove all expired entries.
|
||||
Call periodically (e.g. in a background task) to reclaim memory.
|
||||
Returns count of removed entries.
|
||||
"""
|
||||
now = time.monotonic()
|
||||
with self._lock:
|
||||
expired_keys = [k for k, e in self._cache.items() if e.expires_at <= now]
|
||||
for k in expired_keys:
|
||||
del self._cache[k]
|
||||
if expired_keys:
|
||||
logger.debug("StickyCache.cleanup: removed %d expired entries", len(expired_keys))
|
||||
return len(expired_keys)
|
||||
|
||||
@property
|
||||
def ttl_s(self) -> float:
|
||||
return self._ttl_s
|
||||
|
||||
|
||||
def make_sticky_key(room_id: str, agent_id: str) -> str:
|
||||
"""Canonical sticky cache key for a room+agent pair."""
|
||||
return f"{room_id}:{agent_id}"
|
||||
Reference in New Issue
Block a user