Files
microdao-daarion/services/matrix-bridge-dagi/app/sticky_cache.py
Apple 82d5ff2a4f feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11:
- M4: agent discovery (!agents / !status)
- M5: node-aware routing + per-node observability
- M6: dynamic policy store (node/agent overrides, import/export)
- M7: Prometheus alerts + Grafana dashboard + metrics contract
- M8: node health tracker + soft failover + sticky cache + HA persistence
- M9: two-step confirm + diff preview for dangerous commands
- M10: auto-backup, restore, retention, policy history + change detail
- M11: soak scenarios (CI tests) + live soak script

Soak infrastructure (this commit):
- POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false)
- _preflight_inject() and _check_wal() in soak script
- --db-path arg for WAL delta reporting
- Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands

Made-with: Cursor
2026-03-05 07:51:37 -08:00

150 lines
5.0 KiB
Python

"""
StickyNodeCache — M8.1: anti-flap sticky routing after soft-failover.
After a successful failover (primary → fallback), the bridge remembers the
fallback node per room:agent pair for `ttl_s` seconds. Subsequent messages
for the same pair skip the primary entirely and go directly to the known-good
fallback, preventing oscillation ("flapping") while the primary recovers.
Key design
----------
key = "{room_id}:{agent_id}"
ttl = FAILOVER_STICKY_TTL_S (default 300 s)
Priority in routing (when source != explicit):
1. sticky cache (temporary)
2. store override (desired long-term policy)
3. env room_node_map
4. env default
Sticky expires naturally; recovery is automatic — no operator action needed.
If the sticky node also fails, the entry is removed and normal failover logic
takes over again.
Thread safety
-------------
Uses threading.RLock — safe to call from asyncio callbacks without to_thread.
"""
from __future__ import annotations
import logging
import threading
import time
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
_DEFAULT_TTL_S = 300.0
@dataclass
class _StickyEntry:
node_id: str
expires_at: float # time.monotonic() deadline
class StickyNodeCache:
"""
In-memory sticky node preference cache.
Usage:
cache = StickyNodeCache(ttl_s=300)
# After successful failover:
cache.set("!room:srv:sofiia", "NODA2")
# Before routing the next message:
node = cache.get("!room:srv:sofiia") # → "NODA2" or None if expired/missing
# If sticky node also fails:
cache.delete("!room:srv:sofiia")
"""
def __init__(self, ttl_s: float = _DEFAULT_TTL_S) -> None:
if ttl_s <= 0:
raise ValueError(f"ttl_s must be > 0, got {ttl_s}")
self._ttl_s = ttl_s
self._cache: Dict[str, _StickyEntry] = {}
self._lock = threading.RLock()
# ── Public API ────────────────────────────────────────────────────────────
def set(self, key: str, node_id: str, ttl_s: Optional[float] = None) -> None:
"""Set sticky preference; overwrites existing entry."""
ttl = ttl_s if ttl_s is not None else self._ttl_s
with self._lock:
self._cache[key] = _StickyEntry(
node_id=node_id,
expires_at=time.monotonic() + ttl,
)
logger.debug("StickyCache.set: key=%s node=%s ttl=%.0fs", key, node_id, ttl)
def get(self, key: str) -> Optional[str]:
"""
Return sticky node_id if entry exists and not expired; else None.
Expired entries are lazily removed on access.
"""
with self._lock:
entry = self._cache.get(key)
if entry is None:
return None
if time.monotonic() >= entry.expires_at:
del self._cache[key]
logger.debug("StickyCache.expired: key=%s node=%s", key, entry.node_id)
return None
return entry.node_id
def delete(self, key: str) -> bool:
"""Remove an entry. Returns True if it existed."""
with self._lock:
existed = key in self._cache
self._cache.pop(key, None)
if existed:
logger.debug("StickyCache.delete: key=%s", key)
return existed
def active_count(self) -> int:
"""Count of non-expired entries (best-effort; no eviction)."""
now = time.monotonic()
with self._lock:
return sum(1 for e in self._cache.values() if e.expires_at > now)
def active_entries(self) -> List[Tuple[str, str, float]]:
"""
Return (key, node_id, ttl_remaining_s) for all non-expired entries.
Useful for ops visibility in !status/!nodes.
"""
now = time.monotonic()
with self._lock:
result = []
for k, e in self._cache.items():
remaining = e.expires_at - now
if remaining > 0:
result.append((k, e.node_id, remaining))
return sorted(result, key=lambda x: x[0])
def cleanup(self) -> int:
"""
Remove all expired entries.
Call periodically (e.g. in a background task) to reclaim memory.
Returns count of removed entries.
"""
now = time.monotonic()
with self._lock:
expired_keys = [k for k, e in self._cache.items() if e.expires_at <= now]
for k in expired_keys:
del self._cache[k]
if expired_keys:
logger.debug("StickyCache.cleanup: removed %d expired entries", len(expired_keys))
return len(expired_keys)
@property
def ttl_s(self) -> float:
return self._ttl_s
def make_sticky_key(room_id: str, agent_id: str) -> str:
"""Canonical sticky cache key for a room+agent pair."""
return f"{room_id}:{agent_id}"