""" StickyNodeCache — M8.1: anti-flap sticky routing after soft-failover. After a successful failover (primary → fallback), the bridge remembers the fallback node per room:agent pair for `ttl_s` seconds. Subsequent messages for the same pair skip the primary entirely and go directly to the known-good fallback, preventing oscillation ("flapping") while the primary recovers. Key design ---------- key = "{room_id}:{agent_id}" ttl = FAILOVER_STICKY_TTL_S (default 300 s) Priority in routing (when source != explicit): 1. sticky cache (temporary) 2. store override (desired long-term policy) 3. env room_node_map 4. env default Sticky expires naturally; recovery is automatic — no operator action needed. If the sticky node also fails, the entry is removed and normal failover logic takes over again. Thread safety ------------- Uses threading.RLock — safe to call from asyncio callbacks without to_thread. """ from __future__ import annotations import logging import threading import time from dataclasses import dataclass from typing import Dict, List, Optional, Tuple logger = logging.getLogger(__name__) _DEFAULT_TTL_S = 300.0 @dataclass class _StickyEntry: node_id: str expires_at: float # time.monotonic() deadline class StickyNodeCache: """ In-memory sticky node preference cache. Usage: cache = StickyNodeCache(ttl_s=300) # After successful failover: cache.set("!room:srv:sofiia", "NODA2") # Before routing the next message: node = cache.get("!room:srv:sofiia") # → "NODA2" or None if expired/missing # If sticky node also fails: cache.delete("!room:srv:sofiia") """ def __init__(self, ttl_s: float = _DEFAULT_TTL_S) -> None: if ttl_s <= 0: raise ValueError(f"ttl_s must be > 0, got {ttl_s}") self._ttl_s = ttl_s self._cache: Dict[str, _StickyEntry] = {} self._lock = threading.RLock() # ── Public API ──────────────────────────────────────────────────────────── def set(self, key: str, node_id: str, ttl_s: Optional[float] = None) -> None: """Set sticky preference; overwrites existing entry.""" ttl = ttl_s if ttl_s is not None else self._ttl_s with self._lock: self._cache[key] = _StickyEntry( node_id=node_id, expires_at=time.monotonic() + ttl, ) logger.debug("StickyCache.set: key=%s node=%s ttl=%.0fs", key, node_id, ttl) def get(self, key: str) -> Optional[str]: """ Return sticky node_id if entry exists and not expired; else None. Expired entries are lazily removed on access. """ with self._lock: entry = self._cache.get(key) if entry is None: return None if time.monotonic() >= entry.expires_at: del self._cache[key] logger.debug("StickyCache.expired: key=%s node=%s", key, entry.node_id) return None return entry.node_id def delete(self, key: str) -> bool: """Remove an entry. Returns True if it existed.""" with self._lock: existed = key in self._cache self._cache.pop(key, None) if existed: logger.debug("StickyCache.delete: key=%s", key) return existed def active_count(self) -> int: """Count of non-expired entries (best-effort; no eviction).""" now = time.monotonic() with self._lock: return sum(1 for e in self._cache.values() if e.expires_at > now) def active_entries(self) -> List[Tuple[str, str, float]]: """ Return (key, node_id, ttl_remaining_s) for all non-expired entries. Useful for ops visibility in !status/!nodes. """ now = time.monotonic() with self._lock: result = [] for k, e in self._cache.items(): remaining = e.expires_at - now if remaining > 0: result.append((k, e.node_id, remaining)) return sorted(result, key=lambda x: x[0]) def cleanup(self) -> int: """ Remove all expired entries. Call periodically (e.g. in a background task) to reclaim memory. Returns count of removed entries. """ now = time.monotonic() with self._lock: expired_keys = [k for k, e in self._cache.items() if e.expires_at <= now] for k in expired_keys: del self._cache[k] if expired_keys: logger.debug("StickyCache.cleanup: removed %d expired entries", len(expired_keys)) return len(expired_keys) @property def ttl_s(self) -> float: return self._ttl_s def make_sticky_key(room_id: str, agent_id: str) -> str: """Canonical sticky cache key for a room+agent pair.""" return f"{room_id}:{agent_id}"