Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
150 lines
5.0 KiB
Python
150 lines
5.0 KiB
Python
"""
|
|
StickyNodeCache — M8.1: anti-flap sticky routing after soft-failover.
|
|
|
|
After a successful failover (primary → fallback), the bridge remembers the
|
|
fallback node per room:agent pair for `ttl_s` seconds. Subsequent messages
|
|
for the same pair skip the primary entirely and go directly to the known-good
|
|
fallback, preventing oscillation ("flapping") while the primary recovers.
|
|
|
|
Key design
|
|
----------
|
|
key = "{room_id}:{agent_id}"
|
|
ttl = FAILOVER_STICKY_TTL_S (default 300 s)
|
|
|
|
Priority in routing (when source != explicit):
|
|
1. sticky cache (temporary)
|
|
2. store override (desired long-term policy)
|
|
3. env room_node_map
|
|
4. env default
|
|
|
|
Sticky expires naturally; recovery is automatic — no operator action needed.
|
|
If the sticky node also fails, the entry is removed and normal failover logic
|
|
takes over again.
|
|
|
|
Thread safety
|
|
-------------
|
|
Uses threading.RLock — safe to call from asyncio callbacks without to_thread.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import threading
|
|
import time
|
|
from dataclasses import dataclass
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_DEFAULT_TTL_S = 300.0
|
|
|
|
|
|
@dataclass
|
|
class _StickyEntry:
|
|
node_id: str
|
|
expires_at: float # time.monotonic() deadline
|
|
|
|
|
|
class StickyNodeCache:
|
|
"""
|
|
In-memory sticky node preference cache.
|
|
|
|
Usage:
|
|
cache = StickyNodeCache(ttl_s=300)
|
|
|
|
# After successful failover:
|
|
cache.set("!room:srv:sofiia", "NODA2")
|
|
|
|
# Before routing the next message:
|
|
node = cache.get("!room:srv:sofiia") # → "NODA2" or None if expired/missing
|
|
|
|
# If sticky node also fails:
|
|
cache.delete("!room:srv:sofiia")
|
|
"""
|
|
|
|
def __init__(self, ttl_s: float = _DEFAULT_TTL_S) -> None:
|
|
if ttl_s <= 0:
|
|
raise ValueError(f"ttl_s must be > 0, got {ttl_s}")
|
|
self._ttl_s = ttl_s
|
|
self._cache: Dict[str, _StickyEntry] = {}
|
|
self._lock = threading.RLock()
|
|
|
|
# ── Public API ────────────────────────────────────────────────────────────
|
|
|
|
def set(self, key: str, node_id: str, ttl_s: Optional[float] = None) -> None:
|
|
"""Set sticky preference; overwrites existing entry."""
|
|
ttl = ttl_s if ttl_s is not None else self._ttl_s
|
|
with self._lock:
|
|
self._cache[key] = _StickyEntry(
|
|
node_id=node_id,
|
|
expires_at=time.monotonic() + ttl,
|
|
)
|
|
logger.debug("StickyCache.set: key=%s node=%s ttl=%.0fs", key, node_id, ttl)
|
|
|
|
def get(self, key: str) -> Optional[str]:
|
|
"""
|
|
Return sticky node_id if entry exists and not expired; else None.
|
|
Expired entries are lazily removed on access.
|
|
"""
|
|
with self._lock:
|
|
entry = self._cache.get(key)
|
|
if entry is None:
|
|
return None
|
|
if time.monotonic() >= entry.expires_at:
|
|
del self._cache[key]
|
|
logger.debug("StickyCache.expired: key=%s node=%s", key, entry.node_id)
|
|
return None
|
|
return entry.node_id
|
|
|
|
def delete(self, key: str) -> bool:
|
|
"""Remove an entry. Returns True if it existed."""
|
|
with self._lock:
|
|
existed = key in self._cache
|
|
self._cache.pop(key, None)
|
|
if existed:
|
|
logger.debug("StickyCache.delete: key=%s", key)
|
|
return existed
|
|
|
|
def active_count(self) -> int:
|
|
"""Count of non-expired entries (best-effort; no eviction)."""
|
|
now = time.monotonic()
|
|
with self._lock:
|
|
return sum(1 for e in self._cache.values() if e.expires_at > now)
|
|
|
|
def active_entries(self) -> List[Tuple[str, str, float]]:
|
|
"""
|
|
Return (key, node_id, ttl_remaining_s) for all non-expired entries.
|
|
Useful for ops visibility in !status/!nodes.
|
|
"""
|
|
now = time.monotonic()
|
|
with self._lock:
|
|
result = []
|
|
for k, e in self._cache.items():
|
|
remaining = e.expires_at - now
|
|
if remaining > 0:
|
|
result.append((k, e.node_id, remaining))
|
|
return sorted(result, key=lambda x: x[0])
|
|
|
|
def cleanup(self) -> int:
|
|
"""
|
|
Remove all expired entries.
|
|
Call periodically (e.g. in a background task) to reclaim memory.
|
|
Returns count of removed entries.
|
|
"""
|
|
now = time.monotonic()
|
|
with self._lock:
|
|
expired_keys = [k for k, e in self._cache.items() if e.expires_at <= now]
|
|
for k in expired_keys:
|
|
del self._cache[k]
|
|
if expired_keys:
|
|
logger.debug("StickyCache.cleanup: removed %d expired entries", len(expired_keys))
|
|
return len(expired_keys)
|
|
|
|
@property
|
|
def ttl_s(self) -> float:
|
|
return self._ttl_s
|
|
|
|
|
|
def make_sticky_key(room_id: str, agent_id: str) -> str:
|
|
"""Canonical sticky cache key for a room+agent pair."""
|
|
return f"{room_id}:{agent_id}"
|