Files
microdao-daarion/services/matrix-bridge-dagi/app/node_health.py
Apple 82d5ff2a4f feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11:
- M4: agent discovery (!agents / !status)
- M5: node-aware routing + per-node observability
- M6: dynamic policy store (node/agent overrides, import/export)
- M7: Prometheus alerts + Grafana dashboard + metrics contract
- M8: node health tracker + soft failover + sticky cache + HA persistence
- M9: two-step confirm + diff preview for dangerous commands
- M10: auto-backup, restore, retention, policy history + change detail
- M11: soak scenarios (CI tests) + live soak script

Soak infrastructure (this commit):
- POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false)
- _preflight_inject() and _check_wal() in soak script
- --db-path arg for WAL delta reporting
- Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands

Made-with: Cursor
2026-03-05 07:51:37 -08:00

263 lines
10 KiB
Python

"""
NodeHealthTracker — M8.0: per-node health state tracking for soft-failover.
Tracks invoke outcomes per node and maintains:
- EWMA latency estimate
- consecutive failure counter
- last ok / last error timestamps
- derived health state: "healthy" | "degraded" | "down"
State transitions
-----------------
Any state → "down" : consecutive_failures >= fail_consecutive
Any state → "degraded" : ewma_latency_s >= lat_ewma_threshold
(and not yet "down")
"down"/"degraded""healthy" : record_ok() resets consecutive_failures to 0
and ewma is updated towards the actual latency
Thread safety
-------------
All mutations are protected by a threading.Lock so this can be called from
asyncio callbacks (e.g. in `_invoke_and_send` on the event loop thread).
Use `record_ok` / `record_error` from within coroutines; they are synchronous
(no blocking I/O) so they are safe to call directly without to_thread.
"""
from __future__ import annotations
import logging
import threading
import time
from dataclasses import dataclass, field
from typing import Dict, FrozenSet, Optional, Tuple
logger = logging.getLogger(__name__)
# ── State constants ────────────────────────────────────────────────────────────
NODE_STATE_HEALTHY = "healthy"
NODE_STATE_DEGRADED = "degraded"
NODE_STATE_DOWN = "down"
# Failover-triggering error classes
FAILOVER_REASON_TIMEOUT = "timeout"
FAILOVER_REASON_HTTP_5XX = "http_5xx"
FAILOVER_REASON_NETWORK = "network"
# ── Config ────────────────────────────────────────────────────────────────────
@dataclass(frozen=True)
class NodeHealthConfig:
"""
Thresholds controlling when a node is considered degraded or down.
fail_consecutive : int number of consecutive failures → "down"
lat_ewma_s : float EWMA latency estimate (seconds) threshold → "degraded"
ewma_alpha : float EWMA smoothing factor (0..1); higher = more reactive
"""
fail_consecutive: int = 3
lat_ewma_s: float = 12.0
ewma_alpha: float = 0.3
def __post_init__(self) -> None:
if not (0 < self.ewma_alpha <= 1):
raise ValueError(f"ewma_alpha must be in (0, 1], got {self.ewma_alpha}")
if self.fail_consecutive < 1:
raise ValueError(f"fail_consecutive must be ≥ 1, got {self.fail_consecutive}")
if self.lat_ewma_s <= 0:
raise ValueError(f"lat_ewma_s must be > 0, got {self.lat_ewma_s}")
# ── Per-node state ────────────────────────────────────────────────────────────
@dataclass
class _NodeState:
invoke_ok_total: int = 0
invoke_err_total: int = 0
consecutive_failures: int = 0
last_ok_ts: Optional[float] = None
last_err_ts: Optional[float] = None
ewma_latency_s: Optional[float] = None # None until first ok record
# ── Tracker ───────────────────────────────────────────────────────────────────
class NodeHealthTracker:
"""
Thread-safe per-node health tracker.
Usage:
tracker = NodeHealthTracker(NodeHealthConfig())
# On successful invoke
tracker.record_ok("NODA1", latency_s=1.4)
# On failed invoke
tracker.record_error("NODA1", reason=FAILOVER_REASON_TIMEOUT)
# Read health state
state = tracker.state("NODA1") # "healthy" | "degraded" | "down"
fallback = tracker.pick_fallback("NODA1", allowed_nodes=frozenset({"NODA1","NODA2"}))
"""
def __init__(self, config: Optional[NodeHealthConfig] = None) -> None:
self._cfg = config or NodeHealthConfig()
self._nodes: Dict[str, _NodeState] = {}
self._lock = threading.RLock() # RLock: re-entrant (needed for all_info → as_info_dict)
# ── Public mutation API ────────────────────────────────────────────────────
def record_ok(self, node_id: str, latency_s: float) -> None:
"""Record a successful invoke for node_id with given latency."""
with self._lock:
ns = self._get_or_create(node_id)
ns.invoke_ok_total += 1
ns.consecutive_failures = 0
ns.last_ok_ts = time.monotonic()
if ns.ewma_latency_s is None:
ns.ewma_latency_s = latency_s
else:
alpha = self._cfg.ewma_alpha
ns.ewma_latency_s = alpha * latency_s + (1 - alpha) * ns.ewma_latency_s
def record_error(self, node_id: str, reason: str = "unknown") -> None:
"""Record a failed invoke for node_id."""
with self._lock:
ns = self._get_or_create(node_id)
ns.invoke_err_total += 1
ns.consecutive_failures += 1
ns.last_err_ts = time.monotonic()
logger.debug(
"NodeHealth: node=%s consecutive_failures=%d reason=%s",
node_id, ns.consecutive_failures, reason,
)
# ── Public read API ───────────────────────────────────────────────────────
def state(self, node_id: str) -> str:
"""Return current health state for node_id."""
with self._lock:
return self._state_unlocked(node_id)
def pick_fallback(
self,
primary: str,
allowed_nodes: FrozenSet[str],
) -> Optional[str]:
"""
Return the best alternative node for failover.
Priority: healthy > degraded > (never down)
Returns None if no acceptable fallback exists.
"""
with self._lock:
candidates = sorted(n for n in allowed_nodes if n != primary)
# Prefer healthy first
for n in candidates:
if self._state_unlocked(n) == NODE_STATE_HEALTHY:
return n
# Accept degraded if no healthy available
for n in candidates:
if self._state_unlocked(n) == NODE_STATE_DEGRADED:
return n
# Do not failover to "down" nodes
return None
def as_info_dict(self, node_id: str) -> dict:
"""Return a JSON-safe status dict for one node."""
with self._lock:
ns = self._nodes.get(node_id)
if ns is None:
return {
"node_id": node_id,
"state": NODE_STATE_HEALTHY,
"invoke_ok": 0,
"invoke_err": 0,
"consecutive_failures": 0,
"ewma_latency_s": None,
"last_ok_ts": None,
"last_err_ts": None,
}
return {
"node_id": node_id,
"state": self._state_unlocked(node_id),
"invoke_ok": ns.invoke_ok_total,
"invoke_err": ns.invoke_err_total,
"consecutive_failures": ns.consecutive_failures,
"ewma_latency_s": round(ns.ewma_latency_s, 3) if ns.ewma_latency_s else None,
"last_ok_ts": ns.last_ok_ts,
"last_err_ts": ns.last_err_ts,
}
def all_info(self, allowed_nodes: Optional[FrozenSet[str]] = None) -> Dict[str, dict]:
"""
Return status dicts for all tracked (or specified) nodes.
If allowed_nodes provided, also include entries for unseen nodes (state=healthy).
"""
with self._lock:
keys = set(self._nodes.keys())
if allowed_nodes:
keys |= set(allowed_nodes)
return {n: self.as_info_dict(n) for n in sorted(keys)}
def reset(self, node_id: str) -> None:
"""Reset health state for a node (e.g. after manual recovery)."""
with self._lock:
self._nodes.pop(node_id, None)
def restore_node(
self,
node_id: str,
ewma_latency_s: Optional[float],
consecutive_failures: int,
) -> None:
"""
Restore persisted node state after a restart (M8.2).
Only restores ewma_latency_s and consecutive_failures; counters
(invoke_ok_total, invoke_err_total) start from 0 since they are
runtime metrics for the current session.
"""
with self._lock:
ns = self._get_or_create(node_id)
ns.ewma_latency_s = ewma_latency_s
ns.consecutive_failures = max(0, consecutive_failures)
# ── Internal ──────────────────────────────────────────────────────────────
def _get_or_create(self, node_id: str) -> _NodeState:
if node_id not in self._nodes:
self._nodes[node_id] = _NodeState()
return self._nodes[node_id]
def _state_unlocked(self, node_id: str) -> str:
ns = self._nodes.get(node_id)
if ns is None:
return NODE_STATE_HEALTHY # unseen nodes are assumed healthy
if ns.consecutive_failures >= self._cfg.fail_consecutive:
return NODE_STATE_DOWN
if (
ns.ewma_latency_s is not None
and ns.ewma_latency_s >= self._cfg.lat_ewma_s
):
return NODE_STATE_DEGRADED
return NODE_STATE_HEALTHY
# ── Parser (env vars → NodeHealthConfig) ──────────────────────────────────────
def parse_node_health_config(
fail_consecutive: int = 3,
lat_ewma_s: float = 12.0,
ewma_alpha: float = 0.3,
) -> NodeHealthConfig:
"""Construct NodeHealthConfig from parsed env values."""
return NodeHealthConfig(
fail_consecutive=fail_consecutive,
lat_ewma_s=lat_ewma_s,
ewma_alpha=ewma_alpha,
)