feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
This commit is contained in:
262
services/matrix-bridge-dagi/app/node_health.py
Normal file
262
services/matrix-bridge-dagi/app/node_health.py
Normal file
@@ -0,0 +1,262 @@
|
||||
"""
|
||||
NodeHealthTracker — M8.0: per-node health state tracking for soft-failover.
|
||||
|
||||
Tracks invoke outcomes per node and maintains:
|
||||
- EWMA latency estimate
|
||||
- consecutive failure counter
|
||||
- last ok / last error timestamps
|
||||
- derived health state: "healthy" | "degraded" | "down"
|
||||
|
||||
State transitions
|
||||
-----------------
|
||||
Any state → "down" : consecutive_failures >= fail_consecutive
|
||||
Any state → "degraded" : ewma_latency_s >= lat_ewma_threshold
|
||||
(and not yet "down")
|
||||
"down"/"degraded" → "healthy" : record_ok() resets consecutive_failures to 0
|
||||
and ewma is updated towards the actual latency
|
||||
|
||||
Thread safety
|
||||
-------------
|
||||
All mutations are protected by a threading.Lock so this can be called from
|
||||
asyncio callbacks (e.g. in `_invoke_and_send` on the event loop thread).
|
||||
Use `record_ok` / `record_error` from within coroutines; they are synchronous
|
||||
(no blocking I/O) so they are safe to call directly without to_thread.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, FrozenSet, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ── State constants ────────────────────────────────────────────────────────────
|
||||
|
||||
NODE_STATE_HEALTHY = "healthy"
|
||||
NODE_STATE_DEGRADED = "degraded"
|
||||
NODE_STATE_DOWN = "down"
|
||||
|
||||
# Failover-triggering error classes
|
||||
FAILOVER_REASON_TIMEOUT = "timeout"
|
||||
FAILOVER_REASON_HTTP_5XX = "http_5xx"
|
||||
FAILOVER_REASON_NETWORK = "network"
|
||||
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class NodeHealthConfig:
|
||||
"""
|
||||
Thresholds controlling when a node is considered degraded or down.
|
||||
|
||||
fail_consecutive : int number of consecutive failures → "down"
|
||||
lat_ewma_s : float EWMA latency estimate (seconds) threshold → "degraded"
|
||||
ewma_alpha : float EWMA smoothing factor (0..1); higher = more reactive
|
||||
"""
|
||||
fail_consecutive: int = 3
|
||||
lat_ewma_s: float = 12.0
|
||||
ewma_alpha: float = 0.3
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if not (0 < self.ewma_alpha <= 1):
|
||||
raise ValueError(f"ewma_alpha must be in (0, 1], got {self.ewma_alpha}")
|
||||
if self.fail_consecutive < 1:
|
||||
raise ValueError(f"fail_consecutive must be ≥ 1, got {self.fail_consecutive}")
|
||||
if self.lat_ewma_s <= 0:
|
||||
raise ValueError(f"lat_ewma_s must be > 0, got {self.lat_ewma_s}")
|
||||
|
||||
|
||||
# ── Per-node state ────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class _NodeState:
|
||||
invoke_ok_total: int = 0
|
||||
invoke_err_total: int = 0
|
||||
consecutive_failures: int = 0
|
||||
last_ok_ts: Optional[float] = None
|
||||
last_err_ts: Optional[float] = None
|
||||
ewma_latency_s: Optional[float] = None # None until first ok record
|
||||
|
||||
|
||||
# ── Tracker ───────────────────────────────────────────────────────────────────
|
||||
|
||||
class NodeHealthTracker:
|
||||
"""
|
||||
Thread-safe per-node health tracker.
|
||||
|
||||
Usage:
|
||||
tracker = NodeHealthTracker(NodeHealthConfig())
|
||||
|
||||
# On successful invoke
|
||||
tracker.record_ok("NODA1", latency_s=1.4)
|
||||
|
||||
# On failed invoke
|
||||
tracker.record_error("NODA1", reason=FAILOVER_REASON_TIMEOUT)
|
||||
|
||||
# Read health state
|
||||
state = tracker.state("NODA1") # "healthy" | "degraded" | "down"
|
||||
fallback = tracker.pick_fallback("NODA1", allowed_nodes=frozenset({"NODA1","NODA2"}))
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[NodeHealthConfig] = None) -> None:
|
||||
self._cfg = config or NodeHealthConfig()
|
||||
self._nodes: Dict[str, _NodeState] = {}
|
||||
self._lock = threading.RLock() # RLock: re-entrant (needed for all_info → as_info_dict)
|
||||
|
||||
# ── Public mutation API ────────────────────────────────────────────────────
|
||||
|
||||
def record_ok(self, node_id: str, latency_s: float) -> None:
|
||||
"""Record a successful invoke for node_id with given latency."""
|
||||
with self._lock:
|
||||
ns = self._get_or_create(node_id)
|
||||
ns.invoke_ok_total += 1
|
||||
ns.consecutive_failures = 0
|
||||
ns.last_ok_ts = time.monotonic()
|
||||
if ns.ewma_latency_s is None:
|
||||
ns.ewma_latency_s = latency_s
|
||||
else:
|
||||
alpha = self._cfg.ewma_alpha
|
||||
ns.ewma_latency_s = alpha * latency_s + (1 - alpha) * ns.ewma_latency_s
|
||||
|
||||
def record_error(self, node_id: str, reason: str = "unknown") -> None:
|
||||
"""Record a failed invoke for node_id."""
|
||||
with self._lock:
|
||||
ns = self._get_or_create(node_id)
|
||||
ns.invoke_err_total += 1
|
||||
ns.consecutive_failures += 1
|
||||
ns.last_err_ts = time.monotonic()
|
||||
logger.debug(
|
||||
"NodeHealth: node=%s consecutive_failures=%d reason=%s",
|
||||
node_id, ns.consecutive_failures, reason,
|
||||
)
|
||||
|
||||
# ── Public read API ───────────────────────────────────────────────────────
|
||||
|
||||
def state(self, node_id: str) -> str:
|
||||
"""Return current health state for node_id."""
|
||||
with self._lock:
|
||||
return self._state_unlocked(node_id)
|
||||
|
||||
def pick_fallback(
|
||||
self,
|
||||
primary: str,
|
||||
allowed_nodes: FrozenSet[str],
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Return the best alternative node for failover.
|
||||
|
||||
Priority: healthy > degraded > (never down)
|
||||
Returns None if no acceptable fallback exists.
|
||||
"""
|
||||
with self._lock:
|
||||
candidates = sorted(n for n in allowed_nodes if n != primary)
|
||||
# Prefer healthy first
|
||||
for n in candidates:
|
||||
if self._state_unlocked(n) == NODE_STATE_HEALTHY:
|
||||
return n
|
||||
# Accept degraded if no healthy available
|
||||
for n in candidates:
|
||||
if self._state_unlocked(n) == NODE_STATE_DEGRADED:
|
||||
return n
|
||||
# Do not failover to "down" nodes
|
||||
return None
|
||||
|
||||
def as_info_dict(self, node_id: str) -> dict:
|
||||
"""Return a JSON-safe status dict for one node."""
|
||||
with self._lock:
|
||||
ns = self._nodes.get(node_id)
|
||||
if ns is None:
|
||||
return {
|
||||
"node_id": node_id,
|
||||
"state": NODE_STATE_HEALTHY,
|
||||
"invoke_ok": 0,
|
||||
"invoke_err": 0,
|
||||
"consecutive_failures": 0,
|
||||
"ewma_latency_s": None,
|
||||
"last_ok_ts": None,
|
||||
"last_err_ts": None,
|
||||
}
|
||||
return {
|
||||
"node_id": node_id,
|
||||
"state": self._state_unlocked(node_id),
|
||||
"invoke_ok": ns.invoke_ok_total,
|
||||
"invoke_err": ns.invoke_err_total,
|
||||
"consecutive_failures": ns.consecutive_failures,
|
||||
"ewma_latency_s": round(ns.ewma_latency_s, 3) if ns.ewma_latency_s else None,
|
||||
"last_ok_ts": ns.last_ok_ts,
|
||||
"last_err_ts": ns.last_err_ts,
|
||||
}
|
||||
|
||||
def all_info(self, allowed_nodes: Optional[FrozenSet[str]] = None) -> Dict[str, dict]:
|
||||
"""
|
||||
Return status dicts for all tracked (or specified) nodes.
|
||||
If allowed_nodes provided, also include entries for unseen nodes (state=healthy).
|
||||
"""
|
||||
with self._lock:
|
||||
keys = set(self._nodes.keys())
|
||||
if allowed_nodes:
|
||||
keys |= set(allowed_nodes)
|
||||
return {n: self.as_info_dict(n) for n in sorted(keys)}
|
||||
|
||||
def reset(self, node_id: str) -> None:
|
||||
"""Reset health state for a node (e.g. after manual recovery)."""
|
||||
with self._lock:
|
||||
self._nodes.pop(node_id, None)
|
||||
|
||||
def restore_node(
|
||||
self,
|
||||
node_id: str,
|
||||
ewma_latency_s: Optional[float],
|
||||
consecutive_failures: int,
|
||||
) -> None:
|
||||
"""
|
||||
Restore persisted node state after a restart (M8.2).
|
||||
|
||||
Only restores ewma_latency_s and consecutive_failures; counters
|
||||
(invoke_ok_total, invoke_err_total) start from 0 since they are
|
||||
runtime metrics for the current session.
|
||||
"""
|
||||
with self._lock:
|
||||
ns = self._get_or_create(node_id)
|
||||
ns.ewma_latency_s = ewma_latency_s
|
||||
ns.consecutive_failures = max(0, consecutive_failures)
|
||||
|
||||
# ── Internal ──────────────────────────────────────────────────────────────
|
||||
|
||||
def _get_or_create(self, node_id: str) -> _NodeState:
|
||||
if node_id not in self._nodes:
|
||||
self._nodes[node_id] = _NodeState()
|
||||
return self._nodes[node_id]
|
||||
|
||||
def _state_unlocked(self, node_id: str) -> str:
|
||||
ns = self._nodes.get(node_id)
|
||||
if ns is None:
|
||||
return NODE_STATE_HEALTHY # unseen nodes are assumed healthy
|
||||
|
||||
if ns.consecutive_failures >= self._cfg.fail_consecutive:
|
||||
return NODE_STATE_DOWN
|
||||
|
||||
if (
|
||||
ns.ewma_latency_s is not None
|
||||
and ns.ewma_latency_s >= self._cfg.lat_ewma_s
|
||||
):
|
||||
return NODE_STATE_DEGRADED
|
||||
|
||||
return NODE_STATE_HEALTHY
|
||||
|
||||
|
||||
# ── Parser (env vars → NodeHealthConfig) ──────────────────────────────────────
|
||||
|
||||
def parse_node_health_config(
|
||||
fail_consecutive: int = 3,
|
||||
lat_ewma_s: float = 12.0,
|
||||
ewma_alpha: float = 0.3,
|
||||
) -> NodeHealthConfig:
|
||||
"""Construct NodeHealthConfig from parsed env values."""
|
||||
return NodeHealthConfig(
|
||||
fail_consecutive=fail_consecutive,
|
||||
lat_ewma_s=lat_ewma_s,
|
||||
ewma_alpha=ewma_alpha,
|
||||
)
|
||||
Reference in New Issue
Block a user