feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)

Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
2026-03-05 07:51:37 -08:00
parent fe6e3d30ae
commit 82d5ff2a4f
21 changed files with 9123 additions and 93 deletions
--- a/services/matrix-bridge-dagi/app/node_health.py
+++ b/services/matrix-bridge-dagi/app/node_health.py
@@ -0,0 +1,262 @@
+"""
+NodeHealthTracker — M8.0: per-node health state tracking for soft-failover.
+
+Tracks invoke outcomes per node and maintains:
+  - EWMA latency estimate
+  - consecutive failure counter
+  - last ok / last error timestamps
+  - derived health state: "healthy" | "degraded" | "down"
+
+State transitions
+-----------------
+  Any state → "down"      : consecutive_failures >= fail_consecutive
+  Any state → "degraded"  : ewma_latency_s >= lat_ewma_threshold
+                             (and not yet "down")
+  "down"/"degraded" → "healthy"  : record_ok() resets consecutive_failures to 0
+                                    and ewma is updated towards the actual latency
+
+Thread safety
+-------------
+  All mutations are protected by a threading.Lock so this can be called from
+  asyncio callbacks (e.g. in `_invoke_and_send` on the event loop thread).
+  Use `record_ok` / `record_error` from within coroutines; they are synchronous
+  (no blocking I/O) so they are safe to call directly without to_thread.
+"""
+from __future__ import annotations
+
+import logging
+import threading
+import time
+from dataclasses import dataclass, field
+from typing import Dict, FrozenSet, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+# ── State constants ────────────────────────────────────────────────────────────
+
+NODE_STATE_HEALTHY  = "healthy"
+NODE_STATE_DEGRADED = "degraded"
+NODE_STATE_DOWN     = "down"
+
+# Failover-triggering error classes
+FAILOVER_REASON_TIMEOUT  = "timeout"
+FAILOVER_REASON_HTTP_5XX = "http_5xx"
+FAILOVER_REASON_NETWORK  = "network"
+
+
+# ── Config ────────────────────────────────────────────────────────────────────
+
+@dataclass(frozen=True)
+class NodeHealthConfig:
+    """
+    Thresholds controlling when a node is considered degraded or down.
+
+    fail_consecutive : int   number of consecutive failures → "down"
+    lat_ewma_s       : float EWMA latency estimate (seconds) threshold → "degraded"
+    ewma_alpha       : float EWMA smoothing factor (0..1); higher = more reactive
+    """
+    fail_consecutive: int   = 3
+    lat_ewma_s:       float = 12.0
+    ewma_alpha:       float = 0.3
+
+    def __post_init__(self) -> None:
+        if not (0 < self.ewma_alpha <= 1):
+            raise ValueError(f"ewma_alpha must be in (0, 1], got {self.ewma_alpha}")
+        if self.fail_consecutive < 1:
+            raise ValueError(f"fail_consecutive must be ≥ 1, got {self.fail_consecutive}")
+        if self.lat_ewma_s <= 0:
+            raise ValueError(f"lat_ewma_s must be > 0, got {self.lat_ewma_s}")
+
+
+# ── Per-node state ────────────────────────────────────────────────────────────
+
+@dataclass
+class _NodeState:
+    invoke_ok_total:      int   = 0
+    invoke_err_total:     int   = 0
+    consecutive_failures: int   = 0
+    last_ok_ts:           Optional[float] = None
+    last_err_ts:          Optional[float] = None
+    ewma_latency_s:       Optional[float] = None   # None until first ok record
+
+
+# ── Tracker ───────────────────────────────────────────────────────────────────
+
+class NodeHealthTracker:
+    """
+    Thread-safe per-node health tracker.
+
+    Usage:
+        tracker = NodeHealthTracker(NodeHealthConfig())
+
+        # On successful invoke
+        tracker.record_ok("NODA1", latency_s=1.4)
+
+        # On failed invoke
+        tracker.record_error("NODA1", reason=FAILOVER_REASON_TIMEOUT)
+
+        # Read health state
+        state = tracker.state("NODA1")       # "healthy" | "degraded" | "down"
+        fallback = tracker.pick_fallback("NODA1", allowed_nodes=frozenset({"NODA1","NODA2"}))
+    """
+
+    def __init__(self, config: Optional[NodeHealthConfig] = None) -> None:
+        self._cfg = config or NodeHealthConfig()
+        self._nodes: Dict[str, _NodeState] = {}
+        self._lock = threading.RLock()   # RLock: re-entrant (needed for all_info → as_info_dict)
+
+    # ── Public mutation API ────────────────────────────────────────────────────
+
+    def record_ok(self, node_id: str, latency_s: float) -> None:
+        """Record a successful invoke for node_id with given latency."""
+        with self._lock:
+            ns = self._get_or_create(node_id)
+            ns.invoke_ok_total += 1
+            ns.consecutive_failures = 0
+            ns.last_ok_ts = time.monotonic()
+            if ns.ewma_latency_s is None:
+                ns.ewma_latency_s = latency_s
+            else:
+                alpha = self._cfg.ewma_alpha
+                ns.ewma_latency_s = alpha * latency_s + (1 - alpha) * ns.ewma_latency_s
+
+    def record_error(self, node_id: str, reason: str = "unknown") -> None:
+        """Record a failed invoke for node_id."""
+        with self._lock:
+            ns = self._get_or_create(node_id)
+            ns.invoke_err_total += 1
+            ns.consecutive_failures += 1
+            ns.last_err_ts = time.monotonic()
+            logger.debug(
+                "NodeHealth: node=%s consecutive_failures=%d reason=%s",
+                node_id, ns.consecutive_failures, reason,
+            )
+
+    # ── Public read API ───────────────────────────────────────────────────────
+
+    def state(self, node_id: str) -> str:
+        """Return current health state for node_id."""
+        with self._lock:
+            return self._state_unlocked(node_id)
+
+    def pick_fallback(
+        self,
+        primary: str,
+        allowed_nodes: FrozenSet[str],
+    ) -> Optional[str]:
+        """
+        Return the best alternative node for failover.
+
+        Priority: healthy > degraded > (never down)
+        Returns None if no acceptable fallback exists.
+        """
+        with self._lock:
+            candidates = sorted(n for n in allowed_nodes if n != primary)
+            # Prefer healthy first
+            for n in candidates:
+                if self._state_unlocked(n) == NODE_STATE_HEALTHY:
+                    return n
+            # Accept degraded if no healthy available
+            for n in candidates:
+                if self._state_unlocked(n) == NODE_STATE_DEGRADED:
+                    return n
+            # Do not failover to "down" nodes
+            return None
+
+    def as_info_dict(self, node_id: str) -> dict:
+        """Return a JSON-safe status dict for one node."""
+        with self._lock:
+            ns = self._nodes.get(node_id)
+            if ns is None:
+                return {
+                    "node_id": node_id,
+                    "state": NODE_STATE_HEALTHY,
+                    "invoke_ok": 0,
+                    "invoke_err": 0,
+                    "consecutive_failures": 0,
+                    "ewma_latency_s": None,
+                    "last_ok_ts": None,
+                    "last_err_ts": None,
+                }
+            return {
+                "node_id": node_id,
+                "state": self._state_unlocked(node_id),
+                "invoke_ok": ns.invoke_ok_total,
+                "invoke_err": ns.invoke_err_total,
+                "consecutive_failures": ns.consecutive_failures,
+                "ewma_latency_s": round(ns.ewma_latency_s, 3) if ns.ewma_latency_s else None,
+                "last_ok_ts": ns.last_ok_ts,
+                "last_err_ts": ns.last_err_ts,
+            }
+
+    def all_info(self, allowed_nodes: Optional[FrozenSet[str]] = None) -> Dict[str, dict]:
+        """
+        Return status dicts for all tracked (or specified) nodes.
+        If allowed_nodes provided, also include entries for unseen nodes (state=healthy).
+        """
+        with self._lock:
+            keys = set(self._nodes.keys())
+            if allowed_nodes:
+                keys |= set(allowed_nodes)
+            return {n: self.as_info_dict(n) for n in sorted(keys)}
+
+    def reset(self, node_id: str) -> None:
+        """Reset health state for a node (e.g. after manual recovery)."""
+        with self._lock:
+            self._nodes.pop(node_id, None)
+
+    def restore_node(
+        self,
+        node_id: str,
+        ewma_latency_s: Optional[float],
+        consecutive_failures: int,
+    ) -> None:
+        """
+        Restore persisted node state after a restart (M8.2).
+
+        Only restores ewma_latency_s and consecutive_failures; counters
+        (invoke_ok_total, invoke_err_total) start from 0 since they are
+        runtime metrics for the current session.
+        """
+        with self._lock:
+            ns = self._get_or_create(node_id)
+            ns.ewma_latency_s = ewma_latency_s
+            ns.consecutive_failures = max(0, consecutive_failures)
+
+    # ── Internal ──────────────────────────────────────────────────────────────
+
+    def _get_or_create(self, node_id: str) -> _NodeState:
+        if node_id not in self._nodes:
+            self._nodes[node_id] = _NodeState()
+        return self._nodes[node_id]
+
+    def _state_unlocked(self, node_id: str) -> str:
+        ns = self._nodes.get(node_id)
+        if ns is None:
+            return NODE_STATE_HEALTHY   # unseen nodes are assumed healthy
+
+        if ns.consecutive_failures >= self._cfg.fail_consecutive:
+            return NODE_STATE_DOWN
+
+        if (
+            ns.ewma_latency_s is not None
+            and ns.ewma_latency_s >= self._cfg.lat_ewma_s
+        ):
+            return NODE_STATE_DEGRADED
+
+        return NODE_STATE_HEALTHY
+
+
+# ── Parser (env vars → NodeHealthConfig) ──────────────────────────────────────
+
+def parse_node_health_config(
+    fail_consecutive: int = 3,
+    lat_ewma_s: float = 12.0,
+    ewma_alpha: float = 0.3,
+) -> NodeHealthConfig:
+    """Construct NodeHealthConfig from parsed env values."""
+    return NodeHealthConfig(
+        fail_consecutive=fail_consecutive,
+        lat_ewma_s=lat_ewma_s,
+        ewma_alpha=ewma_alpha,
+    )