Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
180 lines
5.9 KiB
Python
180 lines
5.9 KiB
Python
"""
|
|
node_policy — Node-aware routing for matrix-bridge-dagi.
|
|
|
|
Resolves which NODA (NODA1, NODA2, …) a message should be tagged with based on:
|
|
1. Explicit `node=X` kwarg in the message body (mixed rooms only)
|
|
2. Dynamic store override (PolicyStore, set by operators via !node set) ← M6.0
|
|
3. Static per-room mapping from BRIDGE_ROOM_NODE_MAP env
|
|
4. BRIDGE_DEFAULT_NODE (fallback)
|
|
|
|
The resolved node_id is embedded in the Router metadata so downstream
|
|
services (Router / Memory / Agent) can apply per-node policies.
|
|
|
|
This module does NOT change the HTTP endpoint called — the Router URL
|
|
stays the same.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from typing import Dict, FrozenSet, Optional, Tuple
|
|
|
|
# Regex to find 'node=X' anywhere in message text (case-insensitive)
|
|
_NODE_KWARG_RE = re.compile(r"\bnode=(\w+)\b", re.IGNORECASE)
|
|
|
|
# Node resolution sources (priority order)
|
|
NODE_SOURCE_EXPLICIT = "explicit"
|
|
NODE_SOURCE_STORE = "store" # M6.0: dynamic PolicyStore override
|
|
NODE_SOURCE_ROOM_MAP = "room_map"
|
|
NODE_SOURCE_DEFAULT = "default"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class NodeResolution:
|
|
"""Result of resolving the target node for a message."""
|
|
node_id: str
|
|
source: str
|
|
rejected_node: Optional[str] = None # set when explicit node was not allowlisted
|
|
|
|
|
|
@dataclass
|
|
class NodePolicy:
|
|
"""
|
|
Node resolution policy.
|
|
|
|
Attributes:
|
|
allowed_nodes: Set of valid node names (uppercase).
|
|
default_node: Fallback node when no explicit or room-map match.
|
|
room_node_map: Optional per-room override (room_id → node_id).
|
|
"""
|
|
allowed_nodes: FrozenSet[str]
|
|
default_node: str
|
|
room_node_map: Dict[str, str] = field(default_factory=dict)
|
|
|
|
def resolve(
|
|
self,
|
|
room_id: str,
|
|
explicit_node: Optional[str] = None,
|
|
store_override: Optional[str] = None,
|
|
) -> NodeResolution:
|
|
"""
|
|
Resolve target node for a message.
|
|
|
|
Priority (highest → lowest):
|
|
1. explicit_node kwarg (user-supplied, mixed rooms only)
|
|
2. store_override — dynamic PolicyStore entry (M6.0)
|
|
3. room_node_map — static BRIDGE_ROOM_NODE_MAP env entry
|
|
4. default_node
|
|
"""
|
|
if explicit_node is not None:
|
|
upper = explicit_node.upper()
|
|
if upper in self.allowed_nodes:
|
|
return NodeResolution(node_id=upper, source=NODE_SOURCE_EXPLICIT)
|
|
# Rejected — report bad value and fall through to best available
|
|
fallback = self._fallback(room_id, store_override)
|
|
return NodeResolution(
|
|
node_id=fallback.node_id,
|
|
source=fallback.source,
|
|
rejected_node=upper,
|
|
)
|
|
|
|
return self._fallback(room_id, store_override)
|
|
|
|
def _fallback(
|
|
self,
|
|
room_id: str,
|
|
store_override: Optional[str] = None,
|
|
) -> NodeResolution:
|
|
"""Resolve node without an explicit kwarg (store → env map → default)."""
|
|
if store_override is not None:
|
|
upper = store_override.upper()
|
|
if upper in self.allowed_nodes:
|
|
return NodeResolution(node_id=upper, source=NODE_SOURCE_STORE)
|
|
|
|
if room_id in self.room_node_map:
|
|
mapped = self.room_node_map[room_id].upper()
|
|
if mapped in self.allowed_nodes:
|
|
return NodeResolution(node_id=mapped, source=NODE_SOURCE_ROOM_MAP)
|
|
|
|
return NodeResolution(node_id=self.default_node, source=NODE_SOURCE_DEFAULT)
|
|
|
|
def as_info_dict(self) -> dict:
|
|
"""Return a safe dict for health/ops snapshots (no secrets)."""
|
|
return {
|
|
"default_node": self.default_node,
|
|
"allowed_nodes": sorted(self.allowed_nodes),
|
|
"room_overrides": len(self.room_node_map),
|
|
}
|
|
|
|
|
|
def parse_node_policy(
|
|
raw_allowed: str,
|
|
default_node: str,
|
|
raw_room_map: str,
|
|
) -> NodePolicy:
|
|
"""
|
|
Parse node policy from env-style config strings.
|
|
|
|
raw_allowed: "NODA1,NODA2"
|
|
default_node: "NODA1"
|
|
raw_room_map: "!roomA:server=NODA2;!roomB:server=NODA1"
|
|
"""
|
|
default = default_node.strip().upper() or "NODA1"
|
|
|
|
allowed: FrozenSet[str] = frozenset(
|
|
n.strip().upper() for n in raw_allowed.split(",") if n.strip()
|
|
)
|
|
if not allowed:
|
|
allowed = frozenset([default])
|
|
elif default not in allowed:
|
|
# default must always be reachable
|
|
allowed = allowed | frozenset([default])
|
|
|
|
room_map: Dict[str, str] = {}
|
|
for entry in raw_room_map.split(";"):
|
|
entry = entry.strip()
|
|
if not entry or "=" not in entry:
|
|
continue
|
|
room_id_raw, node_raw = entry.split("=", 1)
|
|
room_id = room_id_raw.strip()
|
|
node = node_raw.strip().upper()
|
|
if room_id and node:
|
|
room_map[room_id] = node
|
|
|
|
return NodePolicy(
|
|
allowed_nodes=allowed,
|
|
default_node=default,
|
|
room_node_map=room_map,
|
|
)
|
|
|
|
|
|
def extract_node_kwarg(text: str) -> Tuple[Optional[str], str]:
|
|
"""
|
|
Extract 'node=X' kwarg from message text.
|
|
|
|
Returns (node_id_or_None, cleaned_text_without_kwarg).
|
|
Preserves the rest of the message — no other transformations.
|
|
|
|
Example:
|
|
"/sofiia node=NODA2 Hello!"
|
|
→ ("NODA2", "/sofiia Hello!")
|
|
"""
|
|
m = _NODE_KWARG_RE.search(text)
|
|
if m:
|
|
node = m.group(1).upper()
|
|
cleaned = _NODE_KWARG_RE.sub("", text, count=1)
|
|
# Collapse runs of whitespace introduced by the removal
|
|
cleaned = " ".join(cleaned.split())
|
|
return node, cleaned
|
|
return None, text
|
|
|
|
|
|
def node_rejected_reply(requested: str, allowed: FrozenSet[str]) -> str:
|
|
"""Reply when user requests a node not in the allowlist."""
|
|
allowed_list = ", ".join(f"`{n}`" for n in sorted(allowed))
|
|
return (
|
|
f"⚠️ Unknown node: `{requested}`\n"
|
|
f"Allowed: {allowed_list}\n"
|
|
f"_Example: `/sofiia node=NODA1 Hello!`_"
|
|
)
|