Files
microdao-daarion/services/matrix-bridge-dagi/app/node_policy.py
Apple 82d5ff2a4f feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11:
- M4: agent discovery (!agents / !status)
- M5: node-aware routing + per-node observability
- M6: dynamic policy store (node/agent overrides, import/export)
- M7: Prometheus alerts + Grafana dashboard + metrics contract
- M8: node health tracker + soft failover + sticky cache + HA persistence
- M9: two-step confirm + diff preview for dangerous commands
- M10: auto-backup, restore, retention, policy history + change detail
- M11: soak scenarios (CI tests) + live soak script

Soak infrastructure (this commit):
- POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false)
- _preflight_inject() and _check_wal() in soak script
- --db-path arg for WAL delta reporting
- Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands

Made-with: Cursor
2026-03-05 07:51:37 -08:00

180 lines
5.9 KiB
Python

"""
node_policy — Node-aware routing for matrix-bridge-dagi.
Resolves which NODA (NODA1, NODA2, …) a message should be tagged with based on:
1. Explicit `node=X` kwarg in the message body (mixed rooms only)
2. Dynamic store override (PolicyStore, set by operators via !node set) ← M6.0
3. Static per-room mapping from BRIDGE_ROOM_NODE_MAP env
4. BRIDGE_DEFAULT_NODE (fallback)
The resolved node_id is embedded in the Router metadata so downstream
services (Router / Memory / Agent) can apply per-node policies.
This module does NOT change the HTTP endpoint called — the Router URL
stays the same.
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
from typing import Dict, FrozenSet, Optional, Tuple
# Regex to find 'node=X' anywhere in message text (case-insensitive)
_NODE_KWARG_RE = re.compile(r"\bnode=(\w+)\b", re.IGNORECASE)
# Node resolution sources (priority order)
NODE_SOURCE_EXPLICIT = "explicit"
NODE_SOURCE_STORE = "store" # M6.0: dynamic PolicyStore override
NODE_SOURCE_ROOM_MAP = "room_map"
NODE_SOURCE_DEFAULT = "default"
@dataclass(frozen=True)
class NodeResolution:
"""Result of resolving the target node for a message."""
node_id: str
source: str
rejected_node: Optional[str] = None # set when explicit node was not allowlisted
@dataclass
class NodePolicy:
"""
Node resolution policy.
Attributes:
allowed_nodes: Set of valid node names (uppercase).
default_node: Fallback node when no explicit or room-map match.
room_node_map: Optional per-room override (room_id → node_id).
"""
allowed_nodes: FrozenSet[str]
default_node: str
room_node_map: Dict[str, str] = field(default_factory=dict)
def resolve(
self,
room_id: str,
explicit_node: Optional[str] = None,
store_override: Optional[str] = None,
) -> NodeResolution:
"""
Resolve target node for a message.
Priority (highest → lowest):
1. explicit_node kwarg (user-supplied, mixed rooms only)
2. store_override — dynamic PolicyStore entry (M6.0)
3. room_node_map — static BRIDGE_ROOM_NODE_MAP env entry
4. default_node
"""
if explicit_node is not None:
upper = explicit_node.upper()
if upper in self.allowed_nodes:
return NodeResolution(node_id=upper, source=NODE_SOURCE_EXPLICIT)
# Rejected — report bad value and fall through to best available
fallback = self._fallback(room_id, store_override)
return NodeResolution(
node_id=fallback.node_id,
source=fallback.source,
rejected_node=upper,
)
return self._fallback(room_id, store_override)
def _fallback(
self,
room_id: str,
store_override: Optional[str] = None,
) -> NodeResolution:
"""Resolve node without an explicit kwarg (store → env map → default)."""
if store_override is not None:
upper = store_override.upper()
if upper in self.allowed_nodes:
return NodeResolution(node_id=upper, source=NODE_SOURCE_STORE)
if room_id in self.room_node_map:
mapped = self.room_node_map[room_id].upper()
if mapped in self.allowed_nodes:
return NodeResolution(node_id=mapped, source=NODE_SOURCE_ROOM_MAP)
return NodeResolution(node_id=self.default_node, source=NODE_SOURCE_DEFAULT)
def as_info_dict(self) -> dict:
"""Return a safe dict for health/ops snapshots (no secrets)."""
return {
"default_node": self.default_node,
"allowed_nodes": sorted(self.allowed_nodes),
"room_overrides": len(self.room_node_map),
}
def parse_node_policy(
raw_allowed: str,
default_node: str,
raw_room_map: str,
) -> NodePolicy:
"""
Parse node policy from env-style config strings.
raw_allowed: "NODA1,NODA2"
default_node: "NODA1"
raw_room_map: "!roomA:server=NODA2;!roomB:server=NODA1"
"""
default = default_node.strip().upper() or "NODA1"
allowed: FrozenSet[str] = frozenset(
n.strip().upper() for n in raw_allowed.split(",") if n.strip()
)
if not allowed:
allowed = frozenset([default])
elif default not in allowed:
# default must always be reachable
allowed = allowed | frozenset([default])
room_map: Dict[str, str] = {}
for entry in raw_room_map.split(";"):
entry = entry.strip()
if not entry or "=" not in entry:
continue
room_id_raw, node_raw = entry.split("=", 1)
room_id = room_id_raw.strip()
node = node_raw.strip().upper()
if room_id and node:
room_map[room_id] = node
return NodePolicy(
allowed_nodes=allowed,
default_node=default,
room_node_map=room_map,
)
def extract_node_kwarg(text: str) -> Tuple[Optional[str], str]:
"""
Extract 'node=X' kwarg from message text.
Returns (node_id_or_None, cleaned_text_without_kwarg).
Preserves the rest of the message — no other transformations.
Example:
"/sofiia node=NODA2 Hello!"
→ ("NODA2", "/sofiia Hello!")
"""
m = _NODE_KWARG_RE.search(text)
if m:
node = m.group(1).upper()
cleaned = _NODE_KWARG_RE.sub("", text, count=1)
# Collapse runs of whitespace introduced by the removal
cleaned = " ".join(cleaned.split())
return node, cleaned
return None, text
def node_rejected_reply(requested: str, allowed: FrozenSet[str]) -> str:
"""Reply when user requests a node not in the allowlist."""
allowed_list = ", ".join(f"`{n}`" for n in sorted(allowed))
return (
f"⚠️ Unknown node: `{requested}`\n"
f"Allowed: {allowed_list}\n"
f"_Example: `/sofiia node=NODA1 Hello!`_"
)