Files
microdao-daarion/services/matrix-bridge-dagi/app/discovery.py
Apple 82d5ff2a4f feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11:
- M4: agent discovery (!agents / !status)
- M5: node-aware routing + per-node observability
- M6: dynamic policy store (node/agent overrides, import/export)
- M7: Prometheus alerts + Grafana dashboard + metrics contract
- M8: node health tracker + soft failover + sticky cache + HA persistence
- M9: two-step confirm + diff preview for dangerous commands
- M10: auto-backup, restore, retention, policy history + change detail
- M11: soak scenarios (CI tests) + live soak script

Soak infrastructure (this commit):
- POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false)
- _preflight_inject() and _check_wal() in soak script
- --db-path arg for WAL delta reporting
- Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands

Made-with: Cursor
2026-03-05 07:51:37 -08:00

211 lines
7.2 KiB
Python

"""
discovery — M4.0: Agent discovery helpers for Matrix user rooms.
Provides formatted replies for `!agents` and `!agents status` commands.
These commands are available to all room members (no auth required) and
are processed BEFORE routing to the LLM agent.
Supports:
- Mixed rooms: list all agents, default, usage examples
- Direct rooms: show single agent mapping
- Unknown rooms: "no mapping" notice
"""
from __future__ import annotations
import datetime
from typing import Optional
from .mixed_routing import MixedRoomConfig
from .room_mapping import RoomMappingConfig # noqa: F401 — used in type hints
def _fmt_ts(ts: int) -> str:
"""Format a Unix timestamp as compact UTC string."""
try:
return datetime.datetime.fromtimestamp(ts, tz=datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
except Exception: # noqa: BLE001
return str(ts)
# Discovery command prefix
DISCOVERY_CMD = "!agents"
# Reply length cap (Matrix message, not truncated — kept short by design)
_MAX_REPLY_LEN = 3500
def is_discovery_message(text: str) -> bool:
"""Return True if the message is a !agents discovery command."""
lower = text.strip().lower()
return lower == DISCOVERY_CMD or lower.startswith(DISCOVERY_CMD + " ")
def agents_reply(
room_id: str,
room_map: RoomMappingConfig,
mixed_room_config: Optional[MixedRoomConfig],
) -> str:
"""
Build a discovery reply for the given room.
Mixed room → list agents, default, usage examples.
Direct room → single agent mapping.
Unknown → 'no mapping' notice.
"""
# Mixed room check first
if mixed_room_config and mixed_room_config.is_mixed(room_id):
room = mixed_room_config.rooms.get(room_id)
if room is not None:
return _mixed_room_reply(room_id, room)
# Direct room check
agent_id = room_map.agent_for_room(room_id)
if agent_id is not None:
return _direct_room_reply(agent_id)
return _unknown_room_reply()
def _mixed_room_reply(room_id: str, room) -> str:
"""Format reply for a mixed room."""
agents = room.agents
default = room.default_agent or (agents[0] if agents else "?")
agent_list = ", ".join(f"**{a}**" for a in agents)
lines = [
f"🤖 **Agents available in this room:** {agent_list}",
f"⭐ **Default:** {default}",
"",
"**How to address an agent:**",
]
for agent in agents[:5]: # show max 5 examples
lines.append(f" • `/{agent} <message>` — slash command")
lines.append(f" • `@{agent} <message>` — mention")
lines.append(f" • `{agent}: <message>` — colon prefix")
lines.extend([
"",
f"_Messages without prefix go to **{default}** by default._",
])
return "\n".join(lines)
def _direct_room_reply(agent_id: str) -> str:
"""Format reply for a directly-mapped room (1 agent)."""
return (
f"🤖 This room is mapped to agent: **{agent_id}**\n\n"
f"All messages are forwarded to **{agent_id}** automatically.\n"
f"No prefix needed — just write your message."
)
def _unknown_room_reply() -> str:
"""Format reply when room has no mapping."""
return (
"⚠️ This room has no agent mapping.\n\n"
"Contact an operator to configure an agent for this room."
)
# ── Bridge status reply (M4.1) ────────────────────────────────────────────────
def bridge_status_reply(snapshot: dict) -> str:
"""
Format a concise bridge health snapshot for `!status` in control room.
snapshot keys (all optional with defaults):
node_id, queue_size, queue_max, worker_count,
room_count, mixed_room_count, operators_count,
control_safety (dict), persistent_dedupe (dict),
dedupe_hits, dedupe_inserts
"""
node_id = snapshot.get("node_id", "?")
q_size = snapshot.get("queue_size", "?")
q_max = snapshot.get("queue_max", "?")
workers = snapshot.get("worker_count", "?")
rooms = snapshot.get("room_count", 0)
mixed = snapshot.get("mixed_room_count", 0)
ops = snapshot.get("operators_count", 0)
safety = snapshot.get("control_safety") or {}
dedupe = snapshot.get("persistent_dedupe") or {}
node_policy = snapshot.get("node_policy") or {}
default_node = node_policy.get("default_node", node_id)
allowed_nodes = node_policy.get("allowed_nodes") or []
room_overrides = node_policy.get("room_overrides", 0)
lines = [
f"📡 **Bridge status** — node: `{node_id}`",
"",
f"**Queue:** {q_size}/{q_max} workers: {workers}",
f"**Rooms:** {rooms} direct {mixed} mixed ops: {ops} operators",
"",
]
# M5.0: node policy
if allowed_nodes:
allowed_str = ", ".join(f"`{n}`" for n in sorted(allowed_nodes))
lines.append(
f"**Node policy:** default=`{default_node}` allowed={allowed_str} room_overrides={room_overrides}"
)
# Control safety
if safety:
enabled = "" if safety.get("enabled") else ""
lines.append(
f"**Control safety {enabled}:** "
f"room={safety.get('room_rpm', '?')}rpm "
f"op={safety.get('operator_rpm', '?')}rpm "
f"cooldown={safety.get('cooldown_s', '?')}s"
)
# Persistent dedupe
if dedupe:
ok_emoji = "" if dedupe.get("ok") else ""
pruned = dedupe.get("pruned_rows_last", 0)
ttl = dedupe.get("ttl_h", "?")
lines.append(
f"**Dedupe {ok_emoji}:** ttl={ttl}h pruned_last={pruned} "
f"db=`{dedupe.get('db_path') or 'n/a'}`"
)
# M6.0/M6.1: policy store status
ps = snapshot.get("policy_store") or {}
if ps:
ps_ok = "" if ps.get("ok") else ""
ps_node_count = ps.get("overrides_count", 0)
ps_agent_count = ps.get("agent_overrides_count", snapshot.get("policy_agent_overrides_count", 0))
ps_path = ps.get("policy_store_path") or ps.get("path") or "n/a"
lines.append(
f"**Policy store {ps_ok}:** node_overrides={ps_node_count} "
f"agent_overrides={ps_agent_count} db=`{ps_path}`"
)
# M6.2: last export/import timestamps + DB mtime
_last_export = snapshot.get("policy_last_export_at")
_last_import = snapshot.get("policy_last_import_at")
_db_mtime = snapshot.get("policy_db_mtime")
_snap_parts: list = []
if _last_export:
_snap_parts.append(f"last_export=`{_fmt_ts(_last_export)}`")
if _last_import:
_snap_parts.append(f"last_import=`{_fmt_ts(_last_import)}`")
if _db_mtime:
_snap_parts.append(f"db_mtime=`{_fmt_ts(_db_mtime)}`")
if _snap_parts:
lines.append("**Policy snapshots:** " + " ".join(_snap_parts))
# M5.1: per-node routed/rejected breakdown
node_stats = snapshot.get("nodes") or {}
if node_stats:
lines.append("\n**Per-node stats:**")
for nid in sorted(node_stats):
ns = node_stats[nid]
lines.append(
f" `{nid}`: routed={ns.get('routed', 0)} rejected={ns.get('rejected', 0)}"
)
reply = "\n".join(lines)
if len(reply) > _MAX_REPLY_LEN:
reply = reply[:_MAX_REPLY_LEN - 3] + ""
return reply