Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
211 lines
7.2 KiB
Python
211 lines
7.2 KiB
Python
"""
|
|
discovery — M4.0: Agent discovery helpers for Matrix user rooms.
|
|
|
|
Provides formatted replies for `!agents` and `!agents status` commands.
|
|
These commands are available to all room members (no auth required) and
|
|
are processed BEFORE routing to the LLM agent.
|
|
|
|
Supports:
|
|
- Mixed rooms: list all agents, default, usage examples
|
|
- Direct rooms: show single agent mapping
|
|
- Unknown rooms: "no mapping" notice
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import datetime
|
|
from typing import Optional
|
|
|
|
from .mixed_routing import MixedRoomConfig
|
|
from .room_mapping import RoomMappingConfig # noqa: F401 — used in type hints
|
|
|
|
|
|
def _fmt_ts(ts: int) -> str:
|
|
"""Format a Unix timestamp as compact UTC string."""
|
|
try:
|
|
return datetime.datetime.fromtimestamp(ts, tz=datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
except Exception: # noqa: BLE001
|
|
return str(ts)
|
|
|
|
# Discovery command prefix
|
|
DISCOVERY_CMD = "!agents"
|
|
|
|
# Reply length cap (Matrix message, not truncated — kept short by design)
|
|
_MAX_REPLY_LEN = 3500
|
|
|
|
|
|
def is_discovery_message(text: str) -> bool:
|
|
"""Return True if the message is a !agents discovery command."""
|
|
lower = text.strip().lower()
|
|
return lower == DISCOVERY_CMD or lower.startswith(DISCOVERY_CMD + " ")
|
|
|
|
|
|
def agents_reply(
|
|
room_id: str,
|
|
room_map: RoomMappingConfig,
|
|
mixed_room_config: Optional[MixedRoomConfig],
|
|
) -> str:
|
|
"""
|
|
Build a discovery reply for the given room.
|
|
|
|
Mixed room → list agents, default, usage examples.
|
|
Direct room → single agent mapping.
|
|
Unknown → 'no mapping' notice.
|
|
"""
|
|
# Mixed room check first
|
|
if mixed_room_config and mixed_room_config.is_mixed(room_id):
|
|
room = mixed_room_config.rooms.get(room_id)
|
|
if room is not None:
|
|
return _mixed_room_reply(room_id, room)
|
|
|
|
# Direct room check
|
|
agent_id = room_map.agent_for_room(room_id)
|
|
if agent_id is not None:
|
|
return _direct_room_reply(agent_id)
|
|
|
|
return _unknown_room_reply()
|
|
|
|
|
|
def _mixed_room_reply(room_id: str, room) -> str:
|
|
"""Format reply for a mixed room."""
|
|
agents = room.agents
|
|
default = room.default_agent or (agents[0] if agents else "?")
|
|
agent_list = ", ".join(f"**{a}**" for a in agents)
|
|
|
|
lines = [
|
|
f"🤖 **Agents available in this room:** {agent_list}",
|
|
f"⭐ **Default:** {default}",
|
|
"",
|
|
"**How to address an agent:**",
|
|
]
|
|
for agent in agents[:5]: # show max 5 examples
|
|
lines.append(f" • `/{agent} <message>` — slash command")
|
|
lines.append(f" • `@{agent} <message>` — mention")
|
|
lines.append(f" • `{agent}: <message>` — colon prefix")
|
|
lines.extend([
|
|
"",
|
|
f"_Messages without prefix go to **{default}** by default._",
|
|
])
|
|
return "\n".join(lines)
|
|
|
|
|
|
def _direct_room_reply(agent_id: str) -> str:
|
|
"""Format reply for a directly-mapped room (1 agent)."""
|
|
return (
|
|
f"🤖 This room is mapped to agent: **{agent_id}**\n\n"
|
|
f"All messages are forwarded to **{agent_id}** automatically.\n"
|
|
f"No prefix needed — just write your message."
|
|
)
|
|
|
|
|
|
def _unknown_room_reply() -> str:
|
|
"""Format reply when room has no mapping."""
|
|
return (
|
|
"⚠️ This room has no agent mapping.\n\n"
|
|
"Contact an operator to configure an agent for this room."
|
|
)
|
|
|
|
|
|
# ── Bridge status reply (M4.1) ────────────────────────────────────────────────
|
|
|
|
def bridge_status_reply(snapshot: dict) -> str:
|
|
"""
|
|
Format a concise bridge health snapshot for `!status` in control room.
|
|
|
|
snapshot keys (all optional with defaults):
|
|
node_id, queue_size, queue_max, worker_count,
|
|
room_count, mixed_room_count, operators_count,
|
|
control_safety (dict), persistent_dedupe (dict),
|
|
dedupe_hits, dedupe_inserts
|
|
"""
|
|
node_id = snapshot.get("node_id", "?")
|
|
q_size = snapshot.get("queue_size", "?")
|
|
q_max = snapshot.get("queue_max", "?")
|
|
workers = snapshot.get("worker_count", "?")
|
|
rooms = snapshot.get("room_count", 0)
|
|
mixed = snapshot.get("mixed_room_count", 0)
|
|
ops = snapshot.get("operators_count", 0)
|
|
|
|
safety = snapshot.get("control_safety") or {}
|
|
dedupe = snapshot.get("persistent_dedupe") or {}
|
|
|
|
node_policy = snapshot.get("node_policy") or {}
|
|
default_node = node_policy.get("default_node", node_id)
|
|
allowed_nodes = node_policy.get("allowed_nodes") or []
|
|
room_overrides = node_policy.get("room_overrides", 0)
|
|
|
|
lines = [
|
|
f"📡 **Bridge status** — node: `{node_id}`",
|
|
"",
|
|
f"**Queue:** {q_size}/{q_max} workers: {workers}",
|
|
f"**Rooms:** {rooms} direct {mixed} mixed ops: {ops} operators",
|
|
"",
|
|
]
|
|
|
|
# M5.0: node policy
|
|
if allowed_nodes:
|
|
allowed_str = ", ".join(f"`{n}`" for n in sorted(allowed_nodes))
|
|
lines.append(
|
|
f"**Node policy:** default=`{default_node}` allowed={allowed_str} room_overrides={room_overrides}"
|
|
)
|
|
|
|
# Control safety
|
|
if safety:
|
|
enabled = "✅" if safety.get("enabled") else "⬜"
|
|
lines.append(
|
|
f"**Control safety {enabled}:** "
|
|
f"room={safety.get('room_rpm', '?')}rpm "
|
|
f"op={safety.get('operator_rpm', '?')}rpm "
|
|
f"cooldown={safety.get('cooldown_s', '?')}s"
|
|
)
|
|
|
|
# Persistent dedupe
|
|
if dedupe:
|
|
ok_emoji = "✅" if dedupe.get("ok") else "❌"
|
|
pruned = dedupe.get("pruned_rows_last", 0)
|
|
ttl = dedupe.get("ttl_h", "?")
|
|
lines.append(
|
|
f"**Dedupe {ok_emoji}:** ttl={ttl}h pruned_last={pruned} "
|
|
f"db=`{dedupe.get('db_path') or 'n/a'}`"
|
|
)
|
|
|
|
# M6.0/M6.1: policy store status
|
|
ps = snapshot.get("policy_store") or {}
|
|
if ps:
|
|
ps_ok = "✅" if ps.get("ok") else "❌"
|
|
ps_node_count = ps.get("overrides_count", 0)
|
|
ps_agent_count = ps.get("agent_overrides_count", snapshot.get("policy_agent_overrides_count", 0))
|
|
ps_path = ps.get("policy_store_path") or ps.get("path") or "n/a"
|
|
lines.append(
|
|
f"**Policy store {ps_ok}:** node_overrides={ps_node_count} "
|
|
f"agent_overrides={ps_agent_count} db=`{ps_path}`"
|
|
)
|
|
|
|
# M6.2: last export/import timestamps + DB mtime
|
|
_last_export = snapshot.get("policy_last_export_at")
|
|
_last_import = snapshot.get("policy_last_import_at")
|
|
_db_mtime = snapshot.get("policy_db_mtime")
|
|
_snap_parts: list = []
|
|
if _last_export:
|
|
_snap_parts.append(f"last_export=`{_fmt_ts(_last_export)}`")
|
|
if _last_import:
|
|
_snap_parts.append(f"last_import=`{_fmt_ts(_last_import)}`")
|
|
if _db_mtime:
|
|
_snap_parts.append(f"db_mtime=`{_fmt_ts(_db_mtime)}`")
|
|
if _snap_parts:
|
|
lines.append("**Policy snapshots:** " + " ".join(_snap_parts))
|
|
|
|
# M5.1: per-node routed/rejected breakdown
|
|
node_stats = snapshot.get("nodes") or {}
|
|
if node_stats:
|
|
lines.append("\n**Per-node stats:**")
|
|
for nid in sorted(node_stats):
|
|
ns = node_stats[nid]
|
|
lines.append(
|
|
f" `{nid}`: routed={ns.get('routed', 0)} rejected={ns.get('rejected', 0)}"
|
|
)
|
|
|
|
reply = "\n".join(lines)
|
|
if len(reply) > _MAX_REPLY_LEN:
|
|
reply = reply[:_MAX_REPLY_LEN - 3] + "…"
|
|
return reply
|