feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
This commit is contained in:
210
services/matrix-bridge-dagi/app/discovery.py
Normal file
210
services/matrix-bridge-dagi/app/discovery.py
Normal file
@@ -0,0 +1,210 @@
|
||||
"""
|
||||
discovery — M4.0: Agent discovery helpers for Matrix user rooms.
|
||||
|
||||
Provides formatted replies for `!agents` and `!agents status` commands.
|
||||
These commands are available to all room members (no auth required) and
|
||||
are processed BEFORE routing to the LLM agent.
|
||||
|
||||
Supports:
|
||||
- Mixed rooms: list all agents, default, usage examples
|
||||
- Direct rooms: show single agent mapping
|
||||
- Unknown rooms: "no mapping" notice
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
from typing import Optional
|
||||
|
||||
from .mixed_routing import MixedRoomConfig
|
||||
from .room_mapping import RoomMappingConfig # noqa: F401 — used in type hints
|
||||
|
||||
|
||||
def _fmt_ts(ts: int) -> str:
|
||||
"""Format a Unix timestamp as compact UTC string."""
|
||||
try:
|
||||
return datetime.datetime.fromtimestamp(ts, tz=datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
except Exception: # noqa: BLE001
|
||||
return str(ts)
|
||||
|
||||
# Discovery command prefix
|
||||
DISCOVERY_CMD = "!agents"
|
||||
|
||||
# Reply length cap (Matrix message, not truncated — kept short by design)
|
||||
_MAX_REPLY_LEN = 3500
|
||||
|
||||
|
||||
def is_discovery_message(text: str) -> bool:
|
||||
"""Return True if the message is a !agents discovery command."""
|
||||
lower = text.strip().lower()
|
||||
return lower == DISCOVERY_CMD or lower.startswith(DISCOVERY_CMD + " ")
|
||||
|
||||
|
||||
def agents_reply(
|
||||
room_id: str,
|
||||
room_map: RoomMappingConfig,
|
||||
mixed_room_config: Optional[MixedRoomConfig],
|
||||
) -> str:
|
||||
"""
|
||||
Build a discovery reply for the given room.
|
||||
|
||||
Mixed room → list agents, default, usage examples.
|
||||
Direct room → single agent mapping.
|
||||
Unknown → 'no mapping' notice.
|
||||
"""
|
||||
# Mixed room check first
|
||||
if mixed_room_config and mixed_room_config.is_mixed(room_id):
|
||||
room = mixed_room_config.rooms.get(room_id)
|
||||
if room is not None:
|
||||
return _mixed_room_reply(room_id, room)
|
||||
|
||||
# Direct room check
|
||||
agent_id = room_map.agent_for_room(room_id)
|
||||
if agent_id is not None:
|
||||
return _direct_room_reply(agent_id)
|
||||
|
||||
return _unknown_room_reply()
|
||||
|
||||
|
||||
def _mixed_room_reply(room_id: str, room) -> str:
|
||||
"""Format reply for a mixed room."""
|
||||
agents = room.agents
|
||||
default = room.default_agent or (agents[0] if agents else "?")
|
||||
agent_list = ", ".join(f"**{a}**" for a in agents)
|
||||
|
||||
lines = [
|
||||
f"🤖 **Agents available in this room:** {agent_list}",
|
||||
f"⭐ **Default:** {default}",
|
||||
"",
|
||||
"**How to address an agent:**",
|
||||
]
|
||||
for agent in agents[:5]: # show max 5 examples
|
||||
lines.append(f" • `/{agent} <message>` — slash command")
|
||||
lines.append(f" • `@{agent} <message>` — mention")
|
||||
lines.append(f" • `{agent}: <message>` — colon prefix")
|
||||
lines.extend([
|
||||
"",
|
||||
f"_Messages without prefix go to **{default}** by default._",
|
||||
])
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _direct_room_reply(agent_id: str) -> str:
|
||||
"""Format reply for a directly-mapped room (1 agent)."""
|
||||
return (
|
||||
f"🤖 This room is mapped to agent: **{agent_id}**\n\n"
|
||||
f"All messages are forwarded to **{agent_id}** automatically.\n"
|
||||
f"No prefix needed — just write your message."
|
||||
)
|
||||
|
||||
|
||||
def _unknown_room_reply() -> str:
|
||||
"""Format reply when room has no mapping."""
|
||||
return (
|
||||
"⚠️ This room has no agent mapping.\n\n"
|
||||
"Contact an operator to configure an agent for this room."
|
||||
)
|
||||
|
||||
|
||||
# ── Bridge status reply (M4.1) ────────────────────────────────────────────────
|
||||
|
||||
def bridge_status_reply(snapshot: dict) -> str:
|
||||
"""
|
||||
Format a concise bridge health snapshot for `!status` in control room.
|
||||
|
||||
snapshot keys (all optional with defaults):
|
||||
node_id, queue_size, queue_max, worker_count,
|
||||
room_count, mixed_room_count, operators_count,
|
||||
control_safety (dict), persistent_dedupe (dict),
|
||||
dedupe_hits, dedupe_inserts
|
||||
"""
|
||||
node_id = snapshot.get("node_id", "?")
|
||||
q_size = snapshot.get("queue_size", "?")
|
||||
q_max = snapshot.get("queue_max", "?")
|
||||
workers = snapshot.get("worker_count", "?")
|
||||
rooms = snapshot.get("room_count", 0)
|
||||
mixed = snapshot.get("mixed_room_count", 0)
|
||||
ops = snapshot.get("operators_count", 0)
|
||||
|
||||
safety = snapshot.get("control_safety") or {}
|
||||
dedupe = snapshot.get("persistent_dedupe") or {}
|
||||
|
||||
node_policy = snapshot.get("node_policy") or {}
|
||||
default_node = node_policy.get("default_node", node_id)
|
||||
allowed_nodes = node_policy.get("allowed_nodes") or []
|
||||
room_overrides = node_policy.get("room_overrides", 0)
|
||||
|
||||
lines = [
|
||||
f"📡 **Bridge status** — node: `{node_id}`",
|
||||
"",
|
||||
f"**Queue:** {q_size}/{q_max} workers: {workers}",
|
||||
f"**Rooms:** {rooms} direct {mixed} mixed ops: {ops} operators",
|
||||
"",
|
||||
]
|
||||
|
||||
# M5.0: node policy
|
||||
if allowed_nodes:
|
||||
allowed_str = ", ".join(f"`{n}`" for n in sorted(allowed_nodes))
|
||||
lines.append(
|
||||
f"**Node policy:** default=`{default_node}` allowed={allowed_str} room_overrides={room_overrides}"
|
||||
)
|
||||
|
||||
# Control safety
|
||||
if safety:
|
||||
enabled = "✅" if safety.get("enabled") else "⬜"
|
||||
lines.append(
|
||||
f"**Control safety {enabled}:** "
|
||||
f"room={safety.get('room_rpm', '?')}rpm "
|
||||
f"op={safety.get('operator_rpm', '?')}rpm "
|
||||
f"cooldown={safety.get('cooldown_s', '?')}s"
|
||||
)
|
||||
|
||||
# Persistent dedupe
|
||||
if dedupe:
|
||||
ok_emoji = "✅" if dedupe.get("ok") else "❌"
|
||||
pruned = dedupe.get("pruned_rows_last", 0)
|
||||
ttl = dedupe.get("ttl_h", "?")
|
||||
lines.append(
|
||||
f"**Dedupe {ok_emoji}:** ttl={ttl}h pruned_last={pruned} "
|
||||
f"db=`{dedupe.get('db_path') or 'n/a'}`"
|
||||
)
|
||||
|
||||
# M6.0/M6.1: policy store status
|
||||
ps = snapshot.get("policy_store") or {}
|
||||
if ps:
|
||||
ps_ok = "✅" if ps.get("ok") else "❌"
|
||||
ps_node_count = ps.get("overrides_count", 0)
|
||||
ps_agent_count = ps.get("agent_overrides_count", snapshot.get("policy_agent_overrides_count", 0))
|
||||
ps_path = ps.get("policy_store_path") or ps.get("path") or "n/a"
|
||||
lines.append(
|
||||
f"**Policy store {ps_ok}:** node_overrides={ps_node_count} "
|
||||
f"agent_overrides={ps_agent_count} db=`{ps_path}`"
|
||||
)
|
||||
|
||||
# M6.2: last export/import timestamps + DB mtime
|
||||
_last_export = snapshot.get("policy_last_export_at")
|
||||
_last_import = snapshot.get("policy_last_import_at")
|
||||
_db_mtime = snapshot.get("policy_db_mtime")
|
||||
_snap_parts: list = []
|
||||
if _last_export:
|
||||
_snap_parts.append(f"last_export=`{_fmt_ts(_last_export)}`")
|
||||
if _last_import:
|
||||
_snap_parts.append(f"last_import=`{_fmt_ts(_last_import)}`")
|
||||
if _db_mtime:
|
||||
_snap_parts.append(f"db_mtime=`{_fmt_ts(_db_mtime)}`")
|
||||
if _snap_parts:
|
||||
lines.append("**Policy snapshots:** " + " ".join(_snap_parts))
|
||||
|
||||
# M5.1: per-node routed/rejected breakdown
|
||||
node_stats = snapshot.get("nodes") or {}
|
||||
if node_stats:
|
||||
lines.append("\n**Per-node stats:**")
|
||||
for nid in sorted(node_stats):
|
||||
ns = node_stats[nid]
|
||||
lines.append(
|
||||
f" `{nid}`: routed={ns.get('routed', 0)} rejected={ns.get('rejected', 0)}"
|
||||
)
|
||||
|
||||
reply = "\n".join(lines)
|
||||
if len(reply) > _MAX_REPLY_LEN:
|
||||
reply = reply[:_MAX_REPLY_LEN - 3] + "…"
|
||||
return reply
|
||||
Reference in New Issue
Block a user