New: app/control.py
- ControlConfig: operator_allowlist + control_rooms (frozensets)
- parse_control_config(): validates @user:server + !room:server formats, fail-fast
- parse_command(): parses !verb subcommand [args] [key=value] up to 512 chars
- check_authorization(): AND(is_control_room, is_operator) → (bool, reason)
- Reply helpers: not_implemented, unknown_command, unauthorized, help
- KNOWN_VERBS: runbook, status, help (M3.1+ stubs)
- MAX_CMD_LEN=512, MAX_CMD_TOKENS=20
ingress.py:
- _try_control(): dispatch for control rooms (authorized → audit + reply, unauthorized → audit + optional ⛔)
- join control rooms on startup
- _enqueue_from_sync: control rooms processed first, never forwarded to agents
- on_control_command(sender, verb, subcommand) metric callback
- CONTROL_UNAUTHORIZED_BEHAVIOR: "ignore" | "reply_error"
Audit events:
matrix.control.command — authorised command (verb, subcommand, args, kwargs)
matrix.control.unauthorized — rejected by allowlist (reason: not_operator | not_control_room)
matrix.control.unknown_cmd — authorised but unrecognised verb
Config + main:
- bridge_operator_allowlist, bridge_control_rooms, control_unauthorized_behavior
- matrix_bridge_control_commands_total{sender,verb,subcommand} counter
- /health: control_channel section (enabled, rooms_count, operators_count, behavior)
- /bridge/mappings: control_rooms + control_operators_count
- docker-compose: BRIDGE_OPERATOR_ALLOWLIST, BRIDGE_CONTROL_ROOMS, CONTROL_UNAUTHORIZED_BEHAVIOR
Tests: 40 new → 148 total pass
Made-with: Cursor
473 lines
19 KiB
Python
473 lines
19 KiB
Python
"""
|
|
matrix-bridge-dagi — Phase M1 scaffold
|
|
Bridges Matrix/Element rooms to DAGI agents via Gateway.
|
|
|
|
M1 scope: 1 room ↔ 1 agent (Sofiia), audit via sofiia-console internal endpoint.
|
|
"""
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import time
|
|
from contextlib import asynccontextmanager
|
|
from typing import Any, Dict, Optional
|
|
|
|
from fastapi import FastAPI, Response
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
try:
|
|
import httpx as _httpx
|
|
_HTTPX_OK = True
|
|
except ImportError: # pragma: no cover
|
|
_httpx = None # type: ignore
|
|
_HTTPX_OK = False
|
|
|
|
try:
|
|
from prometheus_client import (
|
|
Counter, Histogram, Gauge,
|
|
generate_latest, CONTENT_TYPE_LATEST,
|
|
CollectorRegistry, REGISTRY,
|
|
)
|
|
_PROM_OK = True
|
|
except ImportError: # pragma: no cover
|
|
_PROM_OK = False
|
|
|
|
from .config import BridgeConfig, load_config
|
|
from .control import ControlConfig, parse_control_config
|
|
from .ingress import MatrixIngressLoop
|
|
from .mixed_routing import MixedRoomConfig, parse_mixed_room_map
|
|
from .rate_limit import InMemoryRateLimiter
|
|
from .room_mapping import RoomMappingConfig, parse_room_map
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
|
)
|
|
logger = logging.getLogger("matrix-bridge-dagi")
|
|
|
|
# ── Prometheus metrics (H3) ───────────────────────────────────────────────────
|
|
if _PROM_OK:
|
|
_messages_received = Counter(
|
|
"matrix_bridge_messages_received_total",
|
|
"Total Matrix messages received",
|
|
["room_id", "agent_id"],
|
|
)
|
|
_messages_replied = Counter(
|
|
"matrix_bridge_messages_replied_total",
|
|
"Total agent replies sent to Matrix",
|
|
["room_id", "agent_id", "status"],
|
|
)
|
|
_messages_rate_limited = Counter(
|
|
"matrix_bridge_rate_limited_total",
|
|
"Messages dropped by rate limiter",
|
|
["room_id", "agent_id", "limit_type"],
|
|
)
|
|
_gateway_errors = Counter(
|
|
"matrix_bridge_gateway_errors_total",
|
|
"Errors by stage (sync, invoke, send, audit)",
|
|
["error_type"],
|
|
)
|
|
_invoke_latency = Histogram(
|
|
"matrix_bridge_invoke_duration_seconds",
|
|
"Latency of DAGI Router infer call",
|
|
["agent_id"],
|
|
buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 45.0],
|
|
)
|
|
_send_latency = Histogram(
|
|
"matrix_bridge_send_duration_seconds",
|
|
"Latency of Matrix send_text call",
|
|
["agent_id"],
|
|
buckets=[0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0],
|
|
)
|
|
_bridge_up = Gauge(
|
|
"matrix_bridge_up",
|
|
"1 if bridge started successfully",
|
|
)
|
|
_rate_limiter_active_rooms = Gauge(
|
|
"matrix_bridge_rate_limiter_active_rooms",
|
|
"Rooms with activity in the current rate-limit window",
|
|
)
|
|
_rate_limiter_active_senders = Gauge(
|
|
"matrix_bridge_rate_limiter_active_senders",
|
|
"Senders with activity in the current rate-limit window",
|
|
)
|
|
# H2: Queue metrics
|
|
_queue_size = Gauge(
|
|
"matrix_bridge_queue_size",
|
|
"Current number of pending items in the work queue",
|
|
)
|
|
_queue_dropped = Counter(
|
|
"matrix_bridge_queue_dropped_total",
|
|
"Messages dropped because queue was full",
|
|
["room_id", "agent_id"],
|
|
)
|
|
_queue_wait = Histogram(
|
|
"matrix_bridge_queue_wait_seconds",
|
|
"Time between enqueue and worker start processing",
|
|
["agent_id"],
|
|
buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 30.0],
|
|
)
|
|
# M2.2: Mixed room routing metrics
|
|
_routed_total = Counter(
|
|
"matrix_bridge_routed_total",
|
|
"Successful message routing by reason",
|
|
["agent_id", "reason"],
|
|
)
|
|
_route_rejected_total = Counter(
|
|
"matrix_bridge_route_rejected_total",
|
|
"Messages rejected during routing (unknown agent, bad slash, etc.)",
|
|
["room_id", "reason"],
|
|
)
|
|
_active_room_agent_locks = Gauge(
|
|
"matrix_bridge_active_room_agent_locks",
|
|
"Number of room-agent pairs currently holding a concurrency lock",
|
|
)
|
|
# M3.0: Control channel
|
|
_control_commands_total = Counter(
|
|
"matrix_bridge_control_commands_total",
|
|
"Total control commands received from authorized operators",
|
|
["sender", "verb", "subcommand"],
|
|
)
|
|
|
|
# ── Startup state ─────────────────────────────────────────────────────────────
|
|
_START_TIME = time.monotonic()
|
|
_cfg: Optional[BridgeConfig] = None
|
|
_config_error: Optional[str] = None
|
|
_matrix_reachable: Optional[bool] = None
|
|
_gateway_reachable: Optional[bool] = None
|
|
_room_map: Optional[RoomMappingConfig] = None
|
|
_mixed_room_config: Optional[MixedRoomConfig] = None
|
|
_control_config: Optional[ControlConfig] = None
|
|
_rate_limiter: Optional[InMemoryRateLimiter] = None
|
|
_ingress_loop: Optional["MatrixIngressLoop"] = None # for /health queue_size
|
|
_ingress_task: Optional[asyncio.Task] = None
|
|
_ingress_stop: Optional[asyncio.Event] = None
|
|
|
|
|
|
async def _probe_url(url: str, timeout: float = 5.0) -> bool:
|
|
"""Quick GET probe — returns True if HTTP 2xx."""
|
|
if not _HTTPX_OK or not url:
|
|
return False
|
|
try:
|
|
async with _httpx.AsyncClient(timeout=timeout) as client:
|
|
r = await client.get(url)
|
|
return r.status_code < 400
|
|
except Exception:
|
|
return False
|
|
|
|
# ── Lifespan ──────────────────────────────────────────────────────────────────
|
|
@asynccontextmanager
|
|
async def lifespan(app_: Any):
|
|
global _cfg, _config_error, _matrix_reachable, _gateway_reachable
|
|
global _room_map, _mixed_room_config, _control_config, _rate_limiter, _ingress_loop
|
|
try:
|
|
_cfg = load_config()
|
|
|
|
# Parse regular room mapping (M1/M2.0: 1 room → 1 agent)
|
|
_room_map = parse_room_map(
|
|
os.getenv("BRIDGE_ROOM_MAP", ""),
|
|
_cfg.bridge_allowed_agents,
|
|
)
|
|
|
|
# Parse mixed room mapping (M2.1: 1 room → N agents)
|
|
if _cfg.bridge_mixed_room_map:
|
|
_mixed_room_config = parse_mixed_room_map(
|
|
_cfg.bridge_mixed_room_map,
|
|
_cfg.bridge_mixed_defaults,
|
|
_cfg.bridge_allowed_agents,
|
|
max_agents_per_room=_cfg.max_agents_per_mixed_room,
|
|
)
|
|
logger.info(
|
|
"✅ Mixed room config: %d rooms, agents=%s",
|
|
_mixed_room_config.total_rooms,
|
|
[a for r in _mixed_room_config.rooms.values() for a in r.agents],
|
|
)
|
|
else:
|
|
_mixed_room_config = None
|
|
|
|
# H1: Rate limiter (inmemory, per config)
|
|
_rate_limiter = InMemoryRateLimiter(
|
|
room_rpm=_cfg.rate_limit_room_rpm,
|
|
sender_rpm=_cfg.rate_limit_sender_rpm,
|
|
)
|
|
logger.info(
|
|
"✅ Rate limiter: room_rpm=%d sender_rpm=%d",
|
|
_cfg.rate_limit_room_rpm, _cfg.rate_limit_sender_rpm,
|
|
)
|
|
|
|
# M3.0: Operator control channel
|
|
if _cfg.bridge_operator_allowlist or _cfg.bridge_control_rooms:
|
|
_control_config = parse_control_config(
|
|
_cfg.bridge_operator_allowlist,
|
|
_cfg.bridge_control_rooms,
|
|
)
|
|
else:
|
|
_control_config = None
|
|
|
|
mixed_count = _mixed_room_config.total_rooms if _mixed_room_config else 0
|
|
ctrl_rooms = len(_control_config.control_rooms) if _control_config else 0
|
|
ctrl_ops = len(_control_config.operator_allowlist) if _control_config else 0
|
|
logger.info(
|
|
"✅ matrix-bridge-dagi started | node=%s build=%s homeserver=%s "
|
|
"agents=%s mappings=%d mixed_rooms=%d ctrl_rooms=%d ctrl_operators=%d",
|
|
_cfg.node_id, _cfg.build_sha, _cfg.matrix_homeserver_url,
|
|
list(_cfg.bridge_allowed_agents),
|
|
_room_map.total_mappings, mixed_count, ctrl_rooms, ctrl_ops,
|
|
)
|
|
|
|
# Connectivity smoke probes (non-blocking failures)
|
|
_matrix_reachable = await _probe_url(
|
|
f"{_cfg.matrix_homeserver_url}/_matrix/client/versions"
|
|
)
|
|
_gateway_reachable = await _probe_url(
|
|
f"{_cfg.dagi_gateway_url}/health"
|
|
)
|
|
if _matrix_reachable:
|
|
logger.info("✅ Matrix homeserver reachable: %s", _cfg.matrix_homeserver_url)
|
|
else:
|
|
logger.warning("⚠️ Matrix homeserver NOT reachable: %s", _cfg.matrix_homeserver_url)
|
|
if _gateway_reachable:
|
|
logger.info("✅ DAGI Gateway reachable: %s", _cfg.dagi_gateway_url)
|
|
else:
|
|
logger.warning("⚠️ DAGI Gateway NOT reachable: %s", _cfg.dagi_gateway_url)
|
|
if _PROM_OK:
|
|
_bridge_up.set(1)
|
|
|
|
# Start ingress loop (fire-and-forget asyncio task)
|
|
_has_rooms = (_room_map and _room_map.total_mappings > 0) or (
|
|
_mixed_room_config and _mixed_room_config.total_rooms > 0
|
|
)
|
|
if _has_rooms:
|
|
_ingress_stop = asyncio.Event()
|
|
|
|
def _on_msg(room_id: str, agent_id: str) -> None:
|
|
if _PROM_OK:
|
|
_messages_received.labels(room_id=room_id, agent_id=agent_id).inc()
|
|
|
|
def _on_gw_error(error_type: str) -> None:
|
|
if _PROM_OK:
|
|
_gateway_errors.labels(error_type=error_type).inc()
|
|
|
|
def _on_replied(room_id: str, agent_id: str, status: str) -> None:
|
|
if _PROM_OK:
|
|
_messages_replied.labels(
|
|
room_id=room_id, agent_id=agent_id, status=status
|
|
).inc()
|
|
|
|
def _on_rate_limited(room_id: str, agent_id: str, limit_type: str) -> None:
|
|
if _PROM_OK:
|
|
_messages_rate_limited.labels(
|
|
room_id=room_id, agent_id=agent_id, limit_type=limit_type
|
|
).inc()
|
|
if _rate_limiter is not None:
|
|
stats = _rate_limiter.stats()
|
|
_rate_limiter_active_rooms.set(stats["active_rooms"])
|
|
_rate_limiter_active_senders.set(stats["active_senders"])
|
|
|
|
def _on_invoke_latency(agent_id: str, duration_s: float) -> None:
|
|
if _PROM_OK:
|
|
_invoke_latency.labels(agent_id=agent_id).observe(duration_s)
|
|
|
|
def _on_send_latency(agent_id: str, duration_s: float) -> None:
|
|
if _PROM_OK:
|
|
_send_latency.labels(agent_id=agent_id).observe(duration_s)
|
|
|
|
# H2 callbacks
|
|
def _on_queue_dropped(room_id: str, agent_id: str) -> None:
|
|
if _PROM_OK:
|
|
_queue_dropped.labels(room_id=room_id, agent_id=agent_id).inc()
|
|
|
|
def _on_queue_size(size: int) -> None:
|
|
if _PROM_OK:
|
|
_queue_size.set(size)
|
|
|
|
def _on_queue_wait(agent_id: str, wait_s: float) -> None:
|
|
if _PROM_OK:
|
|
_queue_wait.labels(agent_id=agent_id).observe(wait_s)
|
|
|
|
# M2.2 callbacks
|
|
def _on_routed(agent_id: str, reason: str) -> None:
|
|
if _PROM_OK:
|
|
_routed_total.labels(agent_id=agent_id, reason=reason).inc()
|
|
|
|
def _on_route_rejected(room_id: str, reason: str) -> None:
|
|
if _PROM_OK:
|
|
_route_rejected_total.labels(room_id=room_id, reason=reason).inc()
|
|
|
|
# M3.0 callbacks
|
|
def _on_control_command(sender: str, verb: str, subcommand: str) -> None:
|
|
if _PROM_OK:
|
|
_control_commands_total.labels(
|
|
sender=sender, verb=verb, subcommand=subcommand
|
|
).inc()
|
|
|
|
ingress = MatrixIngressLoop(
|
|
matrix_homeserver_url=_cfg.matrix_homeserver_url,
|
|
matrix_access_token=_cfg.matrix_access_token,
|
|
matrix_user_id=_cfg.matrix_user_id,
|
|
router_url=_cfg.dagi_gateway_url,
|
|
node_id=_cfg.node_id,
|
|
room_map=_room_map,
|
|
sofiia_console_url=_cfg.sofiia_console_url,
|
|
sofiia_internal_token=_cfg.sofiia_internal_token,
|
|
rate_limiter=_rate_limiter,
|
|
queue_max_events=_cfg.queue_max_events,
|
|
worker_concurrency=_cfg.worker_concurrency,
|
|
queue_drain_timeout_s=_cfg.queue_drain_timeout_s,
|
|
mixed_room_config=_mixed_room_config,
|
|
unknown_agent_behavior=_cfg.unknown_agent_behavior,
|
|
max_slash_len=_cfg.max_slash_len,
|
|
mixed_concurrency_cap=_cfg.mixed_concurrency_cap,
|
|
on_message_received=_on_msg,
|
|
on_message_replied=_on_replied,
|
|
on_gateway_error=_on_gw_error,
|
|
on_rate_limited=_on_rate_limited,
|
|
on_queue_dropped=_on_queue_dropped,
|
|
on_queue_size=_on_queue_size,
|
|
on_invoke_latency=_on_invoke_latency,
|
|
on_send_latency=_on_send_latency,
|
|
on_queue_wait=_on_queue_wait,
|
|
on_routed=_on_routed,
|
|
on_route_rejected=_on_route_rejected,
|
|
control_config=_control_config,
|
|
control_unauthorized_behavior=_cfg.control_unauthorized_behavior,
|
|
on_control_command=_on_control_command,
|
|
)
|
|
logger.info(
|
|
"✅ Backpressure queue: max=%d workers=%d drain_timeout=%.1fs",
|
|
_cfg.queue_max_events, _cfg.worker_concurrency, _cfg.queue_drain_timeout_s,
|
|
)
|
|
_ingress_loop = ingress
|
|
_ingress_task = asyncio.create_task(
|
|
ingress.run(_ingress_stop),
|
|
name="matrix_ingress_loop",
|
|
)
|
|
logger.info("✅ Ingress loop task started")
|
|
else:
|
|
logger.warning("⚠️ No room mappings — ingress loop NOT started")
|
|
|
|
except (RuntimeError, ValueError) as exc:
|
|
_config_error = str(exc)
|
|
logger.error("❌ Config error: %s", _config_error)
|
|
if _PROM_OK:
|
|
_bridge_up.set(0)
|
|
yield
|
|
# Shutdown: cancel ingress loop
|
|
if _ingress_stop:
|
|
_ingress_stop.set()
|
|
if _ingress_task and not _ingress_task.done():
|
|
_ingress_task.cancel()
|
|
try:
|
|
await asyncio.wait_for(_ingress_task, timeout=5.0)
|
|
except (asyncio.CancelledError, asyncio.TimeoutError):
|
|
pass
|
|
logger.info("matrix-bridge-dagi shutting down")
|
|
|
|
# ── App ───────────────────────────────────────────────────────────────────────
|
|
app = FastAPI(
|
|
title="matrix-bridge-dagi",
|
|
version="0.1.0",
|
|
lifespan=lifespan,
|
|
)
|
|
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"],
|
|
allow_methods=["GET", "POST"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
# ── Health ────────────────────────────────────────────────────────────────────
|
|
@app.get("/health")
|
|
async def health() -> Dict[str, Any]:
|
|
uptime = int(time.monotonic() - _START_TIME)
|
|
if _PROM_OK and _ingress_loop is not None:
|
|
_active_room_agent_locks.set(_ingress_loop.active_lock_count)
|
|
if _config_error or _cfg is None:
|
|
return {
|
|
"ok": False,
|
|
"service": "matrix-bridge-dagi",
|
|
"version": "0.1.0",
|
|
"build": os.getenv("BUILD_SHA", "dev"),
|
|
"uptime_s": uptime,
|
|
"error": _config_error or "service not initialised",
|
|
}
|
|
matrix_ok = _matrix_reachable is True
|
|
gateway_ok = _gateway_reachable is True
|
|
overall_ok = matrix_ok and gateway_ok
|
|
return {
|
|
"ok": overall_ok,
|
|
"service": "matrix-bridge-dagi",
|
|
"version": "0.1.0",
|
|
"build": _cfg.build_sha,
|
|
"build_time": _cfg.build_time,
|
|
"env": os.getenv("ENV", "dev"),
|
|
"uptime_s": uptime,
|
|
"node_id": _cfg.node_id,
|
|
"homeserver": _cfg.matrix_homeserver_url,
|
|
"matrix_reachable": _matrix_reachable,
|
|
"bridge_user": _cfg.matrix_user_id,
|
|
"sofiia_room_id": _cfg.sofiia_room_id,
|
|
"allowed_agents": list(_cfg.bridge_allowed_agents),
|
|
"gateway": _cfg.dagi_gateway_url,
|
|
"gateway_reachable": _gateway_reachable,
|
|
"mappings_count": _room_map.total_mappings if _room_map else 0,
|
|
"mixed_rooms_count": _mixed_room_config.total_rooms if _mixed_room_config else 0,
|
|
"total_agents_in_mixed_rooms": sum(
|
|
len(r.agents) for r in _mixed_room_config.rooms.values()
|
|
) if _mixed_room_config else 0,
|
|
"config_ok": True,
|
|
"rate_limiter": _rate_limiter.stats() if _rate_limiter else None,
|
|
"queue": {
|
|
"size": _ingress_loop.queue_size if _ingress_loop else 0,
|
|
"max": _cfg.queue_max_events,
|
|
"workers": _cfg.worker_concurrency,
|
|
},
|
|
"mixed_guard_rails": {
|
|
"max_agents_per_room": _cfg.max_agents_per_mixed_room,
|
|
"max_slash_len": _cfg.max_slash_len,
|
|
"unknown_agent_behavior": _cfg.unknown_agent_behavior,
|
|
"concurrency_cap": _cfg.mixed_concurrency_cap,
|
|
"active_room_agent_locks": _ingress_loop.active_lock_count if _ingress_loop else 0,
|
|
},
|
|
"control_channel": {
|
|
"enabled": _control_config.is_enabled if _control_config else False,
|
|
"control_rooms_count": len(_control_config.control_rooms) if _control_config else 0,
|
|
"operators_count": len(_control_config.operator_allowlist) if _control_config else 0,
|
|
"unauthorized_behavior": _cfg.control_unauthorized_behavior,
|
|
},
|
|
}
|
|
|
|
|
|
# ── Bridge Mappings (read-only ops endpoint) ───────────────────────────────────
|
|
@app.get("/bridge/mappings")
|
|
async def bridge_mappings() -> Dict[str, Any]:
|
|
"""
|
|
Returns room-to-agent mapping summary.
|
|
Safe for ops visibility — no secrets included.
|
|
"""
|
|
if _cfg is None or _room_map is None:
|
|
return {
|
|
"ok": False,
|
|
"error": _config_error or "service not initialised",
|
|
"mappings": [],
|
|
"mixed_rooms": [],
|
|
}
|
|
return {
|
|
"ok": True,
|
|
"total": _room_map.total_mappings,
|
|
"allowed_agents": list(_cfg.bridge_allowed_agents),
|
|
"mappings": _room_map.as_summary(),
|
|
"mixed_rooms_total": _mixed_room_config.total_rooms if _mixed_room_config else 0,
|
|
"mixed_rooms": _mixed_room_config.as_summary() if _mixed_room_config else [],
|
|
"control_rooms": sorted(_control_config.control_rooms) if _control_config else [],
|
|
"control_operators_count": len(_control_config.operator_allowlist) if _control_config else 0,
|
|
}
|
|
|
|
|
|
# ── Metrics ───────────────────────────────────────────────────────────────────
|
|
@app.get("/metrics")
|
|
async def metrics():
|
|
if not _PROM_OK:
|
|
return Response("# prometheus_client not available\n", media_type="text/plain")
|
|
return Response(generate_latest(REGISTRY), media_type=CONTENT_TYPE_LATEST)
|