H1 — InMemoryRateLimiter (sliding window, no Redis):
- Per-room: RATE_LIMIT_ROOM_RPM (default 20/min)
- Per-sender: RATE_LIMIT_SENDER_RPM (default 10/min)
- Room checked before sender — sender quota not charged on room block
- Blocked messages: audit matrix.rate_limited + on_rate_limited callback
- reset() for ops/test, stats() exposed in /health
H3 — Extended Prometheus metrics:
- matrix_bridge_rate_limited_total{room_id,agent_id,limit_type}
- matrix_bridge_send_duration_seconds histogram (invoke was already there)
- matrix_bridge_invoke_duration_seconds buckets tuned for LLM latency
- matrix_bridge_rate_limiter_active_rooms/senders gauges
- on_invoke_latency + on_send_latency callbacks wired in ingress loop
16 new tests: rate limiter unit (13) + ingress integration (3)
Total: 65 passed
Made-with: Cursor
323 lines
12 KiB
Python
323 lines
12 KiB
Python
"""
|
|
matrix-bridge-dagi — Phase M1 scaffold
|
|
Bridges Matrix/Element rooms to DAGI agents via Gateway.
|
|
|
|
M1 scope: 1 room ↔ 1 agent (Sofiia), audit via sofiia-console internal endpoint.
|
|
"""
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import time
|
|
from contextlib import asynccontextmanager
|
|
from typing import Any, Dict, Optional
|
|
|
|
from fastapi import FastAPI, Response
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
try:
|
|
import httpx as _httpx
|
|
_HTTPX_OK = True
|
|
except ImportError: # pragma: no cover
|
|
_httpx = None # type: ignore
|
|
_HTTPX_OK = False
|
|
|
|
try:
|
|
from prometheus_client import (
|
|
Counter, Histogram, Gauge,
|
|
generate_latest, CONTENT_TYPE_LATEST,
|
|
CollectorRegistry, REGISTRY,
|
|
)
|
|
_PROM_OK = True
|
|
except ImportError: # pragma: no cover
|
|
_PROM_OK = False
|
|
|
|
from .config import BridgeConfig, load_config
|
|
from .ingress import MatrixIngressLoop
|
|
from .rate_limit import InMemoryRateLimiter
|
|
from .room_mapping import RoomMappingConfig, parse_room_map
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
|
)
|
|
logger = logging.getLogger("matrix-bridge-dagi")
|
|
|
|
# ── Prometheus metrics (H3) ───────────────────────────────────────────────────
|
|
if _PROM_OK:
|
|
_messages_received = Counter(
|
|
"matrix_bridge_messages_received_total",
|
|
"Total Matrix messages received",
|
|
["room_id", "agent_id"],
|
|
)
|
|
_messages_replied = Counter(
|
|
"matrix_bridge_messages_replied_total",
|
|
"Total agent replies sent to Matrix",
|
|
["room_id", "agent_id", "status"],
|
|
)
|
|
_messages_rate_limited = Counter(
|
|
"matrix_bridge_rate_limited_total",
|
|
"Messages dropped by rate limiter",
|
|
["room_id", "agent_id", "limit_type"],
|
|
)
|
|
_gateway_errors = Counter(
|
|
"matrix_bridge_gateway_errors_total",
|
|
"Errors by stage (sync, invoke, send, audit)",
|
|
["error_type"],
|
|
)
|
|
_invoke_latency = Histogram(
|
|
"matrix_bridge_invoke_duration_seconds",
|
|
"Latency of DAGI Router infer call",
|
|
["agent_id"],
|
|
buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 45.0],
|
|
)
|
|
_send_latency = Histogram(
|
|
"matrix_bridge_send_duration_seconds",
|
|
"Latency of Matrix send_text call",
|
|
["agent_id"],
|
|
buckets=[0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0],
|
|
)
|
|
_bridge_up = Gauge(
|
|
"matrix_bridge_up",
|
|
"1 if bridge started successfully",
|
|
)
|
|
_rate_limiter_active_rooms = Gauge(
|
|
"matrix_bridge_rate_limiter_active_rooms",
|
|
"Rooms with activity in the current rate-limit window",
|
|
)
|
|
_rate_limiter_active_senders = Gauge(
|
|
"matrix_bridge_rate_limiter_active_senders",
|
|
"Senders with activity in the current rate-limit window",
|
|
)
|
|
|
|
# ── Startup state ─────────────────────────────────────────────────────────────
|
|
_START_TIME = time.monotonic()
|
|
_cfg: Optional[BridgeConfig] = None
|
|
_config_error: Optional[str] = None
|
|
_matrix_reachable: Optional[bool] = None
|
|
_gateway_reachable: Optional[bool] = None
|
|
_room_map: Optional[RoomMappingConfig] = None
|
|
_rate_limiter: Optional[InMemoryRateLimiter] = None
|
|
_ingress_task: Optional[asyncio.Task] = None
|
|
_ingress_stop: Optional[asyncio.Event] = None
|
|
|
|
|
|
async def _probe_url(url: str, timeout: float = 5.0) -> bool:
|
|
"""Quick GET probe — returns True if HTTP 2xx."""
|
|
if not _HTTPX_OK or not url:
|
|
return False
|
|
try:
|
|
async with _httpx.AsyncClient(timeout=timeout) as client:
|
|
r = await client.get(url)
|
|
return r.status_code < 400
|
|
except Exception:
|
|
return False
|
|
|
|
# ── Lifespan ──────────────────────────────────────────────────────────────────
|
|
@asynccontextmanager
|
|
async def lifespan(app_: Any):
|
|
global _cfg, _config_error, _matrix_reachable, _gateway_reachable
|
|
global _room_map, _rate_limiter
|
|
try:
|
|
_cfg = load_config()
|
|
|
|
# Parse room mapping
|
|
_room_map = parse_room_map(
|
|
os.getenv("BRIDGE_ROOM_MAP", ""),
|
|
_cfg.bridge_allowed_agents,
|
|
)
|
|
|
|
# H1: Rate limiter (inmemory, per config)
|
|
_rate_limiter = InMemoryRateLimiter(
|
|
room_rpm=_cfg.rate_limit_room_rpm,
|
|
sender_rpm=_cfg.rate_limit_sender_rpm,
|
|
)
|
|
logger.info(
|
|
"✅ Rate limiter: room_rpm=%d sender_rpm=%d",
|
|
_cfg.rate_limit_room_rpm, _cfg.rate_limit_sender_rpm,
|
|
)
|
|
|
|
logger.info(
|
|
"✅ matrix-bridge-dagi started | node=%s build=%s homeserver=%s "
|
|
"room=%s agents=%s mappings=%d",
|
|
_cfg.node_id, _cfg.build_sha, _cfg.matrix_homeserver_url,
|
|
_cfg.sofiia_room_id, list(_cfg.bridge_allowed_agents),
|
|
_room_map.total_mappings,
|
|
)
|
|
|
|
# Connectivity smoke probes (non-blocking failures)
|
|
_matrix_reachable = await _probe_url(
|
|
f"{_cfg.matrix_homeserver_url}/_matrix/client/versions"
|
|
)
|
|
_gateway_reachable = await _probe_url(
|
|
f"{_cfg.dagi_gateway_url}/health"
|
|
)
|
|
if _matrix_reachable:
|
|
logger.info("✅ Matrix homeserver reachable: %s", _cfg.matrix_homeserver_url)
|
|
else:
|
|
logger.warning("⚠️ Matrix homeserver NOT reachable: %s", _cfg.matrix_homeserver_url)
|
|
if _gateway_reachable:
|
|
logger.info("✅ DAGI Gateway reachable: %s", _cfg.dagi_gateway_url)
|
|
else:
|
|
logger.warning("⚠️ DAGI Gateway NOT reachable: %s", _cfg.dagi_gateway_url)
|
|
if _PROM_OK:
|
|
_bridge_up.set(1)
|
|
|
|
# Start ingress loop (fire-and-forget asyncio task)
|
|
if _room_map and _room_map.total_mappings > 0:
|
|
_ingress_stop = asyncio.Event()
|
|
|
|
def _on_msg(room_id: str, agent_id: str) -> None:
|
|
if _PROM_OK:
|
|
_messages_received.labels(room_id=room_id, agent_id=agent_id).inc()
|
|
|
|
def _on_gw_error(error_type: str) -> None:
|
|
if _PROM_OK:
|
|
_gateway_errors.labels(error_type=error_type).inc()
|
|
|
|
def _on_replied(room_id: str, agent_id: str, status: str) -> None:
|
|
if _PROM_OK:
|
|
_messages_replied.labels(
|
|
room_id=room_id, agent_id=agent_id, status=status
|
|
).inc()
|
|
|
|
def _on_rate_limited(room_id: str, agent_id: str, limit_type: str) -> None:
|
|
if _PROM_OK:
|
|
_messages_rate_limited.labels(
|
|
room_id=room_id, agent_id=agent_id, limit_type=limit_type
|
|
).inc()
|
|
# Update active room/sender gauges from limiter stats
|
|
if _rate_limiter is not None:
|
|
stats = _rate_limiter.stats()
|
|
_rate_limiter_active_rooms.set(stats["active_rooms"])
|
|
_rate_limiter_active_senders.set(stats["active_senders"])
|
|
|
|
def _on_invoke_latency(agent_id: str, duration_s: float) -> None:
|
|
if _PROM_OK:
|
|
_invoke_latency.labels(agent_id=agent_id).observe(duration_s)
|
|
|
|
def _on_send_latency(agent_id: str, duration_s: float) -> None:
|
|
if _PROM_OK:
|
|
_send_latency.labels(agent_id=agent_id).observe(duration_s)
|
|
|
|
ingress = MatrixIngressLoop(
|
|
matrix_homeserver_url=_cfg.matrix_homeserver_url,
|
|
matrix_access_token=_cfg.matrix_access_token,
|
|
matrix_user_id=_cfg.matrix_user_id,
|
|
router_url=_cfg.dagi_gateway_url,
|
|
node_id=_cfg.node_id,
|
|
room_map=_room_map,
|
|
sofiia_console_url=_cfg.sofiia_console_url,
|
|
sofiia_internal_token=_cfg.sofiia_internal_token,
|
|
rate_limiter=_rate_limiter,
|
|
on_message_received=_on_msg,
|
|
on_message_replied=_on_replied,
|
|
on_gateway_error=_on_gw_error,
|
|
on_rate_limited=_on_rate_limited,
|
|
on_invoke_latency=_on_invoke_latency,
|
|
on_send_latency=_on_send_latency,
|
|
)
|
|
_ingress_task = asyncio.create_task(
|
|
ingress.run(_ingress_stop),
|
|
name="matrix_ingress_loop",
|
|
)
|
|
logger.info("✅ Ingress loop task started")
|
|
else:
|
|
logger.warning("⚠️ No room mappings — ingress loop NOT started")
|
|
|
|
except (RuntimeError, ValueError) as exc:
|
|
_config_error = str(exc)
|
|
logger.error("❌ Config error: %s", _config_error)
|
|
if _PROM_OK:
|
|
_bridge_up.set(0)
|
|
yield
|
|
# Shutdown: cancel ingress loop
|
|
if _ingress_stop:
|
|
_ingress_stop.set()
|
|
if _ingress_task and not _ingress_task.done():
|
|
_ingress_task.cancel()
|
|
try:
|
|
await asyncio.wait_for(_ingress_task, timeout=5.0)
|
|
except (asyncio.CancelledError, asyncio.TimeoutError):
|
|
pass
|
|
logger.info("matrix-bridge-dagi shutting down")
|
|
|
|
# ── App ───────────────────────────────────────────────────────────────────────
|
|
app = FastAPI(
|
|
title="matrix-bridge-dagi",
|
|
version="0.1.0",
|
|
lifespan=lifespan,
|
|
)
|
|
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"],
|
|
allow_methods=["GET", "POST"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
# ── Health ────────────────────────────────────────────────────────────────────
|
|
@app.get("/health")
|
|
async def health() -> Dict[str, Any]:
|
|
uptime = int(time.monotonic() - _START_TIME)
|
|
if _config_error or _cfg is None:
|
|
return {
|
|
"ok": False,
|
|
"service": "matrix-bridge-dagi",
|
|
"version": "0.1.0",
|
|
"build": os.getenv("BUILD_SHA", "dev"),
|
|
"uptime_s": uptime,
|
|
"error": _config_error or "service not initialised",
|
|
}
|
|
matrix_ok = _matrix_reachable is True
|
|
gateway_ok = _gateway_reachable is True
|
|
overall_ok = matrix_ok and gateway_ok
|
|
return {
|
|
"ok": overall_ok,
|
|
"service": "matrix-bridge-dagi",
|
|
"version": "0.1.0",
|
|
"build": _cfg.build_sha,
|
|
"build_time": _cfg.build_time,
|
|
"env": os.getenv("ENV", "dev"),
|
|
"uptime_s": uptime,
|
|
"node_id": _cfg.node_id,
|
|
"homeserver": _cfg.matrix_homeserver_url,
|
|
"matrix_reachable": _matrix_reachable,
|
|
"bridge_user": _cfg.matrix_user_id,
|
|
"sofiia_room_id": _cfg.sofiia_room_id,
|
|
"allowed_agents": list(_cfg.bridge_allowed_agents),
|
|
"gateway": _cfg.dagi_gateway_url,
|
|
"gateway_reachable": _gateway_reachable,
|
|
"mappings_count": _room_map.total_mappings if _room_map else 0,
|
|
"config_ok": True,
|
|
"rate_limiter": _rate_limiter.stats() if _rate_limiter else None,
|
|
}
|
|
|
|
|
|
# ── Bridge Mappings (read-only ops endpoint) ───────────────────────────────────
|
|
@app.get("/bridge/mappings")
|
|
async def bridge_mappings() -> Dict[str, Any]:
|
|
"""
|
|
Returns room-to-agent mapping summary.
|
|
Safe for ops visibility — no secrets included.
|
|
"""
|
|
if _cfg is None or _room_map is None:
|
|
return {
|
|
"ok": False,
|
|
"error": _config_error or "service not initialised",
|
|
"mappings": [],
|
|
}
|
|
return {
|
|
"ok": True,
|
|
"total": _room_map.total_mappings,
|
|
"allowed_agents": list(_cfg.bridge_allowed_agents),
|
|
"mappings": _room_map.as_summary(),
|
|
}
|
|
|
|
|
|
# ── Metrics ───────────────────────────────────────────────────────────────────
|
|
@app.get("/metrics")
|
|
async def metrics():
|
|
if not _PROM_OK:
|
|
return Response("# prometheus_client not available\n", media_type="text/plain")
|
|
return Response(generate_latest(REGISTRY), media_type=CONTENT_TYPE_LATEST)
|