feat(matrix-bridge-dagi): add rate limiting (H1) and metrics (H3)

H1 — InMemoryRateLimiter (sliding window, no Redis): - Per-room: RATE_LIMIT_ROOM_RPM (default 20/min) - Per-sender: RATE_LIMIT_SENDER_RPM (default 10/min) - Room checked before sender — sender quota not charged on room block - Blocked messages: audit matrix.rate_limited + on_rate_limited callback - reset() for ops/test, stats() exposed in /health H3 — Extended Prometheus metrics: - matrix_bridge_rate_limited_total{room_id,agent_id,limit_type} - matrix_bridge_send_duration_seconds histogram (invoke was already there) - matrix_bridge_invoke_duration_seconds buckets tuned for LLM latency - matrix_bridge_rate_limiter_active_rooms/senders gauges - on_invoke_latency + on_send_latency callbacks wired in ingress loop 16 new tests: rate limiter unit (13) + ingress integration (3) Total: 65 passed Made-with: Cursor
2026-03-05 00:54:14 -08:00
parent 313d777c84
commit a4e95482bc
5 changed files with 607 additions and 40 deletions
--- a/services/matrix-bridge-dagi/app/main.py
+++ b/services/matrix-bridge-dagi/app/main.py
@@ -33,6 +33,7 @@ except ImportError:  # pragma: no cover

 from .config import BridgeConfig, load_config
 from .ingress import MatrixIngressLoop
+from .rate_limit import InMemoryRateLimiter
 from .room_mapping import RoomMappingConfig, parse_room_map

 logging.basicConfig(
@@ -41,7 +42,7 @@ logging.basicConfig(
 )
 logger = logging.getLogger("matrix-bridge-dagi")

-# ── Prometheus metrics ────────────────────────────────────────────────────────
+# ── Prometheus metrics (H3) ───────────────────────────────────────────────────
 if _PROM_OK:
    _messages_received = Counter(
        "matrix_bridge_messages_received_total",
@@ -53,28 +54,49 @@ if _PROM_OK:
        "Total agent replies sent to Matrix",
        ["room_id", "agent_id", "status"],
    )
+    _messages_rate_limited = Counter(
+        "matrix_bridge_rate_limited_total",
+        "Messages dropped by rate limiter",
+        ["room_id", "agent_id", "limit_type"],
+    )
    _gateway_errors = Counter(
        "matrix_bridge_gateway_errors_total",
-        "Errors calling DAGI gateway",
+        "Errors by stage (sync, invoke, send, audit)",
        ["error_type"],
    )
    _invoke_latency = Histogram(
        "matrix_bridge_invoke_duration_seconds",
-        "Duration of DAGI invoke call",
+        "Latency of DAGI Router infer call",
        ["agent_id"],
+        buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 45.0],
+    )
+    _send_latency = Histogram(
+        "matrix_bridge_send_duration_seconds",
+        "Latency of Matrix send_text call",
+        ["agent_id"],
+        buckets=[0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0],
    )
    _bridge_up = Gauge(
        "matrix_bridge_up",
        "1 if bridge started successfully",
    )
+    _rate_limiter_active_rooms = Gauge(
+        "matrix_bridge_rate_limiter_active_rooms",
+        "Rooms with activity in the current rate-limit window",
+    )
+    _rate_limiter_active_senders = Gauge(
+        "matrix_bridge_rate_limiter_active_senders",
+        "Senders with activity in the current rate-limit window",
+    )

 # ── Startup state ─────────────────────────────────────────────────────────────
 _START_TIME = time.monotonic()
 _cfg: Optional[BridgeConfig] = None
 _config_error: Optional[str] = None
-_matrix_reachable: Optional[bool] = None    # probed at startup
-_gateway_reachable: Optional[bool] = None   # probed at startup
+_matrix_reachable: Optional[bool] = None
+_gateway_reachable: Optional[bool] = None
 _room_map: Optional[RoomMappingConfig] = None
+_rate_limiter: Optional[InMemoryRateLimiter] = None
 _ingress_task: Optional[asyncio.Task] = None
 _ingress_stop: Optional[asyncio.Event] = None

@@ -93,7 +115,8 @@ async def _probe_url(url: str, timeout: float = 5.0) -> bool:
 # ── Lifespan ──────────────────────────────────────────────────────────────────
@asynccontextmanager
 async def lifespan(app_: Any):
-    global _cfg, _config_error, _matrix_reachable, _gateway_reachable, _room_map
+    global _cfg, _config_error, _matrix_reachable, _gateway_reachable
+    global _room_map, _rate_limiter
    try:
        _cfg = load_config()

@@ -103,6 +126,16 @@ async def lifespan(app_: Any):
            _cfg.bridge_allowed_agents,
        )

+        # H1: Rate limiter (inmemory, per config)
+        _rate_limiter = InMemoryRateLimiter(
+            room_rpm=_cfg.rate_limit_room_rpm,
+            sender_rpm=_cfg.rate_limit_sender_rpm,
+        )
+        logger.info(
+            "✅ Rate limiter: room_rpm=%d sender_rpm=%d",
+            _cfg.rate_limit_room_rpm, _cfg.rate_limit_sender_rpm,
+        )
+
        logger.info(
            "✅ matrix-bridge-dagi started | node=%s build=%s homeserver=%s "
            "room=%s agents=%s mappings=%d",
@@ -147,6 +180,25 @@ async def lifespan(app_: Any):
                        room_id=room_id, agent_id=agent_id, status=status
                    ).inc()

+            def _on_rate_limited(room_id: str, agent_id: str, limit_type: str) -> None:
+                if _PROM_OK:
+                    _messages_rate_limited.labels(
+                        room_id=room_id, agent_id=agent_id, limit_type=limit_type
+                    ).inc()
+                    # Update active room/sender gauges from limiter stats
+                    if _rate_limiter is not None:
+                        stats = _rate_limiter.stats()
+                        _rate_limiter_active_rooms.set(stats["active_rooms"])
+                        _rate_limiter_active_senders.set(stats["active_senders"])
+
+            def _on_invoke_latency(agent_id: str, duration_s: float) -> None:
+                if _PROM_OK:
+                    _invoke_latency.labels(agent_id=agent_id).observe(duration_s)
+
+            def _on_send_latency(agent_id: str, duration_s: float) -> None:
+                if _PROM_OK:
+                    _send_latency.labels(agent_id=agent_id).observe(duration_s)
+
            ingress = MatrixIngressLoop(
                matrix_homeserver_url=_cfg.matrix_homeserver_url,
                matrix_access_token=_cfg.matrix_access_token,
@@ -156,9 +208,13 @@ async def lifespan(app_: Any):
                room_map=_room_map,
                sofiia_console_url=_cfg.sofiia_console_url,
                sofiia_internal_token=_cfg.sofiia_internal_token,
+                rate_limiter=_rate_limiter,
                on_message_received=_on_msg,
                on_message_replied=_on_replied,
                on_gateway_error=_on_gw_error,
+                on_rate_limited=_on_rate_limited,
+                on_invoke_latency=_on_invoke_latency,
+                on_send_latency=_on_send_latency,
            )
            _ingress_task = asyncio.create_task(
                ingress.run(_ingress_stop),
@@ -233,6 +289,7 @@ async def health() -> Dict[str, Any]:
        "gateway_reachable": _gateway_reachable,
        "mappings_count": _room_map.total_mappings if _room_map else 0,
        "config_ok": True,
+        "rate_limiter": _rate_limiter.stats() if _rate_limiter else None,
    }