feat(matrix-bridge-dagi): add backpressure queue with N workers (H2)

Reader + N workers architecture: Reader: sync_poll → rate_check → dedupe → queue.put_nowait() Workers (WORKER_CONCURRENCY, default 2): queue.get() → invoke → send → audit Drop policy (queue full): - put_nowait() raises QueueFull → dropped immediately (reader never blocks) - audit matrix.queue_full + on_queue_dropped callback - metric: matrix_bridge_queue_dropped_total{room_id,agent_id} Graceful shutdown: 1. stop_event → reader exits loop 2. queue.join() with QUEUE_DRAIN_TIMEOUT_S (default 5s) → workers finish in-flight 3. worker tasks cancelled New config env vars: QUEUE_MAX_EVENTS (default 100) WORKER_CONCURRENCY (default 2) QUEUE_DRAIN_TIMEOUT_S (default 5) New metrics (H3 additions): matrix_bridge_queue_size (gauge) matrix_bridge_queue_dropped_total (counter) matrix_bridge_queue_wait_seconds histogram (buckets: 0.01…30s) /health: queue.size, queue.max, queue.workers MatrixIngressLoop: queue_size + worker_count properties 6 queue tests: enqueue/process, full-drop-audit, concurrency barrier, graceful drain, wait metric, rate-limit-before-enqueue Total: 71 passed Made-with: Cursor
2026-03-05 01:07:04 -08:00
parent a4e95482bc
commit a24dae8e18
4 changed files with 831 additions and 129 deletions
--- a/services/matrix-bridge-dagi/app/main.py
+++ b/services/matrix-bridge-dagi/app/main.py
@@ -88,6 +88,22 @@ if _PROM_OK:
        "matrix_bridge_rate_limiter_active_senders",
        "Senders with activity in the current rate-limit window",
    )
+    # H2: Queue metrics
+    _queue_size = Gauge(
+        "matrix_bridge_queue_size",
+        "Current number of pending items in the work queue",
+    )
+    _queue_dropped = Counter(
+        "matrix_bridge_queue_dropped_total",
+        "Messages dropped because queue was full",
+        ["room_id", "agent_id"],
+    )
+    _queue_wait = Histogram(
+        "matrix_bridge_queue_wait_seconds",
+        "Time between enqueue and worker start processing",
+        ["agent_id"],
+        buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 30.0],
+    )

 # ── Startup state ─────────────────────────────────────────────────────────────
 _START_TIME = time.monotonic()
@@ -97,6 +113,7 @@ _matrix_reachable: Optional[bool] = None
 _gateway_reachable: Optional[bool] = None
 _room_map: Optional[RoomMappingConfig] = None
 _rate_limiter: Optional[InMemoryRateLimiter] = None
+_ingress_loop: Optional["MatrixIngressLoop"] = None   # for /health queue_size
 _ingress_task: Optional[asyncio.Task] = None
 _ingress_stop: Optional[asyncio.Event] = None

@@ -116,7 +133,7 @@ async def _probe_url(url: str, timeout: float = 5.0) -> bool:
@asynccontextmanager
 async def lifespan(app_: Any):
    global _cfg, _config_error, _matrix_reachable, _gateway_reachable
-    global _room_map, _rate_limiter
+    global _room_map, _rate_limiter, _ingress_loop
    try:
        _cfg = load_config()

@@ -185,7 +202,6 @@ async def lifespan(app_: Any):
                    _messages_rate_limited.labels(
                        room_id=room_id, agent_id=agent_id, limit_type=limit_type
                    ).inc()
-                    # Update active room/sender gauges from limiter stats
                    if _rate_limiter is not None:
                        stats = _rate_limiter.stats()
                        _rate_limiter_active_rooms.set(stats["active_rooms"])
@@ -199,6 +215,19 @@ async def lifespan(app_: Any):
                if _PROM_OK:
                    _send_latency.labels(agent_id=agent_id).observe(duration_s)

+            # H2 callbacks
+            def _on_queue_dropped(room_id: str, agent_id: str) -> None:
+                if _PROM_OK:
+                    _queue_dropped.labels(room_id=room_id, agent_id=agent_id).inc()
+
+            def _on_queue_size(size: int) -> None:
+                if _PROM_OK:
+                    _queue_size.set(size)
+
+            def _on_queue_wait(agent_id: str, wait_s: float) -> None:
+                if _PROM_OK:
+                    _queue_wait.labels(agent_id=agent_id).observe(wait_s)
+
            ingress = MatrixIngressLoop(
                matrix_homeserver_url=_cfg.matrix_homeserver_url,
                matrix_access_token=_cfg.matrix_access_token,
@@ -209,13 +238,24 @@ async def lifespan(app_: Any):
                sofiia_console_url=_cfg.sofiia_console_url,
                sofiia_internal_token=_cfg.sofiia_internal_token,
                rate_limiter=_rate_limiter,
+                queue_max_events=_cfg.queue_max_events,
+                worker_concurrency=_cfg.worker_concurrency,
+                queue_drain_timeout_s=_cfg.queue_drain_timeout_s,
                on_message_received=_on_msg,
                on_message_replied=_on_replied,
                on_gateway_error=_on_gw_error,
                on_rate_limited=_on_rate_limited,
+                on_queue_dropped=_on_queue_dropped,
+                on_queue_size=_on_queue_size,
                on_invoke_latency=_on_invoke_latency,
                on_send_latency=_on_send_latency,
+                on_queue_wait=_on_queue_wait,
            )
+            logger.info(
+                "✅ Backpressure queue: max=%d workers=%d drain_timeout=%.1fs",
+                _cfg.queue_max_events, _cfg.worker_concurrency, _cfg.queue_drain_timeout_s,
+            )
+            _ingress_loop = ingress
            _ingress_task = asyncio.create_task(
                ingress.run(_ingress_stop),
                name="matrix_ingress_loop",
@@ -290,6 +330,11 @@ async def health() -> Dict[str, Any]:
        "mappings_count": _room_map.total_mappings if _room_map else 0,
        "config_ok": True,
        "rate_limiter": _rate_limiter.stats() if _rate_limiter else None,
+        "queue": {
+            "size": _ingress_loop.queue_size if _ingress_loop else 0,
+            "max": _cfg.queue_max_events,
+            "workers": _cfg.worker_concurrency,
+        },
    }