feat(matrix-bridge-dagi): add backpressure queue with N workers (H2)

Reader + N workers architecture:
  Reader: sync_poll → rate_check → dedupe → queue.put_nowait()
  Workers (WORKER_CONCURRENCY, default 2): queue.get() → invoke → send → audit

Drop policy (queue full):
  - put_nowait() raises QueueFull → dropped immediately (reader never blocks)
  - audit matrix.queue_full + on_queue_dropped callback
  - metric: matrix_bridge_queue_dropped_total{room_id,agent_id}

Graceful shutdown:
  1. stop_event → reader exits loop
  2. queue.join() with QUEUE_DRAIN_TIMEOUT_S (default 5s) → workers finish in-flight
  3. worker tasks cancelled

New config env vars:
  QUEUE_MAX_EVENTS (default 100)
  WORKER_CONCURRENCY (default 2)
  QUEUE_DRAIN_TIMEOUT_S (default 5)

New metrics (H3 additions):
  matrix_bridge_queue_size (gauge)
  matrix_bridge_queue_dropped_total (counter)
  matrix_bridge_queue_wait_seconds histogram (buckets: 0.01…30s)

/health: queue.size, queue.max, queue.workers
MatrixIngressLoop: queue_size + worker_count properties

6 queue tests: enqueue/process, full-drop-audit, concurrency barrier,
graceful drain, wait metric, rate-limit-before-enqueue
Total: 71 passed

Made-with: Cursor
This commit is contained in:
Apple
2026-03-05 01:07:04 -08:00
parent a4e95482bc
commit a24dae8e18
4 changed files with 831 additions and 129 deletions

View File

@@ -88,6 +88,22 @@ if _PROM_OK:
"matrix_bridge_rate_limiter_active_senders",
"Senders with activity in the current rate-limit window",
)
# H2: Queue metrics
_queue_size = Gauge(
"matrix_bridge_queue_size",
"Current number of pending items in the work queue",
)
_queue_dropped = Counter(
"matrix_bridge_queue_dropped_total",
"Messages dropped because queue was full",
["room_id", "agent_id"],
)
_queue_wait = Histogram(
"matrix_bridge_queue_wait_seconds",
"Time between enqueue and worker start processing",
["agent_id"],
buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 30.0],
)
# ── Startup state ─────────────────────────────────────────────────────────────
_START_TIME = time.monotonic()
@@ -97,6 +113,7 @@ _matrix_reachable: Optional[bool] = None
_gateway_reachable: Optional[bool] = None
_room_map: Optional[RoomMappingConfig] = None
_rate_limiter: Optional[InMemoryRateLimiter] = None
_ingress_loop: Optional["MatrixIngressLoop"] = None # for /health queue_size
_ingress_task: Optional[asyncio.Task] = None
_ingress_stop: Optional[asyncio.Event] = None
@@ -116,7 +133,7 @@ async def _probe_url(url: str, timeout: float = 5.0) -> bool:
@asynccontextmanager
async def lifespan(app_: Any):
global _cfg, _config_error, _matrix_reachable, _gateway_reachable
global _room_map, _rate_limiter
global _room_map, _rate_limiter, _ingress_loop
try:
_cfg = load_config()
@@ -185,7 +202,6 @@ async def lifespan(app_: Any):
_messages_rate_limited.labels(
room_id=room_id, agent_id=agent_id, limit_type=limit_type
).inc()
# Update active room/sender gauges from limiter stats
if _rate_limiter is not None:
stats = _rate_limiter.stats()
_rate_limiter_active_rooms.set(stats["active_rooms"])
@@ -199,6 +215,19 @@ async def lifespan(app_: Any):
if _PROM_OK:
_send_latency.labels(agent_id=agent_id).observe(duration_s)
# H2 callbacks
def _on_queue_dropped(room_id: str, agent_id: str) -> None:
if _PROM_OK:
_queue_dropped.labels(room_id=room_id, agent_id=agent_id).inc()
def _on_queue_size(size: int) -> None:
if _PROM_OK:
_queue_size.set(size)
def _on_queue_wait(agent_id: str, wait_s: float) -> None:
if _PROM_OK:
_queue_wait.labels(agent_id=agent_id).observe(wait_s)
ingress = MatrixIngressLoop(
matrix_homeserver_url=_cfg.matrix_homeserver_url,
matrix_access_token=_cfg.matrix_access_token,
@@ -209,13 +238,24 @@ async def lifespan(app_: Any):
sofiia_console_url=_cfg.sofiia_console_url,
sofiia_internal_token=_cfg.sofiia_internal_token,
rate_limiter=_rate_limiter,
queue_max_events=_cfg.queue_max_events,
worker_concurrency=_cfg.worker_concurrency,
queue_drain_timeout_s=_cfg.queue_drain_timeout_s,
on_message_received=_on_msg,
on_message_replied=_on_replied,
on_gateway_error=_on_gw_error,
on_rate_limited=_on_rate_limited,
on_queue_dropped=_on_queue_dropped,
on_queue_size=_on_queue_size,
on_invoke_latency=_on_invoke_latency,
on_send_latency=_on_send_latency,
on_queue_wait=_on_queue_wait,
)
logger.info(
"✅ Backpressure queue: max=%d workers=%d drain_timeout=%.1fs",
_cfg.queue_max_events, _cfg.worker_concurrency, _cfg.queue_drain_timeout_s,
)
_ingress_loop = ingress
_ingress_task = asyncio.create_task(
ingress.run(_ingress_stop),
name="matrix_ingress_loop",
@@ -290,6 +330,11 @@ async def health() -> Dict[str, Any]:
"mappings_count": _room_map.total_mappings if _room_map else 0,
"config_ok": True,
"rate_limiter": _rate_limiter.stats() if _rate_limiter else None,
"queue": {
"size": _ingress_loop.queue_size if _ingress_loop else 0,
"max": _cfg.queue_max_events,
"workers": _cfg.worker_concurrency,
},
}