feat(matrix-bridge-dagi): add backpressure queue with N workers (H2)
Reader + N workers architecture:
Reader: sync_poll → rate_check → dedupe → queue.put_nowait()
Workers (WORKER_CONCURRENCY, default 2): queue.get() → invoke → send → audit
Drop policy (queue full):
- put_nowait() raises QueueFull → dropped immediately (reader never blocks)
- audit matrix.queue_full + on_queue_dropped callback
- metric: matrix_bridge_queue_dropped_total{room_id,agent_id}
Graceful shutdown:
1. stop_event → reader exits loop
2. queue.join() with QUEUE_DRAIN_TIMEOUT_S (default 5s) → workers finish in-flight
3. worker tasks cancelled
New config env vars:
QUEUE_MAX_EVENTS (default 100)
WORKER_CONCURRENCY (default 2)
QUEUE_DRAIN_TIMEOUT_S (default 5)
New metrics (H3 additions):
matrix_bridge_queue_size (gauge)
matrix_bridge_queue_dropped_total (counter)
matrix_bridge_queue_wait_seconds histogram (buckets: 0.01…30s)
/health: queue.size, queue.max, queue.workers
MatrixIngressLoop: queue_size + worker_count properties
6 queue tests: enqueue/process, full-drop-audit, concurrency barrier,
graceful drain, wait metric, rate-limit-before-enqueue
Total: 71 passed
Made-with: Cursor
This commit is contained in:
@@ -88,6 +88,22 @@ if _PROM_OK:
|
||||
"matrix_bridge_rate_limiter_active_senders",
|
||||
"Senders with activity in the current rate-limit window",
|
||||
)
|
||||
# H2: Queue metrics
|
||||
_queue_size = Gauge(
|
||||
"matrix_bridge_queue_size",
|
||||
"Current number of pending items in the work queue",
|
||||
)
|
||||
_queue_dropped = Counter(
|
||||
"matrix_bridge_queue_dropped_total",
|
||||
"Messages dropped because queue was full",
|
||||
["room_id", "agent_id"],
|
||||
)
|
||||
_queue_wait = Histogram(
|
||||
"matrix_bridge_queue_wait_seconds",
|
||||
"Time between enqueue and worker start processing",
|
||||
["agent_id"],
|
||||
buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 30.0],
|
||||
)
|
||||
|
||||
# ── Startup state ─────────────────────────────────────────────────────────────
|
||||
_START_TIME = time.monotonic()
|
||||
@@ -97,6 +113,7 @@ _matrix_reachable: Optional[bool] = None
|
||||
_gateway_reachable: Optional[bool] = None
|
||||
_room_map: Optional[RoomMappingConfig] = None
|
||||
_rate_limiter: Optional[InMemoryRateLimiter] = None
|
||||
_ingress_loop: Optional["MatrixIngressLoop"] = None # for /health queue_size
|
||||
_ingress_task: Optional[asyncio.Task] = None
|
||||
_ingress_stop: Optional[asyncio.Event] = None
|
||||
|
||||
@@ -116,7 +133,7 @@ async def _probe_url(url: str, timeout: float = 5.0) -> bool:
|
||||
@asynccontextmanager
|
||||
async def lifespan(app_: Any):
|
||||
global _cfg, _config_error, _matrix_reachable, _gateway_reachable
|
||||
global _room_map, _rate_limiter
|
||||
global _room_map, _rate_limiter, _ingress_loop
|
||||
try:
|
||||
_cfg = load_config()
|
||||
|
||||
@@ -185,7 +202,6 @@ async def lifespan(app_: Any):
|
||||
_messages_rate_limited.labels(
|
||||
room_id=room_id, agent_id=agent_id, limit_type=limit_type
|
||||
).inc()
|
||||
# Update active room/sender gauges from limiter stats
|
||||
if _rate_limiter is not None:
|
||||
stats = _rate_limiter.stats()
|
||||
_rate_limiter_active_rooms.set(stats["active_rooms"])
|
||||
@@ -199,6 +215,19 @@ async def lifespan(app_: Any):
|
||||
if _PROM_OK:
|
||||
_send_latency.labels(agent_id=agent_id).observe(duration_s)
|
||||
|
||||
# H2 callbacks
|
||||
def _on_queue_dropped(room_id: str, agent_id: str) -> None:
|
||||
if _PROM_OK:
|
||||
_queue_dropped.labels(room_id=room_id, agent_id=agent_id).inc()
|
||||
|
||||
def _on_queue_size(size: int) -> None:
|
||||
if _PROM_OK:
|
||||
_queue_size.set(size)
|
||||
|
||||
def _on_queue_wait(agent_id: str, wait_s: float) -> None:
|
||||
if _PROM_OK:
|
||||
_queue_wait.labels(agent_id=agent_id).observe(wait_s)
|
||||
|
||||
ingress = MatrixIngressLoop(
|
||||
matrix_homeserver_url=_cfg.matrix_homeserver_url,
|
||||
matrix_access_token=_cfg.matrix_access_token,
|
||||
@@ -209,13 +238,24 @@ async def lifespan(app_: Any):
|
||||
sofiia_console_url=_cfg.sofiia_console_url,
|
||||
sofiia_internal_token=_cfg.sofiia_internal_token,
|
||||
rate_limiter=_rate_limiter,
|
||||
queue_max_events=_cfg.queue_max_events,
|
||||
worker_concurrency=_cfg.worker_concurrency,
|
||||
queue_drain_timeout_s=_cfg.queue_drain_timeout_s,
|
||||
on_message_received=_on_msg,
|
||||
on_message_replied=_on_replied,
|
||||
on_gateway_error=_on_gw_error,
|
||||
on_rate_limited=_on_rate_limited,
|
||||
on_queue_dropped=_on_queue_dropped,
|
||||
on_queue_size=_on_queue_size,
|
||||
on_invoke_latency=_on_invoke_latency,
|
||||
on_send_latency=_on_send_latency,
|
||||
on_queue_wait=_on_queue_wait,
|
||||
)
|
||||
logger.info(
|
||||
"✅ Backpressure queue: max=%d workers=%d drain_timeout=%.1fs",
|
||||
_cfg.queue_max_events, _cfg.worker_concurrency, _cfg.queue_drain_timeout_s,
|
||||
)
|
||||
_ingress_loop = ingress
|
||||
_ingress_task = asyncio.create_task(
|
||||
ingress.run(_ingress_stop),
|
||||
name="matrix_ingress_loop",
|
||||
@@ -290,6 +330,11 @@ async def health() -> Dict[str, Any]:
|
||||
"mappings_count": _room_map.total_mappings if _room_map else 0,
|
||||
"config_ok": True,
|
||||
"rate_limiter": _rate_limiter.stats() if _rate_limiter else None,
|
||||
"queue": {
|
||||
"size": _ingress_loop.queue_size if _ingress_loop else 0,
|
||||
"max": _cfg.queue_max_events,
|
||||
"workers": _cfg.worker_concurrency,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user