feat(matrix-bridge-dagi): add rate limiting (H1) and metrics (H3)

H1 — InMemoryRateLimiter (sliding window, no Redis):
  - Per-room: RATE_LIMIT_ROOM_RPM (default 20/min)
  - Per-sender: RATE_LIMIT_SENDER_RPM (default 10/min)
  - Room checked before sender — sender quota not charged on room block
  - Blocked messages: audit matrix.rate_limited + on_rate_limited callback
  - reset() for ops/test, stats() exposed in /health

H3 — Extended Prometheus metrics:
  - matrix_bridge_rate_limited_total{room_id,agent_id,limit_type}
  - matrix_bridge_send_duration_seconds histogram (invoke was already there)
  - matrix_bridge_invoke_duration_seconds buckets tuned for LLM latency
  - matrix_bridge_rate_limiter_active_rooms/senders gauges
  - on_invoke_latency + on_send_latency callbacks wired in ingress loop

16 new tests: rate limiter unit (13) + ingress integration (3)
Total: 65 passed

Made-with: Cursor
This commit is contained in:
Apple
2026-03-05 00:54:14 -08:00
parent 313d777c84
commit a4e95482bc
5 changed files with 607 additions and 40 deletions

View File

@@ -33,6 +33,7 @@ except ImportError: # pragma: no cover
from .config import BridgeConfig, load_config
from .ingress import MatrixIngressLoop
from .rate_limit import InMemoryRateLimiter
from .room_mapping import RoomMappingConfig, parse_room_map
logging.basicConfig(
@@ -41,7 +42,7 @@ logging.basicConfig(
)
logger = logging.getLogger("matrix-bridge-dagi")
# ── Prometheus metrics ────────────────────────────────────────────────────────
# ── Prometheus metrics (H3) ───────────────────────────────────────────────────
if _PROM_OK:
_messages_received = Counter(
"matrix_bridge_messages_received_total",
@@ -53,28 +54,49 @@ if _PROM_OK:
"Total agent replies sent to Matrix",
["room_id", "agent_id", "status"],
)
_messages_rate_limited = Counter(
"matrix_bridge_rate_limited_total",
"Messages dropped by rate limiter",
["room_id", "agent_id", "limit_type"],
)
_gateway_errors = Counter(
"matrix_bridge_gateway_errors_total",
"Errors calling DAGI gateway",
"Errors by stage (sync, invoke, send, audit)",
["error_type"],
)
_invoke_latency = Histogram(
"matrix_bridge_invoke_duration_seconds",
"Duration of DAGI invoke call",
"Latency of DAGI Router infer call",
["agent_id"],
buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 45.0],
)
_send_latency = Histogram(
"matrix_bridge_send_duration_seconds",
"Latency of Matrix send_text call",
["agent_id"],
buckets=[0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0],
)
_bridge_up = Gauge(
"matrix_bridge_up",
"1 if bridge started successfully",
)
_rate_limiter_active_rooms = Gauge(
"matrix_bridge_rate_limiter_active_rooms",
"Rooms with activity in the current rate-limit window",
)
_rate_limiter_active_senders = Gauge(
"matrix_bridge_rate_limiter_active_senders",
"Senders with activity in the current rate-limit window",
)
# ── Startup state ─────────────────────────────────────────────────────────────
_START_TIME = time.monotonic()
_cfg: Optional[BridgeConfig] = None
_config_error: Optional[str] = None
_matrix_reachable: Optional[bool] = None # probed at startup
_gateway_reachable: Optional[bool] = None # probed at startup
_matrix_reachable: Optional[bool] = None
_gateway_reachable: Optional[bool] = None
_room_map: Optional[RoomMappingConfig] = None
_rate_limiter: Optional[InMemoryRateLimiter] = None
_ingress_task: Optional[asyncio.Task] = None
_ingress_stop: Optional[asyncio.Event] = None
@@ -93,7 +115,8 @@ async def _probe_url(url: str, timeout: float = 5.0) -> bool:
# ── Lifespan ──────────────────────────────────────────────────────────────────
@asynccontextmanager
async def lifespan(app_: Any):
global _cfg, _config_error, _matrix_reachable, _gateway_reachable, _room_map
global _cfg, _config_error, _matrix_reachable, _gateway_reachable
global _room_map, _rate_limiter
try:
_cfg = load_config()
@@ -103,6 +126,16 @@ async def lifespan(app_: Any):
_cfg.bridge_allowed_agents,
)
# H1: Rate limiter (inmemory, per config)
_rate_limiter = InMemoryRateLimiter(
room_rpm=_cfg.rate_limit_room_rpm,
sender_rpm=_cfg.rate_limit_sender_rpm,
)
logger.info(
"✅ Rate limiter: room_rpm=%d sender_rpm=%d",
_cfg.rate_limit_room_rpm, _cfg.rate_limit_sender_rpm,
)
logger.info(
"✅ matrix-bridge-dagi started | node=%s build=%s homeserver=%s "
"room=%s agents=%s mappings=%d",
@@ -147,6 +180,25 @@ async def lifespan(app_: Any):
room_id=room_id, agent_id=agent_id, status=status
).inc()
def _on_rate_limited(room_id: str, agent_id: str, limit_type: str) -> None:
if _PROM_OK:
_messages_rate_limited.labels(
room_id=room_id, agent_id=agent_id, limit_type=limit_type
).inc()
# Update active room/sender gauges from limiter stats
if _rate_limiter is not None:
stats = _rate_limiter.stats()
_rate_limiter_active_rooms.set(stats["active_rooms"])
_rate_limiter_active_senders.set(stats["active_senders"])
def _on_invoke_latency(agent_id: str, duration_s: float) -> None:
if _PROM_OK:
_invoke_latency.labels(agent_id=agent_id).observe(duration_s)
def _on_send_latency(agent_id: str, duration_s: float) -> None:
if _PROM_OK:
_send_latency.labels(agent_id=agent_id).observe(duration_s)
ingress = MatrixIngressLoop(
matrix_homeserver_url=_cfg.matrix_homeserver_url,
matrix_access_token=_cfg.matrix_access_token,
@@ -156,9 +208,13 @@ async def lifespan(app_: Any):
room_map=_room_map,
sofiia_console_url=_cfg.sofiia_console_url,
sofiia_internal_token=_cfg.sofiia_internal_token,
rate_limiter=_rate_limiter,
on_message_received=_on_msg,
on_message_replied=_on_replied,
on_gateway_error=_on_gw_error,
on_rate_limited=_on_rate_limited,
on_invoke_latency=_on_invoke_latency,
on_send_latency=_on_send_latency,
)
_ingress_task = asyncio.create_task(
ingress.run(_ingress_stop),
@@ -233,6 +289,7 @@ async def health() -> Dict[str, Any]:
"gateway_reachable": _gateway_reachable,
"mappings_count": _room_map.total_mappings if _room_map else 0,
"config_ok": True,
"rate_limiter": _rate_limiter.stats() if _rate_limiter else None,
}