feat(matrix-bridge-dagi): add rate limiting (H1) and metrics (H3)
H1 — InMemoryRateLimiter (sliding window, no Redis):
- Per-room: RATE_LIMIT_ROOM_RPM (default 20/min)
- Per-sender: RATE_LIMIT_SENDER_RPM (default 10/min)
- Room checked before sender — sender quota not charged on room block
- Blocked messages: audit matrix.rate_limited + on_rate_limited callback
- reset() for ops/test, stats() exposed in /health
H3 — Extended Prometheus metrics:
- matrix_bridge_rate_limited_total{room_id,agent_id,limit_type}
- matrix_bridge_send_duration_seconds histogram (invoke was already there)
- matrix_bridge_invoke_duration_seconds buckets tuned for LLM latency
- matrix_bridge_rate_limiter_active_rooms/senders gauges
- on_invoke_latency + on_send_latency callbacks wired in ingress loop
16 new tests: rate limiter unit (13) + ingress integration (3)
Total: 65 passed
Made-with: Cursor
This commit is contained in:
@@ -33,6 +33,7 @@ except ImportError: # pragma: no cover
|
||||
|
||||
from .config import BridgeConfig, load_config
|
||||
from .ingress import MatrixIngressLoop
|
||||
from .rate_limit import InMemoryRateLimiter
|
||||
from .room_mapping import RoomMappingConfig, parse_room_map
|
||||
|
||||
logging.basicConfig(
|
||||
@@ -41,7 +42,7 @@ logging.basicConfig(
|
||||
)
|
||||
logger = logging.getLogger("matrix-bridge-dagi")
|
||||
|
||||
# ── Prometheus metrics ────────────────────────────────────────────────────────
|
||||
# ── Prometheus metrics (H3) ───────────────────────────────────────────────────
|
||||
if _PROM_OK:
|
||||
_messages_received = Counter(
|
||||
"matrix_bridge_messages_received_total",
|
||||
@@ -53,28 +54,49 @@ if _PROM_OK:
|
||||
"Total agent replies sent to Matrix",
|
||||
["room_id", "agent_id", "status"],
|
||||
)
|
||||
_messages_rate_limited = Counter(
|
||||
"matrix_bridge_rate_limited_total",
|
||||
"Messages dropped by rate limiter",
|
||||
["room_id", "agent_id", "limit_type"],
|
||||
)
|
||||
_gateway_errors = Counter(
|
||||
"matrix_bridge_gateway_errors_total",
|
||||
"Errors calling DAGI gateway",
|
||||
"Errors by stage (sync, invoke, send, audit)",
|
||||
["error_type"],
|
||||
)
|
||||
_invoke_latency = Histogram(
|
||||
"matrix_bridge_invoke_duration_seconds",
|
||||
"Duration of DAGI invoke call",
|
||||
"Latency of DAGI Router infer call",
|
||||
["agent_id"],
|
||||
buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 45.0],
|
||||
)
|
||||
_send_latency = Histogram(
|
||||
"matrix_bridge_send_duration_seconds",
|
||||
"Latency of Matrix send_text call",
|
||||
["agent_id"],
|
||||
buckets=[0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0],
|
||||
)
|
||||
_bridge_up = Gauge(
|
||||
"matrix_bridge_up",
|
||||
"1 if bridge started successfully",
|
||||
)
|
||||
_rate_limiter_active_rooms = Gauge(
|
||||
"matrix_bridge_rate_limiter_active_rooms",
|
||||
"Rooms with activity in the current rate-limit window",
|
||||
)
|
||||
_rate_limiter_active_senders = Gauge(
|
||||
"matrix_bridge_rate_limiter_active_senders",
|
||||
"Senders with activity in the current rate-limit window",
|
||||
)
|
||||
|
||||
# ── Startup state ─────────────────────────────────────────────────────────────
|
||||
_START_TIME = time.monotonic()
|
||||
_cfg: Optional[BridgeConfig] = None
|
||||
_config_error: Optional[str] = None
|
||||
_matrix_reachable: Optional[bool] = None # probed at startup
|
||||
_gateway_reachable: Optional[bool] = None # probed at startup
|
||||
_matrix_reachable: Optional[bool] = None
|
||||
_gateway_reachable: Optional[bool] = None
|
||||
_room_map: Optional[RoomMappingConfig] = None
|
||||
_rate_limiter: Optional[InMemoryRateLimiter] = None
|
||||
_ingress_task: Optional[asyncio.Task] = None
|
||||
_ingress_stop: Optional[asyncio.Event] = None
|
||||
|
||||
@@ -93,7 +115,8 @@ async def _probe_url(url: str, timeout: float = 5.0) -> bool:
|
||||
# ── Lifespan ──────────────────────────────────────────────────────────────────
|
||||
@asynccontextmanager
|
||||
async def lifespan(app_: Any):
|
||||
global _cfg, _config_error, _matrix_reachable, _gateway_reachable, _room_map
|
||||
global _cfg, _config_error, _matrix_reachable, _gateway_reachable
|
||||
global _room_map, _rate_limiter
|
||||
try:
|
||||
_cfg = load_config()
|
||||
|
||||
@@ -103,6 +126,16 @@ async def lifespan(app_: Any):
|
||||
_cfg.bridge_allowed_agents,
|
||||
)
|
||||
|
||||
# H1: Rate limiter (inmemory, per config)
|
||||
_rate_limiter = InMemoryRateLimiter(
|
||||
room_rpm=_cfg.rate_limit_room_rpm,
|
||||
sender_rpm=_cfg.rate_limit_sender_rpm,
|
||||
)
|
||||
logger.info(
|
||||
"✅ Rate limiter: room_rpm=%d sender_rpm=%d",
|
||||
_cfg.rate_limit_room_rpm, _cfg.rate_limit_sender_rpm,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"✅ matrix-bridge-dagi started | node=%s build=%s homeserver=%s "
|
||||
"room=%s agents=%s mappings=%d",
|
||||
@@ -147,6 +180,25 @@ async def lifespan(app_: Any):
|
||||
room_id=room_id, agent_id=agent_id, status=status
|
||||
).inc()
|
||||
|
||||
def _on_rate_limited(room_id: str, agent_id: str, limit_type: str) -> None:
|
||||
if _PROM_OK:
|
||||
_messages_rate_limited.labels(
|
||||
room_id=room_id, agent_id=agent_id, limit_type=limit_type
|
||||
).inc()
|
||||
# Update active room/sender gauges from limiter stats
|
||||
if _rate_limiter is not None:
|
||||
stats = _rate_limiter.stats()
|
||||
_rate_limiter_active_rooms.set(stats["active_rooms"])
|
||||
_rate_limiter_active_senders.set(stats["active_senders"])
|
||||
|
||||
def _on_invoke_latency(agent_id: str, duration_s: float) -> None:
|
||||
if _PROM_OK:
|
||||
_invoke_latency.labels(agent_id=agent_id).observe(duration_s)
|
||||
|
||||
def _on_send_latency(agent_id: str, duration_s: float) -> None:
|
||||
if _PROM_OK:
|
||||
_send_latency.labels(agent_id=agent_id).observe(duration_s)
|
||||
|
||||
ingress = MatrixIngressLoop(
|
||||
matrix_homeserver_url=_cfg.matrix_homeserver_url,
|
||||
matrix_access_token=_cfg.matrix_access_token,
|
||||
@@ -156,9 +208,13 @@ async def lifespan(app_: Any):
|
||||
room_map=_room_map,
|
||||
sofiia_console_url=_cfg.sofiia_console_url,
|
||||
sofiia_internal_token=_cfg.sofiia_internal_token,
|
||||
rate_limiter=_rate_limiter,
|
||||
on_message_received=_on_msg,
|
||||
on_message_replied=_on_replied,
|
||||
on_gateway_error=_on_gw_error,
|
||||
on_rate_limited=_on_rate_limited,
|
||||
on_invoke_latency=_on_invoke_latency,
|
||||
on_send_latency=_on_send_latency,
|
||||
)
|
||||
_ingress_task = asyncio.create_task(
|
||||
ingress.run(_ingress_stop),
|
||||
@@ -233,6 +289,7 @@ async def health() -> Dict[str, Any]:
|
||||
"gateway_reachable": _gateway_reachable,
|
||||
"mappings_count": _room_map.total_mappings if _room_map else 0,
|
||||
"config_ok": True,
|
||||
"rate_limiter": _rate_limiter.stats() if _rate_limiter else None,
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user