feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
This commit is contained in:
213
services/matrix-bridge-dagi/app/event_store.py
Normal file
213
services/matrix-bridge-dagi/app/event_store.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""
|
||||
event_store — M2.3: Persistent event deduplication via SQLite.
|
||||
|
||||
Stores processed Matrix event_ids so that bridge restarts do not reprocess
|
||||
events still returned by /sync (within TTL window).
|
||||
|
||||
Schema:
|
||||
processed_events (room_id, event_id, ts, sender_hash)
|
||||
PK: (room_id, event_id)
|
||||
IDX: idx_processed_events_ts (ts)
|
||||
|
||||
Design notes:
|
||||
- Uses aiosqlite for non-blocking async access from the ingress event loop.
|
||||
- Prune is best-effort: failures are logged but do NOT abort processing.
|
||||
- If the DB is unavailable (init error, corruption), EventStore degrades to
|
||||
a no-op: is_processed() returns False, mark_processed() is a no-op.
|
||||
The in-memory LRU dedupe (H1) continues to protect within a single run.
|
||||
- WAL mode is enabled for better concurrent read performance.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple
|
||||
|
||||
try:
|
||||
import aiosqlite
|
||||
_AIOSQLITE_OK = True
|
||||
except ImportError: # pragma: no cover
|
||||
aiosqlite = None # type: ignore
|
||||
_AIOSQLITE_OK = False
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_SCHEMA = """
|
||||
CREATE TABLE IF NOT EXISTS processed_events (
|
||||
room_id TEXT NOT NULL,
|
||||
event_id TEXT NOT NULL,
|
||||
ts INTEGER NOT NULL,
|
||||
sender_hash TEXT,
|
||||
PRIMARY KEY (room_id, event_id)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_processed_events_ts ON processed_events (ts);
|
||||
"""
|
||||
|
||||
|
||||
class EventStore:
|
||||
"""
|
||||
Async SQLite-backed deduplication store for Matrix event_ids.
|
||||
|
||||
Usage:
|
||||
store = EventStore("/app/data/matrix_bridge.db", ttl_h=48)
|
||||
await store.open()
|
||||
...
|
||||
hit = await store.is_processed(room_id, event_id)
|
||||
if not hit:
|
||||
await store.mark_processed(room_id, event_id, sender_hash)
|
||||
...
|
||||
pruned = await store.prune(batch=5000)
|
||||
await store.close()
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db_path: str,
|
||||
ttl_h: int = 48,
|
||||
prune_batch: int = 5000,
|
||||
) -> None:
|
||||
self.db_path = db_path
|
||||
self.ttl_h = ttl_h
|
||||
self.prune_batch = prune_batch
|
||||
self._db: Optional["aiosqlite.Connection"] = None
|
||||
self._ok: bool = False
|
||||
self._last_prune_at: Optional[float] = None
|
||||
self._pruned_rows_last: int = 0
|
||||
|
||||
# ── Lifecycle ─────────────────────────────────────────────────────────────
|
||||
|
||||
async def open(self) -> bool:
|
||||
"""
|
||||
Open the SQLite connection and apply schema.
|
||||
|
||||
Returns True on success; False on failure (degraded mode).
|
||||
"""
|
||||
if not _AIOSQLITE_OK:
|
||||
logger.warning("aiosqlite not available — persistent dedupe disabled")
|
||||
return False
|
||||
try:
|
||||
Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
self._db = await aiosqlite.connect(self.db_path)
|
||||
# WAL mode: better concurrent read, non-blocking writes
|
||||
await self._db.execute("PRAGMA journal_mode=WAL")
|
||||
await self._db.execute("PRAGMA synchronous=NORMAL")
|
||||
await self._db.executescript(_SCHEMA)
|
||||
await self._db.commit()
|
||||
self._ok = True
|
||||
logger.info("EventStore opened: %s (ttl_h=%d)", self.db_path, self.ttl_h)
|
||||
return True
|
||||
except Exception as exc:
|
||||
logger.error("EventStore.open failed — degraded: %s", exc)
|
||||
self._ok = False
|
||||
return False
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Close the SQLite connection gracefully."""
|
||||
if self._db is not None:
|
||||
try:
|
||||
await self._db.close()
|
||||
except Exception as exc: # pragma: no cover
|
||||
logger.warning("EventStore.close error: %s", exc)
|
||||
self._db = None
|
||||
self._ok = False
|
||||
|
||||
# ── Core operations ───────────────────────────────────────────────────────
|
||||
|
||||
async def is_processed(self, room_id: str, event_id: str) -> bool:
|
||||
"""
|
||||
Return True if (room_id, event_id) has already been processed.
|
||||
|
||||
Safe to call even when degraded (returns False → no false deduplication).
|
||||
"""
|
||||
if not self._ok or self._db is None:
|
||||
return False
|
||||
try:
|
||||
async with self._db.execute(
|
||||
"SELECT 1 FROM processed_events WHERE room_id=? AND event_id=? LIMIT 1",
|
||||
(room_id, event_id),
|
||||
) as cursor:
|
||||
row = await cursor.fetchone()
|
||||
return row is not None
|
||||
except Exception as exc:
|
||||
logger.warning("EventStore.is_processed error (degraded): %s", exc)
|
||||
return False
|
||||
|
||||
async def mark_processed(
|
||||
self,
|
||||
room_id: str,
|
||||
event_id: str,
|
||||
sender_hash: str = "",
|
||||
) -> bool:
|
||||
"""
|
||||
Insert (room_id, event_id) as processed.
|
||||
|
||||
Returns True on success, False if already exists or on error.
|
||||
Uses INSERT OR IGNORE to avoid duplicates without raising.
|
||||
"""
|
||||
if not self._ok or self._db is None:
|
||||
return False
|
||||
ts = int(time.time())
|
||||
try:
|
||||
await self._db.execute(
|
||||
"INSERT OR IGNORE INTO processed_events (room_id, event_id, ts, sender_hash) "
|
||||
"VALUES (?, ?, ?, ?)",
|
||||
(room_id, event_id, ts, sender_hash or None),
|
||||
)
|
||||
await self._db.commit()
|
||||
return True
|
||||
except Exception as exc:
|
||||
logger.warning("EventStore.mark_processed error (degraded): %s", exc)
|
||||
return False
|
||||
|
||||
# ── Prune ─────────────────────────────────────────────────────────────────
|
||||
|
||||
async def prune(self, batch: Optional[int] = None) -> int:
|
||||
"""
|
||||
Delete events older than ttl_h.
|
||||
|
||||
Returns the number of rows deleted (0 on error or degraded).
|
||||
Uses LIMIT batch to avoid long locks on large tables.
|
||||
"""
|
||||
if not self._ok or self._db is None:
|
||||
return 0
|
||||
|
||||
cutoff = int(time.time()) - self.ttl_h * 3600
|
||||
effective_batch = batch or self.prune_batch
|
||||
deleted = 0
|
||||
|
||||
try:
|
||||
# SQLite DELETE with LIMIT requires compiling with SQLITE_ENABLE_UPDATE_DELETE_LIMIT,
|
||||
# which may not be available. Use a subquery approach instead.
|
||||
await self._db.execute(
|
||||
"DELETE FROM processed_events "
|
||||
"WHERE rowid IN ("
|
||||
" SELECT rowid FROM processed_events WHERE ts < ? LIMIT ?"
|
||||
")",
|
||||
(cutoff, effective_batch),
|
||||
)
|
||||
await self._db.commit()
|
||||
# Estimate rows deleted from changes()
|
||||
async with self._db.execute("SELECT changes()") as cursor:
|
||||
row = await cursor.fetchone()
|
||||
deleted = row[0] if row else 0
|
||||
self._last_prune_at = time.time()
|
||||
self._pruned_rows_last = deleted
|
||||
if deleted:
|
||||
logger.info("EventStore pruned %d rows (cutoff=%d)", deleted, cutoff)
|
||||
except Exception as exc:
|
||||
logger.warning("EventStore.prune error: %s", exc)
|
||||
|
||||
return deleted
|
||||
|
||||
# ── Health / introspection ─────────────────────────────────────────────────
|
||||
|
||||
def as_health_dict(self) -> dict:
|
||||
return {
|
||||
"enabled": self._ok,
|
||||
"db_path": self.db_path,
|
||||
"ttl_h": self.ttl_h,
|
||||
"ok": self._ok,
|
||||
"last_prune_at": self._last_prune_at,
|
||||
"pruned_rows_last": self._pruned_rows_last,
|
||||
}
|
||||
Reference in New Issue
Block a user