Files
microdao-daarion/services/matrix-bridge-dagi/app/event_store.py
Apple 82d5ff2a4f feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11:
- M4: agent discovery (!agents / !status)
- M5: node-aware routing + per-node observability
- M6: dynamic policy store (node/agent overrides, import/export)
- M7: Prometheus alerts + Grafana dashboard + metrics contract
- M8: node health tracker + soft failover + sticky cache + HA persistence
- M9: two-step confirm + diff preview for dangerous commands
- M10: auto-backup, restore, retention, policy history + change detail
- M11: soak scenarios (CI tests) + live soak script

Soak infrastructure (this commit):
- POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false)
- _preflight_inject() and _check_wal() in soak script
- --db-path arg for WAL delta reporting
- Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands

Made-with: Cursor
2026-03-05 07:51:37 -08:00

214 lines
7.7 KiB
Python

"""
event_store — M2.3: Persistent event deduplication via SQLite.
Stores processed Matrix event_ids so that bridge restarts do not reprocess
events still returned by /sync (within TTL window).
Schema:
processed_events (room_id, event_id, ts, sender_hash)
PK: (room_id, event_id)
IDX: idx_processed_events_ts (ts)
Design notes:
- Uses aiosqlite for non-blocking async access from the ingress event loop.
- Prune is best-effort: failures are logged but do NOT abort processing.
- If the DB is unavailable (init error, corruption), EventStore degrades to
a no-op: is_processed() returns False, mark_processed() is a no-op.
The in-memory LRU dedupe (H1) continues to protect within a single run.
- WAL mode is enabled for better concurrent read performance.
"""
from __future__ import annotations
import logging
import time
from pathlib import Path
from typing import Optional, Tuple
try:
import aiosqlite
_AIOSQLITE_OK = True
except ImportError: # pragma: no cover
aiosqlite = None # type: ignore
_AIOSQLITE_OK = False
logger = logging.getLogger(__name__)
_SCHEMA = """
CREATE TABLE IF NOT EXISTS processed_events (
room_id TEXT NOT NULL,
event_id TEXT NOT NULL,
ts INTEGER NOT NULL,
sender_hash TEXT,
PRIMARY KEY (room_id, event_id)
);
CREATE INDEX IF NOT EXISTS idx_processed_events_ts ON processed_events (ts);
"""
class EventStore:
"""
Async SQLite-backed deduplication store for Matrix event_ids.
Usage:
store = EventStore("/app/data/matrix_bridge.db", ttl_h=48)
await store.open()
...
hit = await store.is_processed(room_id, event_id)
if not hit:
await store.mark_processed(room_id, event_id, sender_hash)
...
pruned = await store.prune(batch=5000)
await store.close()
"""
def __init__(
self,
db_path: str,
ttl_h: int = 48,
prune_batch: int = 5000,
) -> None:
self.db_path = db_path
self.ttl_h = ttl_h
self.prune_batch = prune_batch
self._db: Optional["aiosqlite.Connection"] = None
self._ok: bool = False
self._last_prune_at: Optional[float] = None
self._pruned_rows_last: int = 0
# ── Lifecycle ─────────────────────────────────────────────────────────────
async def open(self) -> bool:
"""
Open the SQLite connection and apply schema.
Returns True on success; False on failure (degraded mode).
"""
if not _AIOSQLITE_OK:
logger.warning("aiosqlite not available — persistent dedupe disabled")
return False
try:
Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
self._db = await aiosqlite.connect(self.db_path)
# WAL mode: better concurrent read, non-blocking writes
await self._db.execute("PRAGMA journal_mode=WAL")
await self._db.execute("PRAGMA synchronous=NORMAL")
await self._db.executescript(_SCHEMA)
await self._db.commit()
self._ok = True
logger.info("EventStore opened: %s (ttl_h=%d)", self.db_path, self.ttl_h)
return True
except Exception as exc:
logger.error("EventStore.open failed — degraded: %s", exc)
self._ok = False
return False
async def close(self) -> None:
"""Close the SQLite connection gracefully."""
if self._db is not None:
try:
await self._db.close()
except Exception as exc: # pragma: no cover
logger.warning("EventStore.close error: %s", exc)
self._db = None
self._ok = False
# ── Core operations ───────────────────────────────────────────────────────
async def is_processed(self, room_id: str, event_id: str) -> bool:
"""
Return True if (room_id, event_id) has already been processed.
Safe to call even when degraded (returns False → no false deduplication).
"""
if not self._ok or self._db is None:
return False
try:
async with self._db.execute(
"SELECT 1 FROM processed_events WHERE room_id=? AND event_id=? LIMIT 1",
(room_id, event_id),
) as cursor:
row = await cursor.fetchone()
return row is not None
except Exception as exc:
logger.warning("EventStore.is_processed error (degraded): %s", exc)
return False
async def mark_processed(
self,
room_id: str,
event_id: str,
sender_hash: str = "",
) -> bool:
"""
Insert (room_id, event_id) as processed.
Returns True on success, False if already exists or on error.
Uses INSERT OR IGNORE to avoid duplicates without raising.
"""
if not self._ok or self._db is None:
return False
ts = int(time.time())
try:
await self._db.execute(
"INSERT OR IGNORE INTO processed_events (room_id, event_id, ts, sender_hash) "
"VALUES (?, ?, ?, ?)",
(room_id, event_id, ts, sender_hash or None),
)
await self._db.commit()
return True
except Exception as exc:
logger.warning("EventStore.mark_processed error (degraded): %s", exc)
return False
# ── Prune ─────────────────────────────────────────────────────────────────
async def prune(self, batch: Optional[int] = None) -> int:
"""
Delete events older than ttl_h.
Returns the number of rows deleted (0 on error or degraded).
Uses LIMIT batch to avoid long locks on large tables.
"""
if not self._ok or self._db is None:
return 0
cutoff = int(time.time()) - self.ttl_h * 3600
effective_batch = batch or self.prune_batch
deleted = 0
try:
# SQLite DELETE with LIMIT requires compiling with SQLITE_ENABLE_UPDATE_DELETE_LIMIT,
# which may not be available. Use a subquery approach instead.
await self._db.execute(
"DELETE FROM processed_events "
"WHERE rowid IN ("
" SELECT rowid FROM processed_events WHERE ts < ? LIMIT ?"
")",
(cutoff, effective_batch),
)
await self._db.commit()
# Estimate rows deleted from changes()
async with self._db.execute("SELECT changes()") as cursor:
row = await cursor.fetchone()
deleted = row[0] if row else 0
self._last_prune_at = time.time()
self._pruned_rows_last = deleted
if deleted:
logger.info("EventStore pruned %d rows (cutoff=%d)", deleted, cutoff)
except Exception as exc:
logger.warning("EventStore.prune error: %s", exc)
return deleted
# ── Health / introspection ─────────────────────────────────────────────────
def as_health_dict(self) -> dict:
return {
"enabled": self._ok,
"db_path": self.db_path,
"ttl_h": self.ttl_h,
"ok": self._ok,
"last_prune_at": self._last_prune_at,
"pruned_rows_last": self._pruned_rows_last,
}