Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
214 lines
7.7 KiB
Python
214 lines
7.7 KiB
Python
"""
|
|
event_store — M2.3: Persistent event deduplication via SQLite.
|
|
|
|
Stores processed Matrix event_ids so that bridge restarts do not reprocess
|
|
events still returned by /sync (within TTL window).
|
|
|
|
Schema:
|
|
processed_events (room_id, event_id, ts, sender_hash)
|
|
PK: (room_id, event_id)
|
|
IDX: idx_processed_events_ts (ts)
|
|
|
|
Design notes:
|
|
- Uses aiosqlite for non-blocking async access from the ingress event loop.
|
|
- Prune is best-effort: failures are logged but do NOT abort processing.
|
|
- If the DB is unavailable (init error, corruption), EventStore degrades to
|
|
a no-op: is_processed() returns False, mark_processed() is a no-op.
|
|
The in-memory LRU dedupe (H1) continues to protect within a single run.
|
|
- WAL mode is enabled for better concurrent read performance.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Optional, Tuple
|
|
|
|
try:
|
|
import aiosqlite
|
|
_AIOSQLITE_OK = True
|
|
except ImportError: # pragma: no cover
|
|
aiosqlite = None # type: ignore
|
|
_AIOSQLITE_OK = False
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_SCHEMA = """
|
|
CREATE TABLE IF NOT EXISTS processed_events (
|
|
room_id TEXT NOT NULL,
|
|
event_id TEXT NOT NULL,
|
|
ts INTEGER NOT NULL,
|
|
sender_hash TEXT,
|
|
PRIMARY KEY (room_id, event_id)
|
|
);
|
|
CREATE INDEX IF NOT EXISTS idx_processed_events_ts ON processed_events (ts);
|
|
"""
|
|
|
|
|
|
class EventStore:
|
|
"""
|
|
Async SQLite-backed deduplication store for Matrix event_ids.
|
|
|
|
Usage:
|
|
store = EventStore("/app/data/matrix_bridge.db", ttl_h=48)
|
|
await store.open()
|
|
...
|
|
hit = await store.is_processed(room_id, event_id)
|
|
if not hit:
|
|
await store.mark_processed(room_id, event_id, sender_hash)
|
|
...
|
|
pruned = await store.prune(batch=5000)
|
|
await store.close()
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
db_path: str,
|
|
ttl_h: int = 48,
|
|
prune_batch: int = 5000,
|
|
) -> None:
|
|
self.db_path = db_path
|
|
self.ttl_h = ttl_h
|
|
self.prune_batch = prune_batch
|
|
self._db: Optional["aiosqlite.Connection"] = None
|
|
self._ok: bool = False
|
|
self._last_prune_at: Optional[float] = None
|
|
self._pruned_rows_last: int = 0
|
|
|
|
# ── Lifecycle ─────────────────────────────────────────────────────────────
|
|
|
|
async def open(self) -> bool:
|
|
"""
|
|
Open the SQLite connection and apply schema.
|
|
|
|
Returns True on success; False on failure (degraded mode).
|
|
"""
|
|
if not _AIOSQLITE_OK:
|
|
logger.warning("aiosqlite not available — persistent dedupe disabled")
|
|
return False
|
|
try:
|
|
Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
|
|
self._db = await aiosqlite.connect(self.db_path)
|
|
# WAL mode: better concurrent read, non-blocking writes
|
|
await self._db.execute("PRAGMA journal_mode=WAL")
|
|
await self._db.execute("PRAGMA synchronous=NORMAL")
|
|
await self._db.executescript(_SCHEMA)
|
|
await self._db.commit()
|
|
self._ok = True
|
|
logger.info("EventStore opened: %s (ttl_h=%d)", self.db_path, self.ttl_h)
|
|
return True
|
|
except Exception as exc:
|
|
logger.error("EventStore.open failed — degraded: %s", exc)
|
|
self._ok = False
|
|
return False
|
|
|
|
async def close(self) -> None:
|
|
"""Close the SQLite connection gracefully."""
|
|
if self._db is not None:
|
|
try:
|
|
await self._db.close()
|
|
except Exception as exc: # pragma: no cover
|
|
logger.warning("EventStore.close error: %s", exc)
|
|
self._db = None
|
|
self._ok = False
|
|
|
|
# ── Core operations ───────────────────────────────────────────────────────
|
|
|
|
async def is_processed(self, room_id: str, event_id: str) -> bool:
|
|
"""
|
|
Return True if (room_id, event_id) has already been processed.
|
|
|
|
Safe to call even when degraded (returns False → no false deduplication).
|
|
"""
|
|
if not self._ok or self._db is None:
|
|
return False
|
|
try:
|
|
async with self._db.execute(
|
|
"SELECT 1 FROM processed_events WHERE room_id=? AND event_id=? LIMIT 1",
|
|
(room_id, event_id),
|
|
) as cursor:
|
|
row = await cursor.fetchone()
|
|
return row is not None
|
|
except Exception as exc:
|
|
logger.warning("EventStore.is_processed error (degraded): %s", exc)
|
|
return False
|
|
|
|
async def mark_processed(
|
|
self,
|
|
room_id: str,
|
|
event_id: str,
|
|
sender_hash: str = "",
|
|
) -> bool:
|
|
"""
|
|
Insert (room_id, event_id) as processed.
|
|
|
|
Returns True on success, False if already exists or on error.
|
|
Uses INSERT OR IGNORE to avoid duplicates without raising.
|
|
"""
|
|
if not self._ok or self._db is None:
|
|
return False
|
|
ts = int(time.time())
|
|
try:
|
|
await self._db.execute(
|
|
"INSERT OR IGNORE INTO processed_events (room_id, event_id, ts, sender_hash) "
|
|
"VALUES (?, ?, ?, ?)",
|
|
(room_id, event_id, ts, sender_hash or None),
|
|
)
|
|
await self._db.commit()
|
|
return True
|
|
except Exception as exc:
|
|
logger.warning("EventStore.mark_processed error (degraded): %s", exc)
|
|
return False
|
|
|
|
# ── Prune ─────────────────────────────────────────────────────────────────
|
|
|
|
async def prune(self, batch: Optional[int] = None) -> int:
|
|
"""
|
|
Delete events older than ttl_h.
|
|
|
|
Returns the number of rows deleted (0 on error or degraded).
|
|
Uses LIMIT batch to avoid long locks on large tables.
|
|
"""
|
|
if not self._ok or self._db is None:
|
|
return 0
|
|
|
|
cutoff = int(time.time()) - self.ttl_h * 3600
|
|
effective_batch = batch or self.prune_batch
|
|
deleted = 0
|
|
|
|
try:
|
|
# SQLite DELETE with LIMIT requires compiling with SQLITE_ENABLE_UPDATE_DELETE_LIMIT,
|
|
# which may not be available. Use a subquery approach instead.
|
|
await self._db.execute(
|
|
"DELETE FROM processed_events "
|
|
"WHERE rowid IN ("
|
|
" SELECT rowid FROM processed_events WHERE ts < ? LIMIT ?"
|
|
")",
|
|
(cutoff, effective_batch),
|
|
)
|
|
await self._db.commit()
|
|
# Estimate rows deleted from changes()
|
|
async with self._db.execute("SELECT changes()") as cursor:
|
|
row = await cursor.fetchone()
|
|
deleted = row[0] if row else 0
|
|
self._last_prune_at = time.time()
|
|
self._pruned_rows_last = deleted
|
|
if deleted:
|
|
logger.info("EventStore pruned %d rows (cutoff=%d)", deleted, cutoff)
|
|
except Exception as exc:
|
|
logger.warning("EventStore.prune error: %s", exc)
|
|
|
|
return deleted
|
|
|
|
# ── Health / introspection ─────────────────────────────────────────────────
|
|
|
|
def as_health_dict(self) -> dict:
|
|
return {
|
|
"enabled": self._ok,
|
|
"db_path": self.db_path,
|
|
"ttl_h": self.ttl_h,
|
|
"ok": self._ok,
|
|
"last_prune_at": self._last_prune_at,
|
|
"pruned_rows_last": self._pruned_rows_last,
|
|
}
|