Files
microdao-daarion/services/matrix-bridge-dagi/app/policy_store.py
Apple 82d5ff2a4f feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11:
- M4: agent discovery (!agents / !status)
- M5: node-aware routing + per-node observability
- M6: dynamic policy store (node/agent overrides, import/export)
- M7: Prometheus alerts + Grafana dashboard + metrics contract
- M8: node health tracker + soft failover + sticky cache + HA persistence
- M9: two-step confirm + diff preview for dangerous commands
- M10: auto-backup, restore, retention, policy history + change detail
- M11: soak scenarios (CI tests) + live soak script

Soak infrastructure (this commit):
- POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false)
- _preflight_inject() and _check_wal() in soak script
- --db-path arg for WAL delta reporting
- Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands

Made-with: Cursor
2026-03-05 07:51:37 -08:00

1008 lines
39 KiB
Python

"""
policy_store — M6.0: Persistent room-node override store.
SQLite-backed store that allows operators to dynamically set a preferred
node (NODA1, NODA2, …) for any Matrix room without redeploying the bridge.
Resolution layer (in NodePolicy.resolve):
1. explicit node=X kwarg (highest priority)
2. dynamic store override ← this module
3. static BRIDGE_ROOM_NODE_MAP env
4. BRIDGE_DEFAULT_NODE (lowest priority)
All DB operations are synchronous/blocking. Call via asyncio.to_thread
in async contexts to avoid blocking the event loop.
Security:
- operator identity is stored as SHA-256[:16] (no PII verbatim)
- room_id values validated against basic Matrix ID format by callers
- SQLite WAL mode, PRAGMA synchronous=NORMAL for durability+speed
"""
from __future__ import annotations
import datetime
import glob as _glob
import hashlib
import json as _json
import logging
import os as _os
import sqlite3
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
POLICY_SNAPSHOT_VERSION = 1
POLICY_IMPORT_MODE_MERGE = "merge"
POLICY_IMPORT_MODE_REPLACE = "replace"
logger = logging.getLogger(__name__)
_DDL = """
CREATE TABLE IF NOT EXISTS room_node_overrides (
room_id TEXT PRIMARY KEY,
node_id TEXT NOT NULL,
updated_at INTEGER NOT NULL,
updated_by_hash TEXT NOT NULL
);
"""
_IDX_TS = """
CREATE INDEX IF NOT EXISTS idx_rno_updated_at
ON room_node_overrides (updated_at DESC);
"""
# M6.1: Dynamic mixed room agent overrides
_DDL_AGENT = """
CREATE TABLE IF NOT EXISTS room_agent_overrides (
room_id TEXT PRIMARY KEY,
agents_csv TEXT NOT NULL,
default_agent TEXT,
updated_at INTEGER NOT NULL,
updated_by_hash TEXT NOT NULL
);
"""
_IDX_AGENT_TS = """
CREATE INDEX IF NOT EXISTS idx_rao_updated_at
ON room_agent_overrides (updated_at DESC);
"""
# M8.2: HA persistence tables
_DDL_STICKY = """
CREATE TABLE IF NOT EXISTS sticky_node_cache (
key TEXT PRIMARY KEY,
node_id TEXT NOT NULL,
expires_at INTEGER NOT NULL,
updated_at INTEGER NOT NULL
);
"""
_DDL_NODE_HEALTH = """
CREATE TABLE IF NOT EXISTS node_health_state (
node_id TEXT PRIMARY KEY,
ewma_latency_s REAL,
consecutive_failures INTEGER NOT NULL DEFAULT 0,
updated_at INTEGER NOT NULL
);
"""
# M10.2: Policy change history table
_DDL_POLICY_CHANGES = """
CREATE TABLE IF NOT EXISTS policy_changes (
id INTEGER PRIMARY KEY AUTOINCREMENT,
applied_at INTEGER NOT NULL,
verb TEXT NOT NULL DEFAULT '',
mode TEXT NOT NULL DEFAULT '',
source_file TEXT NOT NULL DEFAULT '',
sender_hash TEXT NOT NULL DEFAULT '',
diff_summary TEXT NOT NULL DEFAULT '',
is_destructive INTEGER NOT NULL DEFAULT 0,
node_added INTEGER NOT NULL DEFAULT 0,
node_updated INTEGER NOT NULL DEFAULT 0,
node_deleted INTEGER NOT NULL DEFAULT 0,
agent_added INTEGER NOT NULL DEFAULT 0,
agent_updated INTEGER NOT NULL DEFAULT 0,
agent_deleted INTEGER NOT NULL DEFAULT 0
);
"""
_IDX_POLICY_CHANGES_TS = """
CREATE INDEX IF NOT EXISTS idx_pc_applied_at
ON policy_changes (applied_at DESC);
"""
_POLICY_HISTORY_DEFAULT_LIMIT = 100
# Maximum number of entries returned by list_* (safety cap)
_LIST_HARD_LIMIT = 100
# M9.1: Import diff result dataclass
_SAMPLE_KEYS_MAX = 5
@dataclass
class ImportDiff:
"""
Result of compute_import_diff — what would change if a snapshot were imported.
Used to build a preview reply and confirm binding hash (M9.1).
"""
node_added: int = 0
node_updated: int = 0
node_deleted: int = 0
agent_added: int = 0
agent_updated: int = 0
agent_deleted: int = 0
sample_keys: List[str] = field(default_factory=list) # up to _SAMPLE_KEYS_MAX
is_replace: bool = False
def total_changes(self) -> int:
return (
self.node_added + self.node_updated + self.node_deleted
+ self.agent_added + self.agent_updated + self.agent_deleted
)
def is_destructive(self) -> bool:
"""True if any existing data would be deleted."""
return self.node_deleted > 0 or self.agent_deleted > 0
# M10.2: Policy change history entry
@dataclass
class PolicyChange:
"""A single recorded policy apply event (import or restore)."""
id: int
applied_at: int # unix timestamp
verb: str # e.g. "policy.import", "policy.restore"
mode: str # "merge" or "replace"
source_file: str # snapshot filename (basename only)
sender_hash: str # truncated hash of operator sender_id
diff_summary: str # human-readable change summary string
is_destructive: bool # True if any deletions occurred
node_added: int
node_updated: int
node_deleted: int
agent_added: int
agent_updated: int
agent_deleted: int
def when_str(self) -> str:
"""Human-readable UTC timestamp."""
return datetime.datetime.fromtimestamp(
self.applied_at, datetime.timezone.utc
).strftime("%Y-%m-%d %H:%M")
def changes_short(self) -> str:
"""Compact change summary, e.g. '+2n -1n +1a'."""
parts = []
if self.node_added: parts.append(f"+{self.node_added}n")
if self.node_updated: parts.append(f"~{self.node_updated}n")
if self.node_deleted: parts.append(f"-{self.node_deleted}n")
if self.agent_added: parts.append(f"+{self.agent_added}a")
if self.agent_updated: parts.append(f"~{self.agent_updated}a")
if self.agent_deleted: parts.append(f"-{self.agent_deleted}a")
return " ".join(parts) or "no changes"
# M10.0: Auto-backup + prune result
_AUTOBACKUP_PREFIX = "policy-autobackup-"
_EXPORT_GLOB = "policy-*.json"
_PRUNE_SAMPLE_MAX = 5
@dataclass
class PruneResult:
"""Result of prune_exports — what was (or would be) pruned (M10.0)."""
files_to_delete: List[str] # basenames of matching expired files
total_bytes: int # approximate bytes freed (or to be freed)
oldest_mtime: Optional[float] = None # oldest mtime among files to delete
@property
def count(self) -> int:
return len(self.files_to_delete)
def sample_filenames(self, n: int = _PRUNE_SAMPLE_MAX) -> List[str]:
return sorted(self.files_to_delete)[:n]
def _hash_sender(sender: str) -> str:
"""Partial SHA-256 of sender Matrix ID (non-reversible, no PII stored raw)."""
return hashlib.sha256(sender.encode("utf-8")).hexdigest()[:16]
class PolicyStore:
"""
Lightweight synchronous SQLite wrapper for room→node overrides.
Usage pattern (async callers):
override = await asyncio.to_thread(store.get_override, room_id)
await asyncio.to_thread(store.set_override, room_id, "NODA2", sender)
"""
def __init__(self, db_path: str) -> None:
self._db_path = db_path
self._conn: Optional[sqlite3.Connection] = None
# ── Lifecycle ──────────────────────────────────────────────────────────────
def open(self) -> None:
"""Open (or create) the SQLite DB and apply schema."""
Path(self._db_path).parent.mkdir(parents=True, exist_ok=True)
self._conn = sqlite3.connect(
self._db_path,
check_same_thread=False,
isolation_level=None, # autocommit
)
self._conn.execute("PRAGMA journal_mode=WAL")
self._conn.execute("PRAGMA synchronous=NORMAL")
self._conn.execute(_DDL)
self._conn.execute(_IDX_TS)
self._conn.execute(_DDL_AGENT)
self._conn.execute(_IDX_AGENT_TS)
# M8.2: HA persistence tables
self._conn.execute(_DDL_STICKY)
self._conn.execute(_DDL_NODE_HEALTH)
# M10.2: Policy change history
self._conn.execute(_DDL_POLICY_CHANGES)
self._conn.execute(_IDX_POLICY_CHANGES_TS)
logger.info("PolicyStore opened: %s", self._db_path)
def close(self) -> None:
"""Close the SQLite connection."""
if self._conn:
try:
self._conn.close()
except Exception: # noqa: BLE001
pass
finally:
self._conn = None
# ── CRUD ───────────────────────────────────────────────────────────────────
def get_override(self, room_id: str) -> Optional[str]:
"""Return the stored node_id for room_id, or None if not set."""
self._require_open()
row = self._conn.execute( # type: ignore[union-attr]
"SELECT node_id FROM room_node_overrides WHERE room_id = ?",
(room_id,),
).fetchone()
return row[0] if row else None
def set_override(self, room_id: str, node_id: str, updated_by: str) -> None:
"""Upsert a room→node override."""
self._require_open()
self._conn.execute( # type: ignore[union-attr]
"""
INSERT INTO room_node_overrides (room_id, node_id, updated_at, updated_by_hash)
VALUES (?, ?, ?, ?)
ON CONFLICT(room_id) DO UPDATE SET
node_id = excluded.node_id,
updated_at = excluded.updated_at,
updated_by_hash = excluded.updated_by_hash
""",
(room_id, node_id, int(time.time()), _hash_sender(updated_by)),
)
def delete_override(self, room_id: str) -> bool:
"""Remove override for room_id. Returns True if a row was deleted."""
self._require_open()
cursor = self._conn.execute( # type: ignore[union-attr]
"DELETE FROM room_node_overrides WHERE room_id = ?",
(room_id,),
)
return cursor.rowcount > 0
def list_overrides(self, limit: int = 10) -> List[Tuple[str, str, int]]:
"""
Return [(room_id, node_id, updated_at), …] ordered by updated_at DESC.
Hard-capped at _LIST_HARD_LIMIT regardless of caller's limit.
"""
self._require_open()
cap = min(max(1, limit), _LIST_HARD_LIMIT)
rows = self._conn.execute( # type: ignore[union-attr]
"""
SELECT room_id, node_id, updated_at
FROM room_node_overrides
ORDER BY updated_at DESC
LIMIT ?
""",
(cap,),
).fetchall()
return [(r[0], r[1], r[2]) for r in rows]
def count_overrides(self) -> int:
"""Return total number of override rows in the DB."""
self._require_open()
row = self._conn.execute(
"SELECT COUNT(*) FROM room_node_overrides"
).fetchone()
return int(row[0]) if row else 0
# ── Properties ─────────────────────────────────────────────────────────────
@property
def db_path(self) -> str:
return self._db_path
@property
def is_open(self) -> bool:
return self._conn is not None
# ── M6.1: Room agent overrides ─────────────────────────────────────────────
def get_agent_override(
self, room_id: str
) -> Optional[Tuple[List[str], Optional[str]]]:
"""
Return (agents_list, default_agent_or_None) for room_id,
or None if no override exists.
"""
self._require_open()
row = self._conn.execute( # type: ignore[union-attr]
"SELECT agents_csv, default_agent FROM room_agent_overrides WHERE room_id = ?",
(room_id,),
).fetchone()
if row is None:
return None
agents = [a.strip() for a in row[0].split(",") if a.strip()]
return agents, (row[1] or None)
def set_agent_override(
self,
room_id: str,
agents: List[str],
default_agent: Optional[str],
updated_by: str,
) -> None:
"""Upsert a room agent override (sorted, deduplicated agents_csv)."""
self._require_open()
agents_csv = ",".join(sorted(set(agents)))
self._conn.execute( # type: ignore[union-attr]
"""
INSERT INTO room_agent_overrides
(room_id, agents_csv, default_agent, updated_at, updated_by_hash)
VALUES (?, ?, ?, ?, ?)
ON CONFLICT(room_id) DO UPDATE SET
agents_csv = excluded.agents_csv,
default_agent = excluded.default_agent,
updated_at = excluded.updated_at,
updated_by_hash = excluded.updated_by_hash
""",
(room_id, agents_csv, default_agent, int(time.time()), _hash_sender(updated_by)),
)
def delete_agent_override(self, room_id: str) -> bool:
"""Remove agent override for room_id. Returns True if deleted."""
self._require_open()
cursor = self._conn.execute( # type: ignore[union-attr]
"DELETE FROM room_agent_overrides WHERE room_id = ?",
(room_id,),
)
return cursor.rowcount > 0
def add_agent_to_room(
self, room_id: str, agent: str, updated_by: str
) -> Tuple[List[str], Optional[str]]:
"""
Add agent to room override, creating it if it doesn't exist.
Returns the new (agents, default_agent) state.
"""
self._require_open()
existing = self.get_agent_override(room_id)
if existing:
agents, default = existing
if agent not in agents:
agents = sorted(set(agents) | {agent})
self.set_agent_override(room_id, agents, default, updated_by)
return agents, default
else:
self.set_agent_override(room_id, [agent], agent, updated_by)
return [agent], agent
def remove_agent_from_room(
self, room_id: str, agent: str, updated_by: str
) -> Tuple[bool, Optional[str]]:
"""
Remove agent from room override.
Returns (removed: bool, error_message_or_None).
If the last agent is removed, the entire override is deleted.
"""
self._require_open()
existing = self.get_agent_override(room_id)
if not existing:
return False, "No agent override set for this room"
agents, default = existing
if agent not in agents:
return False, f"Agent `{agent}` not in override list"
agents = [a for a in agents if a != agent]
if not agents:
self.delete_agent_override(room_id)
return True, None
new_default = default if default != agent else agents[0]
self.set_agent_override(room_id, agents, new_default, updated_by)
return True, None
def list_agent_overrides(
self, limit: int = 10
) -> List[Tuple[str, List[str], Optional[str], int]]:
"""
Return [(room_id, agents_list, default_agent, updated_at), …]
ordered by updated_at DESC.
"""
self._require_open()
cap = min(max(1, limit), _LIST_HARD_LIMIT)
rows = self._conn.execute( # type: ignore[union-attr]
"""
SELECT room_id, agents_csv, default_agent, updated_at
FROM room_agent_overrides
ORDER BY updated_at DESC
LIMIT ?
""",
(cap,),
).fetchall()
return [
(r[0], [a.strip() for a in r[1].split(",") if a.strip()], r[2] or None, r[3])
for r in rows
]
def count_agent_overrides(self) -> int:
"""Return total number of agent override rows."""
self._require_open()
row = self._conn.execute(
"SELECT COUNT(*) FROM room_agent_overrides"
).fetchone()
return int(row[0]) if row else 0
# ── M8.2: HA persistence — sticky node cache ──────────────────────────────
def upsert_sticky(self, key: str, node_id: str, expires_at_unix: int) -> None:
"""Persist a sticky routing entry. Idempotent (upsert by key)."""
assert self._conn, "Store not open"
now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
self._conn.execute(
"""INSERT INTO sticky_node_cache (key, node_id, expires_at, updated_at)
VALUES (?, ?, ?, ?)
ON CONFLICT(key) DO UPDATE SET
node_id=excluded.node_id,
expires_at=excluded.expires_at,
updated_at=excluded.updated_at""",
(key, node_id, expires_at_unix, now),
)
def delete_sticky(self, key: str) -> bool:
"""Remove a sticky entry. Returns True if it existed."""
assert self._conn, "Store not open"
cur = self._conn.execute(
"DELETE FROM sticky_node_cache WHERE key=?", (key,)
)
return cur.rowcount > 0
def load_sticky_entries(self) -> List[Tuple[str, str, int]]:
"""
Return all non-expired sticky entries as (key, node_id, expires_at_unix).
Callers filter by monotonic time; here we compare against unix now.
"""
assert self._conn, "Store not open"
now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
rows = self._conn.execute(
"SELECT key, node_id, expires_at FROM sticky_node_cache WHERE expires_at > ?",
(now,),
).fetchall()
return [(r[0], r[1], int(r[2])) for r in rows]
def prune_sticky_expired(self) -> int:
"""Remove all expired sticky entries. Returns count removed."""
assert self._conn, "Store not open"
now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
cur = self._conn.execute(
"DELETE FROM sticky_node_cache WHERE expires_at <= ?", (now,)
)
return cur.rowcount
# ── M8.2: HA persistence — node health state ──────────────────────────────
def upsert_node_health(
self,
node_id: str,
ewma_latency_s: Optional[float],
consecutive_failures: int,
) -> None:
"""Persist node health snapshot. Idempotent (upsert by node_id)."""
assert self._conn, "Store not open"
now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
self._conn.execute(
"""INSERT INTO node_health_state
(node_id, ewma_latency_s, consecutive_failures, updated_at)
VALUES (?, ?, ?, ?)
ON CONFLICT(node_id) DO UPDATE SET
ewma_latency_s=excluded.ewma_latency_s,
consecutive_failures=excluded.consecutive_failures,
updated_at=excluded.updated_at""",
(node_id, ewma_latency_s, consecutive_failures, now),
)
def load_node_health(self, max_age_s: int = 600) -> Optional[Dict[str, Any]]:
"""
Load node health snapshot if all rows are fresh enough (updated_at >= now - max_age_s).
Returns None if no rows or snapshot is stale.
Returns dict: {node_id: {ewma_latency_s, consecutive_failures, updated_at}}
"""
assert self._conn, "Store not open"
now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
cutoff = now - max_age_s
rows = self._conn.execute(
"""SELECT node_id, ewma_latency_s, consecutive_failures, updated_at
FROM node_health_state""",
).fetchall()
if not rows:
return None
result: Dict[str, Any] = {}
for node_id, ewma, consec, updated_at in rows:
if int(updated_at) < cutoff:
logger.debug(
"HA: node health snapshot for %s is stale (age=%ds > max=%ds) — ignoring",
node_id, now - int(updated_at), max_age_s,
)
return None # Any stale node → discard whole snapshot
result[node_id] = {
"ewma_latency_s": ewma,
"consecutive_failures": int(consec),
"updated_at": int(updated_at),
}
return result if result else None
# ── M6.2: Snapshot export / import ────────────────────────────────────────
# ── M10.2: Policy change history ──────────────────────────────────────────
def record_policy_change(
self,
verb: str,
mode: str,
source_file: str,
sender_hash: str,
diff_summary: str,
is_destructive: bool,
node_added: int,
node_updated: int,
node_deleted: int,
agent_added: int,
agent_updated: int,
agent_deleted: int,
history_limit: int = _POLICY_HISTORY_DEFAULT_LIMIT,
) -> int:
"""
Insert a policy apply event into the history table and prune old rows.
history_limit=0 means keep all rows (no pruning).
Returns the id of the inserted row.
"""
self._require_open()
cur = self._conn.execute( # type: ignore[union-attr]
"""INSERT INTO policy_changes
(applied_at, verb, mode, source_file, sender_hash,
diff_summary, is_destructive,
node_added, node_updated, node_deleted,
agent_added, agent_updated, agent_deleted)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(
int(time.time()), verb, mode, source_file, sender_hash,
diff_summary, int(is_destructive),
node_added, node_updated, node_deleted,
agent_added, agent_updated, agent_deleted,
),
)
row_id: int = cur.lastrowid # type: ignore[assignment]
# Prune oldest rows beyond limit
if history_limit > 0:
self._conn.execute( # type: ignore[union-attr]
"""DELETE FROM policy_changes
WHERE id NOT IN (
SELECT id FROM policy_changes ORDER BY id DESC LIMIT ?
)""",
(history_limit,),
)
logger.debug(
"Recorded policy change id=%d verb=%s mode=%s file=%s destr=%s",
row_id, verb, mode, source_file, is_destructive,
)
return row_id
def list_policy_changes(self, limit: int = 10) -> List[PolicyChange]:
"""
Return the most-recent `limit` policy change records, newest first.
Hard cap: min(limit, _LIST_HARD_LIMIT).
"""
self._require_open()
safe_limit = min(max(1, limit), _LIST_HARD_LIMIT)
rows = self._conn.execute( # type: ignore[union-attr]
"""SELECT id, applied_at, verb, mode, source_file, sender_hash,
diff_summary, is_destructive,
node_added, node_updated, node_deleted,
agent_added, agent_updated, agent_deleted
FROM policy_changes
ORDER BY id DESC LIMIT ?""",
(safe_limit,),
).fetchall()
return [
PolicyChange(
id=r[0], applied_at=r[1], verb=r[2], mode=r[3],
source_file=r[4], sender_hash=r[5], diff_summary=r[6],
is_destructive=bool(r[7]),
node_added=r[8], node_updated=r[9], node_deleted=r[10],
agent_added=r[11], agent_updated=r[12], agent_deleted=r[13],
)
for r in rows
]
def get_policy_changes_count(self) -> int:
"""Return the total number of recorded policy changes."""
self._require_open()
row = self._conn.execute( # type: ignore[union-attr]
"SELECT COUNT(*) FROM policy_changes"
).fetchone()
return row[0] if row else 0
def get_policy_change_by_id(self, change_id: int) -> Optional["PolicyChange"]:
"""Return a single PolicyChange by its DB auto-increment id, or None."""
self._require_open()
row = self._conn.execute( # type: ignore[union-attr]
"""SELECT id, applied_at, verb, mode, source_file, sender_hash,
diff_summary, is_destructive,
node_added, node_updated, node_deleted,
agent_added, agent_updated, agent_deleted
FROM policy_changes WHERE id = ?""",
(change_id,),
).fetchone()
if row is None:
return None
return PolicyChange(
id=row[0], applied_at=row[1], verb=row[2], mode=row[3],
source_file=row[4], sender_hash=row[5], diff_summary=row[6],
is_destructive=bool(row[7]),
node_added=row[8], node_updated=row[9], node_deleted=row[10],
agent_added=row[11], agent_updated=row[12], agent_deleted=row[13],
)
# ── M10.0: Auto-backup + retention prune ──────────────────────────────────
def write_autobackup(
self,
exports_dir: str,
sender_hash8: str,
nonce: str,
) -> tuple[str, str]:
"""
Export all current policy to a timestamped autobackup file.
Filename: policy-autobackup-<UTC>-<senderhash8>-<nonce>.json
Returns (file_path, content_hash_prefix[:8]).
Non-atomic write is acceptable: file is complete before we return.
"""
self._require_open()
ts = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%dT%H%M%SZ")
filename = f"{_AUTOBACKUP_PREFIX}{ts}-{sender_hash8[:8]}-{nonce}.json"
file_path = _os.path.join(exports_dir, filename)
snapshot = self.export_all()
content = _json.dumps(snapshot, sort_keys=True, ensure_ascii=True)
with open(file_path, "w", encoding="utf-8") as fh:
fh.write(content)
content_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()[:8]
logger.debug("Auto-backup written: %s hash=%s", filename, content_hash)
return file_path, content_hash
def prune_exports(
self,
exports_dir: str,
retention_days: int,
dry_run: bool = True,
) -> PruneResult:
"""
Remove policy export files older than retention_days.
Only files matching 'policy-*.json' in exports_dir are considered —
never recursing into subdirectories.
dry_run=True: compute stats without deleting.
dry_run=False: actually delete matching files.
Returns PruneResult with filenames, total_bytes, oldest_mtime.
"""
if retention_days <= 0:
return PruneResult(files_to_delete=[], total_bytes=0)
cutoff = time.time() - retention_days * 86400
pattern = _os.path.join(exports_dir, _EXPORT_GLOB)
to_delete: List[str] = []
total_bytes = 0
oldest_mtime: Optional[float] = None
for fpath in sorted(_glob.glob(pattern)):
# Safety: only process files directly in exports_dir (no subdirs)
if _os.path.dirname(fpath) != _os.path.abspath(exports_dir):
continue
try:
stat = _os.stat(fpath)
except OSError:
continue
if stat.st_mtime < cutoff:
basename = _os.path.basename(fpath)
to_delete.append(basename)
total_bytes += stat.st_size
if oldest_mtime is None or stat.st_mtime < oldest_mtime:
oldest_mtime = stat.st_mtime
if not dry_run:
for basename in to_delete:
fpath = _os.path.join(exports_dir, basename)
try:
_os.remove(fpath)
logger.info("Pruned policy export: %s", basename)
except OSError as exc:
logger.warning("Could not prune %s: %s", basename, exc)
return PruneResult(
files_to_delete=to_delete,
total_bytes=total_bytes,
oldest_mtime=oldest_mtime,
)
def export_all(self) -> Dict[str, Any]:
"""
Export all overrides as a JSON-serializable snapshot dict.
Format (version 1):
{
"version": 1,
"created_at": "<ISO8601>Z",
"room_node_overrides": [{room_id, node_id, updated_at, updated_by}, ...],
"room_agent_overrides": [{room_id, agents, default_agent, updated_at, updated_by}, ...]
}
"""
self._require_open()
node_rows = self._conn.execute( # type: ignore[union-attr]
"SELECT room_id, node_id, updated_at, updated_by_hash FROM room_node_overrides ORDER BY room_id"
).fetchall()
agent_rows = self._conn.execute( # type: ignore[union-attr]
"""SELECT room_id, agents_csv, default_agent, updated_at, updated_by_hash
FROM room_agent_overrides ORDER BY room_id"""
).fetchall()
return {
"version": POLICY_SNAPSHOT_VERSION,
"created_at": datetime.datetime.now(datetime.timezone.utc).isoformat().replace("+00:00", "Z"),
"room_node_overrides": [
{"room_id": r[0], "node_id": r[1], "updated_at": r[2], "updated_by": r[3]}
for r in node_rows
],
"room_agent_overrides": [
{
"room_id": r[0],
"agents": [a.strip() for a in r[1].split(",") if a.strip()],
"default_agent": r[2] or None,
"updated_at": r[3],
"updated_by": r[4],
}
for r in agent_rows
],
}
def compute_import_diff(
self,
data: Dict[str, Any],
mode: str = POLICY_IMPORT_MODE_MERGE,
) -> ImportDiff:
"""
Compute what would change if data were imported (dry-run, M9.1).
Returns an ImportDiff with counts and up to _SAMPLE_KEYS_MAX changed rooms.
Non-destructive — never modifies the database.
"""
if data.get("version") != POLICY_SNAPSHOT_VERSION:
raise ValueError(f"Unsupported snapshot version: {data.get('version')!r}")
self._require_open()
existing_nodes: Dict[str, str] = {
r[0]: r[1]
for r in self._conn.execute( # type: ignore[union-attr]
"SELECT room_id, node_id FROM room_node_overrides"
).fetchall()
}
existing_agents: Dict[str, str] = {
r[0]: r[1]
for r in self._conn.execute( # type: ignore[union-attr]
"SELECT room_id, agents_csv FROM room_agent_overrides"
).fetchall()
}
file_nodes: Dict[str, str] = {
e["room_id"]: e["node_id"]
for e in (data.get("room_node_overrides") or [])
if "room_id" in e and "node_id" in e
}
file_agents: Dict[str, Any] = {
e["room_id"]: e
for e in (data.get("room_agent_overrides") or [])
if "room_id" in e and "agents" in e
}
node_added = sum(1 for r in file_nodes if r not in existing_nodes)
node_updated = sum(1 for r in file_nodes if r in existing_nodes)
agent_added = sum(1 for r in file_agents if r not in existing_agents)
agent_updated = sum(1 for r in file_agents if r in existing_agents)
node_deleted = 0
agent_deleted = 0
if mode == POLICY_IMPORT_MODE_REPLACE:
node_deleted = sum(1 for r in existing_nodes if r not in file_nodes)
agent_deleted = sum(1 for r in existing_agents if r not in file_agents)
# Collect up to _SAMPLE_KEYS_MAX affected rooms (deterministic: sorted)
affected: List[str] = []
seen: set[str] = set()
for rid in list(file_nodes) + list(file_agents):
if rid not in seen:
affected.append(rid)
seen.add(rid)
if mode == POLICY_IMPORT_MODE_REPLACE:
for rid in list(existing_nodes) + list(existing_agents):
if rid not in seen and (rid not in file_nodes or rid not in file_agents):
affected.append(rid)
seen.add(rid)
sample_keys = sorted(affected)[:_SAMPLE_KEYS_MAX]
return ImportDiff(
node_added=node_added,
node_updated=node_updated,
node_deleted=node_deleted,
agent_added=agent_added,
agent_updated=agent_updated,
agent_deleted=agent_deleted,
sample_keys=sample_keys,
is_replace=(mode == POLICY_IMPORT_MODE_REPLACE),
)
def import_snapshot(
self,
data: Dict[str, Any],
mode: str = POLICY_IMPORT_MODE_MERGE,
dry_run: bool = True,
imported_by: str = "import",
) -> Dict[str, int]:
"""
Import a policy snapshot.
mode=merge: upsert entries from file; never delete existing entries not in file.
mode=replace: upsert entries from file AND delete entries in DB not present in file.
dry_run=True: compute stats without modifying DB.
Returns:
{
"node_added": N, "node_updated": N, "node_deleted": N,
"agent_added": N, "agent_updated": N, "agent_deleted": N,
}
"""
if data.get("version") != POLICY_SNAPSHOT_VERSION:
raise ValueError(f"Unsupported snapshot version: {data.get('version')!r}")
self._require_open()
# ── Current DB state ──────────────────────────────────────────────────
existing_nodes: Dict[str, str] = {
r[0]: r[1]
for r in self._conn.execute( # type: ignore[union-attr]
"SELECT room_id, node_id FROM room_node_overrides"
).fetchall()
}
existing_agents: Dict[str, str] = {
r[0]: r[1]
for r in self._conn.execute( # type: ignore[union-attr]
"SELECT room_id, agents_csv FROM room_agent_overrides"
).fetchall()
}
# ── Compute deltas ────────────────────────────────────────────────────
file_nodes = {
e["room_id"]: e["node_id"]
for e in (data.get("room_node_overrides") or [])
if "room_id" in e and "node_id" in e
}
file_agents = {
e["room_id"]: e
for e in (data.get("room_agent_overrides") or [])
if "room_id" in e and "agents" in e
}
node_added = sum(1 for r in file_nodes if r not in existing_nodes)
node_updated = sum(1 for r in file_nodes if r in existing_nodes)
agent_added = sum(1 for r in file_agents if r not in existing_agents)
agent_updated = sum(1 for r in file_agents if r in existing_agents)
node_deleted = 0
agent_deleted = 0
if mode == POLICY_IMPORT_MODE_REPLACE:
node_deleted = sum(1 for r in existing_nodes if r not in file_nodes)
agent_deleted = sum(1 for r in existing_agents if r not in file_agents)
stats = {
"node_added": node_added,
"node_updated": node_updated,
"node_deleted": node_deleted,
"agent_added": agent_added,
"agent_updated": agent_updated,
"agent_deleted": agent_deleted,
}
if dry_run:
return stats
# ── Apply changes ─────────────────────────────────────────────────────
now = int(time.time())
by_hash = _hash_sender(imported_by)
for entry in (data.get("room_node_overrides") or []):
rid = entry.get("room_id")
nid = entry.get("node_id")
if rid and nid:
self._conn.execute( # type: ignore[union-attr]
"""
INSERT INTO room_node_overrides (room_id, node_id, updated_at, updated_by_hash)
VALUES (?, ?, ?, ?)
ON CONFLICT(room_id) DO UPDATE SET
node_id = excluded.node_id,
updated_at = excluded.updated_at,
updated_by_hash = excluded.updated_by_hash
""",
(rid, nid, now, by_hash),
)
for entry in (data.get("room_agent_overrides") or []):
rid = entry.get("room_id")
agents = entry.get("agents") or []
def_agent = entry.get("default_agent") or (agents[0] if agents else None)
if rid and agents:
agents_csv = ",".join(sorted(set(agents)))
self._conn.execute( # type: ignore[union-attr]
"""
INSERT INTO room_agent_overrides
(room_id, agents_csv, default_agent, updated_at, updated_by_hash)
VALUES (?, ?, ?, ?, ?)
ON CONFLICT(room_id) DO UPDATE SET
agents_csv = excluded.agents_csv,
default_agent = excluded.default_agent,
updated_at = excluded.updated_at,
updated_by_hash = excluded.updated_by_hash
""",
(rid, agents_csv, def_agent, now, by_hash),
)
if mode == POLICY_IMPORT_MODE_REPLACE:
file_node_rooms = set(file_nodes.keys())
file_agent_rooms = set(file_agents.keys())
for room_id in existing_nodes:
if room_id not in file_node_rooms:
self._conn.execute( # type: ignore[union-attr]
"DELETE FROM room_node_overrides WHERE room_id = ?", (room_id,)
)
for room_id in existing_agents:
if room_id not in file_agent_rooms:
self._conn.execute( # type: ignore[union-attr]
"DELETE FROM room_agent_overrides WHERE room_id = ?", (room_id,)
)
return stats
# ── Internal ───────────────────────────────────────────────────────────────
def _require_open(self) -> None:
if self._conn is None:
raise RuntimeError("PolicyStore is not open — call open() first")