""" risk_history_store.py — Storage layer for Risk Score snapshots. Provides: RiskSnapshot — dataclass for a single point-in-time risk record RiskHistoryStore — abstract base MemoryRiskHistoryStore — in-process (tests + fallback) NullRiskHistoryStore — no-op (disabled) PostgresRiskHistoryStore — Postgres primary (psycopg2 sync) AutoRiskHistoryStore — Postgres → Memory fallback Factory: get_risk_history_store() → AutoRiskHistoryStore by default """ from __future__ import annotations import datetime import json import logging import os import threading from abc import ABC, abstractmethod from collections import defaultdict from dataclasses import dataclass, field, asdict from typing import Dict, List, Optional logger = logging.getLogger(__name__) # ─── Data model ─────────────────────────────────────────────────────────────── @dataclass class RiskSnapshot: ts: str # ISO-8601 UTC service: str env: str score: int band: str components: Dict = field(default_factory=dict) reasons: List[str] = field(default_factory=list) def to_dict(self) -> Dict: return asdict(self) @staticmethod def from_dict(d: Dict) -> "RiskSnapshot": return RiskSnapshot( ts=d["ts"], service=d["service"], env=d.get("env", "prod"), score=int(d["score"]), band=d.get("band", "low"), components=d.get("components", {}), reasons=d.get("reasons", []), ) # ─── Abstract base ──────────────────────────────────────────────────────────── class RiskHistoryStore(ABC): @abstractmethod def write_snapshot(self, records: List[RiskSnapshot]) -> int: """Persist records; returns number written.""" @abstractmethod def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]: """Most recent snapshot for service/env.""" @abstractmethod def get_series( self, service: str, env: str, hours: int = 168, limit: int = 200 ) -> List[RiskSnapshot]: """Snapshots in descending time order within last `hours` hours.""" def get_delta(self, service: str, env: str, hours: int = 24) -> Optional[int]: """ latest.score - closest-to-(now-hours) score. Returns None if no baseline is available. """ series = self.get_series(service, env, hours=hours * 2, limit=500) if not series: return None latest = series[0] cutoff_ts = ( datetime.datetime.utcnow() - datetime.timedelta(hours=hours) ).isoformat() # Find snapshot closest to cutoff (first one before or at cutoff) baseline = None for snap in series: if snap.ts <= cutoff_ts: baseline = snap break if baseline is None: return None return latest.score - baseline.score def dashboard_series( self, env: str, hours: int = 24, top_n: int = 10 ) -> List[Dict]: """Return latest snapshot for each service in env, sorted by score desc.""" raise NotImplementedError @abstractmethod def cleanup(self, retention_days: int = 90) -> int: """Delete records older than retention_days; returns count deleted.""" # ─── Memory backend (tests + fallback) ──────────────────────────────────────── class MemoryRiskHistoryStore(RiskHistoryStore): def __init__(self) -> None: self._lock = threading.Lock() # key: (service, env) → list of RiskSnapshot sorted desc by ts self._data: Dict = defaultdict(list) def write_snapshot(self, records: List[RiskSnapshot]) -> int: with self._lock: for rec in records: key = (rec.service, rec.env) self._data[key].append(rec) self._data[key].sort(key=lambda r: r.ts, reverse=True) return len(records) def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]: with self._lock: series = self._data.get((service, env), []) return series[0] if series else None def get_series( self, service: str, env: str, hours: int = 168, limit: int = 200 ) -> List[RiskSnapshot]: cutoff = ( datetime.datetime.utcnow() - datetime.timedelta(hours=hours) ).isoformat() with self._lock: series = self._data.get((service, env), []) result = [s for s in series if s.ts >= cutoff] return result[:limit] def dashboard_series( self, env: str, hours: int = 24, top_n: int = 10 ) -> List[Dict]: cutoff = ( datetime.datetime.utcnow() - datetime.timedelta(hours=hours) ).isoformat() with self._lock: latest_per_service: Dict[str, RiskSnapshot] = {} for (svc, e), snaps in self._data.items(): if e != env: continue recent = [s for s in snaps if s.ts >= cutoff] if recent: latest_per_service[svc] = recent[0] return sorted( [s.to_dict() for s in latest_per_service.values()], key=lambda r: -r["score"], )[:top_n] def cleanup(self, retention_days: int = 90) -> int: cutoff = ( datetime.datetime.utcnow() - datetime.timedelta(days=retention_days) ).isoformat() deleted = 0 with self._lock: for key in list(self._data.keys()): before = len(self._data[key]) self._data[key] = [s for s in self._data[key] if s.ts >= cutoff] deleted += before - len(self._data[key]) return deleted # ─── Null backend ────────────────────────────────────────────────────────────── class NullRiskHistoryStore(RiskHistoryStore): """No-op: all writes discarded, all reads return empty.""" def write_snapshot(self, records: List[RiskSnapshot]) -> int: return 0 def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]: return None def get_series( self, service: str, env: str, hours: int = 168, limit: int = 200 ) -> List[RiskSnapshot]: return [] def cleanup(self, retention_days: int = 90) -> int: return 0 # ─── Postgres backend ────────────────────────────────────────────────────────── class PostgresRiskHistoryStore(RiskHistoryStore): """ Production Postgres backend (psycopg2 sync, per-thread connection). Schema created by ops/scripts/migrate_risk_history_postgres.py. """ def __init__(self, dsn: str) -> None: self._dsn = dsn self._local = threading.local() def _conn(self): conn = getattr(self._local, "conn", None) if conn is None or conn.closed: import psycopg2 # type: ignore conn = psycopg2.connect(self._dsn) conn.autocommit = True self._local.conn = conn return conn def write_snapshot(self, records: List[RiskSnapshot]) -> int: if not records: return 0 cur = self._conn().cursor() written = 0 for rec in records: try: cur.execute( """INSERT INTO risk_history (ts, service, env, score, band, components, reasons) VALUES (%s, %s, %s, %s, %s, %s, %s) ON CONFLICT (ts, service, env) DO UPDATE SET score=EXCLUDED.score, band=EXCLUDED.band, components=EXCLUDED.components, reasons=EXCLUDED.reasons""", (rec.ts, rec.service, rec.env, rec.score, rec.band, json.dumps(rec.components), json.dumps(rec.reasons)), ) written += 1 except Exception as e: logger.warning("risk_history write failed for %s/%s: %s", rec.service, rec.env, e) cur.close() return written def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]: cur = self._conn().cursor() cur.execute( "SELECT ts,service,env,score,band,components,reasons FROM risk_history " "WHERE service=%s AND env=%s ORDER BY ts DESC LIMIT 1", (service, env), ) row = cur.fetchone() cur.close() if not row: return None return self._row_to_snap(row) def get_series( self, service: str, env: str, hours: int = 168, limit: int = 200 ) -> List[RiskSnapshot]: cutoff = datetime.datetime.utcnow() - datetime.timedelta(hours=hours) cur = self._conn().cursor() cur.execute( "SELECT ts,service,env,score,band,components,reasons FROM risk_history " "WHERE service=%s AND env=%s AND ts >= %s ORDER BY ts DESC LIMIT %s", (service, env, cutoff, limit), ) rows = cur.fetchall() cur.close() return [self._row_to_snap(r) for r in rows] def dashboard_series( self, env: str, hours: int = 24, top_n: int = 10 ) -> List[Dict]: cutoff = datetime.datetime.utcnow() - datetime.timedelta(hours=hours) cur = self._conn().cursor() # Latest snapshot per service in env within window cur.execute( """SELECT DISTINCT ON (service) ts, service, env, score, band, components, reasons FROM risk_history WHERE env=%s AND ts >= %s ORDER BY service, ts DESC""", (env, cutoff), ) rows = cur.fetchall() cur.close() snaps = [self._row_to_snap(r).to_dict() for r in rows] return sorted(snaps, key=lambda r: -r["score"])[:top_n] def cleanup(self, retention_days: int = 90) -> int: cutoff = datetime.datetime.utcnow() - datetime.timedelta(days=retention_days) cur = self._conn().cursor() cur.execute("DELETE FROM risk_history WHERE ts < %s", (cutoff,)) deleted = cur.rowcount cur.close() return deleted @staticmethod def _row_to_snap(row) -> RiskSnapshot: ts, service, env, score, band, components, reasons = row if isinstance(ts, datetime.datetime): ts = ts.isoformat() if isinstance(components, str): components = json.loads(components) if isinstance(reasons, str): reasons = json.loads(reasons) return RiskSnapshot( ts=ts, service=service, env=env, score=int(score), band=band, components=components or {}, reasons=reasons or [], ) # ─── Auto backend ───────────────────────────────────────────────────────────── class AutoRiskHistoryStore(RiskHistoryStore): """ Postgres primary; falls back to MemoryRiskHistoryStore on connection failures. Reads are always tried against Postgres first. On failure, returns from memory buffer. """ def __init__(self, pg_dsn: str) -> None: self._pg = PostgresRiskHistoryStore(pg_dsn) self._mem = MemoryRiskHistoryStore() self._pg_ok = True def _try_pg(self, method: str, *args, **kwargs): try: result = getattr(self._pg, method)(*args, **kwargs) self._pg_ok = True return True, result except Exception as e: if self._pg_ok: logger.warning("AutoRiskHistoryStore: Postgres unavailable (%s), using memory", e) self._pg_ok = False return False, None def write_snapshot(self, records: List[RiskSnapshot]) -> int: ok, written = self._try_pg("write_snapshot", records) self._mem.write_snapshot(records) # always keep in-memory buffer return written if ok else len(records) def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]: ok, result = self._try_pg("get_latest", service, env) if ok: return result return self._mem.get_latest(service, env) def get_series( self, service: str, env: str, hours: int = 168, limit: int = 200 ) -> List[RiskSnapshot]: ok, result = self._try_pg("get_series", service, env, hours, limit) if ok: return result return self._mem.get_series(service, env, hours, limit) def dashboard_series( self, env: str, hours: int = 24, top_n: int = 10 ) -> List[Dict]: ok, result = self._try_pg("dashboard_series", env, hours, top_n) if ok: return result return self._mem.dashboard_series(env, hours, top_n) def cleanup(self, retention_days: int = 90) -> int: ok, count = self._try_pg("cleanup", retention_days) self._mem.cleanup(retention_days) return count if ok else 0 # ─── Singleton factory ──────────────────────────────────────────────────────── _store: Optional[RiskHistoryStore] = None _store_lock = threading.Lock() def get_risk_history_store() -> RiskHistoryStore: global _store if _store is None: with _store_lock: if _store is None: _store = _create_store() return _store def set_risk_history_store(store: Optional[RiskHistoryStore]) -> None: global _store with _store_lock: _store = store def _create_store() -> RiskHistoryStore: backend = os.getenv("RISK_HISTORY_BACKEND", "auto").lower() dsn = ( os.getenv("RISK_DATABASE_URL") or os.getenv("DATABASE_URL") or "" ) if backend == "memory": logger.info("RiskHistoryStore: in-memory") return MemoryRiskHistoryStore() if backend == "null": logger.info("RiskHistoryStore: null (disabled)") return NullRiskHistoryStore() if backend == "postgres": if dsn: logger.info("RiskHistoryStore: postgres dsn=%s…", dsn[:30]) return PostgresRiskHistoryStore(dsn) logger.warning("RISK_HISTORY_BACKEND=postgres but no DATABASE_URL; falling back to memory") return MemoryRiskHistoryStore() # Default: auto if dsn: logger.info("RiskHistoryStore: auto (postgres→memory fallback) dsn=%s…", dsn[:30]) return AutoRiskHistoryStore(pg_dsn=dsn) logger.info("RiskHistoryStore: auto — no DATABASE_URL, using memory") return MemoryRiskHistoryStore()