New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
410 lines
15 KiB
Python
410 lines
15 KiB
Python
"""
|
|
risk_history_store.py — Storage layer for Risk Score snapshots.
|
|
|
|
Provides:
|
|
RiskSnapshot — dataclass for a single point-in-time risk record
|
|
RiskHistoryStore — abstract base
|
|
MemoryRiskHistoryStore — in-process (tests + fallback)
|
|
NullRiskHistoryStore — no-op (disabled)
|
|
PostgresRiskHistoryStore — Postgres primary (psycopg2 sync)
|
|
AutoRiskHistoryStore — Postgres → Memory fallback
|
|
|
|
Factory: get_risk_history_store() → AutoRiskHistoryStore by default
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import datetime
|
|
import json
|
|
import logging
|
|
import os
|
|
import threading
|
|
from abc import ABC, abstractmethod
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass, field, asdict
|
|
from typing import Dict, List, Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ─── Data model ───────────────────────────────────────────────────────────────
|
|
|
|
@dataclass
|
|
class RiskSnapshot:
|
|
ts: str # ISO-8601 UTC
|
|
service: str
|
|
env: str
|
|
score: int
|
|
band: str
|
|
components: Dict = field(default_factory=dict)
|
|
reasons: List[str] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict:
|
|
return asdict(self)
|
|
|
|
@staticmethod
|
|
def from_dict(d: Dict) -> "RiskSnapshot":
|
|
return RiskSnapshot(
|
|
ts=d["ts"], service=d["service"], env=d.get("env", "prod"),
|
|
score=int(d["score"]), band=d.get("band", "low"),
|
|
components=d.get("components", {}),
|
|
reasons=d.get("reasons", []),
|
|
)
|
|
|
|
|
|
# ─── Abstract base ────────────────────────────────────────────────────────────
|
|
|
|
class RiskHistoryStore(ABC):
|
|
@abstractmethod
|
|
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
|
|
"""Persist records; returns number written."""
|
|
|
|
@abstractmethod
|
|
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
|
|
"""Most recent snapshot for service/env."""
|
|
|
|
@abstractmethod
|
|
def get_series(
|
|
self, service: str, env: str, hours: int = 168, limit: int = 200
|
|
) -> List[RiskSnapshot]:
|
|
"""Snapshots in descending time order within last `hours` hours."""
|
|
|
|
def get_delta(self, service: str, env: str, hours: int = 24) -> Optional[int]:
|
|
"""
|
|
latest.score - closest-to-(now-hours) score.
|
|
Returns None if no baseline is available.
|
|
"""
|
|
series = self.get_series(service, env, hours=hours * 2, limit=500)
|
|
if not series:
|
|
return None
|
|
latest = series[0]
|
|
cutoff_ts = (
|
|
datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
|
|
).isoformat()
|
|
# Find snapshot closest to cutoff (first one before or at cutoff)
|
|
baseline = None
|
|
for snap in series:
|
|
if snap.ts <= cutoff_ts:
|
|
baseline = snap
|
|
break
|
|
if baseline is None:
|
|
return None
|
|
return latest.score - baseline.score
|
|
|
|
def dashboard_series(
|
|
self, env: str, hours: int = 24, top_n: int = 10
|
|
) -> List[Dict]:
|
|
"""Return latest snapshot for each service in env, sorted by score desc."""
|
|
raise NotImplementedError
|
|
|
|
@abstractmethod
|
|
def cleanup(self, retention_days: int = 90) -> int:
|
|
"""Delete records older than retention_days; returns count deleted."""
|
|
|
|
|
|
# ─── Memory backend (tests + fallback) ────────────────────────────────────────
|
|
|
|
class MemoryRiskHistoryStore(RiskHistoryStore):
|
|
def __init__(self) -> None:
|
|
self._lock = threading.Lock()
|
|
# key: (service, env) → list of RiskSnapshot sorted desc by ts
|
|
self._data: Dict = defaultdict(list)
|
|
|
|
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
|
|
with self._lock:
|
|
for rec in records:
|
|
key = (rec.service, rec.env)
|
|
self._data[key].append(rec)
|
|
self._data[key].sort(key=lambda r: r.ts, reverse=True)
|
|
return len(records)
|
|
|
|
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
|
|
with self._lock:
|
|
series = self._data.get((service, env), [])
|
|
return series[0] if series else None
|
|
|
|
def get_series(
|
|
self, service: str, env: str, hours: int = 168, limit: int = 200
|
|
) -> List[RiskSnapshot]:
|
|
cutoff = (
|
|
datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
|
|
).isoformat()
|
|
with self._lock:
|
|
series = self._data.get((service, env), [])
|
|
result = [s for s in series if s.ts >= cutoff]
|
|
return result[:limit]
|
|
|
|
def dashboard_series(
|
|
self, env: str, hours: int = 24, top_n: int = 10
|
|
) -> List[Dict]:
|
|
cutoff = (
|
|
datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
|
|
).isoformat()
|
|
with self._lock:
|
|
latest_per_service: Dict[str, RiskSnapshot] = {}
|
|
for (svc, e), snaps in self._data.items():
|
|
if e != env:
|
|
continue
|
|
recent = [s for s in snaps if s.ts >= cutoff]
|
|
if recent:
|
|
latest_per_service[svc] = recent[0]
|
|
return sorted(
|
|
[s.to_dict() for s in latest_per_service.values()],
|
|
key=lambda r: -r["score"],
|
|
)[:top_n]
|
|
|
|
def cleanup(self, retention_days: int = 90) -> int:
|
|
cutoff = (
|
|
datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
|
|
).isoformat()
|
|
deleted = 0
|
|
with self._lock:
|
|
for key in list(self._data.keys()):
|
|
before = len(self._data[key])
|
|
self._data[key] = [s for s in self._data[key] if s.ts >= cutoff]
|
|
deleted += before - len(self._data[key])
|
|
return deleted
|
|
|
|
|
|
# ─── Null backend ──────────────────────────────────────────────────────────────
|
|
|
|
class NullRiskHistoryStore(RiskHistoryStore):
|
|
"""No-op: all writes discarded, all reads return empty."""
|
|
|
|
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
|
|
return 0
|
|
|
|
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
|
|
return None
|
|
|
|
def get_series(
|
|
self, service: str, env: str, hours: int = 168, limit: int = 200
|
|
) -> List[RiskSnapshot]:
|
|
return []
|
|
|
|
def cleanup(self, retention_days: int = 90) -> int:
|
|
return 0
|
|
|
|
|
|
# ─── Postgres backend ──────────────────────────────────────────────────────────
|
|
|
|
class PostgresRiskHistoryStore(RiskHistoryStore):
|
|
"""
|
|
Production Postgres backend (psycopg2 sync, per-thread connection).
|
|
Schema created by ops/scripts/migrate_risk_history_postgres.py.
|
|
"""
|
|
|
|
def __init__(self, dsn: str) -> None:
|
|
self._dsn = dsn
|
|
self._local = threading.local()
|
|
|
|
def _conn(self):
|
|
conn = getattr(self._local, "conn", None)
|
|
if conn is None or conn.closed:
|
|
import psycopg2 # type: ignore
|
|
conn = psycopg2.connect(self._dsn)
|
|
conn.autocommit = True
|
|
self._local.conn = conn
|
|
return conn
|
|
|
|
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
|
|
if not records:
|
|
return 0
|
|
cur = self._conn().cursor()
|
|
written = 0
|
|
for rec in records:
|
|
try:
|
|
cur.execute(
|
|
"""INSERT INTO risk_history (ts, service, env, score, band, components, reasons)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s)
|
|
ON CONFLICT (ts, service, env) DO UPDATE
|
|
SET score=EXCLUDED.score, band=EXCLUDED.band,
|
|
components=EXCLUDED.components, reasons=EXCLUDED.reasons""",
|
|
(rec.ts, rec.service, rec.env, rec.score, rec.band,
|
|
json.dumps(rec.components), json.dumps(rec.reasons)),
|
|
)
|
|
written += 1
|
|
except Exception as e:
|
|
logger.warning("risk_history write failed for %s/%s: %s", rec.service, rec.env, e)
|
|
cur.close()
|
|
return written
|
|
|
|
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
|
|
cur = self._conn().cursor()
|
|
cur.execute(
|
|
"SELECT ts,service,env,score,band,components,reasons FROM risk_history "
|
|
"WHERE service=%s AND env=%s ORDER BY ts DESC LIMIT 1",
|
|
(service, env),
|
|
)
|
|
row = cur.fetchone()
|
|
cur.close()
|
|
if not row:
|
|
return None
|
|
return self._row_to_snap(row)
|
|
|
|
def get_series(
|
|
self, service: str, env: str, hours: int = 168, limit: int = 200
|
|
) -> List[RiskSnapshot]:
|
|
cutoff = datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
|
|
cur = self._conn().cursor()
|
|
cur.execute(
|
|
"SELECT ts,service,env,score,band,components,reasons FROM risk_history "
|
|
"WHERE service=%s AND env=%s AND ts >= %s ORDER BY ts DESC LIMIT %s",
|
|
(service, env, cutoff, limit),
|
|
)
|
|
rows = cur.fetchall()
|
|
cur.close()
|
|
return [self._row_to_snap(r) for r in rows]
|
|
|
|
def dashboard_series(
|
|
self, env: str, hours: int = 24, top_n: int = 10
|
|
) -> List[Dict]:
|
|
cutoff = datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
|
|
cur = self._conn().cursor()
|
|
# Latest snapshot per service in env within window
|
|
cur.execute(
|
|
"""SELECT DISTINCT ON (service)
|
|
ts, service, env, score, band, components, reasons
|
|
FROM risk_history
|
|
WHERE env=%s AND ts >= %s
|
|
ORDER BY service, ts DESC""",
|
|
(env, cutoff),
|
|
)
|
|
rows = cur.fetchall()
|
|
cur.close()
|
|
snaps = [self._row_to_snap(r).to_dict() for r in rows]
|
|
return sorted(snaps, key=lambda r: -r["score"])[:top_n]
|
|
|
|
def cleanup(self, retention_days: int = 90) -> int:
|
|
cutoff = datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
|
|
cur = self._conn().cursor()
|
|
cur.execute("DELETE FROM risk_history WHERE ts < %s", (cutoff,))
|
|
deleted = cur.rowcount
|
|
cur.close()
|
|
return deleted
|
|
|
|
@staticmethod
|
|
def _row_to_snap(row) -> RiskSnapshot:
|
|
ts, service, env, score, band, components, reasons = row
|
|
if isinstance(ts, datetime.datetime):
|
|
ts = ts.isoformat()
|
|
if isinstance(components, str):
|
|
components = json.loads(components)
|
|
if isinstance(reasons, str):
|
|
reasons = json.loads(reasons)
|
|
return RiskSnapshot(
|
|
ts=ts, service=service, env=env,
|
|
score=int(score), band=band,
|
|
components=components or {},
|
|
reasons=reasons or [],
|
|
)
|
|
|
|
|
|
# ─── Auto backend ─────────────────────────────────────────────────────────────
|
|
|
|
class AutoRiskHistoryStore(RiskHistoryStore):
|
|
"""
|
|
Postgres primary; falls back to MemoryRiskHistoryStore on connection failures.
|
|
Reads are always tried against Postgres first. On failure, returns from memory buffer.
|
|
"""
|
|
|
|
def __init__(self, pg_dsn: str) -> None:
|
|
self._pg = PostgresRiskHistoryStore(pg_dsn)
|
|
self._mem = MemoryRiskHistoryStore()
|
|
self._pg_ok = True
|
|
|
|
def _try_pg(self, method: str, *args, **kwargs):
|
|
try:
|
|
result = getattr(self._pg, method)(*args, **kwargs)
|
|
self._pg_ok = True
|
|
return True, result
|
|
except Exception as e:
|
|
if self._pg_ok:
|
|
logger.warning("AutoRiskHistoryStore: Postgres unavailable (%s), using memory", e)
|
|
self._pg_ok = False
|
|
return False, None
|
|
|
|
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
|
|
ok, written = self._try_pg("write_snapshot", records)
|
|
self._mem.write_snapshot(records) # always keep in-memory buffer
|
|
return written if ok else len(records)
|
|
|
|
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
|
|
ok, result = self._try_pg("get_latest", service, env)
|
|
if ok:
|
|
return result
|
|
return self._mem.get_latest(service, env)
|
|
|
|
def get_series(
|
|
self, service: str, env: str, hours: int = 168, limit: int = 200
|
|
) -> List[RiskSnapshot]:
|
|
ok, result = self._try_pg("get_series", service, env, hours, limit)
|
|
if ok:
|
|
return result
|
|
return self._mem.get_series(service, env, hours, limit)
|
|
|
|
def dashboard_series(
|
|
self, env: str, hours: int = 24, top_n: int = 10
|
|
) -> List[Dict]:
|
|
ok, result = self._try_pg("dashboard_series", env, hours, top_n)
|
|
if ok:
|
|
return result
|
|
return self._mem.dashboard_series(env, hours, top_n)
|
|
|
|
def cleanup(self, retention_days: int = 90) -> int:
|
|
ok, count = self._try_pg("cleanup", retention_days)
|
|
self._mem.cleanup(retention_days)
|
|
return count if ok else 0
|
|
|
|
|
|
# ─── Singleton factory ────────────────────────────────────────────────────────
|
|
|
|
_store: Optional[RiskHistoryStore] = None
|
|
_store_lock = threading.Lock()
|
|
|
|
|
|
def get_risk_history_store() -> RiskHistoryStore:
|
|
global _store
|
|
if _store is None:
|
|
with _store_lock:
|
|
if _store is None:
|
|
_store = _create_store()
|
|
return _store
|
|
|
|
|
|
def set_risk_history_store(store: Optional[RiskHistoryStore]) -> None:
|
|
global _store
|
|
with _store_lock:
|
|
_store = store
|
|
|
|
|
|
def _create_store() -> RiskHistoryStore:
|
|
backend = os.getenv("RISK_HISTORY_BACKEND", "auto").lower()
|
|
dsn = (
|
|
os.getenv("RISK_DATABASE_URL")
|
|
or os.getenv("DATABASE_URL")
|
|
or ""
|
|
)
|
|
|
|
if backend == "memory":
|
|
logger.info("RiskHistoryStore: in-memory")
|
|
return MemoryRiskHistoryStore()
|
|
|
|
if backend == "null":
|
|
logger.info("RiskHistoryStore: null (disabled)")
|
|
return NullRiskHistoryStore()
|
|
|
|
if backend == "postgres":
|
|
if dsn:
|
|
logger.info("RiskHistoryStore: postgres dsn=%s…", dsn[:30])
|
|
return PostgresRiskHistoryStore(dsn)
|
|
logger.warning("RISK_HISTORY_BACKEND=postgres but no DATABASE_URL; falling back to memory")
|
|
return MemoryRiskHistoryStore()
|
|
|
|
# Default: auto
|
|
if dsn:
|
|
logger.info("RiskHistoryStore: auto (postgres→memory fallback) dsn=%s…", dsn[:30])
|
|
return AutoRiskHistoryStore(pg_dsn=dsn)
|
|
|
|
logger.info("RiskHistoryStore: auto — no DATABASE_URL, using memory")
|
|
return MemoryRiskHistoryStore()
|