Files
microdao-daarion/services/router/risk_history_store.py
Apple 129e4ea1fc feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store,
architecture_pressure, backlog_generator/store, cost_analyzer, data_governance,
dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment,
platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files),
signature_state_store, sofiia_auto_router, tool_governance

New services:
- sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static
- memory-service: integration_endpoints, integrations, voice_endpoints, static UI
- aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents)
- sofiia-supervisor: new supervisor service
- aistalk-bridge-lite: Telegram bridge lite
- calendar-service: CalDAV calendar service with reminders
- mlx-stt-service / mlx-tts-service: Apple Silicon speech services
- binance-bot-monitor: market monitor service
- node-worker: STT/TTS memory providers

New tools (9): agent_email, browser_tool, contract_tool, observability_tool,
oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault

New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus,
farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine,
session_context, style_adapter, telemetry)

Tests: 85+ test files for all new modules
Made-with: Cursor
2026-03-03 07:14:14 -08:00

410 lines
15 KiB
Python

"""
risk_history_store.py — Storage layer for Risk Score snapshots.
Provides:
RiskSnapshot — dataclass for a single point-in-time risk record
RiskHistoryStore — abstract base
MemoryRiskHistoryStore — in-process (tests + fallback)
NullRiskHistoryStore — no-op (disabled)
PostgresRiskHistoryStore — Postgres primary (psycopg2 sync)
AutoRiskHistoryStore — Postgres → Memory fallback
Factory: get_risk_history_store() → AutoRiskHistoryStore by default
"""
from __future__ import annotations
import datetime
import json
import logging
import os
import threading
from abc import ABC, abstractmethod
from collections import defaultdict
from dataclasses import dataclass, field, asdict
from typing import Dict, List, Optional
logger = logging.getLogger(__name__)
# ─── Data model ───────────────────────────────────────────────────────────────
@dataclass
class RiskSnapshot:
ts: str # ISO-8601 UTC
service: str
env: str
score: int
band: str
components: Dict = field(default_factory=dict)
reasons: List[str] = field(default_factory=list)
def to_dict(self) -> Dict:
return asdict(self)
@staticmethod
def from_dict(d: Dict) -> "RiskSnapshot":
return RiskSnapshot(
ts=d["ts"], service=d["service"], env=d.get("env", "prod"),
score=int(d["score"]), band=d.get("band", "low"),
components=d.get("components", {}),
reasons=d.get("reasons", []),
)
# ─── Abstract base ────────────────────────────────────────────────────────────
class RiskHistoryStore(ABC):
@abstractmethod
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
"""Persist records; returns number written."""
@abstractmethod
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
"""Most recent snapshot for service/env."""
@abstractmethod
def get_series(
self, service: str, env: str, hours: int = 168, limit: int = 200
) -> List[RiskSnapshot]:
"""Snapshots in descending time order within last `hours` hours."""
def get_delta(self, service: str, env: str, hours: int = 24) -> Optional[int]:
"""
latest.score - closest-to-(now-hours) score.
Returns None if no baseline is available.
"""
series = self.get_series(service, env, hours=hours * 2, limit=500)
if not series:
return None
latest = series[0]
cutoff_ts = (
datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
).isoformat()
# Find snapshot closest to cutoff (first one before or at cutoff)
baseline = None
for snap in series:
if snap.ts <= cutoff_ts:
baseline = snap
break
if baseline is None:
return None
return latest.score - baseline.score
def dashboard_series(
self, env: str, hours: int = 24, top_n: int = 10
) -> List[Dict]:
"""Return latest snapshot for each service in env, sorted by score desc."""
raise NotImplementedError
@abstractmethod
def cleanup(self, retention_days: int = 90) -> int:
"""Delete records older than retention_days; returns count deleted."""
# ─── Memory backend (tests + fallback) ────────────────────────────────────────
class MemoryRiskHistoryStore(RiskHistoryStore):
def __init__(self) -> None:
self._lock = threading.Lock()
# key: (service, env) → list of RiskSnapshot sorted desc by ts
self._data: Dict = defaultdict(list)
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
with self._lock:
for rec in records:
key = (rec.service, rec.env)
self._data[key].append(rec)
self._data[key].sort(key=lambda r: r.ts, reverse=True)
return len(records)
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
with self._lock:
series = self._data.get((service, env), [])
return series[0] if series else None
def get_series(
self, service: str, env: str, hours: int = 168, limit: int = 200
) -> List[RiskSnapshot]:
cutoff = (
datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
).isoformat()
with self._lock:
series = self._data.get((service, env), [])
result = [s for s in series if s.ts >= cutoff]
return result[:limit]
def dashboard_series(
self, env: str, hours: int = 24, top_n: int = 10
) -> List[Dict]:
cutoff = (
datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
).isoformat()
with self._lock:
latest_per_service: Dict[str, RiskSnapshot] = {}
for (svc, e), snaps in self._data.items():
if e != env:
continue
recent = [s for s in snaps if s.ts >= cutoff]
if recent:
latest_per_service[svc] = recent[0]
return sorted(
[s.to_dict() for s in latest_per_service.values()],
key=lambda r: -r["score"],
)[:top_n]
def cleanup(self, retention_days: int = 90) -> int:
cutoff = (
datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
).isoformat()
deleted = 0
with self._lock:
for key in list(self._data.keys()):
before = len(self._data[key])
self._data[key] = [s for s in self._data[key] if s.ts >= cutoff]
deleted += before - len(self._data[key])
return deleted
# ─── Null backend ──────────────────────────────────────────────────────────────
class NullRiskHistoryStore(RiskHistoryStore):
"""No-op: all writes discarded, all reads return empty."""
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
return 0
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
return None
def get_series(
self, service: str, env: str, hours: int = 168, limit: int = 200
) -> List[RiskSnapshot]:
return []
def cleanup(self, retention_days: int = 90) -> int:
return 0
# ─── Postgres backend ──────────────────────────────────────────────────────────
class PostgresRiskHistoryStore(RiskHistoryStore):
"""
Production Postgres backend (psycopg2 sync, per-thread connection).
Schema created by ops/scripts/migrate_risk_history_postgres.py.
"""
def __init__(self, dsn: str) -> None:
self._dsn = dsn
self._local = threading.local()
def _conn(self):
conn = getattr(self._local, "conn", None)
if conn is None or conn.closed:
import psycopg2 # type: ignore
conn = psycopg2.connect(self._dsn)
conn.autocommit = True
self._local.conn = conn
return conn
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
if not records:
return 0
cur = self._conn().cursor()
written = 0
for rec in records:
try:
cur.execute(
"""INSERT INTO risk_history (ts, service, env, score, band, components, reasons)
VALUES (%s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (ts, service, env) DO UPDATE
SET score=EXCLUDED.score, band=EXCLUDED.band,
components=EXCLUDED.components, reasons=EXCLUDED.reasons""",
(rec.ts, rec.service, rec.env, rec.score, rec.band,
json.dumps(rec.components), json.dumps(rec.reasons)),
)
written += 1
except Exception as e:
logger.warning("risk_history write failed for %s/%s: %s", rec.service, rec.env, e)
cur.close()
return written
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
cur = self._conn().cursor()
cur.execute(
"SELECT ts,service,env,score,band,components,reasons FROM risk_history "
"WHERE service=%s AND env=%s ORDER BY ts DESC LIMIT 1",
(service, env),
)
row = cur.fetchone()
cur.close()
if not row:
return None
return self._row_to_snap(row)
def get_series(
self, service: str, env: str, hours: int = 168, limit: int = 200
) -> List[RiskSnapshot]:
cutoff = datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
cur = self._conn().cursor()
cur.execute(
"SELECT ts,service,env,score,band,components,reasons FROM risk_history "
"WHERE service=%s AND env=%s AND ts >= %s ORDER BY ts DESC LIMIT %s",
(service, env, cutoff, limit),
)
rows = cur.fetchall()
cur.close()
return [self._row_to_snap(r) for r in rows]
def dashboard_series(
self, env: str, hours: int = 24, top_n: int = 10
) -> List[Dict]:
cutoff = datetime.datetime.utcnow() - datetime.timedelta(hours=hours)
cur = self._conn().cursor()
# Latest snapshot per service in env within window
cur.execute(
"""SELECT DISTINCT ON (service)
ts, service, env, score, band, components, reasons
FROM risk_history
WHERE env=%s AND ts >= %s
ORDER BY service, ts DESC""",
(env, cutoff),
)
rows = cur.fetchall()
cur.close()
snaps = [self._row_to_snap(r).to_dict() for r in rows]
return sorted(snaps, key=lambda r: -r["score"])[:top_n]
def cleanup(self, retention_days: int = 90) -> int:
cutoff = datetime.datetime.utcnow() - datetime.timedelta(days=retention_days)
cur = self._conn().cursor()
cur.execute("DELETE FROM risk_history WHERE ts < %s", (cutoff,))
deleted = cur.rowcount
cur.close()
return deleted
@staticmethod
def _row_to_snap(row) -> RiskSnapshot:
ts, service, env, score, band, components, reasons = row
if isinstance(ts, datetime.datetime):
ts = ts.isoformat()
if isinstance(components, str):
components = json.loads(components)
if isinstance(reasons, str):
reasons = json.loads(reasons)
return RiskSnapshot(
ts=ts, service=service, env=env,
score=int(score), band=band,
components=components or {},
reasons=reasons or [],
)
# ─── Auto backend ─────────────────────────────────────────────────────────────
class AutoRiskHistoryStore(RiskHistoryStore):
"""
Postgres primary; falls back to MemoryRiskHistoryStore on connection failures.
Reads are always tried against Postgres first. On failure, returns from memory buffer.
"""
def __init__(self, pg_dsn: str) -> None:
self._pg = PostgresRiskHistoryStore(pg_dsn)
self._mem = MemoryRiskHistoryStore()
self._pg_ok = True
def _try_pg(self, method: str, *args, **kwargs):
try:
result = getattr(self._pg, method)(*args, **kwargs)
self._pg_ok = True
return True, result
except Exception as e:
if self._pg_ok:
logger.warning("AutoRiskHistoryStore: Postgres unavailable (%s), using memory", e)
self._pg_ok = False
return False, None
def write_snapshot(self, records: List[RiskSnapshot]) -> int:
ok, written = self._try_pg("write_snapshot", records)
self._mem.write_snapshot(records) # always keep in-memory buffer
return written if ok else len(records)
def get_latest(self, service: str, env: str) -> Optional[RiskSnapshot]:
ok, result = self._try_pg("get_latest", service, env)
if ok:
return result
return self._mem.get_latest(service, env)
def get_series(
self, service: str, env: str, hours: int = 168, limit: int = 200
) -> List[RiskSnapshot]:
ok, result = self._try_pg("get_series", service, env, hours, limit)
if ok:
return result
return self._mem.get_series(service, env, hours, limit)
def dashboard_series(
self, env: str, hours: int = 24, top_n: int = 10
) -> List[Dict]:
ok, result = self._try_pg("dashboard_series", env, hours, top_n)
if ok:
return result
return self._mem.dashboard_series(env, hours, top_n)
def cleanup(self, retention_days: int = 90) -> int:
ok, count = self._try_pg("cleanup", retention_days)
self._mem.cleanup(retention_days)
return count if ok else 0
# ─── Singleton factory ────────────────────────────────────────────────────────
_store: Optional[RiskHistoryStore] = None
_store_lock = threading.Lock()
def get_risk_history_store() -> RiskHistoryStore:
global _store
if _store is None:
with _store_lock:
if _store is None:
_store = _create_store()
return _store
def set_risk_history_store(store: Optional[RiskHistoryStore]) -> None:
global _store
with _store_lock:
_store = store
def _create_store() -> RiskHistoryStore:
backend = os.getenv("RISK_HISTORY_BACKEND", "auto").lower()
dsn = (
os.getenv("RISK_DATABASE_URL")
or os.getenv("DATABASE_URL")
or ""
)
if backend == "memory":
logger.info("RiskHistoryStore: in-memory")
return MemoryRiskHistoryStore()
if backend == "null":
logger.info("RiskHistoryStore: null (disabled)")
return NullRiskHistoryStore()
if backend == "postgres":
if dsn:
logger.info("RiskHistoryStore: postgres dsn=%s", dsn[:30])
return PostgresRiskHistoryStore(dsn)
logger.warning("RISK_HISTORY_BACKEND=postgres but no DATABASE_URL; falling back to memory")
return MemoryRiskHistoryStore()
# Default: auto
if dsn:
logger.info("RiskHistoryStore: auto (postgres→memory fallback) dsn=%s", dsn[:30])
return AutoRiskHistoryStore(pg_dsn=dsn)
logger.info("RiskHistoryStore: auto — no DATABASE_URL, using memory")
return MemoryRiskHistoryStore()