""" signature_state_store.py — Cooldown tracking per incident signature. Prevents triage from running too frequently for the same failure type. A "signature" is the same one computed by alert_routing.compute_incident_signature. Backends: - MemorySignatureStateStore (tests / single-process) - PostgresSignatureStateStore (production) - AutoSignatureStateStore (Postgres → Memory fallback) Table: incident_signature_state signature text PK, last_triage_at timestamptz, last_alert_at timestamptz, triage_count_24h int, updated_at timestamptz DDL: ops/scripts/migrate_alerts_postgres.py """ from __future__ import annotations import datetime import logging import os import threading import time from abc import ABC, abstractmethod from typing import Dict, List, Optional logger = logging.getLogger(__name__) DEFAULT_COOLDOWN_MINUTES = 15 def _now_dt() -> datetime.datetime: return datetime.datetime.utcnow() def _now_iso() -> str: return datetime.datetime.utcnow().isoformat() # ─── Abstract ───────────────────────────────────────────────────────────────── class SignatureStateStore(ABC): @abstractmethod def should_run_triage( self, signature: str, cooldown_minutes: int = DEFAULT_COOLDOWN_MINUTES ) -> bool: """Return True if cooldown has passed (triage may proceed).""" @abstractmethod def mark_alert_seen(self, signature: str) -> None: """Record that an alert with this signature was observed. Also updates occurrences_60m rolling bucket.""" @abstractmethod def mark_triage_run(self, signature: str) -> None: """Record that triage was executed for this signature.""" @abstractmethod def get_state(self, signature: str) -> Optional[Dict]: """Return raw state dict or None.""" @abstractmethod def list_active_signatures(self, window_minutes: int = 60, limit: int = 100) -> List[Dict]: """Return signatures seen in last window_minutes, ordered by occurrences_60m desc.""" # ─── Memory backend ──────────────────────────────────────────────────────────── class MemorySignatureStateStore(SignatureStateStore): BUCKET_MINUTES = 60 # rolling window for occurrences_60m def __init__(self): self._lock = threading.Lock() self._states: Dict[str, Dict] = {} def _update_bucket(self, state: Dict, now: str) -> None: """Update the 60-min rolling occurrence bucket in-place.""" bucket_start = state.get("occurrences_60m_bucket_start") or "" cutoff = (_now_dt() - datetime.timedelta(minutes=self.BUCKET_MINUTES)).isoformat() if bucket_start < cutoff: state["occurrences_60m"] = 1 state["occurrences_60m_bucket_start"] = now else: state["occurrences_60m"] = state.get("occurrences_60m", 0) + 1 def should_run_triage( self, signature: str, cooldown_minutes: int = DEFAULT_COOLDOWN_MINUTES ) -> bool: with self._lock: state = self._states.get(signature) if state is None: return True last_triage = state.get("last_triage_at") if not last_triage: return True cutoff = (_now_dt() - datetime.timedelta(minutes=cooldown_minutes)).isoformat() return last_triage < cutoff def mark_alert_seen(self, signature: str) -> None: now = _now_iso() with self._lock: if signature not in self._states: self._states[signature] = { "signature": signature, "last_triage_at": None, "last_alert_at": now, "triage_count_24h": 0, "occurrences_60m": 1, "occurrences_60m_bucket_start": now, "updated_at": now, } else: s = self._states[signature] s["last_alert_at"] = now s["updated_at"] = now self._update_bucket(s, now) def mark_triage_run(self, signature: str) -> None: now = _now_iso() cutoff_24h = (_now_dt() - datetime.timedelta(hours=24)).isoformat() with self._lock: if signature not in self._states: self._states[signature] = { "signature": signature, "last_triage_at": now, "last_alert_at": now, "triage_count_24h": 1, "occurrences_60m": 0, "occurrences_60m_bucket_start": now, "updated_at": now, } else: s = self._states[signature] prev = s.get("last_triage_at") or "" if prev < cutoff_24h: s["triage_count_24h"] = 1 else: s["triage_count_24h"] = s.get("triage_count_24h", 0) + 1 s["last_triage_at"] = now s["updated_at"] = now def get_state(self, signature: str) -> Optional[Dict]: with self._lock: s = self._states.get(signature) return dict(s) if s else None def list_active_signatures(self, window_minutes: int = 60, limit: int = 100) -> List[Dict]: cutoff = (_now_dt() - datetime.timedelta(minutes=window_minutes)).isoformat() with self._lock: active = [ dict(s) for s in self._states.values() if (s.get("last_alert_at") or "") >= cutoff ] return sorted(active, key=lambda x: x.get("occurrences_60m", 0), reverse=True)[:limit] # ─── Postgres backend ────────────────────────────────────────────────────────── class PostgresSignatureStateStore(SignatureStateStore): def __init__(self, dsn: str): self._dsn = dsn self._local = threading.local() def _conn(self): conn = getattr(self._local, "conn", None) if conn is None or conn.closed: import psycopg2 # type: ignore conn = psycopg2.connect(self._dsn) conn.autocommit = True self._local.conn = conn return conn def should_run_triage( self, signature: str, cooldown_minutes: int = DEFAULT_COOLDOWN_MINUTES ) -> bool: cur = self._conn().cursor() cur.execute( "SELECT last_triage_at FROM incident_signature_state WHERE signature=%s", (signature,), ) row = cur.fetchone() cur.close() if not row or row[0] is None: return True cutoff = _now_dt() - datetime.timedelta(minutes=cooldown_minutes) last = row[0] if hasattr(last, "tzinfo") and last.tzinfo: last = last.replace(tzinfo=None) return last < cutoff def mark_alert_seen(self, signature: str) -> None: now = _now_iso() cutoff_60m = (_now_dt() - datetime.timedelta(minutes=60)).isoformat() cur = self._conn().cursor() cur.execute( """INSERT INTO incident_signature_state (signature, last_alert_at, triage_count_24h, updated_at, occurrences_60m, occurrences_60m_bucket_start) VALUES (%s, %s, 0, %s, 1, %s) ON CONFLICT (signature) DO UPDATE SET last_alert_at=EXCLUDED.last_alert_at, updated_at=EXCLUDED.updated_at, occurrences_60m = CASE WHEN incident_signature_state.occurrences_60m_bucket_start IS NULL OR incident_signature_state.occurrences_60m_bucket_start < %s THEN 1 ELSE incident_signature_state.occurrences_60m + 1 END, occurrences_60m_bucket_start = CASE WHEN incident_signature_state.occurrences_60m_bucket_start IS NULL OR incident_signature_state.occurrences_60m_bucket_start < %s THEN EXCLUDED.occurrences_60m_bucket_start ELSE incident_signature_state.occurrences_60m_bucket_start END""", (signature, now, now, now, cutoff_60m, cutoff_60m), ) cur.close() def mark_triage_run(self, signature: str) -> None: now = _now_iso() cutoff_24h = (_now_dt() - datetime.timedelta(hours=24)).isoformat() cur = self._conn().cursor() cur.execute( """INSERT INTO incident_signature_state (signature, last_triage_at, last_alert_at, triage_count_24h, updated_at, occurrences_60m, occurrences_60m_bucket_start) VALUES (%s, %s, %s, 1, %s, 0, %s) ON CONFLICT (signature) DO UPDATE SET last_triage_at=EXCLUDED.last_triage_at, triage_count_24h = CASE WHEN incident_signature_state.last_triage_at IS NULL OR incident_signature_state.last_triage_at < %s THEN 1 ELSE incident_signature_state.triage_count_24h + 1 END, updated_at=EXCLUDED.updated_at""", (signature, now, now, now, now, cutoff_24h), ) cur.close() def get_state(self, signature: str) -> Optional[Dict]: cur = self._conn().cursor() cur.execute( "SELECT signature, last_triage_at, last_alert_at, triage_count_24h, updated_at, " "occurrences_60m, occurrences_60m_bucket_start " "FROM incident_signature_state WHERE signature=%s", (signature,), ) row = cur.fetchone() cur.close() if not row: return None sig, lta, laa, cnt, upd, occ60, occ_start = row return { "signature": sig, "last_triage_at": lta.isoformat() if hasattr(lta, "isoformat") else lta, "last_alert_at": laa.isoformat() if hasattr(laa, "isoformat") else laa, "triage_count_24h": cnt, "updated_at": upd.isoformat() if hasattr(upd, "isoformat") else upd, "occurrences_60m": occ60 or 0, "occurrences_60m_bucket_start": ( occ_start.isoformat() if hasattr(occ_start, "isoformat") else occ_start ), } def list_active_signatures(self, window_minutes: int = 60, limit: int = 100) -> List[Dict]: cutoff = (_now_dt() - datetime.timedelta(minutes=window_minutes)).isoformat() cur = self._conn().cursor() cur.execute( "SELECT signature, last_triage_at, last_alert_at, triage_count_24h, updated_at, " "occurrences_60m, occurrences_60m_bucket_start " "FROM incident_signature_state " "WHERE last_alert_at >= %s " "ORDER BY occurrences_60m DESC NULLS LAST LIMIT %s", (cutoff, limit), ) rows = [] for row in cur.fetchall(): sig, lta, laa, cnt, upd, occ60, occ_start = row rows.append({ "signature": sig, "last_triage_at": lta.isoformat() if hasattr(lta, "isoformat") else lta, "last_alert_at": laa.isoformat() if hasattr(laa, "isoformat") else laa, "triage_count_24h": cnt, "updated_at": upd.isoformat() if hasattr(upd, "isoformat") else upd, "occurrences_60m": occ60 or 0, "occurrences_60m_bucket_start": ( occ_start.isoformat() if hasattr(occ_start, "isoformat") else occ_start ), }) cur.close() return rows # ─── Auto backend ────────────────────────────────────────────────────────────── class AutoSignatureStateStore(SignatureStateStore): _RECOVERY_S = 300 def __init__(self, pg_dsn: str): self._pg_dsn = pg_dsn self._primary: Optional[PostgresSignatureStateStore] = None self._fallback = MemorySignatureStateStore() self._using_fallback = False self._since: float = 0.0 self._lock = threading.Lock() def _get_primary(self) -> PostgresSignatureStateStore: if self._primary is None: with self._lock: if self._primary is None: self._primary = PostgresSignatureStateStore(self._pg_dsn) return self._primary def _maybe_recover(self): if self._using_fallback and time.monotonic() - self._since >= self._RECOVERY_S: self._using_fallback = False def _delegate(self, method: str, *args, **kwargs): self._maybe_recover() if not self._using_fallback: try: return getattr(self._get_primary(), method)(*args, **kwargs) except Exception as e: logger.warning("AutoSignatureStateStore Postgres failed: %s", e) self._using_fallback = True self._since = time.monotonic() return getattr(self._fallback, method)(*args, **kwargs) def should_run_triage(self, signature, cooldown_minutes=DEFAULT_COOLDOWN_MINUTES): return self._delegate("should_run_triage", signature, cooldown_minutes) def mark_alert_seen(self, signature): self._delegate("mark_alert_seen", signature) def mark_triage_run(self, signature): self._delegate("mark_triage_run", signature) def get_state(self, signature): return self._delegate("get_state", signature) def list_active_signatures(self, window_minutes=60, limit=100): return self._delegate("list_active_signatures", window_minutes, limit) # ─── Singleton ──────────────────────────────────────────────────────────────── _sig_store: Optional[SignatureStateStore] = None _sig_lock = threading.Lock() def get_signature_state_store() -> SignatureStateStore: global _sig_store if _sig_store is None: with _sig_lock: if _sig_store is None: _sig_store = _create_sig_store() return _sig_store def set_signature_state_store(store: Optional[SignatureStateStore]) -> None: global _sig_store with _sig_lock: _sig_store = store def _create_sig_store() -> SignatureStateStore: backend = os.getenv("ALERT_BACKEND", "memory").lower() dsn = os.getenv("DATABASE_URL") or os.getenv("ALERT_DATABASE_URL", "") if backend == "postgres" and dsn: return PostgresSignatureStateStore(dsn) if backend == "auto" and dsn: return AutoSignatureStateStore(dsn) return MemorySignatureStateStore()