""" alert_store.py — Alert ingestion storage with state machine. State machine: new → processing → acked | failed Backends: - MemoryAlertStore (testing / single-process) - PostgresAlertStore (production — uses psycopg2 sync) - AutoAlertStore (Postgres primary → Memory fallback) DDL: ops/scripts/migrate_alerts_postgres.py """ from __future__ import annotations import datetime import hashlib import json import logging import os import threading import time import uuid from abc import ABC, abstractmethod from pathlib import Path from typing import Any, Dict, List, Optional logger = logging.getLogger(__name__) # ─── Constants ──────────────────────────────────────────────────────────────── MAX_LOG_SAMPLES = 40 MAX_SUMMARY_CHARS = 1000 MAX_ALERT_JSON_BYTES = 32 * 1024 # 32 KB per alert # Alert status values STATUS_NEW = "new" STATUS_PROCESSING = "processing" STATUS_ACKED = "acked" STATUS_FAILED = "failed" PROCESSING_LOCK_TTL_S = 600 # default 10 min lock def _now_iso() -> str: return datetime.datetime.utcnow().isoformat() def _now_dt() -> datetime.datetime: return datetime.datetime.utcnow() def _generate_alert_ref() -> str: ts = datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S") short = uuid.uuid4().hex[:6] return f"alrt_{ts}_{short}" def _compute_dedupe_key(service: str, env: str, kind: str, fingerprint: str = "") -> str: raw = f"{service}|{env}|{kind}|{fingerprint}" return hashlib.sha256(raw.encode()).hexdigest()[:32] def _redact_text(text: str, max_chars: int = 500) -> str: import re _SECRET_PAT = re.compile( r'(?i)(token|api[_-]?key|password|secret|bearer)\s*[=:]\s*\S+', ) redacted = _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***", text or "") return redacted[:max_chars] def _sanitize_alert(alert_data: Dict) -> Dict: """Truncate/redact alert payload for safe storage.""" safe = dict(alert_data) safe["summary"] = _redact_text(safe.get("summary", ""), MAX_SUMMARY_CHARS) safe["title"] = _redact_text(safe.get("title", ""), 300) ev = safe.get("evidence", {}) if isinstance(ev, dict): logs = ev.get("log_samples", []) safe["evidence"] = { **ev, "log_samples": [_redact_text(l, 300) for l in logs[:MAX_LOG_SAMPLES]], } return safe # ─── Abstract interface ──────────────────────────────────────────────────────── class AlertStore(ABC): @abstractmethod def ingest(self, alert_data: Dict, dedupe_ttl_minutes: int = 30) -> Dict: """ Store alert with dedupe. Returns: {accepted, deduped, dedupe_key, alert_ref, occurrences} """ @abstractmethod def list_alerts(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]: """List alerts metadata. Supports status_in filter.""" @abstractmethod def get_alert(self, alert_ref: str) -> Optional[Dict]: """Return full alert record.""" @abstractmethod def ack_alert(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]: """Mark alert as acked (status=acked). Legacy compat.""" @abstractmethod def get_by_dedupe_key(self, dedupe_key: str) -> Optional[Dict]: """Lookup by dedupe key (for reuse-open-incident logic).""" # ── State machine methods ────────────────────────────────────────────────── @abstractmethod def claim_next_alerts( self, window_minutes: int = 240, limit: int = 25, owner: str = "loop", lock_ttl_seconds: int = PROCESSING_LOCK_TTL_S, ) -> List[Dict]: """ Atomically move status=new (or failed+expired) → processing. Skips already-processing-and-locked alerts. Returns the claimed alert records. """ @abstractmethod def mark_acked(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]: """status=acked, acked_at=now.""" @abstractmethod def mark_failed( self, alert_ref: str, error: str, retry_after_seconds: int = 300 ) -> Optional[Dict]: """status=failed, lock_until=now+retry, last_error=truncated.""" @abstractmethod def requeue_expired_processing(self) -> int: """processing + lock_until < now → status=new. Returns count reset.""" @abstractmethod def dashboard_counts(self, window_minutes: int = 240) -> Dict: """Return {new, processing, failed, acked} counts for window.""" @abstractmethod def top_signatures(self, window_minutes: int = 240, limit: int = 20) -> List[Dict]: """Return top dedupe_keys by occurrences.""" @abstractmethod def compute_loop_slo(self, window_minutes: int = 240, p95_threshold_s: float = 60.0, failed_rate_threshold_pct: float = 5.0, stuck_minutes: float = 15.0) -> Dict: """Compute alert-loop SLO metrics for the dashboard. Returns: {claim_to_ack_p95_seconds, failed_rate_pct, processing_stuck_count, violations} """ # ─── Memory backend ──────────────────────────────────────────────────────────── class MemoryAlertStore(AlertStore): def __init__(self): self._lock = threading.Lock() self._alerts: Dict[str, Dict] = {} self._dedupe: Dict[str, str] = {} # dedupe_key → alert_ref def _new_record(self, alert_data: Dict, dedupe_key: str, now: str) -> Dict: safe = _sanitize_alert(alert_data) service = alert_data.get("service", "unknown") env = alert_data.get("env", "prod") kind = alert_data.get("kind", "custom") alert_ref = alert_data.get("alert_id") or _generate_alert_ref() return { "alert_ref": alert_ref, "dedupe_key": dedupe_key, "source": safe.get("source", "unknown"), "service": service, "env": env, "severity": safe.get("severity", "P2"), "kind": kind, "title": safe.get("title", ""), "summary": safe.get("summary", ""), "started_at": safe.get("started_at") or now, "labels": safe.get("labels", {}), "metrics": safe.get("metrics", {}), "evidence": safe.get("evidence", {}), "links": safe.get("links", [])[:10], "created_at": now, "last_seen_at": now, "occurrences": 1, # State machine fields "status": STATUS_NEW, "claimed_at": None, # set when claimed "processing_lock_until": None, "processing_owner": None, "last_error": None, "acked_at": None, # Legacy compat "ack_status": "pending", "ack_actor": None, "ack_note": None, "ack_at": None, } def ingest(self, alert_data: Dict, dedupe_ttl_minutes: int = 30) -> Dict: service = alert_data.get("service", "unknown") env = alert_data.get("env", "prod") kind = alert_data.get("kind", "custom") labels = alert_data.get("labels", {}) fingerprint = labels.get("fingerprint", "") dedupe_key = _compute_dedupe_key(service, env, kind, fingerprint) now = _now_iso() with self._lock: existing_ref = self._dedupe.get(dedupe_key) if existing_ref and existing_ref in self._alerts: existing = self._alerts[existing_ref] created_at = existing.get("created_at", "") ttl_cutoff = ( datetime.datetime.utcnow() - datetime.timedelta(minutes=dedupe_ttl_minutes) ).isoformat() if created_at >= ttl_cutoff: existing["occurrences"] = existing.get("occurrences", 1) + 1 existing["last_seen_at"] = now if alert_data.get("metrics"): existing["metrics"] = alert_data["metrics"] # If previously acked/failed, reset to new so it gets picked up again if existing.get("status") in (STATUS_ACKED, STATUS_FAILED): existing["status"] = STATUS_NEW existing["processing_lock_until"] = None existing["last_error"] = None return { "accepted": True, "deduped": True, "dedupe_key": dedupe_key, "alert_ref": existing_ref, "occurrences": existing["occurrences"], } record = self._new_record(alert_data, dedupe_key, now) alert_ref = record["alert_ref"] self._alerts[alert_ref] = record self._dedupe[dedupe_key] = alert_ref return { "accepted": True, "deduped": False, "dedupe_key": dedupe_key, "alert_ref": alert_ref, "occurrences": 1, } def list_alerts(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]: filters = filters or {} service = filters.get("service") env = filters.get("env") window = int(filters.get("window_minutes", 240)) status_in = filters.get("status_in") # list of statuses or None (all) cutoff = ( datetime.datetime.utcnow() - datetime.timedelta(minutes=window) ).isoformat() with self._lock: results = [] for a in sorted(self._alerts.values(), key=lambda x: x.get("created_at", ""), reverse=True): if a.get("created_at", "") < cutoff: continue if service and a.get("service") != service: continue if env and a.get("env") != env: continue if status_in and a.get("status", STATUS_NEW) not in status_in: continue results.append({k: v for k, v in a.items() if k not in ("evidence",)}) if len(results) >= limit: break return results def get_alert(self, alert_ref: str) -> Optional[Dict]: with self._lock: return dict(self._alerts[alert_ref]) if alert_ref in self._alerts else None def ack_alert(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]: return self.mark_acked(alert_ref, actor, note) def get_by_dedupe_key(self, dedupe_key: str) -> Optional[Dict]: with self._lock: ref = self._dedupe.get(dedupe_key) if ref and ref in self._alerts: return dict(self._alerts[ref]) return None def claim_next_alerts( self, window_minutes: int = 240, limit: int = 25, owner: str = "loop", lock_ttl_seconds: int = PROCESSING_LOCK_TTL_S, ) -> List[Dict]: now_dt = _now_dt() now_str = now_dt.isoformat() lock_until = (now_dt + datetime.timedelta(seconds=lock_ttl_seconds)).isoformat() cutoff = (now_dt - datetime.timedelta(minutes=window_minutes)).isoformat() claimed = [] with self._lock: for a in sorted(self._alerts.values(), key=lambda x: x.get("created_at", "")): if len(claimed) >= limit: break if a.get("created_at", "") < cutoff: continue st = a.get("status", STATUS_NEW) lock_exp = a.get("processing_lock_until") # Claimable: new, OR failed/processing with expired/no lock if st == STATUS_ACKED: continue if st in (STATUS_PROCESSING, STATUS_FAILED): if lock_exp and lock_exp > now_str: continue # still locked (retry window not passed) # Claim it a["status"] = STATUS_PROCESSING a["claimed_at"] = now_str a["processing_lock_until"] = lock_until a["processing_owner"] = owner claimed.append(dict(a)) return claimed def mark_acked(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]: now = _now_iso() with self._lock: if alert_ref not in self._alerts: return None rec = self._alerts[alert_ref] rec["status"] = STATUS_ACKED rec["acked_at"] = now rec["ack_status"] = "acked" rec["ack_actor"] = _redact_text(actor, 100) rec["ack_note"] = _redact_text(note, 500) rec["ack_at"] = now rec["processing_lock_until"] = None rec["processing_owner"] = None return {"alert_ref": alert_ref, "status": STATUS_ACKED, "ack_status": "acked"} def mark_failed( self, alert_ref: str, error: str, retry_after_seconds: int = 300 ) -> Optional[Dict]: now_dt = _now_dt() retry_at = (now_dt + datetime.timedelta(seconds=retry_after_seconds)).isoformat() with self._lock: if alert_ref not in self._alerts: return None rec = self._alerts[alert_ref] rec["status"] = STATUS_FAILED rec["last_error"] = _redact_text(error, 500) rec["processing_lock_until"] = retry_at rec["processing_owner"] = None return {"alert_ref": alert_ref, "status": STATUS_FAILED, "ack_status": "failed", "retry_at": retry_at} def requeue_expired_processing(self) -> int: now_str = _now_iso() count = 0 with self._lock: for a in self._alerts.values(): if a.get("status") == STATUS_PROCESSING: lock_exp = a.get("processing_lock_until") if lock_exp and lock_exp <= now_str: a["status"] = STATUS_NEW a["processing_lock_until"] = None a["processing_owner"] = None count += 1 return count def dashboard_counts(self, window_minutes: int = 240) -> Dict: cutoff = ( _now_dt() - datetime.timedelta(minutes=window_minutes) ).isoformat() counts = {STATUS_NEW: 0, STATUS_PROCESSING: 0, STATUS_ACKED: 0, STATUS_FAILED: 0} now_str = _now_iso() with self._lock: for a in self._alerts.values(): if a.get("created_at", "") < cutoff: continue st = a.get("status", STATUS_NEW) if st in counts: counts[st] += 1 return counts def top_signatures(self, window_minutes: int = 240, limit: int = 20) -> List[Dict]: cutoff = ( _now_dt() - datetime.timedelta(minutes=window_minutes) ).isoformat() sigs: Dict[str, Dict] = {} with self._lock: for a in self._alerts.values(): if a.get("created_at", "") < cutoff: continue key = a.get("dedupe_key", "") if key not in sigs: sigs[key] = { "signature": key, "service": a.get("service", ""), "kind": a.get("kind", ""), "occurrences": 0, "last_seen": a.get("last_seen_at", ""), } sigs[key]["occurrences"] += a.get("occurrences", 1) if a.get("last_seen_at", "") > sigs[key]["last_seen"]: sigs[key]["last_seen"] = a.get("last_seen_at", "") return sorted(sigs.values(), key=lambda x: x["occurrences"], reverse=True)[:limit] def compute_loop_slo(self, window_minutes: int = 240, p95_threshold_s: float = 60.0, failed_rate_threshold_pct: float = 5.0, stuck_minutes: float = 15.0) -> Dict: now_dt = _now_dt() cutoff = (now_dt - datetime.timedelta(minutes=window_minutes)).isoformat() stuck_cutoff = (now_dt - datetime.timedelta(minutes=stuck_minutes)).isoformat() durations_s: list = [] acked = 0 failed = 0 stuck = 0 with self._lock: for a in self._alerts.values(): if a.get("created_at", "") < cutoff: continue st = a.get("status", STATUS_NEW) if st == STATUS_ACKED: acked += 1 claimed_at = a.get("claimed_at") acked_at = a.get("acked_at") if claimed_at and acked_at: try: c = datetime.datetime.fromisoformat(claimed_at) k = datetime.datetime.fromisoformat(acked_at) durations_s.append((k - c).total_seconds()) except Exception: pass elif st == STATUS_FAILED: failed += 1 elif st == STATUS_PROCESSING: claimed_at = a.get("claimed_at") or "" if claimed_at and claimed_at < stuck_cutoff: stuck += 1 # P95 p95 = None if durations_s: durations_s.sort() idx = max(0, int(len(durations_s) * 0.95) - 1) p95 = round(durations_s[idx], 1) # Failed rate total_terminal = acked + failed failed_pct = round((failed / total_terminal * 100) if total_terminal > 0 else 0.0, 1) violations = [] if p95 is not None and p95 > p95_threshold_s: violations.append({ "metric": "claim_to_ack_p95_seconds", "value": p95, "threshold": p95_threshold_s, "message": f"P95 claim→ack latency {p95}s exceeds {p95_threshold_s}s", }) if failed_pct > failed_rate_threshold_pct: violations.append({ "metric": "failed_rate_pct", "value": failed_pct, "threshold": failed_rate_threshold_pct, "message": f"Failed alert rate {failed_pct}% exceeds {failed_rate_threshold_pct}%", }) if stuck > 0: violations.append({ "metric": "processing_stuck_count", "value": stuck, "threshold": 0, "message": f"{stuck} alerts stuck in processing > {stuck_minutes}min", }) return { "claim_to_ack_p95_seconds": p95, "failed_rate_pct": failed_pct, "processing_stuck_count": stuck, "sample_count": len(durations_s), "violations": violations, } # ─── Postgres backend ────────────────────────────────────────────────────────── class PostgresAlertStore(AlertStore): """Production backend via psycopg2 (sync, per-thread connections).""" def __init__(self, dsn: str): self._dsn = dsn self._local = threading.local() def _conn(self): conn = getattr(self._local, "conn", None) if conn is None or conn.closed: import psycopg2 # type: ignore conn = psycopg2.connect(self._dsn) conn.autocommit = False self._local.conn = conn return conn def _commit(self): self._conn().commit() def ingest(self, alert_data: Dict, dedupe_ttl_minutes: int = 30) -> Dict: service = alert_data.get("service", "unknown") env = alert_data.get("env", "prod") kind = alert_data.get("kind", "custom") labels = alert_data.get("labels", {}) fingerprint = labels.get("fingerprint", "") dedupe_key = _compute_dedupe_key(service, env, kind, fingerprint) now = _now_iso() conn = self._conn() cur = conn.cursor() cutoff = ( datetime.datetime.utcnow() - datetime.timedelta(minutes=dedupe_ttl_minutes) ).isoformat() cur.execute( "SELECT alert_ref, occurrences, status FROM alerts " "WHERE dedupe_key=%s AND created_at >= %s LIMIT 1", (dedupe_key, cutoff), ) row = cur.fetchone() if row: existing_ref, occ, existing_status = row new_occ = occ + 1 # Reset to new if previously terminal new_status = STATUS_NEW if existing_status in (STATUS_ACKED, STATUS_FAILED) else existing_status cur.execute( "UPDATE alerts SET occurrences=%s, last_seen_at=%s, metrics=%s, status=%s " "WHERE alert_ref=%s", (new_occ, now, json.dumps(alert_data.get("metrics", {}), default=str), new_status, existing_ref), ) conn.commit() cur.close() return { "accepted": True, "deduped": True, "dedupe_key": dedupe_key, "alert_ref": existing_ref, "occurrences": new_occ, } safe = _sanitize_alert(alert_data) alert_ref = alert_data.get("alert_id") or _generate_alert_ref() cur.execute( """INSERT INTO alerts (alert_ref,dedupe_key,source,service,env,severity,kind, title,summary,started_at,labels,metrics,evidence,links, created_at,last_seen_at,occurrences,status) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,1,%s)""", (alert_ref, dedupe_key, safe.get("source", "unknown"), service, env, safe.get("severity", "P2"), kind, safe.get("title", ""), safe.get("summary", ""), safe.get("started_at") or now, json.dumps(safe.get("labels", {}), default=str), json.dumps(safe.get("metrics", {}), default=str), json.dumps(safe.get("evidence", {}), default=str), json.dumps(safe.get("links", [])[:10], default=str), now, now, STATUS_NEW), ) conn.commit() cur.close() return { "accepted": True, "deduped": False, "dedupe_key": dedupe_key, "alert_ref": alert_ref, "occurrences": 1, } def _row_to_dict(self, cur, row) -> Dict: cols = [d[0] for d in cur.description] d: Dict = {} for c, v in zip(cols, row): if isinstance(v, datetime.datetime): d[c] = v.isoformat() elif isinstance(v, str) and c in ("labels", "metrics", "evidence", "links"): try: d[c] = json.loads(v) except Exception: d[c] = v else: d[c] = v return d def list_alerts(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]: filters = filters or {} window = int(filters.get("window_minutes", 240)) cutoff = (datetime.datetime.utcnow() - datetime.timedelta(minutes=window)).isoformat() status_in = filters.get("status_in") clauses = ["created_at >= %s"] params: list = [cutoff] if filters.get("service"): clauses.append("service=%s") params.append(filters["service"]) if filters.get("env"): clauses.append("env=%s") params.append(filters["env"]) if status_in: placeholders = ",".join(["%s"] * len(status_in)) clauses.append(f"status IN ({placeholders})") params.extend(status_in) params.append(min(limit, 200)) where = " AND ".join(clauses) cur = self._conn().cursor() cur.execute( f"SELECT alert_ref,dedupe_key,source,service,env,severity,kind," f"title,summary,started_at,labels,metrics,links," f"created_at,last_seen_at,occurrences,status,processing_owner,acked_at,last_error " f"FROM alerts WHERE {where} ORDER BY created_at DESC LIMIT %s", params, ) rows = [self._row_to_dict(cur, r) for r in cur.fetchall()] cur.close() return rows def get_alert(self, alert_ref: str) -> Optional[Dict]: cur = self._conn().cursor() cur.execute( "SELECT alert_ref,dedupe_key,source,service,env,severity,kind," "title,summary,started_at,labels,metrics,evidence,links," "created_at,last_seen_at,occurrences,status,processing_lock_until," "processing_owner,last_error,acked_at,ack_actor,ack_note " "FROM alerts WHERE alert_ref=%s", (alert_ref,), ) row = cur.fetchone() if not row: cur.close() return None result = self._row_to_dict(cur, row) cur.close() return result def ack_alert(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]: return self.mark_acked(alert_ref, actor, note) def get_by_dedupe_key(self, dedupe_key: str) -> Optional[Dict]: cur = self._conn().cursor() cur.execute( "SELECT alert_ref,dedupe_key,service,env,severity,kind,title,summary," "started_at,labels,metrics,created_at,last_seen_at,occurrences,status " "FROM alerts WHERE dedupe_key=%s ORDER BY created_at DESC LIMIT 1", (dedupe_key,), ) row = cur.fetchone() if not row: cur.close() return None result = self._row_to_dict(cur, row) cur.close() return result def claim_next_alerts( self, window_minutes: int = 240, limit: int = 25, owner: str = "loop", lock_ttl_seconds: int = PROCESSING_LOCK_TTL_S, ) -> List[Dict]: """Atomic claim via SELECT FOR UPDATE SKIP LOCKED.""" conn = self._conn() now_str = _now_iso() lock_until = ( datetime.datetime.utcnow() + datetime.timedelta(seconds=lock_ttl_seconds) ).isoformat() cutoff = ( datetime.datetime.utcnow() - datetime.timedelta(minutes=window_minutes) ).isoformat() cur = conn.cursor() try: # Select claimable: new, or failed/processing with expired lock cur.execute( """ SELECT alert_ref FROM alerts WHERE created_at >= %s AND status IN ('new', 'failed', 'processing') AND (processing_lock_until IS NULL OR processing_lock_until <= %s) ORDER BY CASE severity WHEN 'P0' THEN 0 WHEN 'P1' THEN 1 WHEN 'P2' THEN 2 WHEN 'P3' THEN 3 ELSE 4 END, created_at LIMIT %s FOR UPDATE SKIP LOCKED """, (cutoff, now_str, limit), ) refs = [row[0] for row in cur.fetchall()] if not refs: conn.commit() cur.close() return [] placeholders = ",".join(["%s"] * len(refs)) cur.execute( f"""UPDATE alerts SET status='processing', claimed_at=%s, processing_lock_until=%s, processing_owner=%s WHERE alert_ref IN ({placeholders})""", [now_str, lock_until, owner] + refs, ) # Fetch updated rows cur.execute( f"SELECT alert_ref,dedupe_key,service,env,severity,kind,title,summary," f"started_at,labels,metrics,created_at,last_seen_at,occurrences," f"status,processing_owner,last_error " f"FROM alerts WHERE alert_ref IN ({placeholders})", refs, ) rows = [self._row_to_dict(cur, r) for r in cur.fetchall()] conn.commit() cur.close() return rows except Exception: conn.rollback() cur.close() raise def mark_acked(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]: now = _now_iso() cur = self._conn().cursor() cur.execute( "UPDATE alerts SET status='acked', acked_at=%s, ack_actor=%s, ack_note=%s, " "processing_lock_until=NULL, processing_owner=NULL " "WHERE alert_ref=%s RETURNING alert_ref", (now, _redact_text(actor, 100), _redact_text(note, 500), alert_ref), ) row = cur.fetchone() self._commit() cur.close() if not row: return None return {"alert_ref": alert_ref, "status": STATUS_ACKED, "ack_status": "acked"} def mark_failed( self, alert_ref: str, error: str, retry_after_seconds: int = 300 ) -> Optional[Dict]: retry_at = ( datetime.datetime.utcnow() + datetime.timedelta(seconds=retry_after_seconds) ).isoformat() cur = self._conn().cursor() cur.execute( "UPDATE alerts SET status='failed', last_error=%s, " "processing_lock_until=%s, processing_owner=NULL " "WHERE alert_ref=%s RETURNING alert_ref", (_redact_text(error, 500), retry_at, alert_ref), ) row = cur.fetchone() self._commit() cur.close() if not row: return None return {"alert_ref": alert_ref, "status": STATUS_FAILED, "ack_status": "failed", "retry_at": retry_at} def requeue_expired_processing(self) -> int: now = _now_iso() cur = self._conn().cursor() cur.execute( "UPDATE alerts SET status='new', processing_lock_until=NULL, " "processing_owner=NULL " "WHERE status='processing' AND processing_lock_until <= %s", (now,), ) count = cur.rowcount self._commit() cur.close() return count def dashboard_counts(self, window_minutes: int = 240) -> Dict: cutoff = ( datetime.datetime.utcnow() - datetime.timedelta(minutes=window_minutes) ).isoformat() cur = self._conn().cursor() cur.execute( "SELECT status, COUNT(*) FROM alerts WHERE created_at >= %s GROUP BY status", (cutoff,), ) counts = {STATUS_NEW: 0, STATUS_PROCESSING: 0, STATUS_ACKED: 0, STATUS_FAILED: 0} for row in cur.fetchall(): st, cnt = row if st in counts: counts[st] = int(cnt) cur.close() return counts def top_signatures(self, window_minutes: int = 240, limit: int = 20) -> List[Dict]: cutoff = ( datetime.datetime.utcnow() - datetime.timedelta(minutes=window_minutes) ).isoformat() cur = self._conn().cursor() cur.execute( "SELECT dedupe_key, service, kind, SUM(occurrences) AS occ, MAX(last_seen_at) AS ls " "FROM alerts WHERE created_at >= %s " "GROUP BY dedupe_key, service, kind " "ORDER BY occ DESC LIMIT %s", (cutoff, limit), ) rows = [] for row in cur.fetchall(): key, svc, kind, occ, ls = row rows.append({ "signature": key, "service": svc, "kind": kind, "occurrences": int(occ), "last_seen": ls.isoformat() if hasattr(ls, "isoformat") else str(ls), }) cur.close() return rows def compute_loop_slo(self, window_minutes: int = 240, p95_threshold_s: float = 60.0, failed_rate_threshold_pct: float = 5.0, stuck_minutes: float = 15.0) -> Dict: now = datetime.datetime.utcnow() cutoff = (now - datetime.timedelta(minutes=window_minutes)).isoformat() stuck_cutoff = (now - datetime.timedelta(minutes=stuck_minutes)).isoformat() cur = self._conn().cursor() # P95 duration: only for acked with both claimed_at and acked_at cur.execute( "SELECT EXTRACT(EPOCH FROM (acked_at - claimed_at)) " "FROM alerts " "WHERE created_at >= %s AND status='acked' " "AND claimed_at IS NOT NULL AND acked_at IS NOT NULL " "ORDER BY 1", (cutoff,), ) durations = [float(r[0]) for r in cur.fetchall() if r[0] is not None] cur.execute( "SELECT COUNT(*) FROM alerts WHERE created_at >= %s AND status='acked'", (cutoff,), ) acked = int(cur.fetchone()[0]) cur.execute( "SELECT COUNT(*) FROM alerts WHERE created_at >= %s AND status='failed'", (cutoff,), ) failed = int(cur.fetchone()[0]) cur.execute( "SELECT COUNT(*) FROM alerts " "WHERE created_at >= %s AND status='processing' AND claimed_at < %s", (cutoff, stuck_cutoff), ) stuck = int(cur.fetchone()[0]) cur.close() p95 = None if durations: idx = max(0, int(len(durations) * 0.95) - 1) p95 = round(durations[idx], 1) total_terminal = acked + failed failed_pct = round((failed / total_terminal * 100) if total_terminal > 0 else 0.0, 1) violations = [] if p95 is not None and p95 > p95_threshold_s: violations.append({ "metric": "claim_to_ack_p95_seconds", "value": p95, "threshold": p95_threshold_s, "message": f"P95 claim→ack {p95}s > {p95_threshold_s}s", }) if failed_pct > failed_rate_threshold_pct: violations.append({ "metric": "failed_rate_pct", "value": failed_pct, "threshold": failed_rate_threshold_pct, "message": f"Failed rate {failed_pct}% > {failed_rate_threshold_pct}%", }) if stuck > 0: violations.append({ "metric": "processing_stuck_count", "value": stuck, "threshold": 0, "message": f"{stuck} alerts stuck in processing > {stuck_minutes}min", }) return { "claim_to_ack_p95_seconds": p95, "failed_rate_pct": failed_pct, "processing_stuck_count": stuck, "sample_count": len(durations), "violations": violations, } # ─── Auto backend ────────────────────────────────────────────────────────────── class AutoAlertStore(AlertStore): """Postgres primary → MemoryAlertStore fallback, with 5 min recovery.""" _RECOVERY_INTERVAL_S = 300 def __init__(self, pg_dsn: str): self._pg_dsn = pg_dsn self._primary: Optional[PostgresAlertStore] = None self._fallback = MemoryAlertStore() self._using_fallback = False self._fallback_since: float = 0.0 self._init_lock = threading.Lock() def _get_primary(self) -> PostgresAlertStore: if self._primary is None: with self._init_lock: if self._primary is None: self._primary = PostgresAlertStore(self._pg_dsn) return self._primary def _maybe_recover(self) -> None: if self._using_fallback and self._fallback_since > 0: if time.monotonic() - self._fallback_since >= self._RECOVERY_INTERVAL_S: logger.info("AutoAlertStore: attempting Postgres recovery") self._using_fallback = False self._fallback_since = 0.0 def _switch_to_fallback(self, err: Exception) -> None: logger.warning("AutoAlertStore: Postgres failed (%s), using Memory fallback", err) self._using_fallback = True self._fallback_since = time.monotonic() def active_backend(self) -> str: return "memory_fallback" if self._using_fallback else "postgres" def _delegate(self, method: str, *args, **kwargs): self._maybe_recover() if not self._using_fallback: try: return getattr(self._get_primary(), method)(*args, **kwargs) except Exception as e: self._switch_to_fallback(e) return getattr(self._fallback, method)(*args, **kwargs) def ingest(self, alert_data: Dict, dedupe_ttl_minutes: int = 30) -> Dict: return self._delegate("ingest", alert_data, dedupe_ttl_minutes) def list_alerts(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]: return self._delegate("list_alerts", filters, limit) def get_alert(self, alert_ref: str) -> Optional[Dict]: return self._delegate("get_alert", alert_ref) def ack_alert(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]: return self._delegate("mark_acked", alert_ref, actor, note) def get_by_dedupe_key(self, dedupe_key: str) -> Optional[Dict]: return self._delegate("get_by_dedupe_key", dedupe_key) def claim_next_alerts(self, window_minutes=240, limit=25, owner="loop", lock_ttl_seconds=PROCESSING_LOCK_TTL_S) -> List[Dict]: return self._delegate("claim_next_alerts", window_minutes, limit, owner, lock_ttl_seconds) def mark_acked(self, alert_ref, actor, note="") -> Optional[Dict]: return self._delegate("mark_acked", alert_ref, actor, note) def mark_failed(self, alert_ref, error, retry_after_seconds=300) -> Optional[Dict]: return self._delegate("mark_failed", alert_ref, error, retry_after_seconds) def requeue_expired_processing(self) -> int: return self._delegate("requeue_expired_processing") def dashboard_counts(self, window_minutes=240) -> Dict: return self._delegate("dashboard_counts", window_minutes) def top_signatures(self, window_minutes=240, limit=20) -> List[Dict]: return self._delegate("top_signatures", window_minutes, limit) def compute_loop_slo(self, window_minutes=240, p95_threshold_s=60.0, failed_rate_threshold_pct=5.0, stuck_minutes=15.0) -> Dict: return self._delegate("compute_loop_slo", window_minutes, p95_threshold_s, failed_rate_threshold_pct, stuck_minutes) # ─── Singleton ──────────────────────────────────────────────────────────────── _store: Optional[AlertStore] = None _store_lock = threading.Lock() def get_alert_store() -> AlertStore: global _store if _store is None: with _store_lock: if _store is None: _store = _create_alert_store() return _store def set_alert_store(store: Optional[AlertStore]) -> None: global _store with _store_lock: _store = store def _create_alert_store() -> AlertStore: backend = os.getenv("ALERT_BACKEND", "memory").lower() # ALERT_DATABASE_URL takes precedence (service-specific), then DATABASE_URL (shared) dsn = os.getenv("ALERT_DATABASE_URL") or os.getenv("DATABASE_URL", "") if backend == "postgres": if dsn: logger.info("AlertStore: postgres dsn=%s…", dsn[:30]) return PostgresAlertStore(dsn) logger.warning( "ALERT_BACKEND=postgres but no ALERT_DATABASE_URL/DATABASE_URL; falling back to memory" ) if backend == "auto": if dsn: logger.info("AlertStore: auto (postgres→memory fallback) dsn=%s…", dsn[:30]) return AutoAlertStore(dsn) logger.info("AlertStore: auto — no ALERT_DATABASE_URL/DATABASE_URL, using memory") logger.info("AlertStore: memory (in-process)") return MemoryAlertStore()