microdao-daarion/services/router/alert_store.py

"""
alert_store.py — Alert ingestion storage with state machine.

State machine: new → processing → acked | failed

Backends:
  - MemoryAlertStore   (testing / single-process)
  - PostgresAlertStore (production — uses psycopg2 sync)
  - AutoAlertStore     (Postgres primary → Memory fallback)

DDL: ops/scripts/migrate_alerts_postgres.py
"""
from __future__ import annotations

import datetime
import hashlib
import json
import logging
import os
import threading
import time
import uuid
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Dict, List, Optional

logger = logging.getLogger(__name__)

# ─── Constants ────────────────────────────────────────────────────────────────

MAX_LOG_SAMPLES = 40
MAX_SUMMARY_CHARS = 1000
MAX_ALERT_JSON_BYTES = 32 * 1024   # 32 KB per alert

# Alert status values
STATUS_NEW        = "new"
STATUS_PROCESSING = "processing"
STATUS_ACKED      = "acked"
STATUS_FAILED     = "failed"

PROCESSING_LOCK_TTL_S = 600  # default 10 min lock


def _now_iso() -> str:
    return datetime.datetime.utcnow().isoformat()


def _now_dt() -> datetime.datetime:
    return datetime.datetime.utcnow()


def _generate_alert_ref() -> str:
    ts = datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S")
    short = uuid.uuid4().hex[:6]
    return f"alrt_{ts}_{short}"


def _compute_dedupe_key(service: str, env: str, kind: str, fingerprint: str = "") -> str:
    raw = f"{service}|{env}|{kind}|{fingerprint}"
    return hashlib.sha256(raw.encode()).hexdigest()[:32]


def _redact_text(text: str, max_chars: int = 500) -> str:
    import re
    _SECRET_PAT = re.compile(
        r'(?i)(token|api[_-]?key|password|secret|bearer)\s*[=:]\s*\S+',
    )
    redacted = _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***", text or "")
    return redacted[:max_chars]


def _sanitize_alert(alert_data: Dict) -> Dict:
    """Truncate/redact alert payload for safe storage."""
    safe = dict(alert_data)
    safe["summary"] = _redact_text(safe.get("summary", ""), MAX_SUMMARY_CHARS)
    safe["title"] = _redact_text(safe.get("title", ""), 300)
    ev = safe.get("evidence", {})
    if isinstance(ev, dict):
        logs = ev.get("log_samples", [])
        safe["evidence"] = {
            **ev,
            "log_samples": [_redact_text(l, 300) for l in logs[:MAX_LOG_SAMPLES]],
        }
    return safe


# ─── Abstract interface ────────────────────────────────────────────────────────

class AlertStore(ABC):

    @abstractmethod
    def ingest(self, alert_data: Dict, dedupe_ttl_minutes: int = 30) -> Dict:
        """
        Store alert with dedupe.
        Returns: {accepted, deduped, dedupe_key, alert_ref, occurrences}
        """

    @abstractmethod
    def list_alerts(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
        """List alerts metadata. Supports status_in filter."""

    @abstractmethod
    def get_alert(self, alert_ref: str) -> Optional[Dict]:
        """Return full alert record."""

    @abstractmethod
    def ack_alert(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
        """Mark alert as acked (status=acked). Legacy compat."""

    @abstractmethod
    def get_by_dedupe_key(self, dedupe_key: str) -> Optional[Dict]:
        """Lookup by dedupe key (for reuse-open-incident logic)."""

    # ── State machine methods ──────────────────────────────────────────────────

    @abstractmethod
    def claim_next_alerts(
        self,
        window_minutes: int = 240,
        limit: int = 25,
        owner: str = "loop",
        lock_ttl_seconds: int = PROCESSING_LOCK_TTL_S,
    ) -> List[Dict]:
        """
        Atomically move status=new (or failed+expired) → processing.
        Skips already-processing-and-locked alerts.
        Returns the claimed alert records.
        """

    @abstractmethod
    def mark_acked(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
        """status=acked, acked_at=now."""

    @abstractmethod
    def mark_failed(
        self, alert_ref: str, error: str, retry_after_seconds: int = 300
    ) -> Optional[Dict]:
        """status=failed, lock_until=now+retry, last_error=truncated."""

    @abstractmethod
    def requeue_expired_processing(self) -> int:
        """processing + lock_until < now → status=new. Returns count reset."""

    @abstractmethod
    def dashboard_counts(self, window_minutes: int = 240) -> Dict:
        """Return {new, processing, failed, acked} counts for window."""

    @abstractmethod
    def top_signatures(self, window_minutes: int = 240, limit: int = 20) -> List[Dict]:
        """Return top dedupe_keys by occurrences."""

    @abstractmethod
    def compute_loop_slo(self, window_minutes: int = 240,
                         p95_threshold_s: float = 60.0,
                         failed_rate_threshold_pct: float = 5.0,
                         stuck_minutes: float = 15.0) -> Dict:
        """Compute alert-loop SLO metrics for the dashboard.
        Returns: {claim_to_ack_p95_seconds, failed_rate_pct, processing_stuck_count, violations}
        """


# ─── Memory backend ────────────────────────────────────────────────────────────

class MemoryAlertStore(AlertStore):
    def __init__(self):
        self._lock = threading.Lock()
        self._alerts: Dict[str, Dict] = {}
        self._dedupe: Dict[str, str] = {}   # dedupe_key → alert_ref

    def _new_record(self, alert_data: Dict, dedupe_key: str, now: str) -> Dict:
        safe = _sanitize_alert(alert_data)
        service = alert_data.get("service", "unknown")
        env = alert_data.get("env", "prod")
        kind = alert_data.get("kind", "custom")
        alert_ref = alert_data.get("alert_id") or _generate_alert_ref()
        return {
            "alert_ref": alert_ref,
            "dedupe_key": dedupe_key,
            "source": safe.get("source", "unknown"),
            "service": service,
            "env": env,
            "severity": safe.get("severity", "P2"),
            "kind": kind,
            "title": safe.get("title", ""),
            "summary": safe.get("summary", ""),
            "started_at": safe.get("started_at") or now,
            "labels": safe.get("labels", {}),
            "metrics": safe.get("metrics", {}),
            "evidence": safe.get("evidence", {}),
            "links": safe.get("links", [])[:10],
            "created_at": now,
            "last_seen_at": now,
            "occurrences": 1,
            # State machine fields
            "status": STATUS_NEW,
            "claimed_at": None,             # set when claimed
            "processing_lock_until": None,
            "processing_owner": None,
            "last_error": None,
            "acked_at": None,
            # Legacy compat
            "ack_status": "pending",
            "ack_actor": None,
            "ack_note": None,
            "ack_at": None,
        }

    def ingest(self, alert_data: Dict, dedupe_ttl_minutes: int = 30) -> Dict:
        service = alert_data.get("service", "unknown")
        env = alert_data.get("env", "prod")
        kind = alert_data.get("kind", "custom")
        labels = alert_data.get("labels", {})
        fingerprint = labels.get("fingerprint", "")
        dedupe_key = _compute_dedupe_key(service, env, kind, fingerprint)

        now = _now_iso()
        with self._lock:
            existing_ref = self._dedupe.get(dedupe_key)
            if existing_ref and existing_ref in self._alerts:
                existing = self._alerts[existing_ref]
                created_at = existing.get("created_at", "")
                ttl_cutoff = (
                    datetime.datetime.utcnow()
                    - datetime.timedelta(minutes=dedupe_ttl_minutes)
                ).isoformat()
                if created_at >= ttl_cutoff:
                    existing["occurrences"] = existing.get("occurrences", 1) + 1
                    existing["last_seen_at"] = now
                    if alert_data.get("metrics"):
                        existing["metrics"] = alert_data["metrics"]
                    # If previously acked/failed, reset to new so it gets picked up again
                    if existing.get("status") in (STATUS_ACKED, STATUS_FAILED):
                        existing["status"] = STATUS_NEW
                        existing["processing_lock_until"] = None
                        existing["last_error"] = None
                    return {
                        "accepted": True,
                        "deduped": True,
                        "dedupe_key": dedupe_key,
                        "alert_ref": existing_ref,
                        "occurrences": existing["occurrences"],
                    }

            record = self._new_record(alert_data, dedupe_key, now)
            alert_ref = record["alert_ref"]
            self._alerts[alert_ref] = record
            self._dedupe[dedupe_key] = alert_ref

        return {
            "accepted": True,
            "deduped": False,
            "dedupe_key": dedupe_key,
            "alert_ref": alert_ref,
            "occurrences": 1,
        }

    def list_alerts(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
        filters = filters or {}
        service = filters.get("service")
        env = filters.get("env")
        window = int(filters.get("window_minutes", 240))
        status_in = filters.get("status_in")   # list of statuses or None (all)
        cutoff = (
            datetime.datetime.utcnow() - datetime.timedelta(minutes=window)
        ).isoformat()
        with self._lock:
            results = []
            for a in sorted(self._alerts.values(),
                            key=lambda x: x.get("created_at", ""), reverse=True):
                if a.get("created_at", "") < cutoff:
                    continue
                if service and a.get("service") != service:
                    continue
                if env and a.get("env") != env:
                    continue
                if status_in and a.get("status", STATUS_NEW) not in status_in:
                    continue
                results.append({k: v for k, v in a.items() if k not in ("evidence",)})
                if len(results) >= limit:
                    break
        return results

    def get_alert(self, alert_ref: str) -> Optional[Dict]:
        with self._lock:
            return dict(self._alerts[alert_ref]) if alert_ref in self._alerts else None

    def ack_alert(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
        return self.mark_acked(alert_ref, actor, note)

    def get_by_dedupe_key(self, dedupe_key: str) -> Optional[Dict]:
        with self._lock:
            ref = self._dedupe.get(dedupe_key)
            if ref and ref in self._alerts:
                return dict(self._alerts[ref])
            return None

    def claim_next_alerts(
        self,
        window_minutes: int = 240,
        limit: int = 25,
        owner: str = "loop",
        lock_ttl_seconds: int = PROCESSING_LOCK_TTL_S,
    ) -> List[Dict]:
        now_dt = _now_dt()
        now_str = now_dt.isoformat()
        lock_until = (now_dt + datetime.timedelta(seconds=lock_ttl_seconds)).isoformat()
        cutoff = (now_dt - datetime.timedelta(minutes=window_minutes)).isoformat()

        claimed = []
        with self._lock:
            for a in sorted(self._alerts.values(),
                            key=lambda x: x.get("created_at", "")):
                if len(claimed) >= limit:
                    break
                if a.get("created_at", "") < cutoff:
                    continue
                st = a.get("status", STATUS_NEW)
                lock_exp = a.get("processing_lock_until")

                # Claimable: new, OR failed/processing with expired/no lock
                if st == STATUS_ACKED:
                    continue
                if st in (STATUS_PROCESSING, STATUS_FAILED):
                    if lock_exp and lock_exp > now_str:
                        continue  # still locked (retry window not passed)
                # Claim it
                a["status"] = STATUS_PROCESSING
                a["claimed_at"] = now_str
                a["processing_lock_until"] = lock_until
                a["processing_owner"] = owner
                claimed.append(dict(a))

        return claimed

    def mark_acked(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
        now = _now_iso()
        with self._lock:
            if alert_ref not in self._alerts:
                return None
            rec = self._alerts[alert_ref]
            rec["status"] = STATUS_ACKED
            rec["acked_at"] = now
            rec["ack_status"] = "acked"
            rec["ack_actor"] = _redact_text(actor, 100)
            rec["ack_note"] = _redact_text(note, 500)
            rec["ack_at"] = now
            rec["processing_lock_until"] = None
            rec["processing_owner"] = None
            return {"alert_ref": alert_ref, "status": STATUS_ACKED, "ack_status": "acked"}

    def mark_failed(
        self, alert_ref: str, error: str, retry_after_seconds: int = 300
    ) -> Optional[Dict]:
        now_dt = _now_dt()
        retry_at = (now_dt + datetime.timedelta(seconds=retry_after_seconds)).isoformat()
        with self._lock:
            if alert_ref not in self._alerts:
                return None
            rec = self._alerts[alert_ref]
            rec["status"] = STATUS_FAILED
            rec["last_error"] = _redact_text(error, 500)
            rec["processing_lock_until"] = retry_at
            rec["processing_owner"] = None
            return {"alert_ref": alert_ref, "status": STATUS_FAILED,
                    "ack_status": "failed", "retry_at": retry_at}

    def requeue_expired_processing(self) -> int:
        now_str = _now_iso()
        count = 0
        with self._lock:
            for a in self._alerts.values():
                if a.get("status") == STATUS_PROCESSING:
                    lock_exp = a.get("processing_lock_until")
                    if lock_exp and lock_exp <= now_str:
                        a["status"] = STATUS_NEW
                        a["processing_lock_until"] = None
                        a["processing_owner"] = None
                        count += 1
        return count

    def dashboard_counts(self, window_minutes: int = 240) -> Dict:
        cutoff = (
            _now_dt() - datetime.timedelta(minutes=window_minutes)
        ).isoformat()
        counts = {STATUS_NEW: 0, STATUS_PROCESSING: 0, STATUS_ACKED: 0, STATUS_FAILED: 0}
        now_str = _now_iso()
        with self._lock:
            for a in self._alerts.values():
                if a.get("created_at", "") < cutoff:
                    continue
                st = a.get("status", STATUS_NEW)
                if st in counts:
                    counts[st] += 1
        return counts

    def top_signatures(self, window_minutes: int = 240, limit: int = 20) -> List[Dict]:
        cutoff = (
            _now_dt() - datetime.timedelta(minutes=window_minutes)
        ).isoformat()
        sigs: Dict[str, Dict] = {}
        with self._lock:
            for a in self._alerts.values():
                if a.get("created_at", "") < cutoff:
                    continue
                key = a.get("dedupe_key", "")
                if key not in sigs:
                    sigs[key] = {
                        "signature": key,
                        "service": a.get("service", ""),
                        "kind": a.get("kind", ""),
                        "occurrences": 0,
                        "last_seen": a.get("last_seen_at", ""),
                    }
                sigs[key]["occurrences"] += a.get("occurrences", 1)
                if a.get("last_seen_at", "") > sigs[key]["last_seen"]:
                    sigs[key]["last_seen"] = a.get("last_seen_at", "")
        return sorted(sigs.values(), key=lambda x: x["occurrences"], reverse=True)[:limit]

    def compute_loop_slo(self, window_minutes: int = 240,
                         p95_threshold_s: float = 60.0,
                         failed_rate_threshold_pct: float = 5.0,
                         stuck_minutes: float = 15.0) -> Dict:
        now_dt = _now_dt()
        cutoff = (now_dt - datetime.timedelta(minutes=window_minutes)).isoformat()
        stuck_cutoff = (now_dt - datetime.timedelta(minutes=stuck_minutes)).isoformat()

        durations_s: list = []
        acked = 0
        failed = 0
        stuck = 0

        with self._lock:
            for a in self._alerts.values():
                if a.get("created_at", "") < cutoff:
                    continue
                st = a.get("status", STATUS_NEW)
                if st == STATUS_ACKED:
                    acked += 1
                    claimed_at = a.get("claimed_at")
                    acked_at = a.get("acked_at")
                    if claimed_at and acked_at:
                        try:
                            c = datetime.datetime.fromisoformat(claimed_at)
                            k = datetime.datetime.fromisoformat(acked_at)
                            durations_s.append((k - c).total_seconds())
                        except Exception:
                            pass
                elif st == STATUS_FAILED:
                    failed += 1
                elif st == STATUS_PROCESSING:
                    claimed_at = a.get("claimed_at") or ""
                    if claimed_at and claimed_at < stuck_cutoff:
                        stuck += 1

        # P95
        p95 = None
        if durations_s:
            durations_s.sort()
            idx = max(0, int(len(durations_s) * 0.95) - 1)
            p95 = round(durations_s[idx], 1)

        # Failed rate
        total_terminal = acked + failed
        failed_pct = round((failed / total_terminal * 100) if total_terminal > 0 else 0.0, 1)

        violations = []
        if p95 is not None and p95 > p95_threshold_s:
            violations.append({
                "metric": "claim_to_ack_p95_seconds",
                "value": p95,
                "threshold": p95_threshold_s,
                "message": f"P95 claim→ack latency {p95}s exceeds {p95_threshold_s}s",
            })
        if failed_pct > failed_rate_threshold_pct:
            violations.append({
                "metric": "failed_rate_pct",
                "value": failed_pct,
                "threshold": failed_rate_threshold_pct,
                "message": f"Failed alert rate {failed_pct}% exceeds {failed_rate_threshold_pct}%",
            })
        if stuck > 0:
            violations.append({
                "metric": "processing_stuck_count",
                "value": stuck,
                "threshold": 0,
                "message": f"{stuck} alerts stuck in processing > {stuck_minutes}min",
            })

        return {
            "claim_to_ack_p95_seconds": p95,
            "failed_rate_pct": failed_pct,
            "processing_stuck_count": stuck,
            "sample_count": len(durations_s),
            "violations": violations,
        }


# ─── Postgres backend ──────────────────────────────────────────────────────────

class PostgresAlertStore(AlertStore):
    """Production backend via psycopg2 (sync, per-thread connections)."""

    def __init__(self, dsn: str):
        self._dsn = dsn
        self._local = threading.local()

    def _conn(self):
        conn = getattr(self._local, "conn", None)
        if conn is None or conn.closed:
            import psycopg2  # type: ignore
            conn = psycopg2.connect(self._dsn)
            conn.autocommit = False
            self._local.conn = conn
        return conn

    def _commit(self):
        self._conn().commit()

    def ingest(self, alert_data: Dict, dedupe_ttl_minutes: int = 30) -> Dict:
        service = alert_data.get("service", "unknown")
        env = alert_data.get("env", "prod")
        kind = alert_data.get("kind", "custom")
        labels = alert_data.get("labels", {})
        fingerprint = labels.get("fingerprint", "")
        dedupe_key = _compute_dedupe_key(service, env, kind, fingerprint)
        now = _now_iso()

        conn = self._conn()
        cur = conn.cursor()
        cutoff = (
            datetime.datetime.utcnow() - datetime.timedelta(minutes=dedupe_ttl_minutes)
        ).isoformat()
        cur.execute(
            "SELECT alert_ref, occurrences, status FROM alerts "
            "WHERE dedupe_key=%s AND created_at >= %s LIMIT 1",
            (dedupe_key, cutoff),
        )
        row = cur.fetchone()
        if row:
            existing_ref, occ, existing_status = row
            new_occ = occ + 1
            # Reset to new if previously terminal
            new_status = STATUS_NEW if existing_status in (STATUS_ACKED, STATUS_FAILED) else existing_status
            cur.execute(
                "UPDATE alerts SET occurrences=%s, last_seen_at=%s, metrics=%s, status=%s "
                "WHERE alert_ref=%s",
                (new_occ, now,
                 json.dumps(alert_data.get("metrics", {}), default=str),
                 new_status, existing_ref),
            )
            conn.commit()
            cur.close()
            return {
                "accepted": True,
                "deduped": True,
                "dedupe_key": dedupe_key,
                "alert_ref": existing_ref,
                "occurrences": new_occ,
            }

        safe = _sanitize_alert(alert_data)
        alert_ref = alert_data.get("alert_id") or _generate_alert_ref()
        cur.execute(
            """INSERT INTO alerts (alert_ref,dedupe_key,source,service,env,severity,kind,
               title,summary,started_at,labels,metrics,evidence,links,
               created_at,last_seen_at,occurrences,status)
               VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,1,%s)""",
            (alert_ref, dedupe_key,
             safe.get("source", "unknown"), service, env,
             safe.get("severity", "P2"), kind,
             safe.get("title", ""), safe.get("summary", ""),
             safe.get("started_at") or now,
             json.dumps(safe.get("labels", {}), default=str),
             json.dumps(safe.get("metrics", {}), default=str),
             json.dumps(safe.get("evidence", {}), default=str),
             json.dumps(safe.get("links", [])[:10], default=str),
             now, now, STATUS_NEW),
        )
        conn.commit()
        cur.close()
        return {
            "accepted": True,
            "deduped": False,
            "dedupe_key": dedupe_key,
            "alert_ref": alert_ref,
            "occurrences": 1,
        }

    def _row_to_dict(self, cur, row) -> Dict:
        cols = [d[0] for d in cur.description]
        d: Dict = {}
        for c, v in zip(cols, row):
            if isinstance(v, datetime.datetime):
                d[c] = v.isoformat()
            elif isinstance(v, str) and c in ("labels", "metrics", "evidence", "links"):
                try:
                    d[c] = json.loads(v)
                except Exception:
                    d[c] = v
            else:
                d[c] = v
        return d

    def list_alerts(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
        filters = filters or {}
        window = int(filters.get("window_minutes", 240))
        cutoff = (datetime.datetime.utcnow() - datetime.timedelta(minutes=window)).isoformat()
        status_in = filters.get("status_in")
        clauses = ["created_at >= %s"]
        params: list = [cutoff]
        if filters.get("service"):
            clauses.append("service=%s")
            params.append(filters["service"])
        if filters.get("env"):
            clauses.append("env=%s")
            params.append(filters["env"])
        if status_in:
            placeholders = ",".join(["%s"] * len(status_in))
            clauses.append(f"status IN ({placeholders})")
            params.extend(status_in)
        params.append(min(limit, 200))
        where = " AND ".join(clauses)
        cur = self._conn().cursor()
        cur.execute(
            f"SELECT alert_ref,dedupe_key,source,service,env,severity,kind,"
            f"title,summary,started_at,labels,metrics,links,"
            f"created_at,last_seen_at,occurrences,status,processing_owner,acked_at,last_error "
            f"FROM alerts WHERE {where} ORDER BY created_at DESC LIMIT %s",
            params,
        )
        rows = [self._row_to_dict(cur, r) for r in cur.fetchall()]
        cur.close()
        return rows

    def get_alert(self, alert_ref: str) -> Optional[Dict]:
        cur = self._conn().cursor()
        cur.execute(
            "SELECT alert_ref,dedupe_key,source,service,env,severity,kind,"
            "title,summary,started_at,labels,metrics,evidence,links,"
            "created_at,last_seen_at,occurrences,status,processing_lock_until,"
            "processing_owner,last_error,acked_at,ack_actor,ack_note "
            "FROM alerts WHERE alert_ref=%s",
            (alert_ref,),
        )
        row = cur.fetchone()
        if not row:
            cur.close()
            return None
        result = self._row_to_dict(cur, row)
        cur.close()
        return result

    def ack_alert(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
        return self.mark_acked(alert_ref, actor, note)

    def get_by_dedupe_key(self, dedupe_key: str) -> Optional[Dict]:
        cur = self._conn().cursor()
        cur.execute(
            "SELECT alert_ref,dedupe_key,service,env,severity,kind,title,summary,"
            "started_at,labels,metrics,created_at,last_seen_at,occurrences,status "
            "FROM alerts WHERE dedupe_key=%s ORDER BY created_at DESC LIMIT 1",
            (dedupe_key,),
        )
        row = cur.fetchone()
        if not row:
            cur.close()
            return None
        result = self._row_to_dict(cur, row)
        cur.close()
        return result

    def claim_next_alerts(
        self,
        window_minutes: int = 240,
        limit: int = 25,
        owner: str = "loop",
        lock_ttl_seconds: int = PROCESSING_LOCK_TTL_S,
    ) -> List[Dict]:
        """Atomic claim via SELECT FOR UPDATE SKIP LOCKED."""
        conn = self._conn()
        now_str = _now_iso()
        lock_until = (
            datetime.datetime.utcnow() + datetime.timedelta(seconds=lock_ttl_seconds)
        ).isoformat()
        cutoff = (
            datetime.datetime.utcnow() - datetime.timedelta(minutes=window_minutes)
        ).isoformat()

        cur = conn.cursor()
        try:
            # Select claimable: new, or failed/processing with expired lock
            cur.execute(
                """
                SELECT alert_ref FROM alerts
                WHERE created_at >= %s
                  AND status IN ('new', 'failed', 'processing')
                  AND (processing_lock_until IS NULL OR processing_lock_until <= %s)
                ORDER BY
                  CASE severity WHEN 'P0' THEN 0 WHEN 'P1' THEN 1
                                WHEN 'P2' THEN 2 WHEN 'P3' THEN 3 ELSE 4 END,
                  created_at
                LIMIT %s
                FOR UPDATE SKIP LOCKED
                """,
                (cutoff, now_str, limit),
            )
            refs = [row[0] for row in cur.fetchall()]
            if not refs:
                conn.commit()
                cur.close()
                return []

            placeholders = ",".join(["%s"] * len(refs))
            cur.execute(
                f"""UPDATE alerts SET status='processing',
                    claimed_at=%s, processing_lock_until=%s, processing_owner=%s
                    WHERE alert_ref IN ({placeholders})""",
                [now_str, lock_until, owner] + refs,
            )
            # Fetch updated rows
            cur.execute(
                f"SELECT alert_ref,dedupe_key,service,env,severity,kind,title,summary,"
                f"started_at,labels,metrics,created_at,last_seen_at,occurrences,"
                f"status,processing_owner,last_error "
                f"FROM alerts WHERE alert_ref IN ({placeholders})",
                refs,
            )
            rows = [self._row_to_dict(cur, r) for r in cur.fetchall()]
            conn.commit()
            cur.close()
            return rows
        except Exception:
            conn.rollback()
            cur.close()
            raise

    def mark_acked(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
        now = _now_iso()
        cur = self._conn().cursor()
        cur.execute(
            "UPDATE alerts SET status='acked', acked_at=%s, ack_actor=%s, ack_note=%s, "
            "processing_lock_until=NULL, processing_owner=NULL "
            "WHERE alert_ref=%s RETURNING alert_ref",
            (now, _redact_text(actor, 100), _redact_text(note, 500), alert_ref),
        )
        row = cur.fetchone()
        self._commit()
        cur.close()
        if not row:
            return None
        return {"alert_ref": alert_ref, "status": STATUS_ACKED, "ack_status": "acked"}

    def mark_failed(
        self, alert_ref: str, error: str, retry_after_seconds: int = 300
    ) -> Optional[Dict]:
        retry_at = (
            datetime.datetime.utcnow() + datetime.timedelta(seconds=retry_after_seconds)
        ).isoformat()
        cur = self._conn().cursor()
        cur.execute(
            "UPDATE alerts SET status='failed', last_error=%s, "
            "processing_lock_until=%s, processing_owner=NULL "
            "WHERE alert_ref=%s RETURNING alert_ref",
            (_redact_text(error, 500), retry_at, alert_ref),
        )
        row = cur.fetchone()
        self._commit()
        cur.close()
        if not row:
            return None
        return {"alert_ref": alert_ref, "status": STATUS_FAILED,
                "ack_status": "failed", "retry_at": retry_at}

    def requeue_expired_processing(self) -> int:
        now = _now_iso()
        cur = self._conn().cursor()
        cur.execute(
            "UPDATE alerts SET status='new', processing_lock_until=NULL, "
            "processing_owner=NULL "
            "WHERE status='processing' AND processing_lock_until <= %s",
            (now,),
        )
        count = cur.rowcount
        self._commit()
        cur.close()
        return count

    def dashboard_counts(self, window_minutes: int = 240) -> Dict:
        cutoff = (
            datetime.datetime.utcnow() - datetime.timedelta(minutes=window_minutes)
        ).isoformat()
        cur = self._conn().cursor()
        cur.execute(
            "SELECT status, COUNT(*) FROM alerts WHERE created_at >= %s GROUP BY status",
            (cutoff,),
        )
        counts = {STATUS_NEW: 0, STATUS_PROCESSING: 0, STATUS_ACKED: 0, STATUS_FAILED: 0}
        for row in cur.fetchall():
            st, cnt = row
            if st in counts:
                counts[st] = int(cnt)
        cur.close()
        return counts

    def top_signatures(self, window_minutes: int = 240, limit: int = 20) -> List[Dict]:
        cutoff = (
            datetime.datetime.utcnow() - datetime.timedelta(minutes=window_minutes)
        ).isoformat()
        cur = self._conn().cursor()
        cur.execute(
            "SELECT dedupe_key, service, kind, SUM(occurrences) AS occ, MAX(last_seen_at) AS ls "
            "FROM alerts WHERE created_at >= %s "
            "GROUP BY dedupe_key, service, kind "
            "ORDER BY occ DESC LIMIT %s",
            (cutoff, limit),
        )
        rows = []
        for row in cur.fetchall():
            key, svc, kind, occ, ls = row
            rows.append({
                "signature": key,
                "service": svc,
                "kind": kind,
                "occurrences": int(occ),
                "last_seen": ls.isoformat() if hasattr(ls, "isoformat") else str(ls),
            })
        cur.close()
        return rows

    def compute_loop_slo(self, window_minutes: int = 240,
                         p95_threshold_s: float = 60.0,
                         failed_rate_threshold_pct: float = 5.0,
                         stuck_minutes: float = 15.0) -> Dict:
        now = datetime.datetime.utcnow()
        cutoff = (now - datetime.timedelta(minutes=window_minutes)).isoformat()
        stuck_cutoff = (now - datetime.timedelta(minutes=stuck_minutes)).isoformat()
        cur = self._conn().cursor()

        # P95 duration: only for acked with both claimed_at and acked_at
        cur.execute(
            "SELECT EXTRACT(EPOCH FROM (acked_at - claimed_at)) "
            "FROM alerts "
            "WHERE created_at >= %s AND status='acked' "
            "AND claimed_at IS NOT NULL AND acked_at IS NOT NULL "
            "ORDER BY 1",
            (cutoff,),
        )
        durations = [float(r[0]) for r in cur.fetchall() if r[0] is not None]

        cur.execute(
            "SELECT COUNT(*) FROM alerts WHERE created_at >= %s AND status='acked'",
            (cutoff,),
        )
        acked = int(cur.fetchone()[0])
        cur.execute(
            "SELECT COUNT(*) FROM alerts WHERE created_at >= %s AND status='failed'",
            (cutoff,),
        )
        failed = int(cur.fetchone()[0])
        cur.execute(
            "SELECT COUNT(*) FROM alerts "
            "WHERE created_at >= %s AND status='processing' AND claimed_at < %s",
            (cutoff, stuck_cutoff),
        )
        stuck = int(cur.fetchone()[0])
        cur.close()

        p95 = None
        if durations:
            idx = max(0, int(len(durations) * 0.95) - 1)
            p95 = round(durations[idx], 1)

        total_terminal = acked + failed
        failed_pct = round((failed / total_terminal * 100) if total_terminal > 0 else 0.0, 1)

        violations = []
        if p95 is not None and p95 > p95_threshold_s:
            violations.append({
                "metric": "claim_to_ack_p95_seconds", "value": p95,
                "threshold": p95_threshold_s,
                "message": f"P95 claim→ack {p95}s > {p95_threshold_s}s",
            })
        if failed_pct > failed_rate_threshold_pct:
            violations.append({
                "metric": "failed_rate_pct", "value": failed_pct,
                "threshold": failed_rate_threshold_pct,
                "message": f"Failed rate {failed_pct}% > {failed_rate_threshold_pct}%",
            })
        if stuck > 0:
            violations.append({
                "metric": "processing_stuck_count", "value": stuck,
                "threshold": 0,
                "message": f"{stuck} alerts stuck in processing > {stuck_minutes}min",
            })
        return {
            "claim_to_ack_p95_seconds": p95,
            "failed_rate_pct": failed_pct,
            "processing_stuck_count": stuck,
            "sample_count": len(durations),
            "violations": violations,
        }


# ─── Auto backend ──────────────────────────────────────────────────────────────

class AutoAlertStore(AlertStore):
    """Postgres primary → MemoryAlertStore fallback, with 5 min recovery."""

    _RECOVERY_INTERVAL_S = 300

    def __init__(self, pg_dsn: str):
        self._pg_dsn = pg_dsn
        self._primary: Optional[PostgresAlertStore] = None
        self._fallback = MemoryAlertStore()
        self._using_fallback = False
        self._fallback_since: float = 0.0
        self._init_lock = threading.Lock()

    def _get_primary(self) -> PostgresAlertStore:
        if self._primary is None:
            with self._init_lock:
                if self._primary is None:
                    self._primary = PostgresAlertStore(self._pg_dsn)
        return self._primary

    def _maybe_recover(self) -> None:
        if self._using_fallback and self._fallback_since > 0:
            if time.monotonic() - self._fallback_since >= self._RECOVERY_INTERVAL_S:
                logger.info("AutoAlertStore: attempting Postgres recovery")
                self._using_fallback = False
                self._fallback_since = 0.0

    def _switch_to_fallback(self, err: Exception) -> None:
        logger.warning("AutoAlertStore: Postgres failed (%s), using Memory fallback", err)
        self._using_fallback = True
        self._fallback_since = time.monotonic()

    def active_backend(self) -> str:
        return "memory_fallback" if self._using_fallback else "postgres"

    def _delegate(self, method: str, *args, **kwargs):
        self._maybe_recover()
        if not self._using_fallback:
            try:
                return getattr(self._get_primary(), method)(*args, **kwargs)
            except Exception as e:
                self._switch_to_fallback(e)
        return getattr(self._fallback, method)(*args, **kwargs)

    def ingest(self, alert_data: Dict, dedupe_ttl_minutes: int = 30) -> Dict:
        return self._delegate("ingest", alert_data, dedupe_ttl_minutes)

    def list_alerts(self, filters: Optional[Dict] = None, limit: int = 50) -> List[Dict]:
        return self._delegate("list_alerts", filters, limit)

    def get_alert(self, alert_ref: str) -> Optional[Dict]:
        return self._delegate("get_alert", alert_ref)

    def ack_alert(self, alert_ref: str, actor: str, note: str = "") -> Optional[Dict]:
        return self._delegate("mark_acked", alert_ref, actor, note)

    def get_by_dedupe_key(self, dedupe_key: str) -> Optional[Dict]:
        return self._delegate("get_by_dedupe_key", dedupe_key)

    def claim_next_alerts(self, window_minutes=240, limit=25, owner="loop",
                          lock_ttl_seconds=PROCESSING_LOCK_TTL_S) -> List[Dict]:
        return self._delegate("claim_next_alerts", window_minutes, limit, owner, lock_ttl_seconds)

    def mark_acked(self, alert_ref, actor, note="") -> Optional[Dict]:
        return self._delegate("mark_acked", alert_ref, actor, note)

    def mark_failed(self, alert_ref, error, retry_after_seconds=300) -> Optional[Dict]:
        return self._delegate("mark_failed", alert_ref, error, retry_after_seconds)

    def requeue_expired_processing(self) -> int:
        return self._delegate("requeue_expired_processing")

    def dashboard_counts(self, window_minutes=240) -> Dict:
        return self._delegate("dashboard_counts", window_minutes)

    def top_signatures(self, window_minutes=240, limit=20) -> List[Dict]:
        return self._delegate("top_signatures", window_minutes, limit)

    def compute_loop_slo(self, window_minutes=240, p95_threshold_s=60.0,
                         failed_rate_threshold_pct=5.0, stuck_minutes=15.0) -> Dict:
        return self._delegate("compute_loop_slo", window_minutes, p95_threshold_s,
                              failed_rate_threshold_pct, stuck_minutes)


# ─── Singleton ────────────────────────────────────────────────────────────────

_store: Optional[AlertStore] = None
_store_lock = threading.Lock()


def get_alert_store() -> AlertStore:
    global _store
    if _store is None:
        with _store_lock:
            if _store is None:
                _store = _create_alert_store()
    return _store


def set_alert_store(store: Optional[AlertStore]) -> None:
    global _store
    with _store_lock:
        _store = store


def _create_alert_store() -> AlertStore:
    backend = os.getenv("ALERT_BACKEND", "memory").lower()
    # ALERT_DATABASE_URL takes precedence (service-specific), then DATABASE_URL (shared)
    dsn = os.getenv("ALERT_DATABASE_URL") or os.getenv("DATABASE_URL", "")

    if backend == "postgres":
        if dsn:
            logger.info("AlertStore: postgres dsn=%s…", dsn[:30])
            return PostgresAlertStore(dsn)
        logger.warning(
            "ALERT_BACKEND=postgres but no ALERT_DATABASE_URL/DATABASE_URL; falling back to memory"
        )

    if backend == "auto":
        if dsn:
            logger.info("AlertStore: auto (postgres→memory fallback) dsn=%s…", dsn[:30])
            return AutoAlertStore(dsn)
        logger.info("AlertStore: auto — no ALERT_DATABASE_URL/DATABASE_URL, using memory")

    logger.info("AlertStore: memory (in-process)")
    return MemoryAlertStore()