microdao-daarion/tests/test_alert_state_machine.py

"""
Tests for Alert State Machine — MemoryAlertStore state transitions.

Covers:
  - claim moves new→processing and locks
  - second claim does not re-claim locked alerts
  - lock expiry allows re-claim (stale processing requeue)
  - mark_failed sets failed + retry lock
  - mark_acked sets acked
  - priority ordering (P0 before P1)
  - requeue_expired_processing
  - dashboard_counts
  - top_signatures
  - SignatureStateStore cooldown
"""
import os
import sys
from datetime import datetime, timedelta
from pathlib import Path
from unittest.mock import patch

ROOT = Path(__file__).resolve().parent.parent
ROUTER = ROOT / "services" / "router"
if str(ROUTER) not in sys.path:
    sys.path.insert(0, str(ROUTER))


def _make_alert(service="gw", severity="P1", kind="slo_breach", fp="fp1", ref=None):
    from alert_ingest import ingest_alert
    return {
        "source": "monitor@node1",
        "service": service,
        "env": "prod",
        "severity": severity,
        "kind": kind,
        "title": f"{service} {kind}",
        "summary": f"{service} issue",
        "started_at": datetime.utcnow().isoformat(),
        "labels": {"fingerprint": fp},
        "metrics": {},
        "evidence": {},
    }


def _store_with_alert(alert_data=None):
    from alert_store import MemoryAlertStore, set_alert_store
    from alert_ingest import ingest_alert
    store = MemoryAlertStore()
    set_alert_store(store)
    if alert_data is None:
        alert_data = _make_alert()
    result = ingest_alert(store, alert_data)
    return store, result["alert_ref"]


class TestStateMachineClaim:
    def teardown_method(self):
        from alert_store import set_alert_store
        set_alert_store(None)

    def test_claim_new_alert(self):
        store, ref = _store_with_alert()
        claimed = store.claim_next_alerts(limit=5, owner="test_owner")
        assert len(claimed) == 1
        assert claimed[0]["alert_ref"] == ref
        assert claimed[0]["status"] == "processing"
        assert claimed[0]["processing_owner"] == "test_owner"

    def test_claim_sets_lock(self):
        store, ref = _store_with_alert()
        store.claim_next_alerts(limit=5, owner="loop1", lock_ttl_seconds=600)
        rec = store.get_alert(ref)
        assert rec["processing_lock_until"] is not None
        # Lock should be in the future
        from datetime import datetime
        lock = rec["processing_lock_until"]
        assert lock > datetime.utcnow().isoformat()

    def test_second_claim_skips_locked(self):
        store, ref = _store_with_alert()
        store.claim_next_alerts(limit=5, owner="loop1", lock_ttl_seconds=600)
        # Second claim should not get the same alert
        claimed2 = store.claim_next_alerts(limit=5, owner="loop2", lock_ttl_seconds=600)
        assert len(claimed2) == 0

    def test_expired_lock_allows_reclaim(self):
        store, ref = _store_with_alert()
        store.claim_next_alerts(limit=5, owner="loop1", lock_ttl_seconds=600)
        # Manually expire the lock
        with store._lock:
            store._alerts[ref]["processing_lock_until"] = (
                (datetime.utcnow() - timedelta(seconds=10)).isoformat()
            )
        claimed2 = store.claim_next_alerts(limit=5, owner="loop2", lock_ttl_seconds=600)
        assert len(claimed2) == 1
        assert claimed2[0]["processing_owner"] == "loop2"

    def test_acked_alert_not_claimed(self):
        store, ref = _store_with_alert()
        store.mark_acked(ref, "test")
        claimed = store.claim_next_alerts(limit=5)
        assert len(claimed) == 0

    def test_failed_alert_retried_after_lock_expires(self):
        store, ref = _store_with_alert()
        store.mark_failed(ref, "processing error", retry_after_seconds=300)
        # Immediately after mark_failed, lock is in future → not claimable
        claimed = store.claim_next_alerts(limit=5)
        assert len(claimed) == 0
        # Expire the retry lock
        with store._lock:
            store._alerts[ref]["processing_lock_until"] = (
                (datetime.utcnow() - timedelta(seconds=10)).isoformat()
            )
        claimed2 = store.claim_next_alerts(limit=5)
        assert len(claimed2) == 1


class TestStateMachineTransitions:
    def teardown_method(self):
        from alert_store import set_alert_store
        set_alert_store(None)

    def test_mark_acked(self):
        store, ref = _store_with_alert()
        store.claim_next_alerts(limit=5, owner="loop")
        result = store.mark_acked(ref, "sofiia", note="incident:inc_001")
        assert result["status"] == "acked"
        rec = store.get_alert(ref)
        assert rec["status"] == "acked"
        assert rec["acked_at"] is not None
        assert rec["processing_lock_until"] is None

    def test_mark_failed(self):
        store, ref = _store_with_alert()
        store.claim_next_alerts(limit=5)
        result = store.mark_failed(ref, "gateway timeout", retry_after_seconds=300)
        assert result["status"] == "failed"
        assert "retry_at" in result
        rec = store.get_alert(ref)
        assert rec["status"] == "failed"
        assert rec["last_error"] == "gateway timeout"

    def test_requeue_expired_processing(self):
        store, ref = _store_with_alert()
        store.claim_next_alerts(limit=5, lock_ttl_seconds=600)
        # Expire the lock manually
        with store._lock:
            store._alerts[ref]["processing_lock_until"] = (
                (datetime.utcnow() - timedelta(seconds=5)).isoformat()
            )
        count = store.requeue_expired_processing()
        assert count == 1
        rec = store.get_alert(ref)
        assert rec["status"] == "new"
        assert rec["processing_lock_until"] is None

    def test_secret_redacted_in_last_error(self):
        store, ref = _store_with_alert()
        store.mark_failed(ref, "token=sk-secret123 failed processing")
        rec = store.get_alert(ref)
        assert "sk-secret123" not in rec["last_error"]
        assert "***" in rec["last_error"]


class TestStateMachineDashboard:
    def teardown_method(self):
        from alert_store import set_alert_store
        set_alert_store(None)

    def test_dashboard_counts(self):
        from alert_store import MemoryAlertStore, set_alert_store
        from alert_ingest import ingest_alert
        store = MemoryAlertStore()
        set_alert_store(store)

        a1 = ingest_alert(store, _make_alert(fp="fp1", ref="a1"))
        a2 = ingest_alert(store, _make_alert(fp="fp2", ref="a2"))
        a3 = ingest_alert(store, _make_alert(fp="fp3", ref="a3"))

        store.claim_next_alerts(limit=1, owner="loop")
        store.mark_acked(a2["alert_ref"], "test")

        counts = store.dashboard_counts()
        assert counts["new"] >= 1
        assert counts["processing"] >= 1
        assert counts["acked"] >= 1

    def test_top_signatures(self):
        from alert_store import MemoryAlertStore, set_alert_store
        from alert_ingest import ingest_alert
        store = MemoryAlertStore()
        set_alert_store(store)

        # Same signature: 3 occurrences
        for i in range(3):
            ingest_alert(store, _make_alert(fp="samefp"))
        # Different signature: 1 occurrence
        ingest_alert(store, _make_alert(fp="otherfp"))

        top = store.top_signatures()
        assert len(top) >= 1
        assert top[0]["occurrences"] >= 3  # most common first

    def test_list_alerts_status_filter(self):
        from alert_store import MemoryAlertStore, set_alert_store
        from alert_ingest import ingest_alert
        store = MemoryAlertStore()
        set_alert_store(store)

        r1 = ingest_alert(store, _make_alert(fp="fp1"))
        r2 = ingest_alert(store, _make_alert(fp="fp2"))
        store.mark_acked(r2["alert_ref"], "test")

        new_only = store.list_alerts({"status_in": ["new"]})
        assert all(a["status"] == "new" for a in new_only)

        acked_only = store.list_alerts({"status_in": ["acked"]})
        assert all(a["status"] == "acked" for a in acked_only)


class TestSignatureStateStore:
    def setup_method(self):
        from signature_state_store import MemorySignatureStateStore, set_signature_state_store
        self.store = MemorySignatureStateStore()
        set_signature_state_store(self.store)

    def teardown_method(self):
        from signature_state_store import set_signature_state_store
        set_signature_state_store(None)

    def test_first_call_should_triage(self):
        assert self.store.should_run_triage("sig_abc", cooldown_minutes=15) is True

    def test_after_mark_cooldown_active(self):
        self.store.mark_triage_run("sig_abc")
        assert self.store.should_run_triage("sig_abc", cooldown_minutes=15) is False

    def test_after_cooldown_passes_ok(self):
        self.store.mark_triage_run("sig_abc")
        # Manually back-date last_triage_at
        with self.store._lock:
            self.store._states["sig_abc"]["last_triage_at"] = (
                (datetime.utcnow() - timedelta(minutes=20)).isoformat()
            )
        assert self.store.should_run_triage("sig_abc", cooldown_minutes=15) is True

    def test_mark_alert_seen_creates_state(self):
        self.store.mark_alert_seen("sig_xyz")
        state = self.store.get_state("sig_xyz")
        assert state is not None
        assert state["last_alert_at"] is not None
        assert state["last_triage_at"] is None

    def test_triage_count_increments(self):
        for _ in range(3):
            self.store.mark_triage_run("sig_count")
        state = self.store.get_state("sig_count")
        assert state["triage_count_24h"] == 3

    def test_different_signatures_independent(self):
        self.store.mark_triage_run("sig_a")
        assert self.store.should_run_triage("sig_b", cooldown_minutes=15) is True


class TestAlertStoreFactory:
    def test_default_is_memory(self):
        from alert_store import _create_alert_store, MemoryAlertStore
        with patch.dict(os.environ, {"ALERT_BACKEND": "memory"}, clear=False):
            store = _create_alert_store()
        assert isinstance(store, MemoryAlertStore)

    def test_auto_with_dsn_is_auto(self):
        from alert_store import _create_alert_store, AutoAlertStore
        env = {"ALERT_BACKEND": "auto", "DATABASE_URL": "postgresql://x:x@localhost/test"}
        with patch.dict(os.environ, env, clear=False):
            store = _create_alert_store()
        assert isinstance(store, AutoAlertStore)


class TestClaimDedupeAndPriority:
    def teardown_method(self):
        from alert_store import set_alert_store
        set_alert_store(None)

    def test_multiple_new_alerts_claimed_in_order(self):
        from alert_store import MemoryAlertStore, set_alert_store
        from alert_ingest import ingest_alert
        store = MemoryAlertStore()
        set_alert_store(store)

        ingest_alert(store, _make_alert(fp="fp1"))
        ingest_alert(store, _make_alert(fp="fp2"))
        ingest_alert(store, _make_alert(fp="fp3"))

        claimed = store.claim_next_alerts(limit=2)
        assert len(claimed) == 2
        remaining = store.claim_next_alerts(limit=10)
        assert len(remaining) == 1  # only one left