microdao-daarion/services/router/incident_escalation.py

"""
incident_escalation.py — Deterministic Incident Escalation Engine.

Actions (exposed via incident_escalation_tool):
  evaluate              — check active signatures against escalation thresholds
  auto_resolve_candidates — find open incidents with no recent alerts

No LLM usage; all logic is policy-driven.
"""
from __future__ import annotations

import datetime
import logging
import os
import yaml
from pathlib import Path
from typing import Any, Dict, List, Optional

logger = logging.getLogger(__name__)

# ─── Severity ordering ────────────────────────────────────────────────────────

_SEV_ORDER = {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "INFO": 4}
_SEV_NAMES = ["P0", "P1", "P2", "P3", "INFO"]


def _sev_higher(a: str, b: str) -> bool:
    """Return True if a is more severe (lower P number) than b."""
    return _SEV_ORDER.get(a, 99) < _SEV_ORDER.get(b, 99)


def _escalate_sev(current: str, cap: str = "P0") -> Optional[str]:
    """Return next higher severity, or None if already at/above cap."""
    idx = _SEV_ORDER.get(current)
    if idx is None or idx == 0:
        return None
    target = _SEV_NAMES[idx - 1]
    if _SEV_ORDER.get(target, 99) < _SEV_ORDER.get(cap, 0):
        return None  # would exceed cap
    return target


def _now_iso() -> str:
    return datetime.datetime.utcnow().isoformat()


def _plus_hours(hours: int) -> str:
    return (datetime.datetime.utcnow() + datetime.timedelta(hours=hours)).isoformat()


# ─── Policy loading ───────────────────────────────────────────────────────────

_POLICY_CACHE: Optional[Dict] = None
_POLICY_PATHS = [
    Path("config/incident_escalation_policy.yml"),
    Path(__file__).resolve().parent.parent.parent / "config" / "incident_escalation_policy.yml",
]


def load_escalation_policy() -> Dict:
    global _POLICY_CACHE
    if _POLICY_CACHE is not None:
        return _POLICY_CACHE
    for path in _POLICY_PATHS:
        if path.exists():
            try:
                with open(path) as f:
                    data = yaml.safe_load(f) or {}
                _POLICY_CACHE = data
                return data
            except Exception as e:
                logger.warning("Failed to load escalation policy from %s: %s", path, e)
    logger.warning("incident_escalation_policy.yml not found; using defaults")
    _POLICY_CACHE = _builtin_defaults()
    return _POLICY_CACHE


def _builtin_defaults() -> Dict:
    return {
        "defaults": {"window_minutes": 60},
        "escalation": {
            "occurrences_thresholds": {"P2_to_P1": 10, "P1_to_P0": 25},
            "triage_thresholds_24h": {"P2_to_P1": 3, "P1_to_P0": 6},
            "severity_cap": "P0",
            "create_followup_on_escalate": True,
            "followup": {
                "priority": "P1", "due_hours": 24, "owner": "oncall",
                "message_template": "Escalated: occurrences={occurrences_60m}, triages_24h={triage_count_24h}",
            },
        },
        "auto_resolve": {
            "no_alerts_minutes_for_candidate": 60,
            "close_allowed_severities": ["P2", "P3"],
            "auto_close": False,
            "candidate_event_type": "note",
            "candidate_message": "Auto-resolve candidate: no alerts in {no_alerts_minutes} minutes",
        },
        "alert_loop_slo": {
            "claim_to_ack_p95_seconds": 60,
            "failed_rate_pct": 5,
            "processing_stuck_minutes": 15,
        },
    }


# ─── Escalation thresholds helper ────────────────────────────────────────────

def _determine_escalation(
    current_severity: str,
    occurrences_60m: int,
    triage_count_24h: int,
    policy: Dict,
) -> Optional[str]:
    """Return target severity if escalation is needed, else None."""
    esc = policy.get("escalation", {})
    occ_thresh = esc.get("occurrences_thresholds", {})
    triage_thresh = esc.get("triage_thresholds_24h", {})
    cap = esc.get("severity_cap", "P0")

    # Build escalation rules in priority order (most → least severe)
    rules = [
        ("P1", "P0", occ_thresh.get("P1_to_P0", 25), triage_thresh.get("P1_to_P0", 6)),
        ("P2", "P1", occ_thresh.get("P2_to_P1", 10), triage_thresh.get("P2_to_P1", 3)),
    ]

    for from_sev, to_sev, occ_limit, triage_limit in rules:
        if current_severity != from_sev:
            continue
        if occurrences_60m >= occ_limit or triage_count_24h >= triage_limit:
            # Check cap
            if not _sev_higher(cap, to_sev) and to_sev != cap:
                # to_sev is more severe than cap — not allowed
                if _sev_higher(to_sev, cap):
                    return cap
            return to_sev
    return None


# ─── Core evaluate function ───────────────────────────────────────────────────

def evaluate_escalations(
    params: Dict,
    alert_store,
    sig_state_store,
    incident_store,
    policy: Optional[Dict] = None,
    dry_run: bool = False,
) -> Dict:
    """
    Main escalation evaluation. Returns structured summary.
    """
    if policy is None:
        policy = load_escalation_policy()

    env_filter = params.get("env")  # "prod" / "staging" / None = any
    window_minutes = int(params.get("window_minutes",
                                    policy.get("defaults", {}).get("window_minutes", 60)))
    limit = int(params.get("limit", 100))

    esc_cfg = policy.get("escalation", {})
    cap = esc_cfg.get("severity_cap", "P0")
    create_followup = esc_cfg.get("create_followup_on_escalate", True)
    followup_cfg = esc_cfg.get("followup", {})

    # Pull active signatures
    active_sigs = sig_state_store.list_active_signatures(
        window_minutes=window_minutes, limit=limit
    )

    evaluated = 0
    escalated = 0
    followups_created = 0
    candidates: List[Dict] = []
    recommendations: List[str] = []

    for sig_state in active_sigs:
        signature = sig_state.get("signature", "")
        occurrences_60m = sig_state.get("occurrences_60m", 0)
        triage_count_24h = sig_state.get("triage_count_24h", 0)

        # Find open incident with this signature
        all_incidents = incident_store.list_incidents(
            {"status": "open"}, limit=200
        )
        matching = [
            i for i in all_incidents
            if i.get("meta", {}).get("incident_signature") == signature
            and (not env_filter or i.get("env") == env_filter)
        ]
        if not matching:
            # Also check mitigating
            mitigating = incident_store.list_incidents(
                {"status": "mitigating"}, limit=200
            )
            matching = [
                i for i in mitigating
                if i.get("meta", {}).get("incident_signature") == signature
                and (not env_filter or i.get("env") == env_filter)
            ]

        if not matching:
            evaluated += 1
            continue

        incident = matching[0]
        inc_id = incident["id"]
        current_sev = incident.get("severity", "P2")

        evaluated += 1

        target_sev = _determine_escalation(
            current_sev, occurrences_60m, triage_count_24h, policy
        )

        if not target_sev:
            continue  # no escalation needed

        candidates.append({
            "incident_id": inc_id,
            "service": incident.get("service"),
            "from_severity": current_sev,
            "to_severity": target_sev,
            "occurrences_60m": occurrences_60m,
            "triage_count_24h": triage_count_24h,
            "signature": signature,
        })

        if dry_run:
            continue

        # Append escalation decision event
        esc_msg = (
            f"Escalated {current_sev} → {target_sev}: "
            f"occurrences_60m={occurrences_60m}, "
            f"triage_count_24h={triage_count_24h}"
        )
        incident_store.append_event(inc_id, "decision", esc_msg, meta={
            "from_severity": current_sev,
            "to_severity": target_sev,
            "occurrences_60m": occurrences_60m,
            "triage_count_24h": triage_count_24h,
            "policy_cap": cap,
            "automated": True,
        })
        escalated += 1

        # Create follow-up event if configured
        if create_followup:
            tmpl = followup_cfg.get(
                "message_template",
                "Escalation follow-up: investigate {occurrences_60m} occurrences"
            )
            followup_msg = tmpl.format(
                occurrences_60m=occurrences_60m,
                triage_count_24h=triage_count_24h,
            )
            due = _plus_hours(int(followup_cfg.get("due_hours", 24)))
            incident_store.append_event(inc_id, "followup", followup_msg, meta={
                "priority": followup_cfg.get("priority", "P1"),
                "due_date": due,
                "owner": followup_cfg.get("owner", "oncall"),
                "auto_created": True,
            })
            followups_created += 1

        recommendations.append(
            f"Incident {inc_id} ({incident.get('service')}) escalated "
            f"{current_sev}→{target_sev}: {esc_msg}"
        )

    return {
        "evaluated": evaluated,
        "escalated": escalated,
        "followups_created": followups_created,
        "candidates": candidates,
        "recommendations": recommendations,
        "dry_run": dry_run,
    }


# ─── Auto-resolve candidates ──────────────────────────────────────────────────

def find_auto_resolve_candidates(
    params: Dict,
    sig_state_store,
    incident_store,
    policy: Optional[Dict] = None,
    dry_run: bool = True,
) -> Dict:
    """
    Find open incidents where no alerts have been seen in the last N minutes.
    Returns list of candidate incidents.
    By default dry_run=True — no state changes.
    """
    if policy is None:
        policy = load_escalation_policy()

    ar = policy.get("auto_resolve", {})
    no_alerts_minutes = int(params.get(
        "no_alerts_minutes",
        ar.get("no_alerts_minutes_for_candidate", 60)
    ))
    env_filter = params.get("env")
    limit = int(params.get("limit", 100))
    close_allowed = ar.get("close_allowed_severities", ["P2", "P3"])
    auto_close = ar.get("auto_close", False)
    candidate_event_type = ar.get("candidate_event_type", "note")
    candidate_msg_tmpl = ar.get(
        "candidate_message",
        "Auto-resolve candidate: no alerts in {no_alerts_minutes} minutes",
    )

    now_dt = datetime.datetime.utcnow()
    no_alert_cutoff = (now_dt - datetime.timedelta(minutes=no_alerts_minutes)).isoformat()

    # Pull all open incidents
    all_open = incident_store.list_incidents({"status": "open"}, limit=limit)
    if env_filter:
        all_open = [i for i in all_open if i.get("env") == env_filter]

    candidates: List[Dict] = []
    closed: List[str] = []

    for incident in all_open:
        inc_id = incident["id"]
        signature = incident.get("meta", {}).get("incident_signature")
        if not signature:
            continue

        sig_state = sig_state_store.get_state(signature)
        if not sig_state:
            continue

        last_alert = sig_state.get("last_alert_at") or ""
        if last_alert >= no_alert_cutoff:
            continue  # alert seen recently → not a candidate

        current_sev = incident.get("severity", "P2")
        can_close = current_sev in close_allowed

        candidates.append({
            "incident_id": inc_id,
            "service": incident.get("service"),
            "severity": current_sev,
            "last_alert_at": last_alert,
            "minutes_without_alerts": round(
                (now_dt - datetime.datetime.fromisoformat(last_alert)).total_seconds() / 60
                if last_alert else no_alerts_minutes
            ),
            "auto_close_eligible": can_close and auto_close,
        })

        if dry_run:
            continue

        # Append candidate note to incident
        msg = candidate_msg_tmpl.format(no_alerts_minutes=no_alerts_minutes)
        incident_store.append_event(inc_id, candidate_event_type, msg, meta={
            "last_alert_at": last_alert,
            "no_alerts_minutes": no_alerts_minutes,
            "auto_created": True,
        })

        if can_close and auto_close:
            incident_store.close_incident(
                inc_id,
                _now_iso(),
                f"Auto-closed: no alerts for {no_alerts_minutes} minutes",
            )
            closed.append(inc_id)

    return {
        "candidates": candidates,
        "candidates_count": len(candidates),
        "closed": closed,
        "closed_count": len(closed),
        "no_alerts_minutes": no_alerts_minutes,
        "dry_run": dry_run,
    }