""" incident_escalation.py — Deterministic Incident Escalation Engine. Actions (exposed via incident_escalation_tool): evaluate — check active signatures against escalation thresholds auto_resolve_candidates — find open incidents with no recent alerts No LLM usage; all logic is policy-driven. """ from __future__ import annotations import datetime import logging import os import yaml from pathlib import Path from typing import Any, Dict, List, Optional logger = logging.getLogger(__name__) # ─── Severity ordering ──────────────────────────────────────────────────────── _SEV_ORDER = {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "INFO": 4} _SEV_NAMES = ["P0", "P1", "P2", "P3", "INFO"] def _sev_higher(a: str, b: str) -> bool: """Return True if a is more severe (lower P number) than b.""" return _SEV_ORDER.get(a, 99) < _SEV_ORDER.get(b, 99) def _escalate_sev(current: str, cap: str = "P0") -> Optional[str]: """Return next higher severity, or None if already at/above cap.""" idx = _SEV_ORDER.get(current) if idx is None or idx == 0: return None target = _SEV_NAMES[idx - 1] if _SEV_ORDER.get(target, 99) < _SEV_ORDER.get(cap, 0): return None # would exceed cap return target def _now_iso() -> str: return datetime.datetime.utcnow().isoformat() def _plus_hours(hours: int) -> str: return (datetime.datetime.utcnow() + datetime.timedelta(hours=hours)).isoformat() # ─── Policy loading ─────────────────────────────────────────────────────────── _POLICY_CACHE: Optional[Dict] = None _POLICY_PATHS = [ Path("config/incident_escalation_policy.yml"), Path(__file__).resolve().parent.parent.parent / "config" / "incident_escalation_policy.yml", ] def load_escalation_policy() -> Dict: global _POLICY_CACHE if _POLICY_CACHE is not None: return _POLICY_CACHE for path in _POLICY_PATHS: if path.exists(): try: with open(path) as f: data = yaml.safe_load(f) or {} _POLICY_CACHE = data return data except Exception as e: logger.warning("Failed to load escalation policy from %s: %s", path, e) logger.warning("incident_escalation_policy.yml not found; using defaults") _POLICY_CACHE = _builtin_defaults() return _POLICY_CACHE def _builtin_defaults() -> Dict: return { "defaults": {"window_minutes": 60}, "escalation": { "occurrences_thresholds": {"P2_to_P1": 10, "P1_to_P0": 25}, "triage_thresholds_24h": {"P2_to_P1": 3, "P1_to_P0": 6}, "severity_cap": "P0", "create_followup_on_escalate": True, "followup": { "priority": "P1", "due_hours": 24, "owner": "oncall", "message_template": "Escalated: occurrences={occurrences_60m}, triages_24h={triage_count_24h}", }, }, "auto_resolve": { "no_alerts_minutes_for_candidate": 60, "close_allowed_severities": ["P2", "P3"], "auto_close": False, "candidate_event_type": "note", "candidate_message": "Auto-resolve candidate: no alerts in {no_alerts_minutes} minutes", }, "alert_loop_slo": { "claim_to_ack_p95_seconds": 60, "failed_rate_pct": 5, "processing_stuck_minutes": 15, }, } # ─── Escalation thresholds helper ──────────────────────────────────────────── def _determine_escalation( current_severity: str, occurrences_60m: int, triage_count_24h: int, policy: Dict, ) -> Optional[str]: """Return target severity if escalation is needed, else None.""" esc = policy.get("escalation", {}) occ_thresh = esc.get("occurrences_thresholds", {}) triage_thresh = esc.get("triage_thresholds_24h", {}) cap = esc.get("severity_cap", "P0") # Build escalation rules in priority order (most → least severe) rules = [ ("P1", "P0", occ_thresh.get("P1_to_P0", 25), triage_thresh.get("P1_to_P0", 6)), ("P2", "P1", occ_thresh.get("P2_to_P1", 10), triage_thresh.get("P2_to_P1", 3)), ] for from_sev, to_sev, occ_limit, triage_limit in rules: if current_severity != from_sev: continue if occurrences_60m >= occ_limit or triage_count_24h >= triage_limit: # Check cap if not _sev_higher(cap, to_sev) and to_sev != cap: # to_sev is more severe than cap — not allowed if _sev_higher(to_sev, cap): return cap return to_sev return None # ─── Core evaluate function ─────────────────────────────────────────────────── def evaluate_escalations( params: Dict, alert_store, sig_state_store, incident_store, policy: Optional[Dict] = None, dry_run: bool = False, ) -> Dict: """ Main escalation evaluation. Returns structured summary. """ if policy is None: policy = load_escalation_policy() env_filter = params.get("env") # "prod" / "staging" / None = any window_minutes = int(params.get("window_minutes", policy.get("defaults", {}).get("window_minutes", 60))) limit = int(params.get("limit", 100)) esc_cfg = policy.get("escalation", {}) cap = esc_cfg.get("severity_cap", "P0") create_followup = esc_cfg.get("create_followup_on_escalate", True) followup_cfg = esc_cfg.get("followup", {}) # Pull active signatures active_sigs = sig_state_store.list_active_signatures( window_minutes=window_minutes, limit=limit ) evaluated = 0 escalated = 0 followups_created = 0 candidates: List[Dict] = [] recommendations: List[str] = [] for sig_state in active_sigs: signature = sig_state.get("signature", "") occurrences_60m = sig_state.get("occurrences_60m", 0) triage_count_24h = sig_state.get("triage_count_24h", 0) # Find open incident with this signature all_incidents = incident_store.list_incidents( {"status": "open"}, limit=200 ) matching = [ i for i in all_incidents if i.get("meta", {}).get("incident_signature") == signature and (not env_filter or i.get("env") == env_filter) ] if not matching: # Also check mitigating mitigating = incident_store.list_incidents( {"status": "mitigating"}, limit=200 ) matching = [ i for i in mitigating if i.get("meta", {}).get("incident_signature") == signature and (not env_filter or i.get("env") == env_filter) ] if not matching: evaluated += 1 continue incident = matching[0] inc_id = incident["id"] current_sev = incident.get("severity", "P2") evaluated += 1 target_sev = _determine_escalation( current_sev, occurrences_60m, triage_count_24h, policy ) if not target_sev: continue # no escalation needed candidates.append({ "incident_id": inc_id, "service": incident.get("service"), "from_severity": current_sev, "to_severity": target_sev, "occurrences_60m": occurrences_60m, "triage_count_24h": triage_count_24h, "signature": signature, }) if dry_run: continue # Append escalation decision event esc_msg = ( f"Escalated {current_sev} → {target_sev}: " f"occurrences_60m={occurrences_60m}, " f"triage_count_24h={triage_count_24h}" ) incident_store.append_event(inc_id, "decision", esc_msg, meta={ "from_severity": current_sev, "to_severity": target_sev, "occurrences_60m": occurrences_60m, "triage_count_24h": triage_count_24h, "policy_cap": cap, "automated": True, }) escalated += 1 # Create follow-up event if configured if create_followup: tmpl = followup_cfg.get( "message_template", "Escalation follow-up: investigate {occurrences_60m} occurrences" ) followup_msg = tmpl.format( occurrences_60m=occurrences_60m, triage_count_24h=triage_count_24h, ) due = _plus_hours(int(followup_cfg.get("due_hours", 24))) incident_store.append_event(inc_id, "followup", followup_msg, meta={ "priority": followup_cfg.get("priority", "P1"), "due_date": due, "owner": followup_cfg.get("owner", "oncall"), "auto_created": True, }) followups_created += 1 recommendations.append( f"Incident {inc_id} ({incident.get('service')}) escalated " f"{current_sev}→{target_sev}: {esc_msg}" ) return { "evaluated": evaluated, "escalated": escalated, "followups_created": followups_created, "candidates": candidates, "recommendations": recommendations, "dry_run": dry_run, } # ─── Auto-resolve candidates ────────────────────────────────────────────────── def find_auto_resolve_candidates( params: Dict, sig_state_store, incident_store, policy: Optional[Dict] = None, dry_run: bool = True, ) -> Dict: """ Find open incidents where no alerts have been seen in the last N minutes. Returns list of candidate incidents. By default dry_run=True — no state changes. """ if policy is None: policy = load_escalation_policy() ar = policy.get("auto_resolve", {}) no_alerts_minutes = int(params.get( "no_alerts_minutes", ar.get("no_alerts_minutes_for_candidate", 60) )) env_filter = params.get("env") limit = int(params.get("limit", 100)) close_allowed = ar.get("close_allowed_severities", ["P2", "P3"]) auto_close = ar.get("auto_close", False) candidate_event_type = ar.get("candidate_event_type", "note") candidate_msg_tmpl = ar.get( "candidate_message", "Auto-resolve candidate: no alerts in {no_alerts_minutes} minutes", ) now_dt = datetime.datetime.utcnow() no_alert_cutoff = (now_dt - datetime.timedelta(minutes=no_alerts_minutes)).isoformat() # Pull all open incidents all_open = incident_store.list_incidents({"status": "open"}, limit=limit) if env_filter: all_open = [i for i in all_open if i.get("env") == env_filter] candidates: List[Dict] = [] closed: List[str] = [] for incident in all_open: inc_id = incident["id"] signature = incident.get("meta", {}).get("incident_signature") if not signature: continue sig_state = sig_state_store.get_state(signature) if not sig_state: continue last_alert = sig_state.get("last_alert_at") or "" if last_alert >= no_alert_cutoff: continue # alert seen recently → not a candidate current_sev = incident.get("severity", "P2") can_close = current_sev in close_allowed candidates.append({ "incident_id": inc_id, "service": incident.get("service"), "severity": current_sev, "last_alert_at": last_alert, "minutes_without_alerts": round( (now_dt - datetime.datetime.fromisoformat(last_alert)).total_seconds() / 60 if last_alert else no_alerts_minutes ), "auto_close_eligible": can_close and auto_close, }) if dry_run: continue # Append candidate note to incident msg = candidate_msg_tmpl.format(no_alerts_minutes=no_alerts_minutes) incident_store.append_event(inc_id, candidate_event_type, msg, meta={ "last_alert_at": last_alert, "no_alerts_minutes": no_alerts_minutes, "auto_created": True, }) if can_close and auto_close: incident_store.close_incident( inc_id, _now_iso(), f"Auto-closed: no alerts for {no_alerts_minutes} minutes", ) closed.append(inc_id) return { "candidates": candidates, "candidates_count": len(candidates), "closed": closed, "closed_count": len(closed), "no_alerts_minutes": no_alerts_minutes, "dry_run": dry_run, }