New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
380 lines
13 KiB
Python
380 lines
13 KiB
Python
"""
|
|
incident_escalation.py — Deterministic Incident Escalation Engine.
|
|
|
|
Actions (exposed via incident_escalation_tool):
|
|
evaluate — check active signatures against escalation thresholds
|
|
auto_resolve_candidates — find open incidents with no recent alerts
|
|
|
|
No LLM usage; all logic is policy-driven.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import datetime
|
|
import logging
|
|
import os
|
|
import yaml
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ─── Severity ordering ────────────────────────────────────────────────────────
|
|
|
|
_SEV_ORDER = {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "INFO": 4}
|
|
_SEV_NAMES = ["P0", "P1", "P2", "P3", "INFO"]
|
|
|
|
|
|
def _sev_higher(a: str, b: str) -> bool:
|
|
"""Return True if a is more severe (lower P number) than b."""
|
|
return _SEV_ORDER.get(a, 99) < _SEV_ORDER.get(b, 99)
|
|
|
|
|
|
def _escalate_sev(current: str, cap: str = "P0") -> Optional[str]:
|
|
"""Return next higher severity, or None if already at/above cap."""
|
|
idx = _SEV_ORDER.get(current)
|
|
if idx is None or idx == 0:
|
|
return None
|
|
target = _SEV_NAMES[idx - 1]
|
|
if _SEV_ORDER.get(target, 99) < _SEV_ORDER.get(cap, 0):
|
|
return None # would exceed cap
|
|
return target
|
|
|
|
|
|
def _now_iso() -> str:
|
|
return datetime.datetime.utcnow().isoformat()
|
|
|
|
|
|
def _plus_hours(hours: int) -> str:
|
|
return (datetime.datetime.utcnow() + datetime.timedelta(hours=hours)).isoformat()
|
|
|
|
|
|
# ─── Policy loading ───────────────────────────────────────────────────────────
|
|
|
|
_POLICY_CACHE: Optional[Dict] = None
|
|
_POLICY_PATHS = [
|
|
Path("config/incident_escalation_policy.yml"),
|
|
Path(__file__).resolve().parent.parent.parent / "config" / "incident_escalation_policy.yml",
|
|
]
|
|
|
|
|
|
def load_escalation_policy() -> Dict:
|
|
global _POLICY_CACHE
|
|
if _POLICY_CACHE is not None:
|
|
return _POLICY_CACHE
|
|
for path in _POLICY_PATHS:
|
|
if path.exists():
|
|
try:
|
|
with open(path) as f:
|
|
data = yaml.safe_load(f) or {}
|
|
_POLICY_CACHE = data
|
|
return data
|
|
except Exception as e:
|
|
logger.warning("Failed to load escalation policy from %s: %s", path, e)
|
|
logger.warning("incident_escalation_policy.yml not found; using defaults")
|
|
_POLICY_CACHE = _builtin_defaults()
|
|
return _POLICY_CACHE
|
|
|
|
|
|
def _builtin_defaults() -> Dict:
|
|
return {
|
|
"defaults": {"window_minutes": 60},
|
|
"escalation": {
|
|
"occurrences_thresholds": {"P2_to_P1": 10, "P1_to_P0": 25},
|
|
"triage_thresholds_24h": {"P2_to_P1": 3, "P1_to_P0": 6},
|
|
"severity_cap": "P0",
|
|
"create_followup_on_escalate": True,
|
|
"followup": {
|
|
"priority": "P1", "due_hours": 24, "owner": "oncall",
|
|
"message_template": "Escalated: occurrences={occurrences_60m}, triages_24h={triage_count_24h}",
|
|
},
|
|
},
|
|
"auto_resolve": {
|
|
"no_alerts_minutes_for_candidate": 60,
|
|
"close_allowed_severities": ["P2", "P3"],
|
|
"auto_close": False,
|
|
"candidate_event_type": "note",
|
|
"candidate_message": "Auto-resolve candidate: no alerts in {no_alerts_minutes} minutes",
|
|
},
|
|
"alert_loop_slo": {
|
|
"claim_to_ack_p95_seconds": 60,
|
|
"failed_rate_pct": 5,
|
|
"processing_stuck_minutes": 15,
|
|
},
|
|
}
|
|
|
|
|
|
# ─── Escalation thresholds helper ────────────────────────────────────────────
|
|
|
|
def _determine_escalation(
|
|
current_severity: str,
|
|
occurrences_60m: int,
|
|
triage_count_24h: int,
|
|
policy: Dict,
|
|
) -> Optional[str]:
|
|
"""Return target severity if escalation is needed, else None."""
|
|
esc = policy.get("escalation", {})
|
|
occ_thresh = esc.get("occurrences_thresholds", {})
|
|
triage_thresh = esc.get("triage_thresholds_24h", {})
|
|
cap = esc.get("severity_cap", "P0")
|
|
|
|
# Build escalation rules in priority order (most → least severe)
|
|
rules = [
|
|
("P1", "P0", occ_thresh.get("P1_to_P0", 25), triage_thresh.get("P1_to_P0", 6)),
|
|
("P2", "P1", occ_thresh.get("P2_to_P1", 10), triage_thresh.get("P2_to_P1", 3)),
|
|
]
|
|
|
|
for from_sev, to_sev, occ_limit, triage_limit in rules:
|
|
if current_severity != from_sev:
|
|
continue
|
|
if occurrences_60m >= occ_limit or triage_count_24h >= triage_limit:
|
|
# Check cap
|
|
if not _sev_higher(cap, to_sev) and to_sev != cap:
|
|
# to_sev is more severe than cap — not allowed
|
|
if _sev_higher(to_sev, cap):
|
|
return cap
|
|
return to_sev
|
|
return None
|
|
|
|
|
|
# ─── Core evaluate function ───────────────────────────────────────────────────
|
|
|
|
def evaluate_escalations(
|
|
params: Dict,
|
|
alert_store,
|
|
sig_state_store,
|
|
incident_store,
|
|
policy: Optional[Dict] = None,
|
|
dry_run: bool = False,
|
|
) -> Dict:
|
|
"""
|
|
Main escalation evaluation. Returns structured summary.
|
|
"""
|
|
if policy is None:
|
|
policy = load_escalation_policy()
|
|
|
|
env_filter = params.get("env") # "prod" / "staging" / None = any
|
|
window_minutes = int(params.get("window_minutes",
|
|
policy.get("defaults", {}).get("window_minutes", 60)))
|
|
limit = int(params.get("limit", 100))
|
|
|
|
esc_cfg = policy.get("escalation", {})
|
|
cap = esc_cfg.get("severity_cap", "P0")
|
|
create_followup = esc_cfg.get("create_followup_on_escalate", True)
|
|
followup_cfg = esc_cfg.get("followup", {})
|
|
|
|
# Pull active signatures
|
|
active_sigs = sig_state_store.list_active_signatures(
|
|
window_minutes=window_minutes, limit=limit
|
|
)
|
|
|
|
evaluated = 0
|
|
escalated = 0
|
|
followups_created = 0
|
|
candidates: List[Dict] = []
|
|
recommendations: List[str] = []
|
|
|
|
for sig_state in active_sigs:
|
|
signature = sig_state.get("signature", "")
|
|
occurrences_60m = sig_state.get("occurrences_60m", 0)
|
|
triage_count_24h = sig_state.get("triage_count_24h", 0)
|
|
|
|
# Find open incident with this signature
|
|
all_incidents = incident_store.list_incidents(
|
|
{"status": "open"}, limit=200
|
|
)
|
|
matching = [
|
|
i for i in all_incidents
|
|
if i.get("meta", {}).get("incident_signature") == signature
|
|
and (not env_filter or i.get("env") == env_filter)
|
|
]
|
|
if not matching:
|
|
# Also check mitigating
|
|
mitigating = incident_store.list_incidents(
|
|
{"status": "mitigating"}, limit=200
|
|
)
|
|
matching = [
|
|
i for i in mitigating
|
|
if i.get("meta", {}).get("incident_signature") == signature
|
|
and (not env_filter or i.get("env") == env_filter)
|
|
]
|
|
|
|
if not matching:
|
|
evaluated += 1
|
|
continue
|
|
|
|
incident = matching[0]
|
|
inc_id = incident["id"]
|
|
current_sev = incident.get("severity", "P2")
|
|
|
|
evaluated += 1
|
|
|
|
target_sev = _determine_escalation(
|
|
current_sev, occurrences_60m, triage_count_24h, policy
|
|
)
|
|
|
|
if not target_sev:
|
|
continue # no escalation needed
|
|
|
|
candidates.append({
|
|
"incident_id": inc_id,
|
|
"service": incident.get("service"),
|
|
"from_severity": current_sev,
|
|
"to_severity": target_sev,
|
|
"occurrences_60m": occurrences_60m,
|
|
"triage_count_24h": triage_count_24h,
|
|
"signature": signature,
|
|
})
|
|
|
|
if dry_run:
|
|
continue
|
|
|
|
# Append escalation decision event
|
|
esc_msg = (
|
|
f"Escalated {current_sev} → {target_sev}: "
|
|
f"occurrences_60m={occurrences_60m}, "
|
|
f"triage_count_24h={triage_count_24h}"
|
|
)
|
|
incident_store.append_event(inc_id, "decision", esc_msg, meta={
|
|
"from_severity": current_sev,
|
|
"to_severity": target_sev,
|
|
"occurrences_60m": occurrences_60m,
|
|
"triage_count_24h": triage_count_24h,
|
|
"policy_cap": cap,
|
|
"automated": True,
|
|
})
|
|
escalated += 1
|
|
|
|
# Create follow-up event if configured
|
|
if create_followup:
|
|
tmpl = followup_cfg.get(
|
|
"message_template",
|
|
"Escalation follow-up: investigate {occurrences_60m} occurrences"
|
|
)
|
|
followup_msg = tmpl.format(
|
|
occurrences_60m=occurrences_60m,
|
|
triage_count_24h=triage_count_24h,
|
|
)
|
|
due = _plus_hours(int(followup_cfg.get("due_hours", 24)))
|
|
incident_store.append_event(inc_id, "followup", followup_msg, meta={
|
|
"priority": followup_cfg.get("priority", "P1"),
|
|
"due_date": due,
|
|
"owner": followup_cfg.get("owner", "oncall"),
|
|
"auto_created": True,
|
|
})
|
|
followups_created += 1
|
|
|
|
recommendations.append(
|
|
f"Incident {inc_id} ({incident.get('service')}) escalated "
|
|
f"{current_sev}→{target_sev}: {esc_msg}"
|
|
)
|
|
|
|
return {
|
|
"evaluated": evaluated,
|
|
"escalated": escalated,
|
|
"followups_created": followups_created,
|
|
"candidates": candidates,
|
|
"recommendations": recommendations,
|
|
"dry_run": dry_run,
|
|
}
|
|
|
|
|
|
# ─── Auto-resolve candidates ──────────────────────────────────────────────────
|
|
|
|
def find_auto_resolve_candidates(
|
|
params: Dict,
|
|
sig_state_store,
|
|
incident_store,
|
|
policy: Optional[Dict] = None,
|
|
dry_run: bool = True,
|
|
) -> Dict:
|
|
"""
|
|
Find open incidents where no alerts have been seen in the last N minutes.
|
|
Returns list of candidate incidents.
|
|
By default dry_run=True — no state changes.
|
|
"""
|
|
if policy is None:
|
|
policy = load_escalation_policy()
|
|
|
|
ar = policy.get("auto_resolve", {})
|
|
no_alerts_minutes = int(params.get(
|
|
"no_alerts_minutes",
|
|
ar.get("no_alerts_minutes_for_candidate", 60)
|
|
))
|
|
env_filter = params.get("env")
|
|
limit = int(params.get("limit", 100))
|
|
close_allowed = ar.get("close_allowed_severities", ["P2", "P3"])
|
|
auto_close = ar.get("auto_close", False)
|
|
candidate_event_type = ar.get("candidate_event_type", "note")
|
|
candidate_msg_tmpl = ar.get(
|
|
"candidate_message",
|
|
"Auto-resolve candidate: no alerts in {no_alerts_minutes} minutes",
|
|
)
|
|
|
|
now_dt = datetime.datetime.utcnow()
|
|
no_alert_cutoff = (now_dt - datetime.timedelta(minutes=no_alerts_minutes)).isoformat()
|
|
|
|
# Pull all open incidents
|
|
all_open = incident_store.list_incidents({"status": "open"}, limit=limit)
|
|
if env_filter:
|
|
all_open = [i for i in all_open if i.get("env") == env_filter]
|
|
|
|
candidates: List[Dict] = []
|
|
closed: List[str] = []
|
|
|
|
for incident in all_open:
|
|
inc_id = incident["id"]
|
|
signature = incident.get("meta", {}).get("incident_signature")
|
|
if not signature:
|
|
continue
|
|
|
|
sig_state = sig_state_store.get_state(signature)
|
|
if not sig_state:
|
|
continue
|
|
|
|
last_alert = sig_state.get("last_alert_at") or ""
|
|
if last_alert >= no_alert_cutoff:
|
|
continue # alert seen recently → not a candidate
|
|
|
|
current_sev = incident.get("severity", "P2")
|
|
can_close = current_sev in close_allowed
|
|
|
|
candidates.append({
|
|
"incident_id": inc_id,
|
|
"service": incident.get("service"),
|
|
"severity": current_sev,
|
|
"last_alert_at": last_alert,
|
|
"minutes_without_alerts": round(
|
|
(now_dt - datetime.datetime.fromisoformat(last_alert)).total_seconds() / 60
|
|
if last_alert else no_alerts_minutes
|
|
),
|
|
"auto_close_eligible": can_close and auto_close,
|
|
})
|
|
|
|
if dry_run:
|
|
continue
|
|
|
|
# Append candidate note to incident
|
|
msg = candidate_msg_tmpl.format(no_alerts_minutes=no_alerts_minutes)
|
|
incident_store.append_event(inc_id, candidate_event_type, msg, meta={
|
|
"last_alert_at": last_alert,
|
|
"no_alerts_minutes": no_alerts_minutes,
|
|
"auto_created": True,
|
|
})
|
|
|
|
if can_close and auto_close:
|
|
incident_store.close_incident(
|
|
inc_id,
|
|
_now_iso(),
|
|
f"Auto-closed: no alerts for {no_alerts_minutes} minutes",
|
|
)
|
|
closed.append(inc_id)
|
|
|
|
return {
|
|
"candidates": candidates,
|
|
"candidates_count": len(candidates),
|
|
"closed": closed,
|
|
"closed_count": len(closed),
|
|
"no_alerts_minutes": no_alerts_minutes,
|
|
"dry_run": dry_run,
|
|
}
|