Files
microdao-daarion/services/router/incident_escalation.py
Apple 129e4ea1fc feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store,
architecture_pressure, backlog_generator/store, cost_analyzer, data_governance,
dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment,
platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files),
signature_state_store, sofiia_auto_router, tool_governance

New services:
- sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static
- memory-service: integration_endpoints, integrations, voice_endpoints, static UI
- aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents)
- sofiia-supervisor: new supervisor service
- aistalk-bridge-lite: Telegram bridge lite
- calendar-service: CalDAV calendar service with reminders
- mlx-stt-service / mlx-tts-service: Apple Silicon speech services
- binance-bot-monitor: market monitor service
- node-worker: STT/TTS memory providers

New tools (9): agent_email, browser_tool, contract_tool, observability_tool,
oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault

New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus,
farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine,
session_context, style_adapter, telemetry)

Tests: 85+ test files for all new modules
Made-with: Cursor
2026-03-03 07:14:14 -08:00

380 lines
13 KiB
Python

"""
incident_escalation.py — Deterministic Incident Escalation Engine.
Actions (exposed via incident_escalation_tool):
evaluate — check active signatures against escalation thresholds
auto_resolve_candidates — find open incidents with no recent alerts
No LLM usage; all logic is policy-driven.
"""
from __future__ import annotations
import datetime
import logging
import os
import yaml
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
# ─── Severity ordering ────────────────────────────────────────────────────────
_SEV_ORDER = {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "INFO": 4}
_SEV_NAMES = ["P0", "P1", "P2", "P3", "INFO"]
def _sev_higher(a: str, b: str) -> bool:
"""Return True if a is more severe (lower P number) than b."""
return _SEV_ORDER.get(a, 99) < _SEV_ORDER.get(b, 99)
def _escalate_sev(current: str, cap: str = "P0") -> Optional[str]:
"""Return next higher severity, or None if already at/above cap."""
idx = _SEV_ORDER.get(current)
if idx is None or idx == 0:
return None
target = _SEV_NAMES[idx - 1]
if _SEV_ORDER.get(target, 99) < _SEV_ORDER.get(cap, 0):
return None # would exceed cap
return target
def _now_iso() -> str:
return datetime.datetime.utcnow().isoformat()
def _plus_hours(hours: int) -> str:
return (datetime.datetime.utcnow() + datetime.timedelta(hours=hours)).isoformat()
# ─── Policy loading ───────────────────────────────────────────────────────────
_POLICY_CACHE: Optional[Dict] = None
_POLICY_PATHS = [
Path("config/incident_escalation_policy.yml"),
Path(__file__).resolve().parent.parent.parent / "config" / "incident_escalation_policy.yml",
]
def load_escalation_policy() -> Dict:
global _POLICY_CACHE
if _POLICY_CACHE is not None:
return _POLICY_CACHE
for path in _POLICY_PATHS:
if path.exists():
try:
with open(path) as f:
data = yaml.safe_load(f) or {}
_POLICY_CACHE = data
return data
except Exception as e:
logger.warning("Failed to load escalation policy from %s: %s", path, e)
logger.warning("incident_escalation_policy.yml not found; using defaults")
_POLICY_CACHE = _builtin_defaults()
return _POLICY_CACHE
def _builtin_defaults() -> Dict:
return {
"defaults": {"window_minutes": 60},
"escalation": {
"occurrences_thresholds": {"P2_to_P1": 10, "P1_to_P0": 25},
"triage_thresholds_24h": {"P2_to_P1": 3, "P1_to_P0": 6},
"severity_cap": "P0",
"create_followup_on_escalate": True,
"followup": {
"priority": "P1", "due_hours": 24, "owner": "oncall",
"message_template": "Escalated: occurrences={occurrences_60m}, triages_24h={triage_count_24h}",
},
},
"auto_resolve": {
"no_alerts_minutes_for_candidate": 60,
"close_allowed_severities": ["P2", "P3"],
"auto_close": False,
"candidate_event_type": "note",
"candidate_message": "Auto-resolve candidate: no alerts in {no_alerts_minutes} minutes",
},
"alert_loop_slo": {
"claim_to_ack_p95_seconds": 60,
"failed_rate_pct": 5,
"processing_stuck_minutes": 15,
},
}
# ─── Escalation thresholds helper ────────────────────────────────────────────
def _determine_escalation(
current_severity: str,
occurrences_60m: int,
triage_count_24h: int,
policy: Dict,
) -> Optional[str]:
"""Return target severity if escalation is needed, else None."""
esc = policy.get("escalation", {})
occ_thresh = esc.get("occurrences_thresholds", {})
triage_thresh = esc.get("triage_thresholds_24h", {})
cap = esc.get("severity_cap", "P0")
# Build escalation rules in priority order (most → least severe)
rules = [
("P1", "P0", occ_thresh.get("P1_to_P0", 25), triage_thresh.get("P1_to_P0", 6)),
("P2", "P1", occ_thresh.get("P2_to_P1", 10), triage_thresh.get("P2_to_P1", 3)),
]
for from_sev, to_sev, occ_limit, triage_limit in rules:
if current_severity != from_sev:
continue
if occurrences_60m >= occ_limit or triage_count_24h >= triage_limit:
# Check cap
if not _sev_higher(cap, to_sev) and to_sev != cap:
# to_sev is more severe than cap — not allowed
if _sev_higher(to_sev, cap):
return cap
return to_sev
return None
# ─── Core evaluate function ───────────────────────────────────────────────────
def evaluate_escalations(
params: Dict,
alert_store,
sig_state_store,
incident_store,
policy: Optional[Dict] = None,
dry_run: bool = False,
) -> Dict:
"""
Main escalation evaluation. Returns structured summary.
"""
if policy is None:
policy = load_escalation_policy()
env_filter = params.get("env") # "prod" / "staging" / None = any
window_minutes = int(params.get("window_minutes",
policy.get("defaults", {}).get("window_minutes", 60)))
limit = int(params.get("limit", 100))
esc_cfg = policy.get("escalation", {})
cap = esc_cfg.get("severity_cap", "P0")
create_followup = esc_cfg.get("create_followup_on_escalate", True)
followup_cfg = esc_cfg.get("followup", {})
# Pull active signatures
active_sigs = sig_state_store.list_active_signatures(
window_minutes=window_minutes, limit=limit
)
evaluated = 0
escalated = 0
followups_created = 0
candidates: List[Dict] = []
recommendations: List[str] = []
for sig_state in active_sigs:
signature = sig_state.get("signature", "")
occurrences_60m = sig_state.get("occurrences_60m", 0)
triage_count_24h = sig_state.get("triage_count_24h", 0)
# Find open incident with this signature
all_incidents = incident_store.list_incidents(
{"status": "open"}, limit=200
)
matching = [
i for i in all_incidents
if i.get("meta", {}).get("incident_signature") == signature
and (not env_filter or i.get("env") == env_filter)
]
if not matching:
# Also check mitigating
mitigating = incident_store.list_incidents(
{"status": "mitigating"}, limit=200
)
matching = [
i for i in mitigating
if i.get("meta", {}).get("incident_signature") == signature
and (not env_filter or i.get("env") == env_filter)
]
if not matching:
evaluated += 1
continue
incident = matching[0]
inc_id = incident["id"]
current_sev = incident.get("severity", "P2")
evaluated += 1
target_sev = _determine_escalation(
current_sev, occurrences_60m, triage_count_24h, policy
)
if not target_sev:
continue # no escalation needed
candidates.append({
"incident_id": inc_id,
"service": incident.get("service"),
"from_severity": current_sev,
"to_severity": target_sev,
"occurrences_60m": occurrences_60m,
"triage_count_24h": triage_count_24h,
"signature": signature,
})
if dry_run:
continue
# Append escalation decision event
esc_msg = (
f"Escalated {current_sev}{target_sev}: "
f"occurrences_60m={occurrences_60m}, "
f"triage_count_24h={triage_count_24h}"
)
incident_store.append_event(inc_id, "decision", esc_msg, meta={
"from_severity": current_sev,
"to_severity": target_sev,
"occurrences_60m": occurrences_60m,
"triage_count_24h": triage_count_24h,
"policy_cap": cap,
"automated": True,
})
escalated += 1
# Create follow-up event if configured
if create_followup:
tmpl = followup_cfg.get(
"message_template",
"Escalation follow-up: investigate {occurrences_60m} occurrences"
)
followup_msg = tmpl.format(
occurrences_60m=occurrences_60m,
triage_count_24h=triage_count_24h,
)
due = _plus_hours(int(followup_cfg.get("due_hours", 24)))
incident_store.append_event(inc_id, "followup", followup_msg, meta={
"priority": followup_cfg.get("priority", "P1"),
"due_date": due,
"owner": followup_cfg.get("owner", "oncall"),
"auto_created": True,
})
followups_created += 1
recommendations.append(
f"Incident {inc_id} ({incident.get('service')}) escalated "
f"{current_sev}{target_sev}: {esc_msg}"
)
return {
"evaluated": evaluated,
"escalated": escalated,
"followups_created": followups_created,
"candidates": candidates,
"recommendations": recommendations,
"dry_run": dry_run,
}
# ─── Auto-resolve candidates ──────────────────────────────────────────────────
def find_auto_resolve_candidates(
params: Dict,
sig_state_store,
incident_store,
policy: Optional[Dict] = None,
dry_run: bool = True,
) -> Dict:
"""
Find open incidents where no alerts have been seen in the last N minutes.
Returns list of candidate incidents.
By default dry_run=True — no state changes.
"""
if policy is None:
policy = load_escalation_policy()
ar = policy.get("auto_resolve", {})
no_alerts_minutes = int(params.get(
"no_alerts_minutes",
ar.get("no_alerts_minutes_for_candidate", 60)
))
env_filter = params.get("env")
limit = int(params.get("limit", 100))
close_allowed = ar.get("close_allowed_severities", ["P2", "P3"])
auto_close = ar.get("auto_close", False)
candidate_event_type = ar.get("candidate_event_type", "note")
candidate_msg_tmpl = ar.get(
"candidate_message",
"Auto-resolve candidate: no alerts in {no_alerts_minutes} minutes",
)
now_dt = datetime.datetime.utcnow()
no_alert_cutoff = (now_dt - datetime.timedelta(minutes=no_alerts_minutes)).isoformat()
# Pull all open incidents
all_open = incident_store.list_incidents({"status": "open"}, limit=limit)
if env_filter:
all_open = [i for i in all_open if i.get("env") == env_filter]
candidates: List[Dict] = []
closed: List[str] = []
for incident in all_open:
inc_id = incident["id"]
signature = incident.get("meta", {}).get("incident_signature")
if not signature:
continue
sig_state = sig_state_store.get_state(signature)
if not sig_state:
continue
last_alert = sig_state.get("last_alert_at") or ""
if last_alert >= no_alert_cutoff:
continue # alert seen recently → not a candidate
current_sev = incident.get("severity", "P2")
can_close = current_sev in close_allowed
candidates.append({
"incident_id": inc_id,
"service": incident.get("service"),
"severity": current_sev,
"last_alert_at": last_alert,
"minutes_without_alerts": round(
(now_dt - datetime.datetime.fromisoformat(last_alert)).total_seconds() / 60
if last_alert else no_alerts_minutes
),
"auto_close_eligible": can_close and auto_close,
})
if dry_run:
continue
# Append candidate note to incident
msg = candidate_msg_tmpl.format(no_alerts_minutes=no_alerts_minutes)
incident_store.append_event(inc_id, candidate_event_type, msg, meta={
"last_alert_at": last_alert,
"no_alerts_minutes": no_alerts_minutes,
"auto_created": True,
})
if can_close and auto_close:
incident_store.close_incident(
inc_id,
_now_iso(),
f"Auto-closed: no alerts for {no_alerts_minutes} minutes",
)
closed.append(inc_id)
return {
"candidates": candidates,
"candidates_count": len(candidates),
"closed": closed,
"closed_count": len(closed),
"no_alerts_minutes": no_alerts_minutes,
"dry_run": dry_run,
}