feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
This commit is contained in:
379
services/router/incident_escalation.py
Normal file
379
services/router/incident_escalation.py
Normal file
@@ -0,0 +1,379 @@
|
||||
"""
|
||||
incident_escalation.py — Deterministic Incident Escalation Engine.
|
||||
|
||||
Actions (exposed via incident_escalation_tool):
|
||||
evaluate — check active signatures against escalation thresholds
|
||||
auto_resolve_candidates — find open incidents with no recent alerts
|
||||
|
||||
No LLM usage; all logic is policy-driven.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ─── Severity ordering ────────────────────────────────────────────────────────
|
||||
|
||||
_SEV_ORDER = {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "INFO": 4}
|
||||
_SEV_NAMES = ["P0", "P1", "P2", "P3", "INFO"]
|
||||
|
||||
|
||||
def _sev_higher(a: str, b: str) -> bool:
|
||||
"""Return True if a is more severe (lower P number) than b."""
|
||||
return _SEV_ORDER.get(a, 99) < _SEV_ORDER.get(b, 99)
|
||||
|
||||
|
||||
def _escalate_sev(current: str, cap: str = "P0") -> Optional[str]:
|
||||
"""Return next higher severity, or None if already at/above cap."""
|
||||
idx = _SEV_ORDER.get(current)
|
||||
if idx is None or idx == 0:
|
||||
return None
|
||||
target = _SEV_NAMES[idx - 1]
|
||||
if _SEV_ORDER.get(target, 99) < _SEV_ORDER.get(cap, 0):
|
||||
return None # would exceed cap
|
||||
return target
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.datetime.utcnow().isoformat()
|
||||
|
||||
|
||||
def _plus_hours(hours: int) -> str:
|
||||
return (datetime.datetime.utcnow() + datetime.timedelta(hours=hours)).isoformat()
|
||||
|
||||
|
||||
# ─── Policy loading ───────────────────────────────────────────────────────────
|
||||
|
||||
_POLICY_CACHE: Optional[Dict] = None
|
||||
_POLICY_PATHS = [
|
||||
Path("config/incident_escalation_policy.yml"),
|
||||
Path(__file__).resolve().parent.parent.parent / "config" / "incident_escalation_policy.yml",
|
||||
]
|
||||
|
||||
|
||||
def load_escalation_policy() -> Dict:
|
||||
global _POLICY_CACHE
|
||||
if _POLICY_CACHE is not None:
|
||||
return _POLICY_CACHE
|
||||
for path in _POLICY_PATHS:
|
||||
if path.exists():
|
||||
try:
|
||||
with open(path) as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
_POLICY_CACHE = data
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.warning("Failed to load escalation policy from %s: %s", path, e)
|
||||
logger.warning("incident_escalation_policy.yml not found; using defaults")
|
||||
_POLICY_CACHE = _builtin_defaults()
|
||||
return _POLICY_CACHE
|
||||
|
||||
|
||||
def _builtin_defaults() -> Dict:
|
||||
return {
|
||||
"defaults": {"window_minutes": 60},
|
||||
"escalation": {
|
||||
"occurrences_thresholds": {"P2_to_P1": 10, "P1_to_P0": 25},
|
||||
"triage_thresholds_24h": {"P2_to_P1": 3, "P1_to_P0": 6},
|
||||
"severity_cap": "P0",
|
||||
"create_followup_on_escalate": True,
|
||||
"followup": {
|
||||
"priority": "P1", "due_hours": 24, "owner": "oncall",
|
||||
"message_template": "Escalated: occurrences={occurrences_60m}, triages_24h={triage_count_24h}",
|
||||
},
|
||||
},
|
||||
"auto_resolve": {
|
||||
"no_alerts_minutes_for_candidate": 60,
|
||||
"close_allowed_severities": ["P2", "P3"],
|
||||
"auto_close": False,
|
||||
"candidate_event_type": "note",
|
||||
"candidate_message": "Auto-resolve candidate: no alerts in {no_alerts_minutes} minutes",
|
||||
},
|
||||
"alert_loop_slo": {
|
||||
"claim_to_ack_p95_seconds": 60,
|
||||
"failed_rate_pct": 5,
|
||||
"processing_stuck_minutes": 15,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ─── Escalation thresholds helper ────────────────────────────────────────────
|
||||
|
||||
def _determine_escalation(
|
||||
current_severity: str,
|
||||
occurrences_60m: int,
|
||||
triage_count_24h: int,
|
||||
policy: Dict,
|
||||
) -> Optional[str]:
|
||||
"""Return target severity if escalation is needed, else None."""
|
||||
esc = policy.get("escalation", {})
|
||||
occ_thresh = esc.get("occurrences_thresholds", {})
|
||||
triage_thresh = esc.get("triage_thresholds_24h", {})
|
||||
cap = esc.get("severity_cap", "P0")
|
||||
|
||||
# Build escalation rules in priority order (most → least severe)
|
||||
rules = [
|
||||
("P1", "P0", occ_thresh.get("P1_to_P0", 25), triage_thresh.get("P1_to_P0", 6)),
|
||||
("P2", "P1", occ_thresh.get("P2_to_P1", 10), triage_thresh.get("P2_to_P1", 3)),
|
||||
]
|
||||
|
||||
for from_sev, to_sev, occ_limit, triage_limit in rules:
|
||||
if current_severity != from_sev:
|
||||
continue
|
||||
if occurrences_60m >= occ_limit or triage_count_24h >= triage_limit:
|
||||
# Check cap
|
||||
if not _sev_higher(cap, to_sev) and to_sev != cap:
|
||||
# to_sev is more severe than cap — not allowed
|
||||
if _sev_higher(to_sev, cap):
|
||||
return cap
|
||||
return to_sev
|
||||
return None
|
||||
|
||||
|
||||
# ─── Core evaluate function ───────────────────────────────────────────────────
|
||||
|
||||
def evaluate_escalations(
|
||||
params: Dict,
|
||||
alert_store,
|
||||
sig_state_store,
|
||||
incident_store,
|
||||
policy: Optional[Dict] = None,
|
||||
dry_run: bool = False,
|
||||
) -> Dict:
|
||||
"""
|
||||
Main escalation evaluation. Returns structured summary.
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_escalation_policy()
|
||||
|
||||
env_filter = params.get("env") # "prod" / "staging" / None = any
|
||||
window_minutes = int(params.get("window_minutes",
|
||||
policy.get("defaults", {}).get("window_minutes", 60)))
|
||||
limit = int(params.get("limit", 100))
|
||||
|
||||
esc_cfg = policy.get("escalation", {})
|
||||
cap = esc_cfg.get("severity_cap", "P0")
|
||||
create_followup = esc_cfg.get("create_followup_on_escalate", True)
|
||||
followup_cfg = esc_cfg.get("followup", {})
|
||||
|
||||
# Pull active signatures
|
||||
active_sigs = sig_state_store.list_active_signatures(
|
||||
window_minutes=window_minutes, limit=limit
|
||||
)
|
||||
|
||||
evaluated = 0
|
||||
escalated = 0
|
||||
followups_created = 0
|
||||
candidates: List[Dict] = []
|
||||
recommendations: List[str] = []
|
||||
|
||||
for sig_state in active_sigs:
|
||||
signature = sig_state.get("signature", "")
|
||||
occurrences_60m = sig_state.get("occurrences_60m", 0)
|
||||
triage_count_24h = sig_state.get("triage_count_24h", 0)
|
||||
|
||||
# Find open incident with this signature
|
||||
all_incidents = incident_store.list_incidents(
|
||||
{"status": "open"}, limit=200
|
||||
)
|
||||
matching = [
|
||||
i for i in all_incidents
|
||||
if i.get("meta", {}).get("incident_signature") == signature
|
||||
and (not env_filter or i.get("env") == env_filter)
|
||||
]
|
||||
if not matching:
|
||||
# Also check mitigating
|
||||
mitigating = incident_store.list_incidents(
|
||||
{"status": "mitigating"}, limit=200
|
||||
)
|
||||
matching = [
|
||||
i for i in mitigating
|
||||
if i.get("meta", {}).get("incident_signature") == signature
|
||||
and (not env_filter or i.get("env") == env_filter)
|
||||
]
|
||||
|
||||
if not matching:
|
||||
evaluated += 1
|
||||
continue
|
||||
|
||||
incident = matching[0]
|
||||
inc_id = incident["id"]
|
||||
current_sev = incident.get("severity", "P2")
|
||||
|
||||
evaluated += 1
|
||||
|
||||
target_sev = _determine_escalation(
|
||||
current_sev, occurrences_60m, triage_count_24h, policy
|
||||
)
|
||||
|
||||
if not target_sev:
|
||||
continue # no escalation needed
|
||||
|
||||
candidates.append({
|
||||
"incident_id": inc_id,
|
||||
"service": incident.get("service"),
|
||||
"from_severity": current_sev,
|
||||
"to_severity": target_sev,
|
||||
"occurrences_60m": occurrences_60m,
|
||||
"triage_count_24h": triage_count_24h,
|
||||
"signature": signature,
|
||||
})
|
||||
|
||||
if dry_run:
|
||||
continue
|
||||
|
||||
# Append escalation decision event
|
||||
esc_msg = (
|
||||
f"Escalated {current_sev} → {target_sev}: "
|
||||
f"occurrences_60m={occurrences_60m}, "
|
||||
f"triage_count_24h={triage_count_24h}"
|
||||
)
|
||||
incident_store.append_event(inc_id, "decision", esc_msg, meta={
|
||||
"from_severity": current_sev,
|
||||
"to_severity": target_sev,
|
||||
"occurrences_60m": occurrences_60m,
|
||||
"triage_count_24h": triage_count_24h,
|
||||
"policy_cap": cap,
|
||||
"automated": True,
|
||||
})
|
||||
escalated += 1
|
||||
|
||||
# Create follow-up event if configured
|
||||
if create_followup:
|
||||
tmpl = followup_cfg.get(
|
||||
"message_template",
|
||||
"Escalation follow-up: investigate {occurrences_60m} occurrences"
|
||||
)
|
||||
followup_msg = tmpl.format(
|
||||
occurrences_60m=occurrences_60m,
|
||||
triage_count_24h=triage_count_24h,
|
||||
)
|
||||
due = _plus_hours(int(followup_cfg.get("due_hours", 24)))
|
||||
incident_store.append_event(inc_id, "followup", followup_msg, meta={
|
||||
"priority": followup_cfg.get("priority", "P1"),
|
||||
"due_date": due,
|
||||
"owner": followup_cfg.get("owner", "oncall"),
|
||||
"auto_created": True,
|
||||
})
|
||||
followups_created += 1
|
||||
|
||||
recommendations.append(
|
||||
f"Incident {inc_id} ({incident.get('service')}) escalated "
|
||||
f"{current_sev}→{target_sev}: {esc_msg}"
|
||||
)
|
||||
|
||||
return {
|
||||
"evaluated": evaluated,
|
||||
"escalated": escalated,
|
||||
"followups_created": followups_created,
|
||||
"candidates": candidates,
|
||||
"recommendations": recommendations,
|
||||
"dry_run": dry_run,
|
||||
}
|
||||
|
||||
|
||||
# ─── Auto-resolve candidates ──────────────────────────────────────────────────
|
||||
|
||||
def find_auto_resolve_candidates(
|
||||
params: Dict,
|
||||
sig_state_store,
|
||||
incident_store,
|
||||
policy: Optional[Dict] = None,
|
||||
dry_run: bool = True,
|
||||
) -> Dict:
|
||||
"""
|
||||
Find open incidents where no alerts have been seen in the last N minutes.
|
||||
Returns list of candidate incidents.
|
||||
By default dry_run=True — no state changes.
|
||||
"""
|
||||
if policy is None:
|
||||
policy = load_escalation_policy()
|
||||
|
||||
ar = policy.get("auto_resolve", {})
|
||||
no_alerts_minutes = int(params.get(
|
||||
"no_alerts_minutes",
|
||||
ar.get("no_alerts_minutes_for_candidate", 60)
|
||||
))
|
||||
env_filter = params.get("env")
|
||||
limit = int(params.get("limit", 100))
|
||||
close_allowed = ar.get("close_allowed_severities", ["P2", "P3"])
|
||||
auto_close = ar.get("auto_close", False)
|
||||
candidate_event_type = ar.get("candidate_event_type", "note")
|
||||
candidate_msg_tmpl = ar.get(
|
||||
"candidate_message",
|
||||
"Auto-resolve candidate: no alerts in {no_alerts_minutes} minutes",
|
||||
)
|
||||
|
||||
now_dt = datetime.datetime.utcnow()
|
||||
no_alert_cutoff = (now_dt - datetime.timedelta(minutes=no_alerts_minutes)).isoformat()
|
||||
|
||||
# Pull all open incidents
|
||||
all_open = incident_store.list_incidents({"status": "open"}, limit=limit)
|
||||
if env_filter:
|
||||
all_open = [i for i in all_open if i.get("env") == env_filter]
|
||||
|
||||
candidates: List[Dict] = []
|
||||
closed: List[str] = []
|
||||
|
||||
for incident in all_open:
|
||||
inc_id = incident["id"]
|
||||
signature = incident.get("meta", {}).get("incident_signature")
|
||||
if not signature:
|
||||
continue
|
||||
|
||||
sig_state = sig_state_store.get_state(signature)
|
||||
if not sig_state:
|
||||
continue
|
||||
|
||||
last_alert = sig_state.get("last_alert_at") or ""
|
||||
if last_alert >= no_alert_cutoff:
|
||||
continue # alert seen recently → not a candidate
|
||||
|
||||
current_sev = incident.get("severity", "P2")
|
||||
can_close = current_sev in close_allowed
|
||||
|
||||
candidates.append({
|
||||
"incident_id": inc_id,
|
||||
"service": incident.get("service"),
|
||||
"severity": current_sev,
|
||||
"last_alert_at": last_alert,
|
||||
"minutes_without_alerts": round(
|
||||
(now_dt - datetime.datetime.fromisoformat(last_alert)).total_seconds() / 60
|
||||
if last_alert else no_alerts_minutes
|
||||
),
|
||||
"auto_close_eligible": can_close and auto_close,
|
||||
})
|
||||
|
||||
if dry_run:
|
||||
continue
|
||||
|
||||
# Append candidate note to incident
|
||||
msg = candidate_msg_tmpl.format(no_alerts_minutes=no_alerts_minutes)
|
||||
incident_store.append_event(inc_id, candidate_event_type, msg, meta={
|
||||
"last_alert_at": last_alert,
|
||||
"no_alerts_minutes": no_alerts_minutes,
|
||||
"auto_created": True,
|
||||
})
|
||||
|
||||
if can_close and auto_close:
|
||||
incident_store.close_incident(
|
||||
inc_id,
|
||||
_now_iso(),
|
||||
f"Auto-closed: no alerts for {no_alerts_minutes} minutes",
|
||||
)
|
||||
closed.append(inc_id)
|
||||
|
||||
return {
|
||||
"candidates": candidates,
|
||||
"candidates_count": len(candidates),
|
||||
"closed": closed,
|
||||
"closed_count": len(closed),
|
||||
"no_alerts_minutes": no_alerts_minutes,
|
||||
"dry_run": dry_run,
|
||||
}
|
||||
Reference in New Issue
Block a user