Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
38 lines
1.3 KiB
YAML
38 lines
1.3 KiB
YAML
# Incident Escalation Policy
|
|
# Controls deterministic escalation and auto-resolve candidate logic.
|
|
|
|
defaults:
|
|
window_minutes: 60
|
|
|
|
escalation:
|
|
# Escalate when the same signature storms
|
|
occurrences_thresholds:
|
|
P2_to_P1: 10 # occurrences_60m to escalate P2 → P1
|
|
P1_to_P0: 25 # occurrences_60m to escalate P1 → P0
|
|
|
|
triage_thresholds_24h:
|
|
P2_to_P1: 3 # triage_count_24h to escalate P2 → P1
|
|
P1_to_P0: 6 # triage_count_24h to escalate P1 → P0
|
|
|
|
severity_cap: "P0" # never escalate above this
|
|
|
|
create_followup_on_escalate: true
|
|
followup:
|
|
priority: "P1"
|
|
due_hours: 24
|
|
owner: "oncall"
|
|
message_template: "Escalated due to alert storm: occurrences={occurrences_60m}, triages_24h={triage_count_24h}"
|
|
|
|
auto_resolve:
|
|
# Candidates only in MVP — do not auto-close P0/P1
|
|
no_alerts_minutes_for_candidate: 60
|
|
close_allowed_severities: ["P2", "P3"]
|
|
auto_close: false # set true carefully in staging only
|
|
candidate_event_type: "note"
|
|
candidate_message: "Auto-resolve candidate: no alerts observed in {no_alerts_minutes} minutes for this signature"
|
|
|
|
alert_loop_slo:
|
|
claim_to_ack_p95_seconds: 60 # p95 latency from claim → ack
|
|
failed_rate_pct: 5 # max % of failed/(acked+failed) in window
|
|
processing_stuck_minutes: 15 # alerts in processing beyond this → stuck
|