# alert_routing_policy.yml # Controls how the alert_triage_graph processes incoming alerts every 5 minutes. # Key design: llm_mode=off means 0 LLM tokens in steady state. defaults: poll_interval_seconds: 300 # 5 min max_alerts_per_run: 25 only_unacked: true # Safety valves (avoid runaway incident creation on alert storm) max_incidents_per_run: 5 max_triages_per_run: 5 dedupe_window_minutes_default: 120 ack_note_prefix: "alert_triage_loop" # LLM gating — off = 0 tokens in steady state llm_mode: "off" # off | local | remote llm_on: triage: false postmortem: false routing: # ─── HARD AUTO: prod P0/P1 → create incident + deterministic triage ───────── - match: env_in: ["prod"] severity_in: ["P0", "P1"] actions: auto_incident: true auto_triage: true triage_mode: "deterministic" # deterministic | llm incident_severity_cap: "P1" dedupe_window_minutes: 180 attach_alert_artifact: true ack: true # ─── Security alerts: auto incident + (optional) LLM triage ───────────────── - match: kind_in: ["security"] actions: auto_incident: true auto_triage: true triage_mode: "deterministic" # flip to llm once stable incident_severity_cap: "P0" dedupe_window_minutes: 360 attach_alert_artifact: true ack: true # ─── Resource-critical: OOM/crashloop/disk in prod|staging ────────────────── - match: kind_in: ["oom", "crashloop", "disk"] env_in: ["prod", "staging"] severity_in: ["P0", "P1", "P2"] actions: auto_incident: true auto_triage: true triage_mode: "deterministic" incident_severity_cap: "P1" dedupe_window_minutes: 240 attach_alert_artifact: true ack: true # ─── Staging P1: auto incident, no triage (save resources) ───────────────── - match: env_in: ["staging"] severity_in: ["P1"] actions: auto_incident: true auto_triage: false triage_mode: "deterministic" incident_severity_cap: "P1" dedupe_window_minutes: 120 attach_alert_artifact: true ack: true # ─── Deploy events: digest-only ────────────────────────────────────────────── - match: kind_in: ["deploy"] actions: auto_incident: false digest_only: true ack: true # ─── Lower severity: digest-only ───────────────────────────────────────────── - match: severity_in: ["P2", "P3", "INFO"] actions: auto_incident: false digest_only: true ack: true # ─── Kind normalization (aliases Monitor may use) ──────────────────────────── kind_map: latency: ["latency", "p95_latency", "p99_latency", "slow_response"] error_rate: ["error_rate", "5xx_rate", "http_errors"] slo_breach: ["slo_breach", "slo", "slo_violation"] crashloop: ["crashloop", "restart_loop", "oom_kill"] oom: ["oom", "out_of_memory", "memory_pressure"] disk: ["disk", "disk_full", "disk_pressure", "pvc_full"] security: ["security", "unauthorized", "injection", "brute_force"] # ─── Per-kind severity caps for incidents created by the loop ───────────────── severity_caps: deploy: "P2" latency: "P1" error_rate: "P1" slo_breach: "P1" security: "P0" # ─── Signature dedupe settings ──────────────────────────────────────────────── signature: use_kind: true use_fingerprint: true use_node_label: false # true = per-node incidents (noisier) normalize_title: true # strip numbers/timestamps from title before hash