Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
115 lines
4.0 KiB
YAML
115 lines
4.0 KiB
YAML
# alert_routing_policy.yml
|
|
# Controls how the alert_triage_graph processes incoming alerts every 5 minutes.
|
|
# Key design: llm_mode=off means 0 LLM tokens in steady state.
|
|
|
|
defaults:
|
|
poll_interval_seconds: 300 # 5 min
|
|
max_alerts_per_run: 25
|
|
only_unacked: true
|
|
|
|
# Safety valves (avoid runaway incident creation on alert storm)
|
|
max_incidents_per_run: 5
|
|
max_triages_per_run: 5
|
|
dedupe_window_minutes_default: 120
|
|
ack_note_prefix: "alert_triage_loop"
|
|
|
|
# LLM gating — off = 0 tokens in steady state
|
|
llm_mode: "off" # off | local | remote
|
|
llm_on:
|
|
triage: false
|
|
postmortem: false
|
|
|
|
routing:
|
|
# ─── HARD AUTO: prod P0/P1 → create incident + deterministic triage ─────────
|
|
- match:
|
|
env_in: ["prod"]
|
|
severity_in: ["P0", "P1"]
|
|
actions:
|
|
auto_incident: true
|
|
auto_triage: true
|
|
triage_mode: "deterministic" # deterministic | llm
|
|
incident_severity_cap: "P1"
|
|
dedupe_window_minutes: 180
|
|
attach_alert_artifact: true
|
|
ack: true
|
|
|
|
# ─── Security alerts: auto incident + (optional) LLM triage ─────────────────
|
|
- match:
|
|
kind_in: ["security"]
|
|
actions:
|
|
auto_incident: true
|
|
auto_triage: true
|
|
triage_mode: "deterministic" # flip to llm once stable
|
|
incident_severity_cap: "P0"
|
|
dedupe_window_minutes: 360
|
|
attach_alert_artifact: true
|
|
ack: true
|
|
|
|
# ─── Resource-critical: OOM/crashloop/disk in prod|staging ──────────────────
|
|
- match:
|
|
kind_in: ["oom", "crashloop", "disk"]
|
|
env_in: ["prod", "staging"]
|
|
severity_in: ["P0", "P1", "P2"]
|
|
actions:
|
|
auto_incident: true
|
|
auto_triage: true
|
|
triage_mode: "deterministic"
|
|
incident_severity_cap: "P1"
|
|
dedupe_window_minutes: 240
|
|
attach_alert_artifact: true
|
|
ack: true
|
|
|
|
# ─── Staging P1: auto incident, no triage (save resources) ─────────────────
|
|
- match:
|
|
env_in: ["staging"]
|
|
severity_in: ["P1"]
|
|
actions:
|
|
auto_incident: true
|
|
auto_triage: false
|
|
triage_mode: "deterministic"
|
|
incident_severity_cap: "P1"
|
|
dedupe_window_minutes: 120
|
|
attach_alert_artifact: true
|
|
ack: true
|
|
|
|
# ─── Deploy events: digest-only ──────────────────────────────────────────────
|
|
- match:
|
|
kind_in: ["deploy"]
|
|
actions:
|
|
auto_incident: false
|
|
digest_only: true
|
|
ack: true
|
|
|
|
# ─── Lower severity: digest-only ─────────────────────────────────────────────
|
|
- match:
|
|
severity_in: ["P2", "P3", "INFO"]
|
|
actions:
|
|
auto_incident: false
|
|
digest_only: true
|
|
ack: true
|
|
|
|
# ─── Kind normalization (aliases Monitor may use) ────────────────────────────
|
|
kind_map:
|
|
latency: ["latency", "p95_latency", "p99_latency", "slow_response"]
|
|
error_rate: ["error_rate", "5xx_rate", "http_errors"]
|
|
slo_breach: ["slo_breach", "slo", "slo_violation"]
|
|
crashloop: ["crashloop", "restart_loop", "oom_kill"]
|
|
oom: ["oom", "out_of_memory", "memory_pressure"]
|
|
disk: ["disk", "disk_full", "disk_pressure", "pvc_full"]
|
|
security: ["security", "unauthorized", "injection", "brute_force"]
|
|
|
|
# ─── Per-kind severity caps for incidents created by the loop ─────────────────
|
|
severity_caps:
|
|
deploy: "P2"
|
|
latency: "P1"
|
|
error_rate: "P1"
|
|
slo_breach: "P1"
|
|
security: "P0"
|
|
|
|
# ─── Signature dedupe settings ────────────────────────────────────────────────
|
|
signature:
|
|
use_kind: true
|
|
use_fingerprint: true
|
|
use_node_label: false # true = per-node incidents (noisier)
|
|
normalize_title: true # strip numbers/timestamps from title before hash
|