Files
microdao-daarion/config/alert_routing_policy.yml
Apple 67225a39fa docs(platform): add policy configs, runbooks, ops scripts and platform documentation
Config policies (16 files): alert_routing, architecture_pressure, backlog,
cost_weights, data_governance, incident_escalation, incident_intelligence,
network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix,
release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout

Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard,
deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice,
cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule),
task_registry, voice alerts/ha/latency/policy

Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks,
NODA1/NODA2 status and setup, audit index and traces, backlog, incident,
supervisor, tools, voice, opencode, release, risk, aistalk, spacebot

Made-with: Cursor
2026-03-03 07:14:53 -08:00

115 lines
4.0 KiB
YAML

# alert_routing_policy.yml
# Controls how the alert_triage_graph processes incoming alerts every 5 minutes.
# Key design: llm_mode=off means 0 LLM tokens in steady state.
defaults:
poll_interval_seconds: 300 # 5 min
max_alerts_per_run: 25
only_unacked: true
# Safety valves (avoid runaway incident creation on alert storm)
max_incidents_per_run: 5
max_triages_per_run: 5
dedupe_window_minutes_default: 120
ack_note_prefix: "alert_triage_loop"
# LLM gating — off = 0 tokens in steady state
llm_mode: "off" # off | local | remote
llm_on:
triage: false
postmortem: false
routing:
# ─── HARD AUTO: prod P0/P1 → create incident + deterministic triage ─────────
- match:
env_in: ["prod"]
severity_in: ["P0", "P1"]
actions:
auto_incident: true
auto_triage: true
triage_mode: "deterministic" # deterministic | llm
incident_severity_cap: "P1"
dedupe_window_minutes: 180
attach_alert_artifact: true
ack: true
# ─── Security alerts: auto incident + (optional) LLM triage ─────────────────
- match:
kind_in: ["security"]
actions:
auto_incident: true
auto_triage: true
triage_mode: "deterministic" # flip to llm once stable
incident_severity_cap: "P0"
dedupe_window_minutes: 360
attach_alert_artifact: true
ack: true
# ─── Resource-critical: OOM/crashloop/disk in prod|staging ──────────────────
- match:
kind_in: ["oom", "crashloop", "disk"]
env_in: ["prod", "staging"]
severity_in: ["P0", "P1", "P2"]
actions:
auto_incident: true
auto_triage: true
triage_mode: "deterministic"
incident_severity_cap: "P1"
dedupe_window_minutes: 240
attach_alert_artifact: true
ack: true
# ─── Staging P1: auto incident, no triage (save resources) ─────────────────
- match:
env_in: ["staging"]
severity_in: ["P1"]
actions:
auto_incident: true
auto_triage: false
triage_mode: "deterministic"
incident_severity_cap: "P1"
dedupe_window_minutes: 120
attach_alert_artifact: true
ack: true
# ─── Deploy events: digest-only ──────────────────────────────────────────────
- match:
kind_in: ["deploy"]
actions:
auto_incident: false
digest_only: true
ack: true
# ─── Lower severity: digest-only ─────────────────────────────────────────────
- match:
severity_in: ["P2", "P3", "INFO"]
actions:
auto_incident: false
digest_only: true
ack: true
# ─── Kind normalization (aliases Monitor may use) ────────────────────────────
kind_map:
latency: ["latency", "p95_latency", "p99_latency", "slow_response"]
error_rate: ["error_rate", "5xx_rate", "http_errors"]
slo_breach: ["slo_breach", "slo", "slo_violation"]
crashloop: ["crashloop", "restart_loop", "oom_kill"]
oom: ["oom", "out_of_memory", "memory_pressure"]
disk: ["disk", "disk_full", "disk_pressure", "pvc_full"]
security: ["security", "unauthorized", "injection", "brute_force"]
# ─── Per-kind severity caps for incidents created by the loop ─────────────────
severity_caps:
deploy: "P2"
latency: "P1"
error_rate: "P1"
slo_breach: "P1"
security: "P0"
# ─── Signature dedupe settings ────────────────────────────────────────────────
signature:
use_kind: true
use_fingerprint: true
use_node_label: false # true = per-node incidents (noisier)
normalize_title: true # strip numbers/timestamps from title before hash