Files
microdao-daarion/config/incident_intelligence_policy.yml
Apple 67225a39fa docs(platform): add policy configs, runbooks, ops scripts and platform documentation
Config policies (16 files): alert_routing, architecture_pressure, backlog,
cost_weights, data_governance, incident_escalation, incident_intelligence,
network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix,
release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout

Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard,
deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice,
cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule),
task_registry, voice alerts/ha/latency/policy

Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks,
NODA1/NODA2 status and setup, audit index and traces, backlog, incident,
supervisor, tools, voice, opencode, release, risk, aistalk, spacebot

Made-with: Cursor
2026-03-03 07:14:53 -08:00

89 lines
2.9 KiB
YAML

# Incident Intelligence Policy
# Controls correlation scoring, recurrence detection, and digest generation.
correlation:
lookback_days: 30
max_related: 10
min_score: 20 # discard matches below this
rules:
- name: "same_signature"
weight: 100
match:
signature: true
- name: "same_service_and_kind"
weight: 60
match:
same_service: true
same_kind: true
- name: "same_service_time_cluster"
weight: 40
match:
same_service: true
within_minutes: 180
- name: "same_kind_cross_service"
weight: 30
match:
same_kind: true
within_minutes: 120
recurrence:
windows_days: [7, 30]
thresholds:
signature:
warn: 3 # ≥ 3 occurrences in window → warn
high: 6 # ≥ 6 occurrences in window → high
kind:
warn: 5
high: 10
top_n: 15 # top N per category
# Deterministic recommendations per recurrence level
recommendations:
signature_high: "Create permanent fix: add regression test + SLO guard for this failure type"
signature_warn: "Review root cause history; consider adding monitoring threshold"
kind_high: "Systemic issue with kind={kind}: review architecture / add circuit breaker"
kind_warn: "Recurring kind={kind}: validate if alert thresholds are tuned correctly"
digest:
weekly_day: "Mon"
include_closed: true
include_open: true
output_dir: "ops/reports/incidents"
markdown_max_chars: 8000
top_incidents: 20 # max incidents in weekly listing
# ── Root-Cause Buckets ─────────────────────────────────────────────────────
buckets:
mode: "service_kind" # service_kind | signature_prefix
signature_prefix_len: 12
top_n: 10
min_count:
7: 3 # bucket must have ≥ 3 incidents in last 7d
30: 6 # or ≥ 6 in last 30d
include_statuses: ["open", "mitigating", "resolved", "closed"]
# ── Auto Follow-ups (policy-driven, no LLM) ───────────────────────────────
autofollowups:
enabled: true
only_when_high: true # only create for HIGH recurrence buckets
owner: "oncall"
priority: "P1"
due_days: 7
max_followups_per_bucket_per_week: 1 # dedupe by week+bucket_key
dedupe_key_prefix: "intel_recur"
# ── Release Gate: recurrence_watch ────────────────────────────────────────
release_gate:
recurrence_watch:
enabled: true
service_scope: "target_service" # target_service | all
windows_days: [7, 30]
fail_on:
severity_in: ["P0", "P1"] # used only in strict mode
high_recurrence: true
warn_on:
warn_recurrence: true