Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
89 lines
2.9 KiB
YAML
89 lines
2.9 KiB
YAML
# Incident Intelligence Policy
|
|
# Controls correlation scoring, recurrence detection, and digest generation.
|
|
|
|
correlation:
|
|
lookback_days: 30
|
|
max_related: 10
|
|
min_score: 20 # discard matches below this
|
|
rules:
|
|
- name: "same_signature"
|
|
weight: 100
|
|
match:
|
|
signature: true
|
|
|
|
- name: "same_service_and_kind"
|
|
weight: 60
|
|
match:
|
|
same_service: true
|
|
same_kind: true
|
|
|
|
- name: "same_service_time_cluster"
|
|
weight: 40
|
|
match:
|
|
same_service: true
|
|
within_minutes: 180
|
|
|
|
- name: "same_kind_cross_service"
|
|
weight: 30
|
|
match:
|
|
same_kind: true
|
|
within_minutes: 120
|
|
|
|
recurrence:
|
|
windows_days: [7, 30]
|
|
thresholds:
|
|
signature:
|
|
warn: 3 # ≥ 3 occurrences in window → warn
|
|
high: 6 # ≥ 6 occurrences in window → high
|
|
kind:
|
|
warn: 5
|
|
high: 10
|
|
top_n: 15 # top N per category
|
|
|
|
# Deterministic recommendations per recurrence level
|
|
recommendations:
|
|
signature_high: "Create permanent fix: add regression test + SLO guard for this failure type"
|
|
signature_warn: "Review root cause history; consider adding monitoring threshold"
|
|
kind_high: "Systemic issue with kind={kind}: review architecture / add circuit breaker"
|
|
kind_warn: "Recurring kind={kind}: validate if alert thresholds are tuned correctly"
|
|
|
|
digest:
|
|
weekly_day: "Mon"
|
|
include_closed: true
|
|
include_open: true
|
|
output_dir: "ops/reports/incidents"
|
|
markdown_max_chars: 8000
|
|
top_incidents: 20 # max incidents in weekly listing
|
|
|
|
# ── Root-Cause Buckets ─────────────────────────────────────────────────────
|
|
buckets:
|
|
mode: "service_kind" # service_kind | signature_prefix
|
|
signature_prefix_len: 12
|
|
top_n: 10
|
|
min_count:
|
|
7: 3 # bucket must have ≥ 3 incidents in last 7d
|
|
30: 6 # or ≥ 6 in last 30d
|
|
include_statuses: ["open", "mitigating", "resolved", "closed"]
|
|
|
|
# ── Auto Follow-ups (policy-driven, no LLM) ───────────────────────────────
|
|
autofollowups:
|
|
enabled: true
|
|
only_when_high: true # only create for HIGH recurrence buckets
|
|
owner: "oncall"
|
|
priority: "P1"
|
|
due_days: 7
|
|
max_followups_per_bucket_per_week: 1 # dedupe by week+bucket_key
|
|
dedupe_key_prefix: "intel_recur"
|
|
|
|
# ── Release Gate: recurrence_watch ────────────────────────────────────────
|
|
release_gate:
|
|
recurrence_watch:
|
|
enabled: true
|
|
service_scope: "target_service" # target_service | all
|
|
windows_days: [7, 30]
|
|
fail_on:
|
|
severity_in: ["P0", "P1"] # used only in strict mode
|
|
high_recurrence: true
|
|
warn_on:
|
|
warn_recurrence: true
|