# Incident Intelligence Policy # Controls correlation scoring, recurrence detection, and digest generation. correlation: lookback_days: 30 max_related: 10 min_score: 20 # discard matches below this rules: - name: "same_signature" weight: 100 match: signature: true - name: "same_service_and_kind" weight: 60 match: same_service: true same_kind: true - name: "same_service_time_cluster" weight: 40 match: same_service: true within_minutes: 180 - name: "same_kind_cross_service" weight: 30 match: same_kind: true within_minutes: 120 recurrence: windows_days: [7, 30] thresholds: signature: warn: 3 # ≥ 3 occurrences in window → warn high: 6 # ≥ 6 occurrences in window → high kind: warn: 5 high: 10 top_n: 15 # top N per category # Deterministic recommendations per recurrence level recommendations: signature_high: "Create permanent fix: add regression test + SLO guard for this failure type" signature_warn: "Review root cause history; consider adding monitoring threshold" kind_high: "Systemic issue with kind={kind}: review architecture / add circuit breaker" kind_warn: "Recurring kind={kind}: validate if alert thresholds are tuned correctly" digest: weekly_day: "Mon" include_closed: true include_open: true output_dir: "ops/reports/incidents" markdown_max_chars: 8000 top_incidents: 20 # max incidents in weekly listing # ── Root-Cause Buckets ───────────────────────────────────────────────────── buckets: mode: "service_kind" # service_kind | signature_prefix signature_prefix_len: 12 top_n: 10 min_count: 7: 3 # bucket must have ≥ 3 incidents in last 7d 30: 6 # or ≥ 6 in last 30d include_statuses: ["open", "mitigating", "resolved", "closed"] # ── Auto Follow-ups (policy-driven, no LLM) ─────────────────────────────── autofollowups: enabled: true only_when_high: true # only create for HIGH recurrence buckets owner: "oncall" priority: "P1" due_days: 7 max_followups_per_bucket_per_week: 1 # dedupe by week+bucket_key dedupe_key_prefix: "intel_recur" # ── Release Gate: recurrence_watch ──────────────────────────────────────── release_gate: recurrence_watch: enabled: true service_scope: "target_service" # target_service | all windows_days: [7, 30] fail_on: severity_in: ["P0", "P1"] # used only in strict mode high_recurrence: true warn_on: warn_recurrence: true