docs(platform): add policy configs, runbooks, ops scripts and platform documentation
Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
This commit is contained in:
88
config/incident_intelligence_policy.yml
Normal file
88
config/incident_intelligence_policy.yml
Normal file
@@ -0,0 +1,88 @@
|
||||
# Incident Intelligence Policy
|
||||
# Controls correlation scoring, recurrence detection, and digest generation.
|
||||
|
||||
correlation:
|
||||
lookback_days: 30
|
||||
max_related: 10
|
||||
min_score: 20 # discard matches below this
|
||||
rules:
|
||||
- name: "same_signature"
|
||||
weight: 100
|
||||
match:
|
||||
signature: true
|
||||
|
||||
- name: "same_service_and_kind"
|
||||
weight: 60
|
||||
match:
|
||||
same_service: true
|
||||
same_kind: true
|
||||
|
||||
- name: "same_service_time_cluster"
|
||||
weight: 40
|
||||
match:
|
||||
same_service: true
|
||||
within_minutes: 180
|
||||
|
||||
- name: "same_kind_cross_service"
|
||||
weight: 30
|
||||
match:
|
||||
same_kind: true
|
||||
within_minutes: 120
|
||||
|
||||
recurrence:
|
||||
windows_days: [7, 30]
|
||||
thresholds:
|
||||
signature:
|
||||
warn: 3 # ≥ 3 occurrences in window → warn
|
||||
high: 6 # ≥ 6 occurrences in window → high
|
||||
kind:
|
||||
warn: 5
|
||||
high: 10
|
||||
top_n: 15 # top N per category
|
||||
|
||||
# Deterministic recommendations per recurrence level
|
||||
recommendations:
|
||||
signature_high: "Create permanent fix: add regression test + SLO guard for this failure type"
|
||||
signature_warn: "Review root cause history; consider adding monitoring threshold"
|
||||
kind_high: "Systemic issue with kind={kind}: review architecture / add circuit breaker"
|
||||
kind_warn: "Recurring kind={kind}: validate if alert thresholds are tuned correctly"
|
||||
|
||||
digest:
|
||||
weekly_day: "Mon"
|
||||
include_closed: true
|
||||
include_open: true
|
||||
output_dir: "ops/reports/incidents"
|
||||
markdown_max_chars: 8000
|
||||
top_incidents: 20 # max incidents in weekly listing
|
||||
|
||||
# ── Root-Cause Buckets ─────────────────────────────────────────────────────
|
||||
buckets:
|
||||
mode: "service_kind" # service_kind | signature_prefix
|
||||
signature_prefix_len: 12
|
||||
top_n: 10
|
||||
min_count:
|
||||
7: 3 # bucket must have ≥ 3 incidents in last 7d
|
||||
30: 6 # or ≥ 6 in last 30d
|
||||
include_statuses: ["open", "mitigating", "resolved", "closed"]
|
||||
|
||||
# ── Auto Follow-ups (policy-driven, no LLM) ───────────────────────────────
|
||||
autofollowups:
|
||||
enabled: true
|
||||
only_when_high: true # only create for HIGH recurrence buckets
|
||||
owner: "oncall"
|
||||
priority: "P1"
|
||||
due_days: 7
|
||||
max_followups_per_bucket_per_week: 1 # dedupe by week+bucket_key
|
||||
dedupe_key_prefix: "intel_recur"
|
||||
|
||||
# ── Release Gate: recurrence_watch ────────────────────────────────────────
|
||||
release_gate:
|
||||
recurrence_watch:
|
||||
enabled: true
|
||||
service_scope: "target_service" # target_service | all
|
||||
windows_days: [7, 30]
|
||||
fail_on:
|
||||
severity_in: ["P0", "P1"] # used only in strict mode
|
||||
high_recurrence: true
|
||||
warn_on:
|
||||
warn_recurrence: true
|
||||
Reference in New Issue
Block a user