docs(platform): add policy configs, runbooks, ops scripts and platform documentation

Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
2026-03-03 07:14:53 -08:00
parent 129e4ea1fc
commit 67225a39fa
102 changed files with 20060 additions and 0 deletions
--- a/config/incident_intelligence_policy.yml
+++ b/config/incident_intelligence_policy.yml
@@ -0,0 +1,88 @@
+# Incident Intelligence Policy
+# Controls correlation scoring, recurrence detection, and digest generation.
+
+correlation:
+  lookback_days: 30
+  max_related: 10
+  min_score: 20          # discard matches below this
+  rules:
+    - name: "same_signature"
+      weight: 100
+      match:
+        signature: true
+
+    - name: "same_service_and_kind"
+      weight: 60
+      match:
+        same_service: true
+        same_kind: true
+
+    - name: "same_service_time_cluster"
+      weight: 40
+      match:
+        same_service: true
+        within_minutes: 180
+
+    - name: "same_kind_cross_service"
+      weight: 30
+      match:
+        same_kind: true
+        within_minutes: 120
+
+recurrence:
+  windows_days: [7, 30]
+  thresholds:
+    signature:
+      warn: 3     # ≥ 3 occurrences in window → warn
+      high: 6     # ≥ 6 occurrences in window → high
+    kind:
+      warn: 5
+      high: 10
+  top_n: 15        # top N per category
+
+  # Deterministic recommendations per recurrence level
+  recommendations:
+    signature_high: "Create permanent fix: add regression test + SLO guard for this failure type"
+    signature_warn: "Review root cause history; consider adding monitoring threshold"
+    kind_high: "Systemic issue with kind={kind}: review architecture / add circuit breaker"
+    kind_warn: "Recurring kind={kind}: validate if alert thresholds are tuned correctly"
+
+digest:
+  weekly_day: "Mon"
+  include_closed: true
+  include_open: true
+  output_dir: "ops/reports/incidents"
+  markdown_max_chars: 8000
+  top_incidents: 20      # max incidents in weekly listing
+
+# ── Root-Cause Buckets ─────────────────────────────────────────────────────
+buckets:
+  mode: "service_kind"         # service_kind | signature_prefix
+  signature_prefix_len: 12
+  top_n: 10
+  min_count:
+    7: 3                       # bucket must have ≥ 3 incidents in last 7d
+    30: 6                      # or ≥ 6 in last 30d
+  include_statuses: ["open", "mitigating", "resolved", "closed"]
+
+# ── Auto Follow-ups (policy-driven, no LLM) ───────────────────────────────
+autofollowups:
+  enabled: true
+  only_when_high: true         # only create for HIGH recurrence buckets
+  owner: "oncall"
+  priority: "P1"
+  due_days: 7
+  max_followups_per_bucket_per_week: 1   # dedupe by week+bucket_key
+  dedupe_key_prefix: "intel_recur"
+
+# ── Release Gate: recurrence_watch ────────────────────────────────────────
+release_gate:
+  recurrence_watch:
+    enabled: true
+    service_scope: "target_service"  # target_service | all
+    windows_days: [7, 30]
+    fail_on:
+      severity_in: ["P0", "P1"]   # used only in strict mode
+      high_recurrence: true
+    warn_on:
+      warn_recurrence: true