docs(platform): add policy configs, runbooks, ops scripts and platform documentation

Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
2026-03-03 07:14:53 -08:00
parent 129e4ea1fc
commit 67225a39fa
102 changed files with 20060 additions and 0 deletions
--- a/config/risk_policy.yml
+++ b/config/risk_policy.yml
@@ -0,0 +1,89 @@
+# Service Risk Index Policy — DAARION.city
+#
+# Controls how Risk Scores are computed, classified, and gated.
+# All scoring is deterministic: no LLM required.
+
+defaults:
+  window_hours: 24
+  recurrence_windows_days: [7, 30]
+  slo_window_minutes: 60
+
+thresholds:
+  bands:
+    low_max: 20
+    medium_max: 50
+    high_max: 80
+  risk_watch:          # defaults, overridable per service below
+    warn_at: 50        # score >= warn_at → recommendations
+    fail_at: 80        # score >= fail_at → gate fails (strict mode only)
+
+weights:
+  open_incidents:
+    P0: 50
+    P1: 25
+    P2: 10
+    P3: 5
+  recurrence:
+    signature_warn_7d: 10
+    signature_high_7d: 20
+    kind_warn_7d: 8
+    kind_high_7d: 15
+    signature_high_30d: 10
+    kind_high_30d: 8
+  followups:
+    overdue_P0: 20
+    overdue_P1: 12
+    overdue_other: 6
+  slo:
+    violation: 10       # per active violation
+  alerts_loop:
+    slo_violation: 10   # per alert-loop SLO violation
+  escalation:
+    escalations_24h:
+      warn: 5           # score added if escalations_24h >= 1
+      high: 12          # score added if escalations_24h >= 3
+
+# Per-service risk gate overrides (lower/higher fail_at)
+service_overrides:
+  gateway:
+    risk_watch:
+      fail_at: 75       # gateway is critical: fail earlier
+  router:
+    risk_watch:
+      fail_at: 80
+
+# Services treated as P0 (always subject to strict risk_watch in staging)
+p0_services:
+  - gateway
+  - router
+
+# ─── History & Snapshotting ────────────────────────────────────────────────────
+history:
+  snapshot_interval_minutes: 60
+  retention_days: 90
+  max_services_per_run: 50
+
+# ─── Trend analysis ───────────────────────────────────────────────────────────
+trend:
+  delta_windows_hours: [24, 168]    # 24h and 7d
+  volatility_window_hours: 168      # stddev computed over last 7d
+  regression_threshold:
+    delta_24h_warn: 10              # score rose >= 10 points in 24h → warn
+    delta_24h_fail: 20              # score rose >= 20 points in 24h → fail (strict)
+    delta_7d_warn: 15
+    delta_7d_fail: 30
+
+# ─── Daily Digest ─────────────────────────────────────────────────────────────
+digest:
+  daily_hour_utc: 9                 # generate at 09:00 UTC
+  output_dir: "ops/reports/risk"
+  markdown_max_chars: 8000
+  top_n: 10
+
+# ─── Risk Delta release gate ──────────────────────────────────────────────────
+release_gate:
+  risk_delta_watch:
+    enabled: true
+    default_warn_delta_24h: 10
+    default_fail_delta_24h: 20
+    p0_services_strict: true