docs(platform): add policy configs, runbooks, ops scripts and platform documentation

Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
2026-03-03 07:14:53 -08:00
parent 129e4ea1fc
commit 67225a39fa
102 changed files with 20060 additions and 0 deletions
--- a/config/risk_attribution_policy.yml
+++ b/config/risk_attribution_policy.yml
@@ -0,0 +1,80 @@
+# Risk Attribution Policy — DAARION.city
+#
+# Deterministic attribution: risk spike → likely causes.
+# LLM enrichment is OFF by default; local only on regression triggers.
+
+defaults:
+  lookback_hours: 24
+  max_causes: 5
+  llm_mode: "off"           # off | local | remote
+  llm_max_chars_in: 3500
+  llm_max_chars_out: 800
+
+# LLM enrichment triggers — only if ALL conditions are met
+llm_triggers:
+  risk_delta_warn: 10       # delta_24h >= 10
+  risk_delta_fail: 20       # delta_24h >= 20 (fail-level)
+  band_in: ["high", "critical"]
+
+# Per-cause scoring weights (additive)
+weights:
+  deploy: 30
+  dependency: 25
+  drift: 25
+  incident_storm: 20
+  slo_violation: 15
+  followups_overdue: 10
+  alert_loop_degraded: 10
+
+# Per-signal detection config
+signals:
+  deploy:
+    # Alert kinds that indicate a deploy event
+    kinds: ["deploy", "deployment", "rollout", "canary"]
+
+  dependency:
+    # Release gate names whose fail/warn counts as a dependency signal
+    release_gate_names: ["dependency_scan", "deps"]
+
+  drift:
+    release_gate_names: ["drift", "config_drift"]
+
+  incident_storm:
+    thresholds:
+      # occurrences in last 60min across all alert signatures for the service
+      occurrences_60m_warn: 10
+      # escalations (Escalated events) in last 24h
+      escalations_24h_warn: 2
+
+  slo:
+    require_active_violation: true
+
+# Confidence bands (minimum score to reach that band)
+output:
+  confidence_bands:
+    high: 60      # score >= 60 → high confidence
+    medium: 35    # score >= 35 → medium
+    # below 35     → low
+
+# Change Timeline config
+timeline:
+  enabled: true
+  lookback_hours: 24
+  max_items: 30
+  include_types: ["deploy","dependency","drift","incident","slo","followup","alert_loop","release_gate"]
+  time_bucket_minutes: 5     # coalesce same-type events within 5-min windows
+
+# Evidence linking
+evidence_linking:
+  enabled: true
+  max_refs_per_cause: 10
+
+# LLM local endpoint config (only used when llm_mode=local)
+llm_local:
+  endpoint: "http://localhost:11434/api/generate"
+  model: "llama3"
+  timeout_seconds: 15
+  # Hardening guards
+  model_allowlist: ["qwen2.5-coder:3b", "llama3.1:8b-instruct", "phi3:mini", "llama3"]
+  max_calls_per_digest: 3
+  per_day_dedupe: true       # key: risk_enrich:{YYYY-MM-DD}:{service}:{env}