Files
microdao-daarion/config/risk_policy.yml
Apple 67225a39fa docs(platform): add policy configs, runbooks, ops scripts and platform documentation
Config policies (16 files): alert_routing, architecture_pressure, backlog,
cost_weights, data_governance, incident_escalation, incident_intelligence,
network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix,
release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout

Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard,
deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice,
cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule),
task_registry, voice alerts/ha/latency/policy

Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks,
NODA1/NODA2 status and setup, audit index and traces, backlog, incident,
supervisor, tools, voice, opencode, release, risk, aistalk, spacebot

Made-with: Cursor
2026-03-03 07:14:53 -08:00

90 lines
2.9 KiB
YAML

# Service Risk Index Policy — DAARION.city
#
# Controls how Risk Scores are computed, classified, and gated.
# All scoring is deterministic: no LLM required.
defaults:
window_hours: 24
recurrence_windows_days: [7, 30]
slo_window_minutes: 60
thresholds:
bands:
low_max: 20
medium_max: 50
high_max: 80
risk_watch: # defaults, overridable per service below
warn_at: 50 # score >= warn_at → recommendations
fail_at: 80 # score >= fail_at → gate fails (strict mode only)
weights:
open_incidents:
P0: 50
P1: 25
P2: 10
P3: 5
recurrence:
signature_warn_7d: 10
signature_high_7d: 20
kind_warn_7d: 8
kind_high_7d: 15
signature_high_30d: 10
kind_high_30d: 8
followups:
overdue_P0: 20
overdue_P1: 12
overdue_other: 6
slo:
violation: 10 # per active violation
alerts_loop:
slo_violation: 10 # per alert-loop SLO violation
escalation:
escalations_24h:
warn: 5 # score added if escalations_24h >= 1
high: 12 # score added if escalations_24h >= 3
# Per-service risk gate overrides (lower/higher fail_at)
service_overrides:
gateway:
risk_watch:
fail_at: 75 # gateway is critical: fail earlier
router:
risk_watch:
fail_at: 80
# Services treated as P0 (always subject to strict risk_watch in staging)
p0_services:
- gateway
- router
# ─── History & Snapshotting ────────────────────────────────────────────────────
history:
snapshot_interval_minutes: 60
retention_days: 90
max_services_per_run: 50
# ─── Trend analysis ───────────────────────────────────────────────────────────
trend:
delta_windows_hours: [24, 168] # 24h and 7d
volatility_window_hours: 168 # stddev computed over last 7d
regression_threshold:
delta_24h_warn: 10 # score rose >= 10 points in 24h → warn
delta_24h_fail: 20 # score rose >= 20 points in 24h → fail (strict)
delta_7d_warn: 15
delta_7d_fail: 30
# ─── Daily Digest ─────────────────────────────────────────────────────────────
digest:
daily_hour_utc: 9 # generate at 09:00 UTC
output_dir: "ops/reports/risk"
markdown_max_chars: 8000
top_n: 10
# ─── Risk Delta release gate ──────────────────────────────────────────────────
release_gate:
risk_delta_watch:
enabled: true
default_warn_delta_24h: 10
default_fail_delta_24h: 20
p0_services_strict: true