docs(platform): add policy configs, runbooks, ops scripts and platform documentation
Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
This commit is contained in:
89
config/risk_policy.yml
Normal file
89
config/risk_policy.yml
Normal file
@@ -0,0 +1,89 @@
|
||||
# Service Risk Index Policy — DAARION.city
|
||||
#
|
||||
# Controls how Risk Scores are computed, classified, and gated.
|
||||
# All scoring is deterministic: no LLM required.
|
||||
|
||||
defaults:
|
||||
window_hours: 24
|
||||
recurrence_windows_days: [7, 30]
|
||||
slo_window_minutes: 60
|
||||
|
||||
thresholds:
|
||||
bands:
|
||||
low_max: 20
|
||||
medium_max: 50
|
||||
high_max: 80
|
||||
risk_watch: # defaults, overridable per service below
|
||||
warn_at: 50 # score >= warn_at → recommendations
|
||||
fail_at: 80 # score >= fail_at → gate fails (strict mode only)
|
||||
|
||||
weights:
|
||||
open_incidents:
|
||||
P0: 50
|
||||
P1: 25
|
||||
P2: 10
|
||||
P3: 5
|
||||
recurrence:
|
||||
signature_warn_7d: 10
|
||||
signature_high_7d: 20
|
||||
kind_warn_7d: 8
|
||||
kind_high_7d: 15
|
||||
signature_high_30d: 10
|
||||
kind_high_30d: 8
|
||||
followups:
|
||||
overdue_P0: 20
|
||||
overdue_P1: 12
|
||||
overdue_other: 6
|
||||
slo:
|
||||
violation: 10 # per active violation
|
||||
alerts_loop:
|
||||
slo_violation: 10 # per alert-loop SLO violation
|
||||
escalation:
|
||||
escalations_24h:
|
||||
warn: 5 # score added if escalations_24h >= 1
|
||||
high: 12 # score added if escalations_24h >= 3
|
||||
|
||||
# Per-service risk gate overrides (lower/higher fail_at)
|
||||
service_overrides:
|
||||
gateway:
|
||||
risk_watch:
|
||||
fail_at: 75 # gateway is critical: fail earlier
|
||||
router:
|
||||
risk_watch:
|
||||
fail_at: 80
|
||||
|
||||
# Services treated as P0 (always subject to strict risk_watch in staging)
|
||||
p0_services:
|
||||
- gateway
|
||||
- router
|
||||
|
||||
# ─── History & Snapshotting ────────────────────────────────────────────────────
|
||||
history:
|
||||
snapshot_interval_minutes: 60
|
||||
retention_days: 90
|
||||
max_services_per_run: 50
|
||||
|
||||
# ─── Trend analysis ───────────────────────────────────────────────────────────
|
||||
trend:
|
||||
delta_windows_hours: [24, 168] # 24h and 7d
|
||||
volatility_window_hours: 168 # stddev computed over last 7d
|
||||
regression_threshold:
|
||||
delta_24h_warn: 10 # score rose >= 10 points in 24h → warn
|
||||
delta_24h_fail: 20 # score rose >= 20 points in 24h → fail (strict)
|
||||
delta_7d_warn: 15
|
||||
delta_7d_fail: 30
|
||||
|
||||
# ─── Daily Digest ─────────────────────────────────────────────────────────────
|
||||
digest:
|
||||
daily_hour_utc: 9 # generate at 09:00 UTC
|
||||
output_dir: "ops/reports/risk"
|
||||
markdown_max_chars: 8000
|
||||
top_n: 10
|
||||
|
||||
# ─── Risk Delta release gate ──────────────────────────────────────────────────
|
||||
release_gate:
|
||||
risk_delta_watch:
|
||||
enabled: true
|
||||
default_warn_delta_24h: 10
|
||||
default_fail_delta_24h: 20
|
||||
p0_services_strict: true
|
||||
Reference in New Issue
Block a user