Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
90 lines
2.9 KiB
YAML
90 lines
2.9 KiB
YAML
# Service Risk Index Policy — DAARION.city
|
|
#
|
|
# Controls how Risk Scores are computed, classified, and gated.
|
|
# All scoring is deterministic: no LLM required.
|
|
|
|
defaults:
|
|
window_hours: 24
|
|
recurrence_windows_days: [7, 30]
|
|
slo_window_minutes: 60
|
|
|
|
thresholds:
|
|
bands:
|
|
low_max: 20
|
|
medium_max: 50
|
|
high_max: 80
|
|
risk_watch: # defaults, overridable per service below
|
|
warn_at: 50 # score >= warn_at → recommendations
|
|
fail_at: 80 # score >= fail_at → gate fails (strict mode only)
|
|
|
|
weights:
|
|
open_incidents:
|
|
P0: 50
|
|
P1: 25
|
|
P2: 10
|
|
P3: 5
|
|
recurrence:
|
|
signature_warn_7d: 10
|
|
signature_high_7d: 20
|
|
kind_warn_7d: 8
|
|
kind_high_7d: 15
|
|
signature_high_30d: 10
|
|
kind_high_30d: 8
|
|
followups:
|
|
overdue_P0: 20
|
|
overdue_P1: 12
|
|
overdue_other: 6
|
|
slo:
|
|
violation: 10 # per active violation
|
|
alerts_loop:
|
|
slo_violation: 10 # per alert-loop SLO violation
|
|
escalation:
|
|
escalations_24h:
|
|
warn: 5 # score added if escalations_24h >= 1
|
|
high: 12 # score added if escalations_24h >= 3
|
|
|
|
# Per-service risk gate overrides (lower/higher fail_at)
|
|
service_overrides:
|
|
gateway:
|
|
risk_watch:
|
|
fail_at: 75 # gateway is critical: fail earlier
|
|
router:
|
|
risk_watch:
|
|
fail_at: 80
|
|
|
|
# Services treated as P0 (always subject to strict risk_watch in staging)
|
|
p0_services:
|
|
- gateway
|
|
- router
|
|
|
|
# ─── History & Snapshotting ────────────────────────────────────────────────────
|
|
history:
|
|
snapshot_interval_minutes: 60
|
|
retention_days: 90
|
|
max_services_per_run: 50
|
|
|
|
# ─── Trend analysis ───────────────────────────────────────────────────────────
|
|
trend:
|
|
delta_windows_hours: [24, 168] # 24h and 7d
|
|
volatility_window_hours: 168 # stddev computed over last 7d
|
|
regression_threshold:
|
|
delta_24h_warn: 10 # score rose >= 10 points in 24h → warn
|
|
delta_24h_fail: 20 # score rose >= 20 points in 24h → fail (strict)
|
|
delta_7d_warn: 15
|
|
delta_7d_fail: 30
|
|
|
|
# ─── Daily Digest ─────────────────────────────────────────────────────────────
|
|
digest:
|
|
daily_hour_utc: 9 # generate at 09:00 UTC
|
|
output_dir: "ops/reports/risk"
|
|
markdown_max_chars: 8000
|
|
top_n: 10
|
|
|
|
# ─── Risk Delta release gate ──────────────────────────────────────────────────
|
|
release_gate:
|
|
risk_delta_watch:
|
|
enabled: true
|
|
default_warn_delta_24h: 10
|
|
default_fail_delta_24h: 20
|
|
p0_services_strict: true
|