docs(platform): add policy configs, runbooks, ops scripts and platform documentation
Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
This commit is contained in:
80
config/risk_attribution_policy.yml
Normal file
80
config/risk_attribution_policy.yml
Normal file
@@ -0,0 +1,80 @@
|
||||
# Risk Attribution Policy — DAARION.city
|
||||
#
|
||||
# Deterministic attribution: risk spike → likely causes.
|
||||
# LLM enrichment is OFF by default; local only on regression triggers.
|
||||
|
||||
defaults:
|
||||
lookback_hours: 24
|
||||
max_causes: 5
|
||||
llm_mode: "off" # off | local | remote
|
||||
llm_max_chars_in: 3500
|
||||
llm_max_chars_out: 800
|
||||
|
||||
# LLM enrichment triggers — only if ALL conditions are met
|
||||
llm_triggers:
|
||||
risk_delta_warn: 10 # delta_24h >= 10
|
||||
risk_delta_fail: 20 # delta_24h >= 20 (fail-level)
|
||||
band_in: ["high", "critical"]
|
||||
|
||||
# Per-cause scoring weights (additive)
|
||||
weights:
|
||||
deploy: 30
|
||||
dependency: 25
|
||||
drift: 25
|
||||
incident_storm: 20
|
||||
slo_violation: 15
|
||||
followups_overdue: 10
|
||||
alert_loop_degraded: 10
|
||||
|
||||
# Per-signal detection config
|
||||
signals:
|
||||
deploy:
|
||||
# Alert kinds that indicate a deploy event
|
||||
kinds: ["deploy", "deployment", "rollout", "canary"]
|
||||
|
||||
dependency:
|
||||
# Release gate names whose fail/warn counts as a dependency signal
|
||||
release_gate_names: ["dependency_scan", "deps"]
|
||||
|
||||
drift:
|
||||
release_gate_names: ["drift", "config_drift"]
|
||||
|
||||
incident_storm:
|
||||
thresholds:
|
||||
# occurrences in last 60min across all alert signatures for the service
|
||||
occurrences_60m_warn: 10
|
||||
# escalations (Escalated events) in last 24h
|
||||
escalations_24h_warn: 2
|
||||
|
||||
slo:
|
||||
require_active_violation: true
|
||||
|
||||
# Confidence bands (minimum score to reach that band)
|
||||
output:
|
||||
confidence_bands:
|
||||
high: 60 # score >= 60 → high confidence
|
||||
medium: 35 # score >= 35 → medium
|
||||
# below 35 → low
|
||||
|
||||
# Change Timeline config
|
||||
timeline:
|
||||
enabled: true
|
||||
lookback_hours: 24
|
||||
max_items: 30
|
||||
include_types: ["deploy","dependency","drift","incident","slo","followup","alert_loop","release_gate"]
|
||||
time_bucket_minutes: 5 # coalesce same-type events within 5-min windows
|
||||
|
||||
# Evidence linking
|
||||
evidence_linking:
|
||||
enabled: true
|
||||
max_refs_per_cause: 10
|
||||
|
||||
# LLM local endpoint config (only used when llm_mode=local)
|
||||
llm_local:
|
||||
endpoint: "http://localhost:11434/api/generate"
|
||||
model: "llama3"
|
||||
timeout_seconds: 15
|
||||
# Hardening guards
|
||||
model_allowlist: ["qwen2.5-coder:3b", "llama3.1:8b-instruct", "phi3:mini", "llama3"]
|
||||
max_calls_per_digest: 3
|
||||
per_day_dedupe: true # key: risk_enrich:{YYYY-MM-DD}:{service}:{env}
|
||||
Reference in New Issue
Block a user