Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
81 lines
2.2 KiB
YAML
81 lines
2.2 KiB
YAML
# Risk Attribution Policy — DAARION.city
|
|
#
|
|
# Deterministic attribution: risk spike → likely causes.
|
|
# LLM enrichment is OFF by default; local only on regression triggers.
|
|
|
|
defaults:
|
|
lookback_hours: 24
|
|
max_causes: 5
|
|
llm_mode: "off" # off | local | remote
|
|
llm_max_chars_in: 3500
|
|
llm_max_chars_out: 800
|
|
|
|
# LLM enrichment triggers — only if ALL conditions are met
|
|
llm_triggers:
|
|
risk_delta_warn: 10 # delta_24h >= 10
|
|
risk_delta_fail: 20 # delta_24h >= 20 (fail-level)
|
|
band_in: ["high", "critical"]
|
|
|
|
# Per-cause scoring weights (additive)
|
|
weights:
|
|
deploy: 30
|
|
dependency: 25
|
|
drift: 25
|
|
incident_storm: 20
|
|
slo_violation: 15
|
|
followups_overdue: 10
|
|
alert_loop_degraded: 10
|
|
|
|
# Per-signal detection config
|
|
signals:
|
|
deploy:
|
|
# Alert kinds that indicate a deploy event
|
|
kinds: ["deploy", "deployment", "rollout", "canary"]
|
|
|
|
dependency:
|
|
# Release gate names whose fail/warn counts as a dependency signal
|
|
release_gate_names: ["dependency_scan", "deps"]
|
|
|
|
drift:
|
|
release_gate_names: ["drift", "config_drift"]
|
|
|
|
incident_storm:
|
|
thresholds:
|
|
# occurrences in last 60min across all alert signatures for the service
|
|
occurrences_60m_warn: 10
|
|
# escalations (Escalated events) in last 24h
|
|
escalations_24h_warn: 2
|
|
|
|
slo:
|
|
require_active_violation: true
|
|
|
|
# Confidence bands (minimum score to reach that band)
|
|
output:
|
|
confidence_bands:
|
|
high: 60 # score >= 60 → high confidence
|
|
medium: 35 # score >= 35 → medium
|
|
# below 35 → low
|
|
|
|
# Change Timeline config
|
|
timeline:
|
|
enabled: true
|
|
lookback_hours: 24
|
|
max_items: 30
|
|
include_types: ["deploy","dependency","drift","incident","slo","followup","alert_loop","release_gate"]
|
|
time_bucket_minutes: 5 # coalesce same-type events within 5-min windows
|
|
|
|
# Evidence linking
|
|
evidence_linking:
|
|
enabled: true
|
|
max_refs_per_cause: 10
|
|
|
|
# LLM local endpoint config (only used when llm_mode=local)
|
|
llm_local:
|
|
endpoint: "http://localhost:11434/api/generate"
|
|
model: "llama3"
|
|
timeout_seconds: 15
|
|
# Hardening guards
|
|
model_allowlist: ["qwen2.5-coder:3b", "llama3.1:8b-instruct", "phi3:mini", "llama3"]
|
|
max_calls_per_digest: 3
|
|
per_day_dedupe: true # key: risk_enrich:{YYYY-MM-DD}:{service}:{env}
|