New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
711 lines
25 KiB
Python
711 lines
25 KiB
Python
"""
|
|
risk_engine.py — Service Risk Index Engine (deterministic, no LLM).
|
|
|
|
Provides:
|
|
compute_service_risk(service, env, ...) -> RiskReport
|
|
compute_risk_dashboard(env, top_n, ...) -> Dashboard
|
|
compute_trend(series) -> TrendReport
|
|
enrich_risk_report_with_trend(report, history_store, policy) -> report (mutated)
|
|
snapshot_all_services(env, compute_fn, history_store, policy) -> SnapshotResult
|
|
|
|
All inputs come from existing stores and tools.
|
|
The engine never calls external services directly — callers inject store references.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import datetime
|
|
import logging
|
|
import math
|
|
import yaml
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ─── Policy ───────────────────────────────────────────────────────────────────
|
|
|
|
_POLICY_CACHE: Optional[Dict] = None
|
|
_POLICY_SEARCH_PATHS = [
|
|
Path("config/risk_policy.yml"),
|
|
Path(__file__).resolve().parent.parent.parent / "config" / "risk_policy.yml",
|
|
]
|
|
|
|
|
|
def load_risk_policy() -> Dict:
|
|
global _POLICY_CACHE
|
|
if _POLICY_CACHE is not None:
|
|
return _POLICY_CACHE
|
|
for p in _POLICY_SEARCH_PATHS:
|
|
if p.exists():
|
|
try:
|
|
with open(p) as f:
|
|
data = yaml.safe_load(f) or {}
|
|
_POLICY_CACHE = data
|
|
return data
|
|
except Exception as e:
|
|
logger.warning("Failed to load risk_policy from %s: %s", p, e)
|
|
logger.warning("risk_policy.yml not found; using built-in defaults")
|
|
_POLICY_CACHE = _builtin_defaults()
|
|
return _POLICY_CACHE
|
|
|
|
|
|
def _builtin_defaults() -> Dict:
|
|
return {
|
|
"defaults": {"window_hours": 24, "recurrence_windows_days": [7, 30],
|
|
"slo_window_minutes": 60},
|
|
"thresholds": {
|
|
"bands": {"low_max": 20, "medium_max": 50, "high_max": 80},
|
|
"risk_watch": {"warn_at": 50, "fail_at": 80},
|
|
},
|
|
"weights": {
|
|
"open_incidents": {"P0": 50, "P1": 25, "P2": 10, "P3": 5},
|
|
"recurrence": {
|
|
"signature_warn_7d": 10, "signature_high_7d": 20,
|
|
"kind_warn_7d": 8, "kind_high_7d": 15,
|
|
"signature_high_30d": 10, "kind_high_30d": 8,
|
|
},
|
|
"followups": {"overdue_P0": 20, "overdue_P1": 12, "overdue_other": 6},
|
|
"slo": {"violation": 10},
|
|
"alerts_loop": {"slo_violation": 10},
|
|
"escalation": {"escalations_24h": {"warn": 5, "high": 12}},
|
|
},
|
|
"service_overrides": {},
|
|
"p0_services": ["gateway", "router"],
|
|
}
|
|
|
|
|
|
def _reload_policy() -> None:
|
|
global _POLICY_CACHE
|
|
_POLICY_CACHE = None
|
|
|
|
|
|
# ─── Band classification ──────────────────────────────────────────────────────
|
|
|
|
def score_to_band(score: int, policy: Dict) -> str:
|
|
bands = policy.get("thresholds", {}).get("bands", {})
|
|
low_max = int(bands.get("low_max", 20))
|
|
medium_max = int(bands.get("medium_max", 50))
|
|
high_max = int(bands.get("high_max", 80))
|
|
if score <= low_max:
|
|
return "low"
|
|
if score <= medium_max:
|
|
return "medium"
|
|
if score <= high_max:
|
|
return "high"
|
|
return "critical"
|
|
|
|
|
|
def get_service_thresholds(service: str, policy: Dict) -> Dict:
|
|
overrides = policy.get("service_overrides", {}).get(service, {})
|
|
defaults = policy.get("thresholds", {}).get("risk_watch", {})
|
|
ov_rw = overrides.get("risk_watch", {})
|
|
return {
|
|
"warn_at": int(ov_rw.get("warn_at", defaults.get("warn_at", 50))),
|
|
"fail_at": int(ov_rw.get("fail_at", defaults.get("fail_at", 80))),
|
|
}
|
|
|
|
|
|
# ─── Individual scoring components ───────────────────────────────────────────
|
|
|
|
def _score_open_incidents(
|
|
open_incidents: List[Dict],
|
|
weights: Dict,
|
|
) -> Tuple[int, Dict, List[str]]:
|
|
"""Score open incidents by severity."""
|
|
w = weights.get("open_incidents", {})
|
|
counts: Dict[str, int] = {"P0": 0, "P1": 0, "P2": 0, "P3": 0}
|
|
points = 0
|
|
for inc in open_incidents:
|
|
sev = inc.get("severity", "P3")
|
|
if sev in counts:
|
|
counts[sev] += 1
|
|
pts = int(w.get(sev, 0))
|
|
points += pts
|
|
|
|
reasons = []
|
|
if counts["P0"]:
|
|
reasons.append(f"Open P0 incident(s): {counts['P0']}")
|
|
if counts["P1"]:
|
|
reasons.append(f"Open P1 incident(s): {counts['P1']}")
|
|
if counts["P2"]:
|
|
reasons.append(f"Open P2 incident(s): {counts['P2']}")
|
|
|
|
return points, {**counts, "points": points}, reasons
|
|
|
|
|
|
def _score_recurrence(
|
|
recurrence_data: Dict,
|
|
weights: Dict,
|
|
) -> Tuple[int, Dict, List[str]]:
|
|
"""Score from recurrence detection stats."""
|
|
w = weights.get("recurrence", {})
|
|
high_rec = recurrence_data.get("high_recurrence", {})
|
|
warn_rec = recurrence_data.get("warn_recurrence", {})
|
|
|
|
high_sigs_7d = len(high_rec.get("signatures", []))
|
|
high_kinds_7d = len(high_rec.get("kinds", []))
|
|
warn_sigs_7d = len(warn_rec.get("signatures", []))
|
|
warn_kinds_7d = len(warn_rec.get("kinds", []))
|
|
|
|
# Note: 30d data comes from separate call; keep it optional
|
|
high_sigs_30d = len(recurrence_data.get("high_recurrence_30d", {}).get("signatures", []))
|
|
high_kinds_30d = len(recurrence_data.get("high_recurrence_30d", {}).get("kinds", []))
|
|
|
|
points = (
|
|
high_sigs_7d * int(w.get("signature_high_7d", 20))
|
|
+ warn_sigs_7d * int(w.get("signature_warn_7d", 10))
|
|
+ high_kinds_7d * int(w.get("kind_high_7d", 15))
|
|
+ warn_kinds_7d * int(w.get("kind_warn_7d", 8))
|
|
+ high_sigs_30d * int(w.get("signature_high_30d", 10))
|
|
+ high_kinds_30d * int(w.get("kind_high_30d", 8))
|
|
)
|
|
|
|
component = {
|
|
"high_signatures_7d": high_sigs_7d,
|
|
"warn_signatures_7d": warn_sigs_7d,
|
|
"high_kinds_7d": high_kinds_7d,
|
|
"warn_kinds_7d": warn_kinds_7d,
|
|
"high_signatures_30d": high_sigs_30d,
|
|
"high_kinds_30d": high_kinds_30d,
|
|
"points": points,
|
|
}
|
|
reasons = []
|
|
if high_sigs_7d:
|
|
reasons.append(f"High recurrence signatures (7d): {high_sigs_7d}")
|
|
if high_kinds_7d:
|
|
reasons.append(f"High recurrence kinds (7d): {high_kinds_7d}")
|
|
if warn_sigs_7d:
|
|
reasons.append(f"Warn recurrence signatures (7d): {warn_sigs_7d}")
|
|
return points, component, reasons
|
|
|
|
|
|
def _score_followups(
|
|
followups_data: Dict,
|
|
weights: Dict,
|
|
) -> Tuple[int, Dict, List[str]]:
|
|
"""Score overdue follow-ups by priority."""
|
|
w = weights.get("followups", {})
|
|
overdue = followups_data.get("overdue_followups", [])
|
|
counts: Dict[str, int] = {"P0": 0, "P1": 0, "other": 0}
|
|
points = 0
|
|
|
|
for fu in overdue:
|
|
prio = fu.get("priority", "other")
|
|
if prio == "P0":
|
|
counts["P0"] += 1
|
|
points += int(w.get("overdue_P0", 20))
|
|
elif prio == "P1":
|
|
counts["P1"] += 1
|
|
points += int(w.get("overdue_P1", 12))
|
|
else:
|
|
counts["other"] += 1
|
|
points += int(w.get("overdue_other", 6))
|
|
|
|
reasons = []
|
|
if counts["P0"]:
|
|
reasons.append(f"Overdue follow-ups (P0): {counts['P0']}")
|
|
if counts["P1"]:
|
|
reasons.append(f"Overdue follow-ups (P1): {counts['P1']}")
|
|
if counts["other"]:
|
|
reasons.append(f"Overdue follow-ups (other): {counts['other']}")
|
|
|
|
return points, {**counts, "points": points}, reasons
|
|
|
|
|
|
def _score_slo(
|
|
slo_data: Dict,
|
|
weights: Dict,
|
|
) -> Tuple[int, Dict, List[str]]:
|
|
"""Score SLO violations."""
|
|
w = weights.get("slo", {})
|
|
violations = slo_data.get("violations", [])
|
|
skipped = slo_data.get("skipped", False)
|
|
|
|
if skipped:
|
|
return 0, {"violations": 0, "skipped": True, "points": 0}, []
|
|
|
|
count = len(violations)
|
|
points = count * int(w.get("violation", 10))
|
|
reasons = []
|
|
if count:
|
|
reasons.append(f"Active SLO violation(s) in window: {count}")
|
|
return points, {"violations": count, "skipped": False, "points": points}, reasons
|
|
|
|
|
|
def _score_alerts_loop(
|
|
loop_slo: Dict,
|
|
weights: Dict,
|
|
) -> Tuple[int, Dict, List[str]]:
|
|
"""Score alert-loop SLO violations (self-monitoring)."""
|
|
w = weights.get("alerts_loop", {})
|
|
violations = loop_slo.get("violations", [])
|
|
count = len(violations)
|
|
points = count * int(w.get("slo_violation", 10))
|
|
reasons = []
|
|
if count:
|
|
reasons.append(f"Alert-loop SLO violation(s): {count}")
|
|
return points, {"violations": count, "points": points}, reasons
|
|
|
|
|
|
def _score_escalations(
|
|
escalation_count: int,
|
|
weights: Dict,
|
|
) -> Tuple[int, Dict, List[str]]:
|
|
"""Score escalations in last 24h."""
|
|
esc_w = weights.get("escalation", {}).get("escalations_24h", {})
|
|
warn_pts = int(esc_w.get("warn", 5))
|
|
high_pts = int(esc_w.get("high", 12))
|
|
|
|
if escalation_count >= 3:
|
|
points = high_pts
|
|
elif escalation_count >= 1:
|
|
points = warn_pts
|
|
else:
|
|
points = 0
|
|
|
|
reasons = []
|
|
if escalation_count:
|
|
reasons.append(f"Escalations in last 24h: {escalation_count}")
|
|
|
|
return points, {"count_24h": escalation_count, "points": points}, reasons
|
|
|
|
|
|
# ─── Main scoring function ────────────────────────────────────────────────────
|
|
|
|
def compute_service_risk(
|
|
service: str,
|
|
env: str = "prod",
|
|
*,
|
|
open_incidents: Optional[List[Dict]] = None,
|
|
recurrence_7d: Optional[Dict] = None,
|
|
recurrence_30d: Optional[Dict] = None,
|
|
followups_data: Optional[Dict] = None,
|
|
slo_data: Optional[Dict] = None,
|
|
alerts_loop_slo: Optional[Dict] = None,
|
|
escalation_count_24h: int = 0,
|
|
policy: Optional[Dict] = None,
|
|
) -> Dict:
|
|
"""
|
|
Compute risk score for a service.
|
|
|
|
Accepts pre-fetched data dicts (callers are responsible for fetching
|
|
from stores/tools). All args default to empty/safe values so the engine
|
|
never crashes due to missing data.
|
|
"""
|
|
if policy is None:
|
|
policy = load_risk_policy()
|
|
|
|
weights = policy.get("weights", _builtin_defaults()["weights"])
|
|
|
|
# ── Compute each component ────────────────────────────────────────────────
|
|
open_incs = open_incidents or []
|
|
pts_inc, comp_inc, reasons_inc = _score_open_incidents(open_incs, weights)
|
|
|
|
# Merge 7d + 30d recurrence into a single dict
|
|
rec_merged = dict(recurrence_7d or {})
|
|
if recurrence_30d:
|
|
rec_merged["high_recurrence_30d"] = recurrence_30d.get("high_recurrence", {})
|
|
rec_merged["warn_recurrence_30d"] = recurrence_30d.get("warn_recurrence", {})
|
|
pts_rec, comp_rec, reasons_rec = _score_recurrence(rec_merged, weights)
|
|
|
|
pts_fu, comp_fu, reasons_fu = _score_followups(followups_data or {}, weights)
|
|
pts_slo, comp_slo, reasons_slo = _score_slo(slo_data or {}, weights)
|
|
pts_loop, comp_loop, reasons_loop = _score_alerts_loop(alerts_loop_slo or {}, weights)
|
|
pts_esc, comp_esc, reasons_esc = _score_escalations(escalation_count_24h, weights)
|
|
|
|
total = max(0, pts_inc + pts_rec + pts_fu + pts_slo + pts_loop + pts_esc)
|
|
band = score_to_band(total, policy)
|
|
svc_thresholds = get_service_thresholds(service, policy)
|
|
|
|
all_reasons = reasons_inc + reasons_rec + reasons_fu + reasons_slo + reasons_loop + reasons_esc
|
|
|
|
# Deterministic recommendations
|
|
recs = _build_recommendations(band, comp_inc, comp_rec, comp_fu, comp_slo)
|
|
|
|
return {
|
|
"service": service,
|
|
"env": env,
|
|
"score": total,
|
|
"band": band,
|
|
"thresholds": svc_thresholds,
|
|
"components": {
|
|
"open_incidents": comp_inc,
|
|
"recurrence": comp_rec,
|
|
"followups": comp_fu,
|
|
"slo": comp_slo,
|
|
"alerts_loop": comp_loop,
|
|
"escalations": comp_esc,
|
|
},
|
|
"reasons": all_reasons,
|
|
"recommendations": recs,
|
|
"updated_at": datetime.datetime.utcnow().isoformat(),
|
|
}
|
|
|
|
|
|
def _build_recommendations(
|
|
band: str,
|
|
comp_inc: Dict,
|
|
comp_rec: Dict,
|
|
comp_fu: Dict,
|
|
comp_slo: Dict,
|
|
) -> List[str]:
|
|
recs = []
|
|
if comp_inc.get("P0", 0) or comp_inc.get("P1", 0):
|
|
recs.append("Prioritize open P0/P1 incidents before deploying.")
|
|
if comp_rec.get("high_signatures_7d", 0) or comp_rec.get("high_kinds_7d", 0):
|
|
recs.append("Investigate recurring failure patterns (high recurrence buckets).")
|
|
if comp_fu.get("P0", 0) or comp_fu.get("P1", 0):
|
|
recs.append("Prioritize follow-up closure for recurring bucket(s).")
|
|
if comp_slo.get("violations", 0):
|
|
recs.append("Avoid risky deploys until SLO violation clears.")
|
|
if band in ("high", "critical"):
|
|
recs.append("Service is high-risk — coordinate with oncall before release.")
|
|
return recs[:6]
|
|
|
|
|
|
# ─── Dashboard ────────────────────────────────────────────────────────────────
|
|
|
|
# ─── Trend computation ────────────────────────────────────────────────────────
|
|
|
|
def compute_trend(
|
|
series: List, # List[RiskSnapshot] — most-recent first
|
|
policy: Optional[Dict] = None,
|
|
) -> Dict:
|
|
"""
|
|
Compute trend metrics from a list of RiskSnapshot objects (or dicts).
|
|
|
|
Returns:
|
|
delta_24h, delta_7d, slope_per_day, volatility, regression{warn, fail}
|
|
"""
|
|
if policy is None:
|
|
policy = load_risk_policy()
|
|
|
|
trend_cfg = policy.get("trend", {})
|
|
reg = trend_cfg.get("regression_threshold", {})
|
|
warn_24h = int(reg.get("delta_24h_warn", 10))
|
|
fail_24h = int(reg.get("delta_24h_fail", 20))
|
|
warn_7d = int(reg.get("delta_7d_warn", 15))
|
|
fail_7d = int(reg.get("delta_7d_fail", 30))
|
|
|
|
if not series:
|
|
return _empty_trend()
|
|
|
|
# Normalise: accept both RiskSnapshot dataclasses and plain dicts
|
|
def _score(s) -> int:
|
|
return int(s.score if hasattr(s, "score") else s["score"])
|
|
|
|
def _ts(s) -> str:
|
|
return s.ts if hasattr(s, "ts") else s["ts"]
|
|
|
|
now = datetime.datetime.utcnow()
|
|
latest_score = _score(series[0])
|
|
|
|
# ── delta_24h ─────────────────────────────────────────────────────────────
|
|
cutoff_24h = (now - datetime.timedelta(hours=24)).isoformat()
|
|
base_24h = _find_baseline(series, cutoff_24h, _ts)
|
|
delta_24h = (latest_score - _score(base_24h)) if base_24h is not None else None
|
|
|
|
# ── delta_7d ──────────────────────────────────────────────────────────────
|
|
cutoff_7d = (now - datetime.timedelta(hours=168)).isoformat()
|
|
base_7d = _find_baseline(series, cutoff_7d, _ts)
|
|
delta_7d = (latest_score - _score(base_7d)) if base_7d is not None else None
|
|
|
|
# ── slope (simple linear regression over all available points) ────────────
|
|
slope_per_day: Optional[float] = None
|
|
if len(series) >= 2:
|
|
# xs = age in hours from oldest point
|
|
pairs = [(now - _parse_ts(_ts(s))).total_seconds() / 3600.0 for s in series]
|
|
hours_from_oldest = [max(pairs) - p for p in pairs] # 0=oldest, max=newest
|
|
scores = [_score(s) for s in series]
|
|
slope_per_day = _linear_slope(hours_from_oldest, scores) * 24 # per day
|
|
|
|
# ── volatility (stddev of daily last-score-per-day over 7d) ──────────────
|
|
volatility: Optional[float] = None
|
|
daily_scores = _daily_latest_scores(series, days=7, _ts_fn=_ts, _score_fn=_score)
|
|
if len(daily_scores) >= 2:
|
|
mean = sum(daily_scores) / len(daily_scores)
|
|
variance = sum((x - mean) ** 2 for x in daily_scores) / len(daily_scores)
|
|
volatility = round(math.sqrt(variance), 2)
|
|
|
|
# ── regression flags ──────────────────────────────────────────────────────
|
|
reg_warn = (
|
|
(delta_24h is not None and delta_24h >= warn_24h)
|
|
or (delta_7d is not None and delta_7d >= warn_7d)
|
|
)
|
|
reg_fail = (
|
|
(delta_24h is not None and delta_24h >= fail_24h)
|
|
or (delta_7d is not None and delta_7d >= fail_7d)
|
|
)
|
|
|
|
return {
|
|
"delta_24h": delta_24h,
|
|
"delta_7d": delta_7d,
|
|
"slope_per_day": round(slope_per_day, 2) if slope_per_day is not None else None,
|
|
"volatility": volatility,
|
|
"regression": {"warn": reg_warn, "fail": reg_fail},
|
|
}
|
|
|
|
|
|
def _empty_trend() -> Dict:
|
|
return {
|
|
"delta_24h": None, "delta_7d": None,
|
|
"slope_per_day": None, "volatility": None,
|
|
"regression": {"warn": False, "fail": False},
|
|
}
|
|
|
|
|
|
def _find_baseline(series, cutoff_iso: str, ts_fn):
|
|
"""Return the first element whose ts <= cutoff (series is newest-first)."""
|
|
for s in series:
|
|
if ts_fn(s) <= cutoff_iso:
|
|
return s
|
|
return None
|
|
|
|
|
|
def _parse_ts(ts_str: str) -> datetime.datetime:
|
|
ts_str = ts_str.rstrip("Z")
|
|
for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d"):
|
|
try:
|
|
return datetime.datetime.strptime(ts_str, fmt)
|
|
except ValueError:
|
|
continue
|
|
return datetime.datetime.utcnow()
|
|
|
|
|
|
def _linear_slope(xs: List[float], ys: List[float]) -> float:
|
|
"""Simple least-squares slope (score per hour)."""
|
|
n = len(xs)
|
|
if n < 2:
|
|
return 0.0
|
|
x_mean = sum(xs) / n
|
|
y_mean = sum(ys) / n
|
|
num = sum((xs[i] - x_mean) * (ys[i] - y_mean) for i in range(n))
|
|
den = sum((xs[i] - x_mean) ** 2 for i in range(n))
|
|
return num / den if den != 0 else 0.0
|
|
|
|
|
|
def _daily_latest_scores(series, days: int, _ts_fn, _score_fn) -> List[float]:
|
|
"""Collect the latest score for each calendar day over last `days` days."""
|
|
now = datetime.datetime.utcnow()
|
|
day_scores: Dict[str, int] = {}
|
|
cutoff = (now - datetime.timedelta(days=days)).isoformat()
|
|
for s in series:
|
|
ts = _ts_fn(s)
|
|
if ts < cutoff:
|
|
break
|
|
day_key = ts[:10] # YYYY-MM-DD
|
|
if day_key not in day_scores: # series is newest-first, so first = latest
|
|
day_scores[day_key] = _score_fn(s)
|
|
return list(day_scores.values())
|
|
|
|
|
|
def enrich_risk_report_with_trend(
|
|
report: Dict,
|
|
history_store, # RiskHistoryStore
|
|
policy: Optional[Dict] = None,
|
|
) -> Dict:
|
|
"""
|
|
Mutates `report` in-place to add a `trend` key.
|
|
Non-fatal: on any error, adds `trend: null`.
|
|
"""
|
|
try:
|
|
service = report.get("service", "")
|
|
env = report.get("env", "prod")
|
|
if policy is None:
|
|
policy = load_risk_policy()
|
|
|
|
trend_cfg = policy.get("trend", {})
|
|
vol_hours = int(trend_cfg.get("volatility_window_hours", 168))
|
|
series = history_store.get_series(service, env, hours=vol_hours, limit=500)
|
|
report["trend"] = compute_trend(series, policy=policy)
|
|
except Exception as e:
|
|
logger.warning("enrich_risk_report_with_trend failed for %s: %s", report.get("service"), e)
|
|
report["trend"] = None
|
|
return report
|
|
|
|
|
|
def enrich_risk_report_with_attribution(
|
|
report: Dict,
|
|
*,
|
|
alert_store=None,
|
|
incident_store=None,
|
|
attr_policy: Optional[Dict] = None,
|
|
) -> Dict:
|
|
"""
|
|
Mutates `report` in-place to add an `attribution` key.
|
|
Non-fatal: on any error, adds `attribution: null`.
|
|
LLM enrichment is applied if policy.llm_mode != 'off' and triggers met.
|
|
"""
|
|
try:
|
|
from risk_attribution import (
|
|
compute_attribution, fetch_signals_from_stores, load_attribution_policy,
|
|
)
|
|
from llm_enrichment import maybe_enrich_attribution
|
|
|
|
if attr_policy is None:
|
|
attr_policy = load_attribution_policy()
|
|
|
|
service = report.get("service", "")
|
|
env = report.get("env", "prod")
|
|
|
|
# Fetch raw signals
|
|
signals = fetch_signals_from_stores(
|
|
service, env,
|
|
window_hours=int((attr_policy.get("defaults") or {}).get("lookback_hours", 24)),
|
|
alert_store=alert_store,
|
|
incident_store=incident_store,
|
|
policy=attr_policy,
|
|
)
|
|
|
|
attribution = compute_attribution(
|
|
service, env,
|
|
risk_report=report,
|
|
**signals,
|
|
policy=attr_policy,
|
|
)
|
|
|
|
# Optionally enrich with LLM (bounded, off by default)
|
|
attribution["llm_enrichment"] = maybe_enrich_attribution(
|
|
attribution, report, attr_policy
|
|
)
|
|
|
|
report["attribution"] = attribution
|
|
except Exception as e:
|
|
logger.warning("enrich_risk_report_with_attribution failed for %s: %s",
|
|
report.get("service"), e)
|
|
report["attribution"] = None
|
|
return report
|
|
|
|
|
|
# ─── Snapshot writer ──────────────────────────────────────────────────────────
|
|
|
|
def snapshot_all_services(
|
|
env: str,
|
|
compute_fn, # Callable[[str, str], Dict] — returns RiskReport for (service, env)
|
|
history_store, # RiskHistoryStore
|
|
policy: Optional[Dict] = None,
|
|
known_services: Optional[List[str]] = None,
|
|
) -> Dict:
|
|
"""
|
|
Compute and persist a RiskSnapshot for every known service.
|
|
|
|
`compute_fn(service, env)` must return a RiskReport dict.
|
|
Returns {written, skipped, errors, services}.
|
|
Non-fatal per service.
|
|
"""
|
|
if policy is None:
|
|
policy = load_risk_policy()
|
|
|
|
from risk_history_store import RiskSnapshot
|
|
|
|
max_services = int(policy.get("history", {}).get("max_services_per_run", 50))
|
|
services = (known_services or [])[:max_services]
|
|
|
|
written = skipped = errors = 0
|
|
snapped: List[str] = []
|
|
|
|
for svc in services:
|
|
try:
|
|
report = compute_fn(svc, env)
|
|
snap = RiskSnapshot(
|
|
ts=datetime.datetime.utcnow().isoformat(),
|
|
service=svc,
|
|
env=env,
|
|
score=int(report.get("score", 0)),
|
|
band=report.get("band", "low"),
|
|
components=report.get("components", {}),
|
|
reasons=report.get("reasons", []),
|
|
)
|
|
history_store.write_snapshot([snap])
|
|
written += 1
|
|
snapped.append(svc)
|
|
except Exception as e:
|
|
logger.warning("snapshot_all_services: error for %s/%s: %s", svc, env, e)
|
|
errors += 1
|
|
|
|
return {
|
|
"written": written,
|
|
"skipped": skipped,
|
|
"errors": errors,
|
|
"services": snapped,
|
|
"env": env,
|
|
"ts": datetime.datetime.utcnow().isoformat(),
|
|
}
|
|
|
|
|
|
def compute_risk_dashboard(
|
|
env: str = "prod",
|
|
top_n: int = 10,
|
|
*,
|
|
service_reports: Optional[List[Dict]] = None,
|
|
history_store=None, # Optional[RiskHistoryStore] — if provided, enrich with trend
|
|
policy: Optional[Dict] = None,
|
|
) -> Dict:
|
|
"""
|
|
Build risk dashboard from a list of pre-computed service reports.
|
|
Sorts by score desc and returns summary.
|
|
If history_store is provided, each report is enriched with trend data.
|
|
"""
|
|
if policy is None:
|
|
policy = load_risk_policy()
|
|
|
|
reports = sorted(
|
|
service_reports or [],
|
|
key=lambda r: -r.get("score", 0),
|
|
)[:top_n]
|
|
|
|
# Enrich with trend if history_store provided
|
|
if history_store is not None:
|
|
for r in reports:
|
|
enrich_risk_report_with_trend(r, history_store, policy)
|
|
|
|
band_counts: Dict[str, int] = {"critical": 0, "high": 0, "medium": 0, "low": 0}
|
|
for r in reports:
|
|
b = r.get("band", "low")
|
|
band_counts[b] = band_counts.get(b, 0) + 1
|
|
|
|
p0_services = set(policy.get("p0_services", []))
|
|
critical_p0 = [r for r in reports if r["service"] in p0_services
|
|
and r["band"] in ("high", "critical")]
|
|
|
|
# Top regressions (highest delta_24h, trend present)
|
|
top_regressions = sorted(
|
|
[r for r in reports if (r.get("trend") or {}).get("delta_24h") is not None
|
|
and r["trend"]["delta_24h"] > 0],
|
|
key=lambda r: -r["trend"]["delta_24h"],
|
|
)[:5]
|
|
|
|
# Improving services (most negative delta_7d)
|
|
improving = sorted(
|
|
[r for r in reports if (r.get("trend") or {}).get("delta_7d") is not None
|
|
and r["trend"]["delta_7d"] < 0],
|
|
key=lambda r: r["trend"]["delta_7d"],
|
|
)[:5]
|
|
|
|
# Top regression summaries (with top-2 causes if attribution available)
|
|
top_regression_summaries = []
|
|
for r in top_regressions:
|
|
entry: Dict = {
|
|
"service": r["service"],
|
|
"delta_24h": r["trend"]["delta_24h"],
|
|
}
|
|
attr = r.get("attribution")
|
|
if attr and attr.get("causes"):
|
|
entry["causes"] = attr["causes"][:2]
|
|
entry["attribution_summary"] = attr.get("summary", "")
|
|
top_regression_summaries.append(entry)
|
|
|
|
now_iso = datetime.datetime.utcnow().isoformat()
|
|
return {
|
|
"env": env,
|
|
"generated_at": now_iso,
|
|
"history_updated_at": now_iso,
|
|
"total_services": len(reports),
|
|
"band_counts": band_counts,
|
|
"critical_p0_services": [r["service"] for r in critical_p0],
|
|
"top_regressions": top_regression_summaries,
|
|
"improving_services": [{"service": r["service"], "delta_7d": r["trend"]["delta_7d"]}
|
|
for r in improving],
|
|
"services": reports,
|
|
}
|