""" risk_engine.py — Service Risk Index Engine (deterministic, no LLM). Provides: compute_service_risk(service, env, ...) -> RiskReport compute_risk_dashboard(env, top_n, ...) -> Dashboard compute_trend(series) -> TrendReport enrich_risk_report_with_trend(report, history_store, policy) -> report (mutated) snapshot_all_services(env, compute_fn, history_store, policy) -> SnapshotResult All inputs come from existing stores and tools. The engine never calls external services directly — callers inject store references. """ from __future__ import annotations import datetime import logging import math import yaml from pathlib import Path from typing import Any, Dict, List, Optional, Tuple logger = logging.getLogger(__name__) # ─── Policy ─────────────────────────────────────────────────────────────────── _POLICY_CACHE: Optional[Dict] = None _POLICY_SEARCH_PATHS = [ Path("config/risk_policy.yml"), Path(__file__).resolve().parent.parent.parent / "config" / "risk_policy.yml", ] def load_risk_policy() -> Dict: global _POLICY_CACHE if _POLICY_CACHE is not None: return _POLICY_CACHE for p in _POLICY_SEARCH_PATHS: if p.exists(): try: with open(p) as f: data = yaml.safe_load(f) or {} _POLICY_CACHE = data return data except Exception as e: logger.warning("Failed to load risk_policy from %s: %s", p, e) logger.warning("risk_policy.yml not found; using built-in defaults") _POLICY_CACHE = _builtin_defaults() return _POLICY_CACHE def _builtin_defaults() -> Dict: return { "defaults": {"window_hours": 24, "recurrence_windows_days": [7, 30], "slo_window_minutes": 60}, "thresholds": { "bands": {"low_max": 20, "medium_max": 50, "high_max": 80}, "risk_watch": {"warn_at": 50, "fail_at": 80}, }, "weights": { "open_incidents": {"P0": 50, "P1": 25, "P2": 10, "P3": 5}, "recurrence": { "signature_warn_7d": 10, "signature_high_7d": 20, "kind_warn_7d": 8, "kind_high_7d": 15, "signature_high_30d": 10, "kind_high_30d": 8, }, "followups": {"overdue_P0": 20, "overdue_P1": 12, "overdue_other": 6}, "slo": {"violation": 10}, "alerts_loop": {"slo_violation": 10}, "escalation": {"escalations_24h": {"warn": 5, "high": 12}}, }, "service_overrides": {}, "p0_services": ["gateway", "router"], } def _reload_policy() -> None: global _POLICY_CACHE _POLICY_CACHE = None # ─── Band classification ────────────────────────────────────────────────────── def score_to_band(score: int, policy: Dict) -> str: bands = policy.get("thresholds", {}).get("bands", {}) low_max = int(bands.get("low_max", 20)) medium_max = int(bands.get("medium_max", 50)) high_max = int(bands.get("high_max", 80)) if score <= low_max: return "low" if score <= medium_max: return "medium" if score <= high_max: return "high" return "critical" def get_service_thresholds(service: str, policy: Dict) -> Dict: overrides = policy.get("service_overrides", {}).get(service, {}) defaults = policy.get("thresholds", {}).get("risk_watch", {}) ov_rw = overrides.get("risk_watch", {}) return { "warn_at": int(ov_rw.get("warn_at", defaults.get("warn_at", 50))), "fail_at": int(ov_rw.get("fail_at", defaults.get("fail_at", 80))), } # ─── Individual scoring components ─────────────────────────────────────────── def _score_open_incidents( open_incidents: List[Dict], weights: Dict, ) -> Tuple[int, Dict, List[str]]: """Score open incidents by severity.""" w = weights.get("open_incidents", {}) counts: Dict[str, int] = {"P0": 0, "P1": 0, "P2": 0, "P3": 0} points = 0 for inc in open_incidents: sev = inc.get("severity", "P3") if sev in counts: counts[sev] += 1 pts = int(w.get(sev, 0)) points += pts reasons = [] if counts["P0"]: reasons.append(f"Open P0 incident(s): {counts['P0']}") if counts["P1"]: reasons.append(f"Open P1 incident(s): {counts['P1']}") if counts["P2"]: reasons.append(f"Open P2 incident(s): {counts['P2']}") return points, {**counts, "points": points}, reasons def _score_recurrence( recurrence_data: Dict, weights: Dict, ) -> Tuple[int, Dict, List[str]]: """Score from recurrence detection stats.""" w = weights.get("recurrence", {}) high_rec = recurrence_data.get("high_recurrence", {}) warn_rec = recurrence_data.get("warn_recurrence", {}) high_sigs_7d = len(high_rec.get("signatures", [])) high_kinds_7d = len(high_rec.get("kinds", [])) warn_sigs_7d = len(warn_rec.get("signatures", [])) warn_kinds_7d = len(warn_rec.get("kinds", [])) # Note: 30d data comes from separate call; keep it optional high_sigs_30d = len(recurrence_data.get("high_recurrence_30d", {}).get("signatures", [])) high_kinds_30d = len(recurrence_data.get("high_recurrence_30d", {}).get("kinds", [])) points = ( high_sigs_7d * int(w.get("signature_high_7d", 20)) + warn_sigs_7d * int(w.get("signature_warn_7d", 10)) + high_kinds_7d * int(w.get("kind_high_7d", 15)) + warn_kinds_7d * int(w.get("kind_warn_7d", 8)) + high_sigs_30d * int(w.get("signature_high_30d", 10)) + high_kinds_30d * int(w.get("kind_high_30d", 8)) ) component = { "high_signatures_7d": high_sigs_7d, "warn_signatures_7d": warn_sigs_7d, "high_kinds_7d": high_kinds_7d, "warn_kinds_7d": warn_kinds_7d, "high_signatures_30d": high_sigs_30d, "high_kinds_30d": high_kinds_30d, "points": points, } reasons = [] if high_sigs_7d: reasons.append(f"High recurrence signatures (7d): {high_sigs_7d}") if high_kinds_7d: reasons.append(f"High recurrence kinds (7d): {high_kinds_7d}") if warn_sigs_7d: reasons.append(f"Warn recurrence signatures (7d): {warn_sigs_7d}") return points, component, reasons def _score_followups( followups_data: Dict, weights: Dict, ) -> Tuple[int, Dict, List[str]]: """Score overdue follow-ups by priority.""" w = weights.get("followups", {}) overdue = followups_data.get("overdue_followups", []) counts: Dict[str, int] = {"P0": 0, "P1": 0, "other": 0} points = 0 for fu in overdue: prio = fu.get("priority", "other") if prio == "P0": counts["P0"] += 1 points += int(w.get("overdue_P0", 20)) elif prio == "P1": counts["P1"] += 1 points += int(w.get("overdue_P1", 12)) else: counts["other"] += 1 points += int(w.get("overdue_other", 6)) reasons = [] if counts["P0"]: reasons.append(f"Overdue follow-ups (P0): {counts['P0']}") if counts["P1"]: reasons.append(f"Overdue follow-ups (P1): {counts['P1']}") if counts["other"]: reasons.append(f"Overdue follow-ups (other): {counts['other']}") return points, {**counts, "points": points}, reasons def _score_slo( slo_data: Dict, weights: Dict, ) -> Tuple[int, Dict, List[str]]: """Score SLO violations.""" w = weights.get("slo", {}) violations = slo_data.get("violations", []) skipped = slo_data.get("skipped", False) if skipped: return 0, {"violations": 0, "skipped": True, "points": 0}, [] count = len(violations) points = count * int(w.get("violation", 10)) reasons = [] if count: reasons.append(f"Active SLO violation(s) in window: {count}") return points, {"violations": count, "skipped": False, "points": points}, reasons def _score_alerts_loop( loop_slo: Dict, weights: Dict, ) -> Tuple[int, Dict, List[str]]: """Score alert-loop SLO violations (self-monitoring).""" w = weights.get("alerts_loop", {}) violations = loop_slo.get("violations", []) count = len(violations) points = count * int(w.get("slo_violation", 10)) reasons = [] if count: reasons.append(f"Alert-loop SLO violation(s): {count}") return points, {"violations": count, "points": points}, reasons def _score_escalations( escalation_count: int, weights: Dict, ) -> Tuple[int, Dict, List[str]]: """Score escalations in last 24h.""" esc_w = weights.get("escalation", {}).get("escalations_24h", {}) warn_pts = int(esc_w.get("warn", 5)) high_pts = int(esc_w.get("high", 12)) if escalation_count >= 3: points = high_pts elif escalation_count >= 1: points = warn_pts else: points = 0 reasons = [] if escalation_count: reasons.append(f"Escalations in last 24h: {escalation_count}") return points, {"count_24h": escalation_count, "points": points}, reasons # ─── Main scoring function ──────────────────────────────────────────────────── def compute_service_risk( service: str, env: str = "prod", *, open_incidents: Optional[List[Dict]] = None, recurrence_7d: Optional[Dict] = None, recurrence_30d: Optional[Dict] = None, followups_data: Optional[Dict] = None, slo_data: Optional[Dict] = None, alerts_loop_slo: Optional[Dict] = None, escalation_count_24h: int = 0, policy: Optional[Dict] = None, ) -> Dict: """ Compute risk score for a service. Accepts pre-fetched data dicts (callers are responsible for fetching from stores/tools). All args default to empty/safe values so the engine never crashes due to missing data. """ if policy is None: policy = load_risk_policy() weights = policy.get("weights", _builtin_defaults()["weights"]) # ── Compute each component ──────────────────────────────────────────────── open_incs = open_incidents or [] pts_inc, comp_inc, reasons_inc = _score_open_incidents(open_incs, weights) # Merge 7d + 30d recurrence into a single dict rec_merged = dict(recurrence_7d or {}) if recurrence_30d: rec_merged["high_recurrence_30d"] = recurrence_30d.get("high_recurrence", {}) rec_merged["warn_recurrence_30d"] = recurrence_30d.get("warn_recurrence", {}) pts_rec, comp_rec, reasons_rec = _score_recurrence(rec_merged, weights) pts_fu, comp_fu, reasons_fu = _score_followups(followups_data or {}, weights) pts_slo, comp_slo, reasons_slo = _score_slo(slo_data or {}, weights) pts_loop, comp_loop, reasons_loop = _score_alerts_loop(alerts_loop_slo or {}, weights) pts_esc, comp_esc, reasons_esc = _score_escalations(escalation_count_24h, weights) total = max(0, pts_inc + pts_rec + pts_fu + pts_slo + pts_loop + pts_esc) band = score_to_band(total, policy) svc_thresholds = get_service_thresholds(service, policy) all_reasons = reasons_inc + reasons_rec + reasons_fu + reasons_slo + reasons_loop + reasons_esc # Deterministic recommendations recs = _build_recommendations(band, comp_inc, comp_rec, comp_fu, comp_slo) return { "service": service, "env": env, "score": total, "band": band, "thresholds": svc_thresholds, "components": { "open_incidents": comp_inc, "recurrence": comp_rec, "followups": comp_fu, "slo": comp_slo, "alerts_loop": comp_loop, "escalations": comp_esc, }, "reasons": all_reasons, "recommendations": recs, "updated_at": datetime.datetime.utcnow().isoformat(), } def _build_recommendations( band: str, comp_inc: Dict, comp_rec: Dict, comp_fu: Dict, comp_slo: Dict, ) -> List[str]: recs = [] if comp_inc.get("P0", 0) or comp_inc.get("P1", 0): recs.append("Prioritize open P0/P1 incidents before deploying.") if comp_rec.get("high_signatures_7d", 0) or comp_rec.get("high_kinds_7d", 0): recs.append("Investigate recurring failure patterns (high recurrence buckets).") if comp_fu.get("P0", 0) or comp_fu.get("P1", 0): recs.append("Prioritize follow-up closure for recurring bucket(s).") if comp_slo.get("violations", 0): recs.append("Avoid risky deploys until SLO violation clears.") if band in ("high", "critical"): recs.append("Service is high-risk — coordinate with oncall before release.") return recs[:6] # ─── Dashboard ──────────────────────────────────────────────────────────────── # ─── Trend computation ──────────────────────────────────────────────────────── def compute_trend( series: List, # List[RiskSnapshot] — most-recent first policy: Optional[Dict] = None, ) -> Dict: """ Compute trend metrics from a list of RiskSnapshot objects (or dicts). Returns: delta_24h, delta_7d, slope_per_day, volatility, regression{warn, fail} """ if policy is None: policy = load_risk_policy() trend_cfg = policy.get("trend", {}) reg = trend_cfg.get("regression_threshold", {}) warn_24h = int(reg.get("delta_24h_warn", 10)) fail_24h = int(reg.get("delta_24h_fail", 20)) warn_7d = int(reg.get("delta_7d_warn", 15)) fail_7d = int(reg.get("delta_7d_fail", 30)) if not series: return _empty_trend() # Normalise: accept both RiskSnapshot dataclasses and plain dicts def _score(s) -> int: return int(s.score if hasattr(s, "score") else s["score"]) def _ts(s) -> str: return s.ts if hasattr(s, "ts") else s["ts"] now = datetime.datetime.utcnow() latest_score = _score(series[0]) # ── delta_24h ───────────────────────────────────────────────────────────── cutoff_24h = (now - datetime.timedelta(hours=24)).isoformat() base_24h = _find_baseline(series, cutoff_24h, _ts) delta_24h = (latest_score - _score(base_24h)) if base_24h is not None else None # ── delta_7d ────────────────────────────────────────────────────────────── cutoff_7d = (now - datetime.timedelta(hours=168)).isoformat() base_7d = _find_baseline(series, cutoff_7d, _ts) delta_7d = (latest_score - _score(base_7d)) if base_7d is not None else None # ── slope (simple linear regression over all available points) ──────────── slope_per_day: Optional[float] = None if len(series) >= 2: # xs = age in hours from oldest point pairs = [(now - _parse_ts(_ts(s))).total_seconds() / 3600.0 for s in series] hours_from_oldest = [max(pairs) - p for p in pairs] # 0=oldest, max=newest scores = [_score(s) for s in series] slope_per_day = _linear_slope(hours_from_oldest, scores) * 24 # per day # ── volatility (stddev of daily last-score-per-day over 7d) ────────────── volatility: Optional[float] = None daily_scores = _daily_latest_scores(series, days=7, _ts_fn=_ts, _score_fn=_score) if len(daily_scores) >= 2: mean = sum(daily_scores) / len(daily_scores) variance = sum((x - mean) ** 2 for x in daily_scores) / len(daily_scores) volatility = round(math.sqrt(variance), 2) # ── regression flags ────────────────────────────────────────────────────── reg_warn = ( (delta_24h is not None and delta_24h >= warn_24h) or (delta_7d is not None and delta_7d >= warn_7d) ) reg_fail = ( (delta_24h is not None and delta_24h >= fail_24h) or (delta_7d is not None and delta_7d >= fail_7d) ) return { "delta_24h": delta_24h, "delta_7d": delta_7d, "slope_per_day": round(slope_per_day, 2) if slope_per_day is not None else None, "volatility": volatility, "regression": {"warn": reg_warn, "fail": reg_fail}, } def _empty_trend() -> Dict: return { "delta_24h": None, "delta_7d": None, "slope_per_day": None, "volatility": None, "regression": {"warn": False, "fail": False}, } def _find_baseline(series, cutoff_iso: str, ts_fn): """Return the first element whose ts <= cutoff (series is newest-first).""" for s in series: if ts_fn(s) <= cutoff_iso: return s return None def _parse_ts(ts_str: str) -> datetime.datetime: ts_str = ts_str.rstrip("Z") for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d"): try: return datetime.datetime.strptime(ts_str, fmt) except ValueError: continue return datetime.datetime.utcnow() def _linear_slope(xs: List[float], ys: List[float]) -> float: """Simple least-squares slope (score per hour).""" n = len(xs) if n < 2: return 0.0 x_mean = sum(xs) / n y_mean = sum(ys) / n num = sum((xs[i] - x_mean) * (ys[i] - y_mean) for i in range(n)) den = sum((xs[i] - x_mean) ** 2 for i in range(n)) return num / den if den != 0 else 0.0 def _daily_latest_scores(series, days: int, _ts_fn, _score_fn) -> List[float]: """Collect the latest score for each calendar day over last `days` days.""" now = datetime.datetime.utcnow() day_scores: Dict[str, int] = {} cutoff = (now - datetime.timedelta(days=days)).isoformat() for s in series: ts = _ts_fn(s) if ts < cutoff: break day_key = ts[:10] # YYYY-MM-DD if day_key not in day_scores: # series is newest-first, so first = latest day_scores[day_key] = _score_fn(s) return list(day_scores.values()) def enrich_risk_report_with_trend( report: Dict, history_store, # RiskHistoryStore policy: Optional[Dict] = None, ) -> Dict: """ Mutates `report` in-place to add a `trend` key. Non-fatal: on any error, adds `trend: null`. """ try: service = report.get("service", "") env = report.get("env", "prod") if policy is None: policy = load_risk_policy() trend_cfg = policy.get("trend", {}) vol_hours = int(trend_cfg.get("volatility_window_hours", 168)) series = history_store.get_series(service, env, hours=vol_hours, limit=500) report["trend"] = compute_trend(series, policy=policy) except Exception as e: logger.warning("enrich_risk_report_with_trend failed for %s: %s", report.get("service"), e) report["trend"] = None return report def enrich_risk_report_with_attribution( report: Dict, *, alert_store=None, incident_store=None, attr_policy: Optional[Dict] = None, ) -> Dict: """ Mutates `report` in-place to add an `attribution` key. Non-fatal: on any error, adds `attribution: null`. LLM enrichment is applied if policy.llm_mode != 'off' and triggers met. """ try: from risk_attribution import ( compute_attribution, fetch_signals_from_stores, load_attribution_policy, ) from llm_enrichment import maybe_enrich_attribution if attr_policy is None: attr_policy = load_attribution_policy() service = report.get("service", "") env = report.get("env", "prod") # Fetch raw signals signals = fetch_signals_from_stores( service, env, window_hours=int((attr_policy.get("defaults") or {}).get("lookback_hours", 24)), alert_store=alert_store, incident_store=incident_store, policy=attr_policy, ) attribution = compute_attribution( service, env, risk_report=report, **signals, policy=attr_policy, ) # Optionally enrich with LLM (bounded, off by default) attribution["llm_enrichment"] = maybe_enrich_attribution( attribution, report, attr_policy ) report["attribution"] = attribution except Exception as e: logger.warning("enrich_risk_report_with_attribution failed for %s: %s", report.get("service"), e) report["attribution"] = None return report # ─── Snapshot writer ────────────────────────────────────────────────────────── def snapshot_all_services( env: str, compute_fn, # Callable[[str, str], Dict] — returns RiskReport for (service, env) history_store, # RiskHistoryStore policy: Optional[Dict] = None, known_services: Optional[List[str]] = None, ) -> Dict: """ Compute and persist a RiskSnapshot for every known service. `compute_fn(service, env)` must return a RiskReport dict. Returns {written, skipped, errors, services}. Non-fatal per service. """ if policy is None: policy = load_risk_policy() from risk_history_store import RiskSnapshot max_services = int(policy.get("history", {}).get("max_services_per_run", 50)) services = (known_services or [])[:max_services] written = skipped = errors = 0 snapped: List[str] = [] for svc in services: try: report = compute_fn(svc, env) snap = RiskSnapshot( ts=datetime.datetime.utcnow().isoformat(), service=svc, env=env, score=int(report.get("score", 0)), band=report.get("band", "low"), components=report.get("components", {}), reasons=report.get("reasons", []), ) history_store.write_snapshot([snap]) written += 1 snapped.append(svc) except Exception as e: logger.warning("snapshot_all_services: error for %s/%s: %s", svc, env, e) errors += 1 return { "written": written, "skipped": skipped, "errors": errors, "services": snapped, "env": env, "ts": datetime.datetime.utcnow().isoformat(), } def compute_risk_dashboard( env: str = "prod", top_n: int = 10, *, service_reports: Optional[List[Dict]] = None, history_store=None, # Optional[RiskHistoryStore] — if provided, enrich with trend policy: Optional[Dict] = None, ) -> Dict: """ Build risk dashboard from a list of pre-computed service reports. Sorts by score desc and returns summary. If history_store is provided, each report is enriched with trend data. """ if policy is None: policy = load_risk_policy() reports = sorted( service_reports or [], key=lambda r: -r.get("score", 0), )[:top_n] # Enrich with trend if history_store provided if history_store is not None: for r in reports: enrich_risk_report_with_trend(r, history_store, policy) band_counts: Dict[str, int] = {"critical": 0, "high": 0, "medium": 0, "low": 0} for r in reports: b = r.get("band", "low") band_counts[b] = band_counts.get(b, 0) + 1 p0_services = set(policy.get("p0_services", [])) critical_p0 = [r for r in reports if r["service"] in p0_services and r["band"] in ("high", "critical")] # Top regressions (highest delta_24h, trend present) top_regressions = sorted( [r for r in reports if (r.get("trend") or {}).get("delta_24h") is not None and r["trend"]["delta_24h"] > 0], key=lambda r: -r["trend"]["delta_24h"], )[:5] # Improving services (most negative delta_7d) improving = sorted( [r for r in reports if (r.get("trend") or {}).get("delta_7d") is not None and r["trend"]["delta_7d"] < 0], key=lambda r: r["trend"]["delta_7d"], )[:5] # Top regression summaries (with top-2 causes if attribution available) top_regression_summaries = [] for r in top_regressions: entry: Dict = { "service": r["service"], "delta_24h": r["trend"]["delta_24h"], } attr = r.get("attribution") if attr and attr.get("causes"): entry["causes"] = attr["causes"][:2] entry["attribution_summary"] = attr.get("summary", "") top_regression_summaries.append(entry) now_iso = datetime.datetime.utcnow().isoformat() return { "env": env, "generated_at": now_iso, "history_updated_at": now_iso, "total_services": len(reports), "band_counts": band_counts, "critical_p0_services": [r["service"] for r in critical_p0], "top_regressions": top_regression_summaries, "improving_services": [{"service": r["service"], "delta_7d": r["trend"]["delta_7d"]} for r in improving], "services": reports, }