microdao-daarion/services/router/risk_engine.py

"""
risk_engine.py — Service Risk Index Engine (deterministic, no LLM).

Provides:
  compute_service_risk(service, env, ...) -> RiskReport
  compute_risk_dashboard(env, top_n, ...) -> Dashboard
  compute_trend(series)   -> TrendReport
  enrich_risk_report_with_trend(report, history_store, policy) -> report (mutated)
  snapshot_all_services(env, compute_fn, history_store, policy) -> SnapshotResult

All inputs come from existing stores and tools.
The engine never calls external services directly — callers inject store references.
"""
from __future__ import annotations

import datetime
import logging
import math
import yaml
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

logger = logging.getLogger(__name__)

# ─── Policy ───────────────────────────────────────────────────────────────────

_POLICY_CACHE: Optional[Dict] = None
_POLICY_SEARCH_PATHS = [
    Path("config/risk_policy.yml"),
    Path(__file__).resolve().parent.parent.parent / "config" / "risk_policy.yml",
]


def load_risk_policy() -> Dict:
    global _POLICY_CACHE
    if _POLICY_CACHE is not None:
        return _POLICY_CACHE
    for p in _POLICY_SEARCH_PATHS:
        if p.exists():
            try:
                with open(p) as f:
                    data = yaml.safe_load(f) or {}
                _POLICY_CACHE = data
                return data
            except Exception as e:
                logger.warning("Failed to load risk_policy from %s: %s", p, e)
    logger.warning("risk_policy.yml not found; using built-in defaults")
    _POLICY_CACHE = _builtin_defaults()
    return _POLICY_CACHE


def _builtin_defaults() -> Dict:
    return {
        "defaults": {"window_hours": 24, "recurrence_windows_days": [7, 30],
                     "slo_window_minutes": 60},
        "thresholds": {
            "bands": {"low_max": 20, "medium_max": 50, "high_max": 80},
            "risk_watch": {"warn_at": 50, "fail_at": 80},
        },
        "weights": {
            "open_incidents": {"P0": 50, "P1": 25, "P2": 10, "P3": 5},
            "recurrence": {
                "signature_warn_7d": 10, "signature_high_7d": 20,
                "kind_warn_7d": 8, "kind_high_7d": 15,
                "signature_high_30d": 10, "kind_high_30d": 8,
            },
            "followups": {"overdue_P0": 20, "overdue_P1": 12, "overdue_other": 6},
            "slo": {"violation": 10},
            "alerts_loop": {"slo_violation": 10},
            "escalation": {"escalations_24h": {"warn": 5, "high": 12}},
        },
        "service_overrides": {},
        "p0_services": ["gateway", "router"],
    }


def _reload_policy() -> None:
    global _POLICY_CACHE
    _POLICY_CACHE = None


# ─── Band classification ──────────────────────────────────────────────────────

def score_to_band(score: int, policy: Dict) -> str:
    bands = policy.get("thresholds", {}).get("bands", {})
    low_max = int(bands.get("low_max", 20))
    medium_max = int(bands.get("medium_max", 50))
    high_max = int(bands.get("high_max", 80))
    if score <= low_max:
        return "low"
    if score <= medium_max:
        return "medium"
    if score <= high_max:
        return "high"
    return "critical"


def get_service_thresholds(service: str, policy: Dict) -> Dict:
    overrides = policy.get("service_overrides", {}).get(service, {})
    defaults = policy.get("thresholds", {}).get("risk_watch", {})
    ov_rw = overrides.get("risk_watch", {})
    return {
        "warn_at": int(ov_rw.get("warn_at", defaults.get("warn_at", 50))),
        "fail_at": int(ov_rw.get("fail_at", defaults.get("fail_at", 80))),
    }


# ─── Individual scoring components ───────────────────────────────────────────

def _score_open_incidents(
    open_incidents: List[Dict],
    weights: Dict,
) -> Tuple[int, Dict, List[str]]:
    """Score open incidents by severity."""
    w = weights.get("open_incidents", {})
    counts: Dict[str, int] = {"P0": 0, "P1": 0, "P2": 0, "P3": 0}
    points = 0
    for inc in open_incidents:
        sev = inc.get("severity", "P3")
        if sev in counts:
            counts[sev] += 1
        pts = int(w.get(sev, 0))
        points += pts

    reasons = []
    if counts["P0"]:
        reasons.append(f"Open P0 incident(s): {counts['P0']}")
    if counts["P1"]:
        reasons.append(f"Open P1 incident(s): {counts['P1']}")
    if counts["P2"]:
        reasons.append(f"Open P2 incident(s): {counts['P2']}")

    return points, {**counts, "points": points}, reasons


def _score_recurrence(
    recurrence_data: Dict,
    weights: Dict,
) -> Tuple[int, Dict, List[str]]:
    """Score from recurrence detection stats."""
    w = weights.get("recurrence", {})
    high_rec = recurrence_data.get("high_recurrence", {})
    warn_rec = recurrence_data.get("warn_recurrence", {})

    high_sigs_7d = len(high_rec.get("signatures", []))
    high_kinds_7d = len(high_rec.get("kinds", []))
    warn_sigs_7d = len(warn_rec.get("signatures", []))
    warn_kinds_7d = len(warn_rec.get("kinds", []))

    # Note: 30d data comes from separate call; keep it optional
    high_sigs_30d = len(recurrence_data.get("high_recurrence_30d", {}).get("signatures", []))
    high_kinds_30d = len(recurrence_data.get("high_recurrence_30d", {}).get("kinds", []))

    points = (
        high_sigs_7d * int(w.get("signature_high_7d", 20))
        + warn_sigs_7d * int(w.get("signature_warn_7d", 10))
        + high_kinds_7d * int(w.get("kind_high_7d", 15))
        + warn_kinds_7d * int(w.get("kind_warn_7d", 8))
        + high_sigs_30d * int(w.get("signature_high_30d", 10))
        + high_kinds_30d * int(w.get("kind_high_30d", 8))
    )

    component = {
        "high_signatures_7d": high_sigs_7d,
        "warn_signatures_7d": warn_sigs_7d,
        "high_kinds_7d": high_kinds_7d,
        "warn_kinds_7d": warn_kinds_7d,
        "high_signatures_30d": high_sigs_30d,
        "high_kinds_30d": high_kinds_30d,
        "points": points,
    }
    reasons = []
    if high_sigs_7d:
        reasons.append(f"High recurrence signatures (7d): {high_sigs_7d}")
    if high_kinds_7d:
        reasons.append(f"High recurrence kinds (7d): {high_kinds_7d}")
    if warn_sigs_7d:
        reasons.append(f"Warn recurrence signatures (7d): {warn_sigs_7d}")
    return points, component, reasons


def _score_followups(
    followups_data: Dict,
    weights: Dict,
) -> Tuple[int, Dict, List[str]]:
    """Score overdue follow-ups by priority."""
    w = weights.get("followups", {})
    overdue = followups_data.get("overdue_followups", [])
    counts: Dict[str, int] = {"P0": 0, "P1": 0, "other": 0}
    points = 0

    for fu in overdue:
        prio = fu.get("priority", "other")
        if prio == "P0":
            counts["P0"] += 1
            points += int(w.get("overdue_P0", 20))
        elif prio == "P1":
            counts["P1"] += 1
            points += int(w.get("overdue_P1", 12))
        else:
            counts["other"] += 1
            points += int(w.get("overdue_other", 6))

    reasons = []
    if counts["P0"]:
        reasons.append(f"Overdue follow-ups (P0): {counts['P0']}")
    if counts["P1"]:
        reasons.append(f"Overdue follow-ups (P1): {counts['P1']}")
    if counts["other"]:
        reasons.append(f"Overdue follow-ups (other): {counts['other']}")

    return points, {**counts, "points": points}, reasons


def _score_slo(
    slo_data: Dict,
    weights: Dict,
) -> Tuple[int, Dict, List[str]]:
    """Score SLO violations."""
    w = weights.get("slo", {})
    violations = slo_data.get("violations", [])
    skipped = slo_data.get("skipped", False)

    if skipped:
        return 0, {"violations": 0, "skipped": True, "points": 0}, []

    count = len(violations)
    points = count * int(w.get("violation", 10))
    reasons = []
    if count:
        reasons.append(f"Active SLO violation(s) in window: {count}")
    return points, {"violations": count, "skipped": False, "points": points}, reasons


def _score_alerts_loop(
    loop_slo: Dict,
    weights: Dict,
) -> Tuple[int, Dict, List[str]]:
    """Score alert-loop SLO violations (self-monitoring)."""
    w = weights.get("alerts_loop", {})
    violations = loop_slo.get("violations", [])
    count = len(violations)
    points = count * int(w.get("slo_violation", 10))
    reasons = []
    if count:
        reasons.append(f"Alert-loop SLO violation(s): {count}")
    return points, {"violations": count, "points": points}, reasons


def _score_escalations(
    escalation_count: int,
    weights: Dict,
) -> Tuple[int, Dict, List[str]]:
    """Score escalations in last 24h."""
    esc_w = weights.get("escalation", {}).get("escalations_24h", {})
    warn_pts = int(esc_w.get("warn", 5))
    high_pts = int(esc_w.get("high", 12))

    if escalation_count >= 3:
        points = high_pts
    elif escalation_count >= 1:
        points = warn_pts
    else:
        points = 0

    reasons = []
    if escalation_count:
        reasons.append(f"Escalations in last 24h: {escalation_count}")

    return points, {"count_24h": escalation_count, "points": points}, reasons


# ─── Main scoring function ────────────────────────────────────────────────────

def compute_service_risk(
    service: str,
    env: str = "prod",
    *,
    open_incidents: Optional[List[Dict]] = None,
    recurrence_7d: Optional[Dict] = None,
    recurrence_30d: Optional[Dict] = None,
    followups_data: Optional[Dict] = None,
    slo_data: Optional[Dict] = None,
    alerts_loop_slo: Optional[Dict] = None,
    escalation_count_24h: int = 0,
    policy: Optional[Dict] = None,
) -> Dict:
    """
    Compute risk score for a service.

    Accepts pre-fetched data dicts (callers are responsible for fetching
    from stores/tools). All args default to empty/safe values so the engine
    never crashes due to missing data.
    """
    if policy is None:
        policy = load_risk_policy()

    weights = policy.get("weights", _builtin_defaults()["weights"])

    # ── Compute each component ────────────────────────────────────────────────
    open_incs = open_incidents or []
    pts_inc, comp_inc, reasons_inc = _score_open_incidents(open_incs, weights)

    # Merge 7d + 30d recurrence into a single dict
    rec_merged = dict(recurrence_7d or {})
    if recurrence_30d:
        rec_merged["high_recurrence_30d"] = recurrence_30d.get("high_recurrence", {})
        rec_merged["warn_recurrence_30d"] = recurrence_30d.get("warn_recurrence", {})
    pts_rec, comp_rec, reasons_rec = _score_recurrence(rec_merged, weights)

    pts_fu, comp_fu, reasons_fu = _score_followups(followups_data or {}, weights)
    pts_slo, comp_slo, reasons_slo = _score_slo(slo_data or {}, weights)
    pts_loop, comp_loop, reasons_loop = _score_alerts_loop(alerts_loop_slo or {}, weights)
    pts_esc, comp_esc, reasons_esc = _score_escalations(escalation_count_24h, weights)

    total = max(0, pts_inc + pts_rec + pts_fu + pts_slo + pts_loop + pts_esc)
    band = score_to_band(total, policy)
    svc_thresholds = get_service_thresholds(service, policy)

    all_reasons = reasons_inc + reasons_rec + reasons_fu + reasons_slo + reasons_loop + reasons_esc

    # Deterministic recommendations
    recs = _build_recommendations(band, comp_inc, comp_rec, comp_fu, comp_slo)

    return {
        "service": service,
        "env": env,
        "score": total,
        "band": band,
        "thresholds": svc_thresholds,
        "components": {
            "open_incidents": comp_inc,
            "recurrence": comp_rec,
            "followups": comp_fu,
            "slo": comp_slo,
            "alerts_loop": comp_loop,
            "escalations": comp_esc,
        },
        "reasons": all_reasons,
        "recommendations": recs,
        "updated_at": datetime.datetime.utcnow().isoformat(),
    }


def _build_recommendations(
    band: str,
    comp_inc: Dict,
    comp_rec: Dict,
    comp_fu: Dict,
    comp_slo: Dict,
) -> List[str]:
    recs = []
    if comp_inc.get("P0", 0) or comp_inc.get("P1", 0):
        recs.append("Prioritize open P0/P1 incidents before deploying.")
    if comp_rec.get("high_signatures_7d", 0) or comp_rec.get("high_kinds_7d", 0):
        recs.append("Investigate recurring failure patterns (high recurrence buckets).")
    if comp_fu.get("P0", 0) or comp_fu.get("P1", 0):
        recs.append("Prioritize follow-up closure for recurring bucket(s).")
    if comp_slo.get("violations", 0):
        recs.append("Avoid risky deploys until SLO violation clears.")
    if band in ("high", "critical"):
        recs.append("Service is high-risk — coordinate with oncall before release.")
    return recs[:6]


# ─── Dashboard ────────────────────────────────────────────────────────────────

# ─── Trend computation ────────────────────────────────────────────────────────

def compute_trend(
    series: List,  # List[RiskSnapshot] — most-recent first
    policy: Optional[Dict] = None,
) -> Dict:
    """
    Compute trend metrics from a list of RiskSnapshot objects (or dicts).

    Returns:
      delta_24h, delta_7d, slope_per_day, volatility, regression{warn, fail}
    """
    if policy is None:
        policy = load_risk_policy()

    trend_cfg = policy.get("trend", {})
    reg = trend_cfg.get("regression_threshold", {})
    warn_24h = int(reg.get("delta_24h_warn", 10))
    fail_24h = int(reg.get("delta_24h_fail", 20))
    warn_7d = int(reg.get("delta_7d_warn", 15))
    fail_7d = int(reg.get("delta_7d_fail", 30))

    if not series:
        return _empty_trend()

    # Normalise: accept both RiskSnapshot dataclasses and plain dicts
    def _score(s) -> int:
        return int(s.score if hasattr(s, "score") else s["score"])

    def _ts(s) -> str:
        return s.ts if hasattr(s, "ts") else s["ts"]

    now = datetime.datetime.utcnow()
    latest_score = _score(series[0])

    # ── delta_24h ─────────────────────────────────────────────────────────────
    cutoff_24h = (now - datetime.timedelta(hours=24)).isoformat()
    base_24h = _find_baseline(series, cutoff_24h, _ts)
    delta_24h = (latest_score - _score(base_24h)) if base_24h is not None else None

    # ── delta_7d ──────────────────────────────────────────────────────────────
    cutoff_7d = (now - datetime.timedelta(hours=168)).isoformat()
    base_7d = _find_baseline(series, cutoff_7d, _ts)
    delta_7d = (latest_score - _score(base_7d)) if base_7d is not None else None

    # ── slope (simple linear regression over all available points) ────────────
    slope_per_day: Optional[float] = None
    if len(series) >= 2:
        # xs = age in hours from oldest point
        pairs = [(now - _parse_ts(_ts(s))).total_seconds() / 3600.0 for s in series]
        hours_from_oldest = [max(pairs) - p for p in pairs]  # 0=oldest, max=newest
        scores = [_score(s) for s in series]
        slope_per_day = _linear_slope(hours_from_oldest, scores) * 24  # per day

    # ── volatility (stddev of daily last-score-per-day over 7d) ──────────────
    volatility: Optional[float] = None
    daily_scores = _daily_latest_scores(series, days=7, _ts_fn=_ts, _score_fn=_score)
    if len(daily_scores) >= 2:
        mean = sum(daily_scores) / len(daily_scores)
        variance = sum((x - mean) ** 2 for x in daily_scores) / len(daily_scores)
        volatility = round(math.sqrt(variance), 2)

    # ── regression flags ──────────────────────────────────────────────────────
    reg_warn = (
        (delta_24h is not None and delta_24h >= warn_24h)
        or (delta_7d is not None and delta_7d >= warn_7d)
    )
    reg_fail = (
        (delta_24h is not None and delta_24h >= fail_24h)
        or (delta_7d is not None and delta_7d >= fail_7d)
    )

    return {
        "delta_24h": delta_24h,
        "delta_7d": delta_7d,
        "slope_per_day": round(slope_per_day, 2) if slope_per_day is not None else None,
        "volatility": volatility,
        "regression": {"warn": reg_warn, "fail": reg_fail},
    }


def _empty_trend() -> Dict:
    return {
        "delta_24h": None, "delta_7d": None,
        "slope_per_day": None, "volatility": None,
        "regression": {"warn": False, "fail": False},
    }


def _find_baseline(series, cutoff_iso: str, ts_fn):
    """Return the first element whose ts <= cutoff (series is newest-first)."""
    for s in series:
        if ts_fn(s) <= cutoff_iso:
            return s
    return None


def _parse_ts(ts_str: str) -> datetime.datetime:
    ts_str = ts_str.rstrip("Z")
    for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d"):
        try:
            return datetime.datetime.strptime(ts_str, fmt)
        except ValueError:
            continue
    return datetime.datetime.utcnow()


def _linear_slope(xs: List[float], ys: List[float]) -> float:
    """Simple least-squares slope (score per hour)."""
    n = len(xs)
    if n < 2:
        return 0.0
    x_mean = sum(xs) / n
    y_mean = sum(ys) / n
    num = sum((xs[i] - x_mean) * (ys[i] - y_mean) for i in range(n))
    den = sum((xs[i] - x_mean) ** 2 for i in range(n))
    return num / den if den != 0 else 0.0


def _daily_latest_scores(series, days: int, _ts_fn, _score_fn) -> List[float]:
    """Collect the latest score for each calendar day over last `days` days."""
    now = datetime.datetime.utcnow()
    day_scores: Dict[str, int] = {}
    cutoff = (now - datetime.timedelta(days=days)).isoformat()
    for s in series:
        ts = _ts_fn(s)
        if ts < cutoff:
            break
        day_key = ts[:10]  # YYYY-MM-DD
        if day_key not in day_scores:  # series is newest-first, so first = latest
            day_scores[day_key] = _score_fn(s)
    return list(day_scores.values())


def enrich_risk_report_with_trend(
    report: Dict,
    history_store,  # RiskHistoryStore
    policy: Optional[Dict] = None,
) -> Dict:
    """
    Mutates `report` in-place to add a `trend` key.
    Non-fatal: on any error, adds `trend: null`.
    """
    try:
        service = report.get("service", "")
        env = report.get("env", "prod")
        if policy is None:
            policy = load_risk_policy()

        trend_cfg = policy.get("trend", {})
        vol_hours = int(trend_cfg.get("volatility_window_hours", 168))
        series = history_store.get_series(service, env, hours=vol_hours, limit=500)
        report["trend"] = compute_trend(series, policy=policy)
    except Exception as e:
        logger.warning("enrich_risk_report_with_trend failed for %s: %s", report.get("service"), e)
        report["trend"] = None
    return report


def enrich_risk_report_with_attribution(
    report: Dict,
    *,
    alert_store=None,
    incident_store=None,
    attr_policy: Optional[Dict] = None,
) -> Dict:
    """
    Mutates `report` in-place to add an `attribution` key.
    Non-fatal: on any error, adds `attribution: null`.
    LLM enrichment is applied if policy.llm_mode != 'off' and triggers met.
    """
    try:
        from risk_attribution import (
            compute_attribution, fetch_signals_from_stores, load_attribution_policy,
        )
        from llm_enrichment import maybe_enrich_attribution

        if attr_policy is None:
            attr_policy = load_attribution_policy()

        service = report.get("service", "")
        env = report.get("env", "prod")

        # Fetch raw signals
        signals = fetch_signals_from_stores(
            service, env,
            window_hours=int((attr_policy.get("defaults") or {}).get("lookback_hours", 24)),
            alert_store=alert_store,
            incident_store=incident_store,
            policy=attr_policy,
        )

        attribution = compute_attribution(
            service, env,
            risk_report=report,
            **signals,
            policy=attr_policy,
        )

        # Optionally enrich with LLM (bounded, off by default)
        attribution["llm_enrichment"] = maybe_enrich_attribution(
            attribution, report, attr_policy
        )

        report["attribution"] = attribution
    except Exception as e:
        logger.warning("enrich_risk_report_with_attribution failed for %s: %s",
                       report.get("service"), e)
        report["attribution"] = None
    return report


# ─── Snapshot writer ──────────────────────────────────────────────────────────

def snapshot_all_services(
    env: str,
    compute_fn,  # Callable[[str, str], Dict]  — returns RiskReport for (service, env)
    history_store,  # RiskHistoryStore
    policy: Optional[Dict] = None,
    known_services: Optional[List[str]] = None,
) -> Dict:
    """
    Compute and persist a RiskSnapshot for every known service.

    `compute_fn(service, env)` must return a RiskReport dict.
    Returns {written, skipped, errors, services}.
    Non-fatal per service.
    """
    if policy is None:
        policy = load_risk_policy()

    from risk_history_store import RiskSnapshot

    max_services = int(policy.get("history", {}).get("max_services_per_run", 50))
    services = (known_services or [])[:max_services]

    written = skipped = errors = 0
    snapped: List[str] = []

    for svc in services:
        try:
            report = compute_fn(svc, env)
            snap = RiskSnapshot(
                ts=datetime.datetime.utcnow().isoformat(),
                service=svc,
                env=env,
                score=int(report.get("score", 0)),
                band=report.get("band", "low"),
                components=report.get("components", {}),
                reasons=report.get("reasons", []),
            )
            history_store.write_snapshot([snap])
            written += 1
            snapped.append(svc)
        except Exception as e:
            logger.warning("snapshot_all_services: error for %s/%s: %s", svc, env, e)
            errors += 1

    return {
        "written": written,
        "skipped": skipped,
        "errors": errors,
        "services": snapped,
        "env": env,
        "ts": datetime.datetime.utcnow().isoformat(),
    }


def compute_risk_dashboard(
    env: str = "prod",
    top_n: int = 10,
    *,
    service_reports: Optional[List[Dict]] = None,
    history_store=None,   # Optional[RiskHistoryStore] — if provided, enrich with trend
    policy: Optional[Dict] = None,
) -> Dict:
    """
    Build risk dashboard from a list of pre-computed service reports.
    Sorts by score desc and returns summary.
    If history_store is provided, each report is enriched with trend data.
    """
    if policy is None:
        policy = load_risk_policy()

    reports = sorted(
        service_reports or [],
        key=lambda r: -r.get("score", 0),
    )[:top_n]

    # Enrich with trend if history_store provided
    if history_store is not None:
        for r in reports:
            enrich_risk_report_with_trend(r, history_store, policy)

    band_counts: Dict[str, int] = {"critical": 0, "high": 0, "medium": 0, "low": 0}
    for r in reports:
        b = r.get("band", "low")
        band_counts[b] = band_counts.get(b, 0) + 1

    p0_services = set(policy.get("p0_services", []))
    critical_p0 = [r for r in reports if r["service"] in p0_services
                   and r["band"] in ("high", "critical")]

    # Top regressions (highest delta_24h, trend present)
    top_regressions = sorted(
        [r for r in reports if (r.get("trend") or {}).get("delta_24h") is not None
         and r["trend"]["delta_24h"] > 0],
        key=lambda r: -r["trend"]["delta_24h"],
    )[:5]

    # Improving services (most negative delta_7d)
    improving = sorted(
        [r for r in reports if (r.get("trend") or {}).get("delta_7d") is not None
         and r["trend"]["delta_7d"] < 0],
        key=lambda r: r["trend"]["delta_7d"],
    )[:5]

    # Top regression summaries (with top-2 causes if attribution available)
    top_regression_summaries = []
    for r in top_regressions:
        entry: Dict = {
            "service": r["service"],
            "delta_24h": r["trend"]["delta_24h"],
        }
        attr = r.get("attribution")
        if attr and attr.get("causes"):
            entry["causes"] = attr["causes"][:2]
            entry["attribution_summary"] = attr.get("summary", "")
        top_regression_summaries.append(entry)

    now_iso = datetime.datetime.utcnow().isoformat()
    return {
        "env": env,
        "generated_at": now_iso,
        "history_updated_at": now_iso,
        "total_services": len(reports),
        "band_counts": band_counts,
        "critical_p0_services": [r["service"] for r in critical_p0],
        "top_regressions": top_regression_summaries,
        "improving_services": [{"service": r["service"], "delta_7d": r["trend"]["delta_7d"]}
                                for r in improving],
        "services": reports,
    }