microdao-daarion/services/sofiia-supervisor/app/graphs/alert_triage_graph.py

"""
alert_triage_graph — Deterministic alert → incident → triage loop.

Runs every 5 min (via scheduler or cron). Zero LLM tokens in steady state
(llm_mode=off). Routing decisions driven entirely by alert_routing_policy.yml.

Node sequence:
  load_policy
  → list_alerts
  → for_each_alert (process loop)
      → decide_action (policy match)
      → alert_to_incident (if auto_incident)
      → run_deterministic_triage (if auto_triage, no LLM)
      → ack_alert
  → build_digest
  → END

All tool calls via GatewayClient (RBAC/audit enforced by gateway).
LLM is only invoked if policy.llm_mode != off AND rule.triage_mode == llm.
"""
from __future__ import annotations

import datetime
import logging
import textwrap
from typing import Any, Dict, List, Optional, TypedDict

from langgraph.graph import StateGraph, END

from ..alert_routing import (
    load_policy, match_alert, is_llm_allowed, compute_incident_signature
)
COOLDOWN_DEFAULT_MINUTES = 15
from ..config import settings
from ..gateway_client import GatewayClient

logger = logging.getLogger(__name__)

MAX_DIGEST_CHARS = 3800
MAX_ALERTS_HARD_CAP = 50   # safety cap regardless of policy


# ─── State ────────────────────────────────────────────────────────────────────

class AlertTriageState(TypedDict, total=False):
    # Input
    workspace_id: str
    user_id: str
    agent_id: str
    policy_profile: str          # "default" (reserved for future multi-profile support)
    dry_run: bool                # if True: no writes, no acks

    # Policy
    policy: Dict
    max_alerts: int
    max_incidents: int
    max_triages: int

    # Runtime
    alerts: List[Dict]
    processed: int
    created_incidents: List[Dict]
    updated_incidents: List[Dict]
    skipped_alerts: List[Dict]
    errors: List[Dict]
    triage_runs: int

    # Post-process results
    escalation_result: Dict
    autoresolve_result: Dict

    # Output
    digest_md: str
    result_summary: Dict


# ─── Helpers ──────────────────────────────────────────────────────────────────

def _now_iso() -> str:
    return datetime.datetime.utcnow().isoformat()


def _truncate(text: str, max_chars: int = 200) -> str:
    if len(text) <= max_chars:
        return text
    return text[:max_chars] + "…"


def _alert_line(alert: Dict) -> str:
    svc = alert.get("service", "?")
    sev = alert.get("severity", "?")
    kind = alert.get("kind", "?")
    title = _truncate(alert.get("title", ""), 80)
    ref = alert.get("alert_ref", "?")
    return f"[{sev}] {svc}/{kind}: {title} ({ref})"


async def _call_tool(
    gw: GatewayClient,
    tool: str,
    action: str,
    params: Dict,
    run_id: str,
    node: str,
    agent_id: str,
    workspace_id: str,
) -> Dict:
    """Call a tool via gateway, return data dict or empty dict on error."""
    result = await gw.call_tool(
        tool_name=tool,
        action=action,
        params=params,
        run_id=run_id,
        node=node,
        agent_id=agent_id,
        workspace_id=workspace_id,
    )
    if result.success:
        return result.data or {}
    logger.warning("Tool %s.%s failed: %s", tool, action, result.error_message)
    return {}


# ─── Nodes ────────────────────────────────────────────────────────────────────

async def load_policy_node(state: AlertTriageState) -> AlertTriageState:
    """Load alert routing policy. Never fails — falls back to built-in defaults."""
    policy = load_policy()
    defaults = policy.get("defaults", {})
    return {
        **state,
        "policy": policy,
        "max_alerts": min(
            int(defaults.get("max_alerts_per_run", 20)),
            MAX_ALERTS_HARD_CAP,
        ),
        "max_incidents": int(defaults.get("max_incidents_per_run", 5)),
        "max_triages": int(defaults.get("max_triages_per_run", 5)),
        "created_incidents": [],
        "updated_incidents": [],
        "skipped_alerts": [],
        "errors": [],
        "triage_runs": 0,
        "processed": 0,
    }


async def list_alerts_node(state: AlertTriageState) -> AlertTriageState:
    """
    Atomically claim a batch of new/failed alerts for processing.
    Uses alert_ingest_tool.claim (SELECT FOR UPDATE SKIP LOCKED in Postgres).
    Falls back to list with status_in=new,failed if claim not available.
    """
    policy = state.get("policy", {})
    max_alerts = state.get("max_alerts", 20)

    agent_id = state.get("agent_id", "sofiia")
    workspace_id = state.get("workspace_id", "default")
    run_id = state.get("_run_id", "unknown")

    try:
        async with GatewayClient() as gw:
            data = await _call_tool(
                gw, "alert_ingest_tool", "claim",
                {
                    "window_minutes": 240,
                    "limit": max_alerts,
                    "owner": f"supervisor:{run_id[:12]}",
                    "lock_ttl_seconds": 600,
                },
                run_id=run_id, node="claim_alerts",
                agent_id=agent_id, workspace_id=workspace_id,
            )
    except Exception as e:
        logger.error("claim_alerts_node failed: %s", e)
        return {**state, "alerts": [], "errors": [{"node": "claim_alerts", "error": str(e)}]}

    claimed = data.get("alerts", [])
    requeued = data.get("requeued_stale", 0)
    if requeued:
        logger.info("Requeued %d stale-processing alerts", requeued)

    return {**state, "alerts": claimed[:max_alerts]}


async def process_alerts_node(state: AlertTriageState) -> AlertTriageState:
    """
    Main loop: for each alert → match policy → create/update incident → triage.
    Deterministic by default (0 LLM tokens unless policy.llm_mode != off).
    """
    policy = state.get("policy", {})
    defaults = policy.get("defaults", {})
    alerts = state.get("alerts", [])
    dry_run = state.get("dry_run", False)

    agent_id = state.get("agent_id", "sofiia")
    workspace_id = state.get("workspace_id", "default")
    run_id = state.get("_run_id", "unknown")

    created_incidents: List[Dict] = list(state.get("created_incidents", []))
    updated_incidents: List[Dict] = list(state.get("updated_incidents", []))
    skipped_alerts: List[Dict] = list(state.get("skipped_alerts", []))
    errors: List[Dict] = list(state.get("errors", []))

    max_incidents = state.get("max_incidents", 5)
    max_triages = state.get("max_triages", 5)
    triage_runs = state.get("triage_runs", 0)
    processed = 0

    ack_prefix = defaults.get("ack_note_prefix", "alert_triage_loop")

    async with GatewayClient() as gw:
        for alert in alerts:
            alert_ref = alert.get("alert_ref", "?")
            try:
                actions = match_alert(alert, policy)
                incident_id = None
                triage_run_id = None

                # ── Digest-only: ack immediately, no incident ─────────────────
                if not actions.get("auto_incident", False):
                    if actions.get("digest_only"):
                        skipped_alerts.append({
                            "alert_ref": alert_ref,
                            "service": alert.get("service"),
                            "severity": alert.get("severity"),
                            "reason": "digest_only (policy)",
                        })
                    if not dry_run and actions.get("ack", True):
                        await _call_tool(
                            gw, "alert_ingest_tool", "ack",
                            {"alert_ref": alert_ref, "actor": agent_id,
                             "note": f"{ack_prefix}:digest_only"},
                            run_id=run_id, node="ack_digest",
                            agent_id=agent_id, workspace_id=workspace_id,
                        )
                    processed += 1
                    continue

                # ── Auto incident creation ─────────────────────────────────────
                if len(created_incidents) + len(updated_incidents) >= max_incidents:
                    skipped_alerts.append({
                        "alert_ref": alert_ref,
                        "reason": "max_incidents_per_run reached",
                    })
                    # Don't ack — leave as processing; next run picks it up
                    processed += 1
                    continue

                if not dry_run:
                    inc_result = await _call_tool(
                        gw, "oncall_tool", "alert_to_incident",
                        {
                            "alert_ref": alert_ref,
                            "incident_severity_cap": actions.get("incident_severity_cap", "P1"),
                            "dedupe_window_minutes": int(actions.get("dedupe_window_minutes", 120)),
                            "attach_artifact": actions.get("attach_alert_artifact", True),
                        },
                        run_id=run_id, node="alert_to_incident",
                        agent_id=agent_id, workspace_id=workspace_id,
                    )
                    if inc_result:
                        incident_id = inc_result.get("incident_id")
                        incident_signature = inc_result.get("incident_signature", "")
                        if inc_result.get("created"):
                            created_incidents.append({
                                "incident_id": incident_id,
                                "alert_ref": alert_ref,
                                "service": alert.get("service"),
                                "severity": inc_result.get("severity"),
                                "signature": incident_signature,
                            })
                        else:
                            updated_incidents.append({
                                "incident_id": incident_id,
                                "alert_ref": alert_ref,
                                "note": inc_result.get("note", "attached"),
                            })
                    else:
                        # incident creation failed — mark alert as failed
                        await _call_tool(
                            gw, "alert_ingest_tool", "fail",
                            {"alert_ref": alert_ref,
                             "error": "alert_to_incident returned empty",
                             "retry_after_seconds": 300},
                            run_id=run_id, node="fail_alert",
                            agent_id=agent_id, workspace_id=workspace_id,
                        )
                        errors.append({
                            "node": "alert_to_incident",
                            "alert_ref": alert_ref,
                            "error": "empty response",
                        })
                        processed += 1
                        continue
                else:
                    sig = compute_incident_signature(alert, policy)
                    incident_id = f"dry_run_inc_{sig[:8]}"
                    incident_signature = sig
                    created_incidents.append({
                        "incident_id": incident_id,
                        "alert_ref": alert_ref,
                        "service": alert.get("service"),
                        "dry_run": True,
                    })

                # ── Cooldown check before triage ──────────────────────────────
                cooldown_ok = True
                cooldown_minutes = int(
                    policy.get("defaults", {}).get("triage_cooldown_minutes",
                                                   COOLDOWN_DEFAULT_MINUTES)
                )
                if (
                    incident_id
                    and actions.get("auto_triage", False)
                    and incident_signature
                    and not dry_run
                ):
                    sig_check = await _call_tool(
                        gw, "oncall_tool", "signature_should_triage",
                        {"signature": incident_signature,
                         "cooldown_minutes": cooldown_minutes},
                        run_id=run_id, node="cooldown_check",
                        agent_id=agent_id, workspace_id=workspace_id,
                    )
                    cooldown_ok = sig_check.get("should_triage", True)

                    if not cooldown_ok:
                        # Cooldown active: append soft event but don't triage
                        await _call_tool(
                            gw, "oncall_tool", "incident_append_event",
                            {"incident_id": incident_id,
                             "type": "note",
                             "message": f"Alert observed during triage cooldown "
                                        f"(signature={incident_signature[:8]}, "
                                        f"cooldown={cooldown_minutes}min)",
                             "meta": {"alert_ref": alert_ref,
                                      "cooldown_active": True}},
                            run_id=run_id, node="cooldown_event",
                            agent_id=agent_id, workspace_id=workspace_id,
                        )

                # ── Deterministic triage ──────────────────────────────────────
                if (
                    incident_id
                    and actions.get("auto_triage", False)
                    and triage_runs < max_triages
                    and cooldown_ok
                ):
                    triage_mode = actions.get("triage_mode", "deterministic")
                    if triage_mode == "llm" and not is_llm_allowed("triage", policy):
                        triage_mode = "deterministic"
                        logger.info("llm_mode=off → deterministic triage for %s", alert_ref)

                    if triage_mode == "deterministic" and not dry_run:
                        try:
                            triage_run_id = await _run_deterministic_triage(
                                gw, incident_id, alert, agent_id, workspace_id, run_id
                            )
                            triage_runs += 1
                            # Mark signature cooldown
                            await _call_tool(
                                gw, "oncall_tool", "signature_mark_triage",
                                {"signature": incident_signature},
                                run_id=run_id, node="mark_triage",
                                agent_id=agent_id, workspace_id=workspace_id,
                            )
                        except Exception as te:
                            logger.warning("Triage failed for %s: %s", incident_id, te)
                            errors.append({
                                "node": "triage",
                                "incident_id": incident_id,
                                "error": str(te),
                            })

                # ── Ack alert (success) ────────────────────────────────────────
                if not dry_run and actions.get("ack", True):
                    note_parts = [ack_prefix]
                    if incident_id:
                        note_parts.append(f"incident:{incident_id}")
                    if triage_run_id:
                        note_parts.append(f"triage:{triage_run_id}")
                    await _call_tool(
                        gw, "alert_ingest_tool", "ack",
                        {"alert_ref": alert_ref, "actor": agent_id,
                         "note": "|".join(note_parts)},
                        run_id=run_id, node="ack_alert",
                        agent_id=agent_id, workspace_id=workspace_id,
                    )

                processed += 1

            except Exception as e:
                logger.error("Error processing alert %s: %s", alert_ref, e)
                errors.append({
                    "node": "process_alerts",
                    "alert_ref": alert_ref,
                    "error": str(e),
                })
                # Mark alert as failed so it retries next run
                try:
                    async with GatewayClient() as gw2:
                        await _call_tool(
                            gw2, "alert_ingest_tool", "fail",
                            {"alert_ref": alert_ref, "error": str(e)[:200],
                             "retry_after_seconds": 300},
                            run_id=run_id, node="fail_on_error",
                            agent_id=agent_id, workspace_id=workspace_id,
                        )
                except Exception:
                    pass  # non-fatal fail-marking
                processed += 1

    return {
        **state,
        "processed": state.get("processed", 0) + processed,
        "created_incidents": created_incidents,
        "updated_incidents": updated_incidents,
        "skipped_alerts": skipped_alerts,
        "errors": errors,
        "triage_runs": triage_runs,
    }


async def _run_deterministic_triage(
    gw: GatewayClient,
    incident_id: str,
    alert: Dict,
    agent_id: str,
    workspace_id: str,
    run_id: str,
) -> Optional[str]:
    """
    Run deterministic triage for an incident:
    1. service_overview (observability)
    2. health check (oncall)
    3. KB runbook snippets
    4. Compile and attach triage report artifact
    """
    import json, base64, hashlib

    service = alert.get("service", "unknown")
    env = alert.get("env", "prod")
    now = datetime.datetime.utcnow()
    time_from = (now - datetime.timedelta(hours=1)).isoformat()
    time_to = now.isoformat()
    triage_id = f"tri_{hashlib.sha256(f'{incident_id}{now}'.encode()).hexdigest()[:8]}"

    # 1. Service overview
    overview_data = await _call_tool(
        gw, "observability_tool", "service_overview",
        {"service": service, "env": env,
         "time_range": {"from": time_from, "to": time_to}},
        run_id=run_id, node="triage_overview",
        agent_id=agent_id, workspace_id=workspace_id,
    )

    # 2. Health check
    health_data = await _call_tool(
        gw, "oncall_tool", "service_health",
        {"service": service, "env": env},
        run_id=run_id, node="triage_health",
        agent_id=agent_id, workspace_id=workspace_id,
    )

    # 3. KB runbooks
    kb_data = await _call_tool(
        gw, "kb_tool", "snippets",
        {"query": f"{service} {alert.get('kind', '')} {alert.get('title', '')}",
         "limit": 3},
        run_id=run_id, node="triage_kb",
        agent_id=agent_id, workspace_id=workspace_id,
    )

    # 4. Compile deterministic report
    report = {
        "triage_id": triage_id,
        "incident_id": incident_id,
        "service": service,
        "env": env,
        "mode": "deterministic",
        "alert_ref": alert.get("alert_ref", ""),
        "generated_at": now.isoformat(),
        "summary": (
            f"Auto-triage for {service} {alert.get('kind','?')} "
            f"(severity={alert.get('severity','?')})"
        ),
        "suspected_root_causes": _build_root_causes(alert, overview_data, health_data),
        "impact_assessment": _build_impact(alert, overview_data),
        "mitigations_now": _build_mitigations(alert, health_data, kb_data),
        "next_checks": _build_next_checks(alert, overview_data),
        "references": {
            "metrics": overview_data.get("metrics", {}),
            "health": health_data,
            "runbook_snippets": (kb_data.get("snippets") or [])[:3],
        },
    }

    # Attach as incident artifact
    content = json.dumps(report, indent=2, default=str).encode()
    content_b64 = base64.b64encode(content).decode()

    await _call_tool(
        gw, "oncall_tool", "incident_attach_artifact",
        {
            "incident_id": incident_id,
            "kind": "triage_report",
            "format": "json",
            "content_base64": content_b64,
            "filename": f"triage_{triage_id}.json",
        },
        run_id=run_id, node="attach_triage",
        agent_id=agent_id, workspace_id=workspace_id,
    )

    # Append timeline event
    await _call_tool(
        gw, "oncall_tool", "incident_append_event",
        {
            "incident_id": incident_id,
            "type": "note",
            "message": f"Deterministic triage completed (triage_id={triage_id})",
            "meta": {"triage_id": triage_id, "mode": "deterministic"},
        },
        run_id=run_id, node="triage_event",
        agent_id=agent_id, workspace_id=workspace_id,
    )

    return triage_id


def _build_root_causes(alert: Dict, overview: Dict, health: Dict) -> List[Dict]:
    causes = []
    kind = alert.get("kind", "custom")
    metrics = (overview.get("metrics") or {})

    kind_cause_map = {
        "slo_breach": "SLO breach detected (latency/error rate exceeded threshold)",
        "latency": "High latency observed — possible overload or downstream dependency degradation",
        "error_rate": "Elevated error rate — check recent deployments and upstream dependencies",
        "crashloop": "Container crash-looping — OOM or unhandled exception in startup",
        "oom": "Out-of-memory condition — memory leak or insufficient limits",
        "disk": "Disk/PVC capacity pressure — check log rotation and data retention",
        "deploy": "Recent deployment may have introduced regression",
        "security": "Security event detected — unauthorized access or injection attempt",
    }
    description = kind_cause_map.get(kind, f"Alert kind '{kind}' triggered on {alert.get('service', '?')}")
    causes.append({"rank": 1, "cause": description, "evidence": [_alert_line(alert)]})

    # Add metric-based hints
    alert_metrics = alert.get("metrics", {})
    if alert_metrics.get("latency_p95_ms", 0) > 500:
        causes.append({
            "rank": 2,
            "cause": f"High p95 latency: {alert_metrics['latency_p95_ms']}ms",
            "evidence": ["From alert metrics"],
        })
    if alert_metrics.get("error_rate_pct", 0) > 1.0:
        causes.append({
            "rank": 3,
            "cause": f"Elevated error rate: {alert_metrics['error_rate_pct']}%",
            "evidence": ["From alert metrics"],
        })

    if health and not health.get("healthy", True):
        causes.append({
            "rank": len(causes) + 1,
            "cause": f"Service health check failed: {health.get('status', 'unknown')}",
            "evidence": [str(health.get("details", ""))[:200]],
        })
    return causes


def _build_impact(alert: Dict, overview: Dict) -> str:
    sev = alert.get("severity", "P2")
    svc = alert.get("service", "unknown")
    env = alert.get("env", "prod")
    impact_map = {
        "P0": f"CRITICAL: {svc} is fully degraded in {env}. Immediate action required.",
        "P1": f"HIGH: {svc} in {env} is significantly impaired. Users affected.",
        "P2": f"MEDIUM: {svc} in {env} is partially degraded. Monitoring required.",
        "P3": f"LOW: Minor degradation in {svc} ({env}). No immediate user impact.",
    }
    return impact_map.get(sev, f"{svc} affected in {env}")


def _build_mitigations(alert: Dict, health: Dict, kb: Dict) -> List[str]:
    mitigations = []
    kind = alert.get("kind", "custom")

    kind_mitigations = {
        "slo_breach": ["Check recent deployments and rollback if needed",
                       "Scale service if under load", "Review error budget"],
        "latency": ["Check downstream dependency health",
                    "Review connection pool settings", "Check for resource contention"],
        "error_rate": ["Review application logs for exceptions",
                       "Check recent config changes", "Verify upstream dependencies"],
        "crashloop": ["Check pod logs: kubectl logs <pod> --previous",
                      "Review resource limits", "Check liveness probe configuration"],
        "oom": ["Increase memory limits", "Check for memory leaks",
                "Review heap dumps if available"],
        "disk": ["Run log rotation", "Check data retention policies",
                 "Delete old artifacts / compact audit logs"],
        "deploy": ["Review deployment diff", "Run smoke tests",
                   "Consider rollback if metrics degraded"],
        "security": ["Block suspicious IPs", "Rotate affected credentials",
                     "Audit access logs", "Notify security team"],
    }
    mitigations.extend(kind_mitigations.get(kind, ["Investigate logs and metrics"]))

    snippets = (kb.get("snippets") or [])[:2]
    for s in snippets:
        ref = s.get("path", "KB")
        mitigations.append(f"See runbook: {ref}")

    return mitigations


def _build_next_checks(alert: Dict, overview: Dict) -> List[str]:
    svc = alert.get("service", "unknown")
    return [
        f"Monitor {svc} error rate and latency for next 15 min",
        "Check incident_triage_graph for deeper analysis",
        "Verify SLO status with observability_tool.slo_snapshot",
        "If not resolved in 30 min → escalate to P0",
    ]


def _alert_line(alert: Dict) -> str:
    """Short single-line summary of an alert for evidence lists."""
    return (
        f"[{alert.get('severity','?')}] {alert.get('service','?')} "
        f"{alert.get('kind','?')}: {alert.get('title','')[:80]}"
    )


async def build_digest_node(state: AlertTriageState) -> AlertTriageState:
    """Build short markdown digest for CTO/UI (max 3800 chars)."""
    created = state.get("created_incidents", [])
    updated = state.get("updated_incidents", [])
    skipped = state.get("skipped_alerts", [])
    errors = state.get("errors", [])
    processed = state.get("processed", 0)
    alerts = state.get("alerts", [])
    dry_run = state.get("dry_run", False)
    triage_runs = state.get("triage_runs", 0)

    ts = _now_iso()
    dry_tag = " **[DRY RUN]**" if dry_run else ""
    lines = [
        f"## Alert Triage Digest{dry_tag} — {ts[:19]}Z",
        "",
        f"**Processed:** {processed} alerts | "
        f"**New incidents:** {len(created)} | "
        f"**Updated:** {len(updated)} | "
        f"**Skipped/Digest:** {len(skipped)} | "
        f"**Triages run:** {triage_runs} | "
        f"**Errors:** {len(errors)}",
        "",
    ]

    if created:
        lines.append("### 🆕 Created Incidents")
        for item in created[:10]:
            sev = item.get("severity", "?")
            svc = item.get("service", "?")
            inc_id = item.get("incident_id", "?")
            ref = item.get("alert_ref", "?")
            sig = (item.get("signature") or "")[:8]
            lines.append(f"- `{inc_id}` [{sev}] {svc} (alert: {ref}, sig: {sig})")
        lines.append("")

    if updated:
        lines.append("### 🔄 Updated Incidents (alert attached)")
        for item in updated[:10]:
            inc_id = item.get("incident_id", "?")
            ref = item.get("alert_ref", "?")
            lines.append(f"- `{inc_id}` ← alert `{ref}` ({item.get('note', '')})")
        lines.append("")

    if skipped:
        lines.append("### ⏭ Skipped / Digest-only")
        for item in skipped[:15]:
            svc = item.get("service", "?")
            sev = item.get("severity", "?")
            reason = item.get("reason", "policy")
            ref = item.get("alert_ref", "?")
            lines.append(f"- [{sev}] {svc} `{ref}` — {reason}")
        if len(skipped) > 15:
            lines.append(f"- … and {len(skipped) - 15} more")
        lines.append("")

    if errors:
        lines.append("### ⚠️ Errors (non-fatal)")
        for e in errors[:5]:
            lines.append(f"- `{e.get('node','?')}`: {str(e.get('error','?'))[:120]}")
        lines.append("")

    # ── Escalation results ─────────────────────────────────────────────────────
    escalation = state.get("escalation_result") or {}
    esc_count = escalation.get("escalated", 0)
    esc_candidates = escalation.get("candidates", [])

    if esc_count > 0:
        lines.append(f"### ⬆️ Escalated Incidents ({esc_count})")
        for c in esc_candidates[:5]:
            if c.get("from_severity") != c.get("to_severity"):
                lines.append(
                    f"- `{c.get('incident_id','?')}` {c.get('service','?')}: "
                    f"{c.get('from_severity')} → {c.get('to_severity')} "
                    f"(occ_60m={c.get('occurrences_60m',0)}, "
                    f"triage_24h={c.get('triage_count_24h',0)})"
                )
        lines.append("")

    # ── Auto-resolve candidates ────────────────────────────────────────────────
    ar = state.get("autoresolve_result") or {}
    ar_count = ar.get("candidates_count", 0)
    if ar_count > 0:
        lines.append(f"### 🟡 Auto-resolve Candidates ({ar_count})")
        for c in (ar.get("candidates") or [])[:5]:
            lines.append(
                f"- `{c.get('incident_id','?')}` [{c.get('severity','?')}] "
                f"{c.get('service','?')}: no alerts for "
                f"{c.get('minutes_without_alerts', '?')}min"
            )
        lines.append("")

    if not created and not updated and not skipped and not errors:
        lines.append("_No alerts to process in this window._")

    digest_md = "\n".join(lines)

    # Truncate if over limit
    if len(digest_md) > MAX_DIGEST_CHARS:
        digest_md = digest_md[:MAX_DIGEST_CHARS - 50] + "\n\n… *(digest truncated)*"

    result_summary = {
        "processed": processed,
        "created_incidents": len(created),
        "updated_incidents": len(updated),
        "skipped": len(skipped),
        "triage_runs": triage_runs,
        "escalated": esc_count,
        "autoresolve_candidates": ar_count,
        "errors": len(errors),
    }

    return {
        **state,
        "digest_md": digest_md,
        "result_summary": result_summary,
    }


async def post_process_escalation_node(state: AlertTriageState) -> AlertTriageState:
    """
    After processing alerts: call incident_escalation_tool.evaluate.
    Only runs if at least 1 alert was processed. Non-fatal.
    """
    processed = state.get("processed", 0)
    if processed == 0:
        return {**state, "escalation_result": {}}

    agent_id = state.get("agent_id", "sofiia")
    workspace_id = state.get("workspace_id", "default")
    run_id = state.get("_run_id", "unknown")
    dry_run = state.get("dry_run", False)

    try:
        async with GatewayClient() as gw:
            result = await _call_tool(
                gw, "incident_escalation_tool", "evaluate",
                {
                    "window_minutes": 60,
                    "limit": 50,
                    "dry_run": dry_run,
                },
                run_id=run_id, node="post_escalation",
                agent_id=agent_id, workspace_id=workspace_id,
            )
    except Exception as e:
        logger.warning("post_process_escalation_node failed (non-fatal): %s", e)
        result = {}

    return {**state, "escalation_result": result}


async def post_process_autoresolve_node(state: AlertTriageState) -> AlertTriageState:
    """
    After processing alerts: find auto-resolve candidates.
    Always dry_run=True (candidate-only, no actual close unless policy says otherwise).
    Non-fatal.
    """
    processed = state.get("processed", 0)
    if processed == 0:
        return {**state, "autoresolve_result": {}}

    agent_id = state.get("agent_id", "sofiia")
    workspace_id = state.get("workspace_id", "default")
    run_id = state.get("_run_id", "unknown")

    try:
        async with GatewayClient() as gw:
            result = await _call_tool(
                gw, "incident_escalation_tool", "auto_resolve_candidates",
                {
                    "no_alerts_minutes": 60,
                    "limit": 50,
                    "dry_run": True,  # always candidate-only in loop
                },
                run_id=run_id, node="post_autoresolve",
                agent_id=agent_id, workspace_id=workspace_id,
            )
    except Exception as e:
        logger.warning("post_process_autoresolve_node failed (non-fatal): %s", e)
        result = {}

    return {**state, "autoresolve_result": result}


# ─── Graph builder ─────────────────────────────────────────────────────────────

def build_alert_triage_graph():
    """
    Build the alert_triage LangGraph.

    LLM usage: ZERO in steady state (llm_mode=off in policy).
    All nodes are deterministic Python + gateway tool calls.

    Flow: load_policy → claim_alerts → process_alerts
          → post_escalation → post_autoresolve → build_digest
    """
    workflow = StateGraph(AlertTriageState)

    workflow.add_node("load_policy", load_policy_node)
    workflow.add_node("list_alerts", list_alerts_node)
    workflow.add_node("process_alerts", process_alerts_node)
    workflow.add_node("post_escalation", post_process_escalation_node)
    workflow.add_node("post_autoresolve", post_process_autoresolve_node)
    workflow.add_node("build_digest", build_digest_node)

    workflow.set_entry_point("load_policy")
    workflow.add_edge("load_policy", "list_alerts")
    workflow.add_edge("list_alerts", "process_alerts")
    workflow.add_edge("process_alerts", "post_escalation")
    workflow.add_edge("post_escalation", "post_autoresolve")
    workflow.add_edge("post_autoresolve", "build_digest")
    workflow.add_edge("build_digest", END)

    return workflow.compile()