""" alert_triage_graph — Deterministic alert → incident → triage loop. Runs every 5 min (via scheduler or cron). Zero LLM tokens in steady state (llm_mode=off). Routing decisions driven entirely by alert_routing_policy.yml. Node sequence: load_policy → list_alerts → for_each_alert (process loop) → decide_action (policy match) → alert_to_incident (if auto_incident) → run_deterministic_triage (if auto_triage, no LLM) → ack_alert → build_digest → END All tool calls via GatewayClient (RBAC/audit enforced by gateway). LLM is only invoked if policy.llm_mode != off AND rule.triage_mode == llm. """ from __future__ import annotations import datetime import logging import textwrap from typing import Any, Dict, List, Optional, TypedDict from langgraph.graph import StateGraph, END from ..alert_routing import ( load_policy, match_alert, is_llm_allowed, compute_incident_signature ) COOLDOWN_DEFAULT_MINUTES = 15 from ..config import settings from ..gateway_client import GatewayClient logger = logging.getLogger(__name__) MAX_DIGEST_CHARS = 3800 MAX_ALERTS_HARD_CAP = 50 # safety cap regardless of policy # ─── State ──────────────────────────────────────────────────────────────────── class AlertTriageState(TypedDict, total=False): # Input workspace_id: str user_id: str agent_id: str policy_profile: str # "default" (reserved for future multi-profile support) dry_run: bool # if True: no writes, no acks # Policy policy: Dict max_alerts: int max_incidents: int max_triages: int # Runtime alerts: List[Dict] processed: int created_incidents: List[Dict] updated_incidents: List[Dict] skipped_alerts: List[Dict] errors: List[Dict] triage_runs: int # Post-process results escalation_result: Dict autoresolve_result: Dict # Output digest_md: str result_summary: Dict # ─── Helpers ────────────────────────────────────────────────────────────────── def _now_iso() -> str: return datetime.datetime.utcnow().isoformat() def _truncate(text: str, max_chars: int = 200) -> str: if len(text) <= max_chars: return text return text[:max_chars] + "…" def _alert_line(alert: Dict) -> str: svc = alert.get("service", "?") sev = alert.get("severity", "?") kind = alert.get("kind", "?") title = _truncate(alert.get("title", ""), 80) ref = alert.get("alert_ref", "?") return f"[{sev}] {svc}/{kind}: {title} ({ref})" async def _call_tool( gw: GatewayClient, tool: str, action: str, params: Dict, run_id: str, node: str, agent_id: str, workspace_id: str, ) -> Dict: """Call a tool via gateway, return data dict or empty dict on error.""" result = await gw.call_tool( tool_name=tool, action=action, params=params, run_id=run_id, node=node, agent_id=agent_id, workspace_id=workspace_id, ) if result.success: return result.data or {} logger.warning("Tool %s.%s failed: %s", tool, action, result.error_message) return {} # ─── Nodes ──────────────────────────────────────────────────────────────────── async def load_policy_node(state: AlertTriageState) -> AlertTriageState: """Load alert routing policy. Never fails — falls back to built-in defaults.""" policy = load_policy() defaults = policy.get("defaults", {}) return { **state, "policy": policy, "max_alerts": min( int(defaults.get("max_alerts_per_run", 20)), MAX_ALERTS_HARD_CAP, ), "max_incidents": int(defaults.get("max_incidents_per_run", 5)), "max_triages": int(defaults.get("max_triages_per_run", 5)), "created_incidents": [], "updated_incidents": [], "skipped_alerts": [], "errors": [], "triage_runs": 0, "processed": 0, } async def list_alerts_node(state: AlertTriageState) -> AlertTriageState: """ Atomically claim a batch of new/failed alerts for processing. Uses alert_ingest_tool.claim (SELECT FOR UPDATE SKIP LOCKED in Postgres). Falls back to list with status_in=new,failed if claim not available. """ policy = state.get("policy", {}) max_alerts = state.get("max_alerts", 20) agent_id = state.get("agent_id", "sofiia") workspace_id = state.get("workspace_id", "default") run_id = state.get("_run_id", "unknown") try: async with GatewayClient() as gw: data = await _call_tool( gw, "alert_ingest_tool", "claim", { "window_minutes": 240, "limit": max_alerts, "owner": f"supervisor:{run_id[:12]}", "lock_ttl_seconds": 600, }, run_id=run_id, node="claim_alerts", agent_id=agent_id, workspace_id=workspace_id, ) except Exception as e: logger.error("claim_alerts_node failed: %s", e) return {**state, "alerts": [], "errors": [{"node": "claim_alerts", "error": str(e)}]} claimed = data.get("alerts", []) requeued = data.get("requeued_stale", 0) if requeued: logger.info("Requeued %d stale-processing alerts", requeued) return {**state, "alerts": claimed[:max_alerts]} async def process_alerts_node(state: AlertTriageState) -> AlertTriageState: """ Main loop: for each alert → match policy → create/update incident → triage. Deterministic by default (0 LLM tokens unless policy.llm_mode != off). """ policy = state.get("policy", {}) defaults = policy.get("defaults", {}) alerts = state.get("alerts", []) dry_run = state.get("dry_run", False) agent_id = state.get("agent_id", "sofiia") workspace_id = state.get("workspace_id", "default") run_id = state.get("_run_id", "unknown") created_incidents: List[Dict] = list(state.get("created_incidents", [])) updated_incidents: List[Dict] = list(state.get("updated_incidents", [])) skipped_alerts: List[Dict] = list(state.get("skipped_alerts", [])) errors: List[Dict] = list(state.get("errors", [])) max_incidents = state.get("max_incidents", 5) max_triages = state.get("max_triages", 5) triage_runs = state.get("triage_runs", 0) processed = 0 ack_prefix = defaults.get("ack_note_prefix", "alert_triage_loop") async with GatewayClient() as gw: for alert in alerts: alert_ref = alert.get("alert_ref", "?") try: actions = match_alert(alert, policy) incident_id = None triage_run_id = None # ── Digest-only: ack immediately, no incident ───────────────── if not actions.get("auto_incident", False): if actions.get("digest_only"): skipped_alerts.append({ "alert_ref": alert_ref, "service": alert.get("service"), "severity": alert.get("severity"), "reason": "digest_only (policy)", }) if not dry_run and actions.get("ack", True): await _call_tool( gw, "alert_ingest_tool", "ack", {"alert_ref": alert_ref, "actor": agent_id, "note": f"{ack_prefix}:digest_only"}, run_id=run_id, node="ack_digest", agent_id=agent_id, workspace_id=workspace_id, ) processed += 1 continue # ── Auto incident creation ───────────────────────────────────── if len(created_incidents) + len(updated_incidents) >= max_incidents: skipped_alerts.append({ "alert_ref": alert_ref, "reason": "max_incidents_per_run reached", }) # Don't ack — leave as processing; next run picks it up processed += 1 continue if not dry_run: inc_result = await _call_tool( gw, "oncall_tool", "alert_to_incident", { "alert_ref": alert_ref, "incident_severity_cap": actions.get("incident_severity_cap", "P1"), "dedupe_window_minutes": int(actions.get("dedupe_window_minutes", 120)), "attach_artifact": actions.get("attach_alert_artifact", True), }, run_id=run_id, node="alert_to_incident", agent_id=agent_id, workspace_id=workspace_id, ) if inc_result: incident_id = inc_result.get("incident_id") incident_signature = inc_result.get("incident_signature", "") if inc_result.get("created"): created_incidents.append({ "incident_id": incident_id, "alert_ref": alert_ref, "service": alert.get("service"), "severity": inc_result.get("severity"), "signature": incident_signature, }) else: updated_incidents.append({ "incident_id": incident_id, "alert_ref": alert_ref, "note": inc_result.get("note", "attached"), }) else: # incident creation failed — mark alert as failed await _call_tool( gw, "alert_ingest_tool", "fail", {"alert_ref": alert_ref, "error": "alert_to_incident returned empty", "retry_after_seconds": 300}, run_id=run_id, node="fail_alert", agent_id=agent_id, workspace_id=workspace_id, ) errors.append({ "node": "alert_to_incident", "alert_ref": alert_ref, "error": "empty response", }) processed += 1 continue else: sig = compute_incident_signature(alert, policy) incident_id = f"dry_run_inc_{sig[:8]}" incident_signature = sig created_incidents.append({ "incident_id": incident_id, "alert_ref": alert_ref, "service": alert.get("service"), "dry_run": True, }) # ── Cooldown check before triage ────────────────────────────── cooldown_ok = True cooldown_minutes = int( policy.get("defaults", {}).get("triage_cooldown_minutes", COOLDOWN_DEFAULT_MINUTES) ) if ( incident_id and actions.get("auto_triage", False) and incident_signature and not dry_run ): sig_check = await _call_tool( gw, "oncall_tool", "signature_should_triage", {"signature": incident_signature, "cooldown_minutes": cooldown_minutes}, run_id=run_id, node="cooldown_check", agent_id=agent_id, workspace_id=workspace_id, ) cooldown_ok = sig_check.get("should_triage", True) if not cooldown_ok: # Cooldown active: append soft event but don't triage await _call_tool( gw, "oncall_tool", "incident_append_event", {"incident_id": incident_id, "type": "note", "message": f"Alert observed during triage cooldown " f"(signature={incident_signature[:8]}, " f"cooldown={cooldown_minutes}min)", "meta": {"alert_ref": alert_ref, "cooldown_active": True}}, run_id=run_id, node="cooldown_event", agent_id=agent_id, workspace_id=workspace_id, ) # ── Deterministic triage ────────────────────────────────────── if ( incident_id and actions.get("auto_triage", False) and triage_runs < max_triages and cooldown_ok ): triage_mode = actions.get("triage_mode", "deterministic") if triage_mode == "llm" and not is_llm_allowed("triage", policy): triage_mode = "deterministic" logger.info("llm_mode=off → deterministic triage for %s", alert_ref) if triage_mode == "deterministic" and not dry_run: try: triage_run_id = await _run_deterministic_triage( gw, incident_id, alert, agent_id, workspace_id, run_id ) triage_runs += 1 # Mark signature cooldown await _call_tool( gw, "oncall_tool", "signature_mark_triage", {"signature": incident_signature}, run_id=run_id, node="mark_triage", agent_id=agent_id, workspace_id=workspace_id, ) except Exception as te: logger.warning("Triage failed for %s: %s", incident_id, te) errors.append({ "node": "triage", "incident_id": incident_id, "error": str(te), }) # ── Ack alert (success) ──────────────────────────────────────── if not dry_run and actions.get("ack", True): note_parts = [ack_prefix] if incident_id: note_parts.append(f"incident:{incident_id}") if triage_run_id: note_parts.append(f"triage:{triage_run_id}") await _call_tool( gw, "alert_ingest_tool", "ack", {"alert_ref": alert_ref, "actor": agent_id, "note": "|".join(note_parts)}, run_id=run_id, node="ack_alert", agent_id=agent_id, workspace_id=workspace_id, ) processed += 1 except Exception as e: logger.error("Error processing alert %s: %s", alert_ref, e) errors.append({ "node": "process_alerts", "alert_ref": alert_ref, "error": str(e), }) # Mark alert as failed so it retries next run try: async with GatewayClient() as gw2: await _call_tool( gw2, "alert_ingest_tool", "fail", {"alert_ref": alert_ref, "error": str(e)[:200], "retry_after_seconds": 300}, run_id=run_id, node="fail_on_error", agent_id=agent_id, workspace_id=workspace_id, ) except Exception: pass # non-fatal fail-marking processed += 1 return { **state, "processed": state.get("processed", 0) + processed, "created_incidents": created_incidents, "updated_incidents": updated_incidents, "skipped_alerts": skipped_alerts, "errors": errors, "triage_runs": triage_runs, } async def _run_deterministic_triage( gw: GatewayClient, incident_id: str, alert: Dict, agent_id: str, workspace_id: str, run_id: str, ) -> Optional[str]: """ Run deterministic triage for an incident: 1. service_overview (observability) 2. health check (oncall) 3. KB runbook snippets 4. Compile and attach triage report artifact """ import json, base64, hashlib service = alert.get("service", "unknown") env = alert.get("env", "prod") now = datetime.datetime.utcnow() time_from = (now - datetime.timedelta(hours=1)).isoformat() time_to = now.isoformat() triage_id = f"tri_{hashlib.sha256(f'{incident_id}{now}'.encode()).hexdigest()[:8]}" # 1. Service overview overview_data = await _call_tool( gw, "observability_tool", "service_overview", {"service": service, "env": env, "time_range": {"from": time_from, "to": time_to}}, run_id=run_id, node="triage_overview", agent_id=agent_id, workspace_id=workspace_id, ) # 2. Health check health_data = await _call_tool( gw, "oncall_tool", "service_health", {"service": service, "env": env}, run_id=run_id, node="triage_health", agent_id=agent_id, workspace_id=workspace_id, ) # 3. KB runbooks kb_data = await _call_tool( gw, "kb_tool", "snippets", {"query": f"{service} {alert.get('kind', '')} {alert.get('title', '')}", "limit": 3}, run_id=run_id, node="triage_kb", agent_id=agent_id, workspace_id=workspace_id, ) # 4. Compile deterministic report report = { "triage_id": triage_id, "incident_id": incident_id, "service": service, "env": env, "mode": "deterministic", "alert_ref": alert.get("alert_ref", ""), "generated_at": now.isoformat(), "summary": ( f"Auto-triage for {service} {alert.get('kind','?')} " f"(severity={alert.get('severity','?')})" ), "suspected_root_causes": _build_root_causes(alert, overview_data, health_data), "impact_assessment": _build_impact(alert, overview_data), "mitigations_now": _build_mitigations(alert, health_data, kb_data), "next_checks": _build_next_checks(alert, overview_data), "references": { "metrics": overview_data.get("metrics", {}), "health": health_data, "runbook_snippets": (kb_data.get("snippets") or [])[:3], }, } # Attach as incident artifact content = json.dumps(report, indent=2, default=str).encode() content_b64 = base64.b64encode(content).decode() await _call_tool( gw, "oncall_tool", "incident_attach_artifact", { "incident_id": incident_id, "kind": "triage_report", "format": "json", "content_base64": content_b64, "filename": f"triage_{triage_id}.json", }, run_id=run_id, node="attach_triage", agent_id=agent_id, workspace_id=workspace_id, ) # Append timeline event await _call_tool( gw, "oncall_tool", "incident_append_event", { "incident_id": incident_id, "type": "note", "message": f"Deterministic triage completed (triage_id={triage_id})", "meta": {"triage_id": triage_id, "mode": "deterministic"}, }, run_id=run_id, node="triage_event", agent_id=agent_id, workspace_id=workspace_id, ) return triage_id def _build_root_causes(alert: Dict, overview: Dict, health: Dict) -> List[Dict]: causes = [] kind = alert.get("kind", "custom") metrics = (overview.get("metrics") or {}) kind_cause_map = { "slo_breach": "SLO breach detected (latency/error rate exceeded threshold)", "latency": "High latency observed — possible overload or downstream dependency degradation", "error_rate": "Elevated error rate — check recent deployments and upstream dependencies", "crashloop": "Container crash-looping — OOM or unhandled exception in startup", "oom": "Out-of-memory condition — memory leak or insufficient limits", "disk": "Disk/PVC capacity pressure — check log rotation and data retention", "deploy": "Recent deployment may have introduced regression", "security": "Security event detected — unauthorized access or injection attempt", } description = kind_cause_map.get(kind, f"Alert kind '{kind}' triggered on {alert.get('service', '?')}") causes.append({"rank": 1, "cause": description, "evidence": [_alert_line(alert)]}) # Add metric-based hints alert_metrics = alert.get("metrics", {}) if alert_metrics.get("latency_p95_ms", 0) > 500: causes.append({ "rank": 2, "cause": f"High p95 latency: {alert_metrics['latency_p95_ms']}ms", "evidence": ["From alert metrics"], }) if alert_metrics.get("error_rate_pct", 0) > 1.0: causes.append({ "rank": 3, "cause": f"Elevated error rate: {alert_metrics['error_rate_pct']}%", "evidence": ["From alert metrics"], }) if health and not health.get("healthy", True): causes.append({ "rank": len(causes) + 1, "cause": f"Service health check failed: {health.get('status', 'unknown')}", "evidence": [str(health.get("details", ""))[:200]], }) return causes def _build_impact(alert: Dict, overview: Dict) -> str: sev = alert.get("severity", "P2") svc = alert.get("service", "unknown") env = alert.get("env", "prod") impact_map = { "P0": f"CRITICAL: {svc} is fully degraded in {env}. Immediate action required.", "P1": f"HIGH: {svc} in {env} is significantly impaired. Users affected.", "P2": f"MEDIUM: {svc} in {env} is partially degraded. Monitoring required.", "P3": f"LOW: Minor degradation in {svc} ({env}). No immediate user impact.", } return impact_map.get(sev, f"{svc} affected in {env}") def _build_mitigations(alert: Dict, health: Dict, kb: Dict) -> List[str]: mitigations = [] kind = alert.get("kind", "custom") kind_mitigations = { "slo_breach": ["Check recent deployments and rollback if needed", "Scale service if under load", "Review error budget"], "latency": ["Check downstream dependency health", "Review connection pool settings", "Check for resource contention"], "error_rate": ["Review application logs for exceptions", "Check recent config changes", "Verify upstream dependencies"], "crashloop": ["Check pod logs: kubectl logs --previous", "Review resource limits", "Check liveness probe configuration"], "oom": ["Increase memory limits", "Check for memory leaks", "Review heap dumps if available"], "disk": ["Run log rotation", "Check data retention policies", "Delete old artifacts / compact audit logs"], "deploy": ["Review deployment diff", "Run smoke tests", "Consider rollback if metrics degraded"], "security": ["Block suspicious IPs", "Rotate affected credentials", "Audit access logs", "Notify security team"], } mitigations.extend(kind_mitigations.get(kind, ["Investigate logs and metrics"])) snippets = (kb.get("snippets") or [])[:2] for s in snippets: ref = s.get("path", "KB") mitigations.append(f"See runbook: {ref}") return mitigations def _build_next_checks(alert: Dict, overview: Dict) -> List[str]: svc = alert.get("service", "unknown") return [ f"Monitor {svc} error rate and latency for next 15 min", "Check incident_triage_graph for deeper analysis", "Verify SLO status with observability_tool.slo_snapshot", "If not resolved in 30 min → escalate to P0", ] def _alert_line(alert: Dict) -> str: """Short single-line summary of an alert for evidence lists.""" return ( f"[{alert.get('severity','?')}] {alert.get('service','?')} " f"{alert.get('kind','?')}: {alert.get('title','')[:80]}" ) async def build_digest_node(state: AlertTriageState) -> AlertTriageState: """Build short markdown digest for CTO/UI (max 3800 chars).""" created = state.get("created_incidents", []) updated = state.get("updated_incidents", []) skipped = state.get("skipped_alerts", []) errors = state.get("errors", []) processed = state.get("processed", 0) alerts = state.get("alerts", []) dry_run = state.get("dry_run", False) triage_runs = state.get("triage_runs", 0) ts = _now_iso() dry_tag = " **[DRY RUN]**" if dry_run else "" lines = [ f"## Alert Triage Digest{dry_tag} — {ts[:19]}Z", "", f"**Processed:** {processed} alerts | " f"**New incidents:** {len(created)} | " f"**Updated:** {len(updated)} | " f"**Skipped/Digest:** {len(skipped)} | " f"**Triages run:** {triage_runs} | " f"**Errors:** {len(errors)}", "", ] if created: lines.append("### 🆕 Created Incidents") for item in created[:10]: sev = item.get("severity", "?") svc = item.get("service", "?") inc_id = item.get("incident_id", "?") ref = item.get("alert_ref", "?") sig = (item.get("signature") or "")[:8] lines.append(f"- `{inc_id}` [{sev}] {svc} (alert: {ref}, sig: {sig})") lines.append("") if updated: lines.append("### 🔄 Updated Incidents (alert attached)") for item in updated[:10]: inc_id = item.get("incident_id", "?") ref = item.get("alert_ref", "?") lines.append(f"- `{inc_id}` ← alert `{ref}` ({item.get('note', '')})") lines.append("") if skipped: lines.append("### ⏭ Skipped / Digest-only") for item in skipped[:15]: svc = item.get("service", "?") sev = item.get("severity", "?") reason = item.get("reason", "policy") ref = item.get("alert_ref", "?") lines.append(f"- [{sev}] {svc} `{ref}` — {reason}") if len(skipped) > 15: lines.append(f"- … and {len(skipped) - 15} more") lines.append("") if errors: lines.append("### ⚠️ Errors (non-fatal)") for e in errors[:5]: lines.append(f"- `{e.get('node','?')}`: {str(e.get('error','?'))[:120]}") lines.append("") # ── Escalation results ───────────────────────────────────────────────────── escalation = state.get("escalation_result") or {} esc_count = escalation.get("escalated", 0) esc_candidates = escalation.get("candidates", []) if esc_count > 0: lines.append(f"### ⬆️ Escalated Incidents ({esc_count})") for c in esc_candidates[:5]: if c.get("from_severity") != c.get("to_severity"): lines.append( f"- `{c.get('incident_id','?')}` {c.get('service','?')}: " f"{c.get('from_severity')} → {c.get('to_severity')} " f"(occ_60m={c.get('occurrences_60m',0)}, " f"triage_24h={c.get('triage_count_24h',0)})" ) lines.append("") # ── Auto-resolve candidates ──────────────────────────────────────────────── ar = state.get("autoresolve_result") or {} ar_count = ar.get("candidates_count", 0) if ar_count > 0: lines.append(f"### 🟡 Auto-resolve Candidates ({ar_count})") for c in (ar.get("candidates") or [])[:5]: lines.append( f"- `{c.get('incident_id','?')}` [{c.get('severity','?')}] " f"{c.get('service','?')}: no alerts for " f"{c.get('minutes_without_alerts', '?')}min" ) lines.append("") if not created and not updated and not skipped and not errors: lines.append("_No alerts to process in this window._") digest_md = "\n".join(lines) # Truncate if over limit if len(digest_md) > MAX_DIGEST_CHARS: digest_md = digest_md[:MAX_DIGEST_CHARS - 50] + "\n\n… *(digest truncated)*" result_summary = { "processed": processed, "created_incidents": len(created), "updated_incidents": len(updated), "skipped": len(skipped), "triage_runs": triage_runs, "escalated": esc_count, "autoresolve_candidates": ar_count, "errors": len(errors), } return { **state, "digest_md": digest_md, "result_summary": result_summary, } async def post_process_escalation_node(state: AlertTriageState) -> AlertTriageState: """ After processing alerts: call incident_escalation_tool.evaluate. Only runs if at least 1 alert was processed. Non-fatal. """ processed = state.get("processed", 0) if processed == 0: return {**state, "escalation_result": {}} agent_id = state.get("agent_id", "sofiia") workspace_id = state.get("workspace_id", "default") run_id = state.get("_run_id", "unknown") dry_run = state.get("dry_run", False) try: async with GatewayClient() as gw: result = await _call_tool( gw, "incident_escalation_tool", "evaluate", { "window_minutes": 60, "limit": 50, "dry_run": dry_run, }, run_id=run_id, node="post_escalation", agent_id=agent_id, workspace_id=workspace_id, ) except Exception as e: logger.warning("post_process_escalation_node failed (non-fatal): %s", e) result = {} return {**state, "escalation_result": result} async def post_process_autoresolve_node(state: AlertTriageState) -> AlertTriageState: """ After processing alerts: find auto-resolve candidates. Always dry_run=True (candidate-only, no actual close unless policy says otherwise). Non-fatal. """ processed = state.get("processed", 0) if processed == 0: return {**state, "autoresolve_result": {}} agent_id = state.get("agent_id", "sofiia") workspace_id = state.get("workspace_id", "default") run_id = state.get("_run_id", "unknown") try: async with GatewayClient() as gw: result = await _call_tool( gw, "incident_escalation_tool", "auto_resolve_candidates", { "no_alerts_minutes": 60, "limit": 50, "dry_run": True, # always candidate-only in loop }, run_id=run_id, node="post_autoresolve", agent_id=agent_id, workspace_id=workspace_id, ) except Exception as e: logger.warning("post_process_autoresolve_node failed (non-fatal): %s", e) result = {} return {**state, "autoresolve_result": result} # ─── Graph builder ───────────────────────────────────────────────────────────── def build_alert_triage_graph(): """ Build the alert_triage LangGraph. LLM usage: ZERO in steady state (llm_mode=off in policy). All nodes are deterministic Python + gateway tool calls. Flow: load_policy → claim_alerts → process_alerts → post_escalation → post_autoresolve → build_digest """ workflow = StateGraph(AlertTriageState) workflow.add_node("load_policy", load_policy_node) workflow.add_node("list_alerts", list_alerts_node) workflow.add_node("process_alerts", process_alerts_node) workflow.add_node("post_escalation", post_process_escalation_node) workflow.add_node("post_autoresolve", post_process_autoresolve_node) workflow.add_node("build_digest", build_digest_node) workflow.set_entry_point("load_policy") workflow.add_edge("load_policy", "list_alerts") workflow.add_edge("list_alerts", "process_alerts") workflow.add_edge("process_alerts", "post_escalation") workflow.add_edge("post_escalation", "post_autoresolve") workflow.add_edge("post_autoresolve", "build_digest") workflow.add_edge("build_digest", END) return workflow.compile()