Files
microdao-daarion/services/sofiia-supervisor/app/graphs/alert_triage_graph.py
Apple 129e4ea1fc feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store,
architecture_pressure, backlog_generator/store, cost_analyzer, data_governance,
dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment,
platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files),
signature_state_store, sofiia_auto_router, tool_governance

New services:
- sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static
- memory-service: integration_endpoints, integrations, voice_endpoints, static UI
- aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents)
- sofiia-supervisor: new supervisor service
- aistalk-bridge-lite: Telegram bridge lite
- calendar-service: CalDAV calendar service with reminders
- mlx-stt-service / mlx-tts-service: Apple Silicon speech services
- binance-bot-monitor: market monitor service
- node-worker: STT/TTS memory providers

New tools (9): agent_email, browser_tool, contract_tool, observability_tool,
oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault

New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus,
farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine,
session_context, style_adapter, telemetry)

Tests: 85+ test files for all new modules
Made-with: Cursor
2026-03-03 07:14:14 -08:00

852 lines
34 KiB
Python

"""
alert_triage_graph — Deterministic alert → incident → triage loop.
Runs every 5 min (via scheduler or cron). Zero LLM tokens in steady state
(llm_mode=off). Routing decisions driven entirely by alert_routing_policy.yml.
Node sequence:
load_policy
→ list_alerts
→ for_each_alert (process loop)
→ decide_action (policy match)
→ alert_to_incident (if auto_incident)
→ run_deterministic_triage (if auto_triage, no LLM)
→ ack_alert
→ build_digest
→ END
All tool calls via GatewayClient (RBAC/audit enforced by gateway).
LLM is only invoked if policy.llm_mode != off AND rule.triage_mode == llm.
"""
from __future__ import annotations
import datetime
import logging
import textwrap
from typing import Any, Dict, List, Optional, TypedDict
from langgraph.graph import StateGraph, END
from ..alert_routing import (
load_policy, match_alert, is_llm_allowed, compute_incident_signature
)
COOLDOWN_DEFAULT_MINUTES = 15
from ..config import settings
from ..gateway_client import GatewayClient
logger = logging.getLogger(__name__)
MAX_DIGEST_CHARS = 3800
MAX_ALERTS_HARD_CAP = 50 # safety cap regardless of policy
# ─── State ────────────────────────────────────────────────────────────────────
class AlertTriageState(TypedDict, total=False):
# Input
workspace_id: str
user_id: str
agent_id: str
policy_profile: str # "default" (reserved for future multi-profile support)
dry_run: bool # if True: no writes, no acks
# Policy
policy: Dict
max_alerts: int
max_incidents: int
max_triages: int
# Runtime
alerts: List[Dict]
processed: int
created_incidents: List[Dict]
updated_incidents: List[Dict]
skipped_alerts: List[Dict]
errors: List[Dict]
triage_runs: int
# Post-process results
escalation_result: Dict
autoresolve_result: Dict
# Output
digest_md: str
result_summary: Dict
# ─── Helpers ──────────────────────────────────────────────────────────────────
def _now_iso() -> str:
return datetime.datetime.utcnow().isoformat()
def _truncate(text: str, max_chars: int = 200) -> str:
if len(text) <= max_chars:
return text
return text[:max_chars] + ""
def _alert_line(alert: Dict) -> str:
svc = alert.get("service", "?")
sev = alert.get("severity", "?")
kind = alert.get("kind", "?")
title = _truncate(alert.get("title", ""), 80)
ref = alert.get("alert_ref", "?")
return f"[{sev}] {svc}/{kind}: {title} ({ref})"
async def _call_tool(
gw: GatewayClient,
tool: str,
action: str,
params: Dict,
run_id: str,
node: str,
agent_id: str,
workspace_id: str,
) -> Dict:
"""Call a tool via gateway, return data dict or empty dict on error."""
result = await gw.call_tool(
tool_name=tool,
action=action,
params=params,
run_id=run_id,
node=node,
agent_id=agent_id,
workspace_id=workspace_id,
)
if result.success:
return result.data or {}
logger.warning("Tool %s.%s failed: %s", tool, action, result.error_message)
return {}
# ─── Nodes ────────────────────────────────────────────────────────────────────
async def load_policy_node(state: AlertTriageState) -> AlertTriageState:
"""Load alert routing policy. Never fails — falls back to built-in defaults."""
policy = load_policy()
defaults = policy.get("defaults", {})
return {
**state,
"policy": policy,
"max_alerts": min(
int(defaults.get("max_alerts_per_run", 20)),
MAX_ALERTS_HARD_CAP,
),
"max_incidents": int(defaults.get("max_incidents_per_run", 5)),
"max_triages": int(defaults.get("max_triages_per_run", 5)),
"created_incidents": [],
"updated_incidents": [],
"skipped_alerts": [],
"errors": [],
"triage_runs": 0,
"processed": 0,
}
async def list_alerts_node(state: AlertTriageState) -> AlertTriageState:
"""
Atomically claim a batch of new/failed alerts for processing.
Uses alert_ingest_tool.claim (SELECT FOR UPDATE SKIP LOCKED in Postgres).
Falls back to list with status_in=new,failed if claim not available.
"""
policy = state.get("policy", {})
max_alerts = state.get("max_alerts", 20)
agent_id = state.get("agent_id", "sofiia")
workspace_id = state.get("workspace_id", "default")
run_id = state.get("_run_id", "unknown")
try:
async with GatewayClient() as gw:
data = await _call_tool(
gw, "alert_ingest_tool", "claim",
{
"window_minutes": 240,
"limit": max_alerts,
"owner": f"supervisor:{run_id[:12]}",
"lock_ttl_seconds": 600,
},
run_id=run_id, node="claim_alerts",
agent_id=agent_id, workspace_id=workspace_id,
)
except Exception as e:
logger.error("claim_alerts_node failed: %s", e)
return {**state, "alerts": [], "errors": [{"node": "claim_alerts", "error": str(e)}]}
claimed = data.get("alerts", [])
requeued = data.get("requeued_stale", 0)
if requeued:
logger.info("Requeued %d stale-processing alerts", requeued)
return {**state, "alerts": claimed[:max_alerts]}
async def process_alerts_node(state: AlertTriageState) -> AlertTriageState:
"""
Main loop: for each alert → match policy → create/update incident → triage.
Deterministic by default (0 LLM tokens unless policy.llm_mode != off).
"""
policy = state.get("policy", {})
defaults = policy.get("defaults", {})
alerts = state.get("alerts", [])
dry_run = state.get("dry_run", False)
agent_id = state.get("agent_id", "sofiia")
workspace_id = state.get("workspace_id", "default")
run_id = state.get("_run_id", "unknown")
created_incidents: List[Dict] = list(state.get("created_incidents", []))
updated_incidents: List[Dict] = list(state.get("updated_incidents", []))
skipped_alerts: List[Dict] = list(state.get("skipped_alerts", []))
errors: List[Dict] = list(state.get("errors", []))
max_incidents = state.get("max_incidents", 5)
max_triages = state.get("max_triages", 5)
triage_runs = state.get("triage_runs", 0)
processed = 0
ack_prefix = defaults.get("ack_note_prefix", "alert_triage_loop")
async with GatewayClient() as gw:
for alert in alerts:
alert_ref = alert.get("alert_ref", "?")
try:
actions = match_alert(alert, policy)
incident_id = None
triage_run_id = None
# ── Digest-only: ack immediately, no incident ─────────────────
if not actions.get("auto_incident", False):
if actions.get("digest_only"):
skipped_alerts.append({
"alert_ref": alert_ref,
"service": alert.get("service"),
"severity": alert.get("severity"),
"reason": "digest_only (policy)",
})
if not dry_run and actions.get("ack", True):
await _call_tool(
gw, "alert_ingest_tool", "ack",
{"alert_ref": alert_ref, "actor": agent_id,
"note": f"{ack_prefix}:digest_only"},
run_id=run_id, node="ack_digest",
agent_id=agent_id, workspace_id=workspace_id,
)
processed += 1
continue
# ── Auto incident creation ─────────────────────────────────────
if len(created_incidents) + len(updated_incidents) >= max_incidents:
skipped_alerts.append({
"alert_ref": alert_ref,
"reason": "max_incidents_per_run reached",
})
# Don't ack — leave as processing; next run picks it up
processed += 1
continue
if not dry_run:
inc_result = await _call_tool(
gw, "oncall_tool", "alert_to_incident",
{
"alert_ref": alert_ref,
"incident_severity_cap": actions.get("incident_severity_cap", "P1"),
"dedupe_window_minutes": int(actions.get("dedupe_window_minutes", 120)),
"attach_artifact": actions.get("attach_alert_artifact", True),
},
run_id=run_id, node="alert_to_incident",
agent_id=agent_id, workspace_id=workspace_id,
)
if inc_result:
incident_id = inc_result.get("incident_id")
incident_signature = inc_result.get("incident_signature", "")
if inc_result.get("created"):
created_incidents.append({
"incident_id": incident_id,
"alert_ref": alert_ref,
"service": alert.get("service"),
"severity": inc_result.get("severity"),
"signature": incident_signature,
})
else:
updated_incidents.append({
"incident_id": incident_id,
"alert_ref": alert_ref,
"note": inc_result.get("note", "attached"),
})
else:
# incident creation failed — mark alert as failed
await _call_tool(
gw, "alert_ingest_tool", "fail",
{"alert_ref": alert_ref,
"error": "alert_to_incident returned empty",
"retry_after_seconds": 300},
run_id=run_id, node="fail_alert",
agent_id=agent_id, workspace_id=workspace_id,
)
errors.append({
"node": "alert_to_incident",
"alert_ref": alert_ref,
"error": "empty response",
})
processed += 1
continue
else:
sig = compute_incident_signature(alert, policy)
incident_id = f"dry_run_inc_{sig[:8]}"
incident_signature = sig
created_incidents.append({
"incident_id": incident_id,
"alert_ref": alert_ref,
"service": alert.get("service"),
"dry_run": True,
})
# ── Cooldown check before triage ──────────────────────────────
cooldown_ok = True
cooldown_minutes = int(
policy.get("defaults", {}).get("triage_cooldown_minutes",
COOLDOWN_DEFAULT_MINUTES)
)
if (
incident_id
and actions.get("auto_triage", False)
and incident_signature
and not dry_run
):
sig_check = await _call_tool(
gw, "oncall_tool", "signature_should_triage",
{"signature": incident_signature,
"cooldown_minutes": cooldown_minutes},
run_id=run_id, node="cooldown_check",
agent_id=agent_id, workspace_id=workspace_id,
)
cooldown_ok = sig_check.get("should_triage", True)
if not cooldown_ok:
# Cooldown active: append soft event but don't triage
await _call_tool(
gw, "oncall_tool", "incident_append_event",
{"incident_id": incident_id,
"type": "note",
"message": f"Alert observed during triage cooldown "
f"(signature={incident_signature[:8]}, "
f"cooldown={cooldown_minutes}min)",
"meta": {"alert_ref": alert_ref,
"cooldown_active": True}},
run_id=run_id, node="cooldown_event",
agent_id=agent_id, workspace_id=workspace_id,
)
# ── Deterministic triage ──────────────────────────────────────
if (
incident_id
and actions.get("auto_triage", False)
and triage_runs < max_triages
and cooldown_ok
):
triage_mode = actions.get("triage_mode", "deterministic")
if triage_mode == "llm" and not is_llm_allowed("triage", policy):
triage_mode = "deterministic"
logger.info("llm_mode=off → deterministic triage for %s", alert_ref)
if triage_mode == "deterministic" and not dry_run:
try:
triage_run_id = await _run_deterministic_triage(
gw, incident_id, alert, agent_id, workspace_id, run_id
)
triage_runs += 1
# Mark signature cooldown
await _call_tool(
gw, "oncall_tool", "signature_mark_triage",
{"signature": incident_signature},
run_id=run_id, node="mark_triage",
agent_id=agent_id, workspace_id=workspace_id,
)
except Exception as te:
logger.warning("Triage failed for %s: %s", incident_id, te)
errors.append({
"node": "triage",
"incident_id": incident_id,
"error": str(te),
})
# ── Ack alert (success) ────────────────────────────────────────
if not dry_run and actions.get("ack", True):
note_parts = [ack_prefix]
if incident_id:
note_parts.append(f"incident:{incident_id}")
if triage_run_id:
note_parts.append(f"triage:{triage_run_id}")
await _call_tool(
gw, "alert_ingest_tool", "ack",
{"alert_ref": alert_ref, "actor": agent_id,
"note": "|".join(note_parts)},
run_id=run_id, node="ack_alert",
agent_id=agent_id, workspace_id=workspace_id,
)
processed += 1
except Exception as e:
logger.error("Error processing alert %s: %s", alert_ref, e)
errors.append({
"node": "process_alerts",
"alert_ref": alert_ref,
"error": str(e),
})
# Mark alert as failed so it retries next run
try:
async with GatewayClient() as gw2:
await _call_tool(
gw2, "alert_ingest_tool", "fail",
{"alert_ref": alert_ref, "error": str(e)[:200],
"retry_after_seconds": 300},
run_id=run_id, node="fail_on_error",
agent_id=agent_id, workspace_id=workspace_id,
)
except Exception:
pass # non-fatal fail-marking
processed += 1
return {
**state,
"processed": state.get("processed", 0) + processed,
"created_incidents": created_incidents,
"updated_incidents": updated_incidents,
"skipped_alerts": skipped_alerts,
"errors": errors,
"triage_runs": triage_runs,
}
async def _run_deterministic_triage(
gw: GatewayClient,
incident_id: str,
alert: Dict,
agent_id: str,
workspace_id: str,
run_id: str,
) -> Optional[str]:
"""
Run deterministic triage for an incident:
1. service_overview (observability)
2. health check (oncall)
3. KB runbook snippets
4. Compile and attach triage report artifact
"""
import json, base64, hashlib
service = alert.get("service", "unknown")
env = alert.get("env", "prod")
now = datetime.datetime.utcnow()
time_from = (now - datetime.timedelta(hours=1)).isoformat()
time_to = now.isoformat()
triage_id = f"tri_{hashlib.sha256(f'{incident_id}{now}'.encode()).hexdigest()[:8]}"
# 1. Service overview
overview_data = await _call_tool(
gw, "observability_tool", "service_overview",
{"service": service, "env": env,
"time_range": {"from": time_from, "to": time_to}},
run_id=run_id, node="triage_overview",
agent_id=agent_id, workspace_id=workspace_id,
)
# 2. Health check
health_data = await _call_tool(
gw, "oncall_tool", "service_health",
{"service": service, "env": env},
run_id=run_id, node="triage_health",
agent_id=agent_id, workspace_id=workspace_id,
)
# 3. KB runbooks
kb_data = await _call_tool(
gw, "kb_tool", "snippets",
{"query": f"{service} {alert.get('kind', '')} {alert.get('title', '')}",
"limit": 3},
run_id=run_id, node="triage_kb",
agent_id=agent_id, workspace_id=workspace_id,
)
# 4. Compile deterministic report
report = {
"triage_id": triage_id,
"incident_id": incident_id,
"service": service,
"env": env,
"mode": "deterministic",
"alert_ref": alert.get("alert_ref", ""),
"generated_at": now.isoformat(),
"summary": (
f"Auto-triage for {service} {alert.get('kind','?')} "
f"(severity={alert.get('severity','?')})"
),
"suspected_root_causes": _build_root_causes(alert, overview_data, health_data),
"impact_assessment": _build_impact(alert, overview_data),
"mitigations_now": _build_mitigations(alert, health_data, kb_data),
"next_checks": _build_next_checks(alert, overview_data),
"references": {
"metrics": overview_data.get("metrics", {}),
"health": health_data,
"runbook_snippets": (kb_data.get("snippets") or [])[:3],
},
}
# Attach as incident artifact
content = json.dumps(report, indent=2, default=str).encode()
content_b64 = base64.b64encode(content).decode()
await _call_tool(
gw, "oncall_tool", "incident_attach_artifact",
{
"incident_id": incident_id,
"kind": "triage_report",
"format": "json",
"content_base64": content_b64,
"filename": f"triage_{triage_id}.json",
},
run_id=run_id, node="attach_triage",
agent_id=agent_id, workspace_id=workspace_id,
)
# Append timeline event
await _call_tool(
gw, "oncall_tool", "incident_append_event",
{
"incident_id": incident_id,
"type": "note",
"message": f"Deterministic triage completed (triage_id={triage_id})",
"meta": {"triage_id": triage_id, "mode": "deterministic"},
},
run_id=run_id, node="triage_event",
agent_id=agent_id, workspace_id=workspace_id,
)
return triage_id
def _build_root_causes(alert: Dict, overview: Dict, health: Dict) -> List[Dict]:
causes = []
kind = alert.get("kind", "custom")
metrics = (overview.get("metrics") or {})
kind_cause_map = {
"slo_breach": "SLO breach detected (latency/error rate exceeded threshold)",
"latency": "High latency observed — possible overload or downstream dependency degradation",
"error_rate": "Elevated error rate — check recent deployments and upstream dependencies",
"crashloop": "Container crash-looping — OOM or unhandled exception in startup",
"oom": "Out-of-memory condition — memory leak or insufficient limits",
"disk": "Disk/PVC capacity pressure — check log rotation and data retention",
"deploy": "Recent deployment may have introduced regression",
"security": "Security event detected — unauthorized access or injection attempt",
}
description = kind_cause_map.get(kind, f"Alert kind '{kind}' triggered on {alert.get('service', '?')}")
causes.append({"rank": 1, "cause": description, "evidence": [_alert_line(alert)]})
# Add metric-based hints
alert_metrics = alert.get("metrics", {})
if alert_metrics.get("latency_p95_ms", 0) > 500:
causes.append({
"rank": 2,
"cause": f"High p95 latency: {alert_metrics['latency_p95_ms']}ms",
"evidence": ["From alert metrics"],
})
if alert_metrics.get("error_rate_pct", 0) > 1.0:
causes.append({
"rank": 3,
"cause": f"Elevated error rate: {alert_metrics['error_rate_pct']}%",
"evidence": ["From alert metrics"],
})
if health and not health.get("healthy", True):
causes.append({
"rank": len(causes) + 1,
"cause": f"Service health check failed: {health.get('status', 'unknown')}",
"evidence": [str(health.get("details", ""))[:200]],
})
return causes
def _build_impact(alert: Dict, overview: Dict) -> str:
sev = alert.get("severity", "P2")
svc = alert.get("service", "unknown")
env = alert.get("env", "prod")
impact_map = {
"P0": f"CRITICAL: {svc} is fully degraded in {env}. Immediate action required.",
"P1": f"HIGH: {svc} in {env} is significantly impaired. Users affected.",
"P2": f"MEDIUM: {svc} in {env} is partially degraded. Monitoring required.",
"P3": f"LOW: Minor degradation in {svc} ({env}). No immediate user impact.",
}
return impact_map.get(sev, f"{svc} affected in {env}")
def _build_mitigations(alert: Dict, health: Dict, kb: Dict) -> List[str]:
mitigations = []
kind = alert.get("kind", "custom")
kind_mitigations = {
"slo_breach": ["Check recent deployments and rollback if needed",
"Scale service if under load", "Review error budget"],
"latency": ["Check downstream dependency health",
"Review connection pool settings", "Check for resource contention"],
"error_rate": ["Review application logs for exceptions",
"Check recent config changes", "Verify upstream dependencies"],
"crashloop": ["Check pod logs: kubectl logs <pod> --previous",
"Review resource limits", "Check liveness probe configuration"],
"oom": ["Increase memory limits", "Check for memory leaks",
"Review heap dumps if available"],
"disk": ["Run log rotation", "Check data retention policies",
"Delete old artifacts / compact audit logs"],
"deploy": ["Review deployment diff", "Run smoke tests",
"Consider rollback if metrics degraded"],
"security": ["Block suspicious IPs", "Rotate affected credentials",
"Audit access logs", "Notify security team"],
}
mitigations.extend(kind_mitigations.get(kind, ["Investigate logs and metrics"]))
snippets = (kb.get("snippets") or [])[:2]
for s in snippets:
ref = s.get("path", "KB")
mitigations.append(f"See runbook: {ref}")
return mitigations
def _build_next_checks(alert: Dict, overview: Dict) -> List[str]:
svc = alert.get("service", "unknown")
return [
f"Monitor {svc} error rate and latency for next 15 min",
"Check incident_triage_graph for deeper analysis",
"Verify SLO status with observability_tool.slo_snapshot",
"If not resolved in 30 min → escalate to P0",
]
def _alert_line(alert: Dict) -> str:
"""Short single-line summary of an alert for evidence lists."""
return (
f"[{alert.get('severity','?')}] {alert.get('service','?')} "
f"{alert.get('kind','?')}: {alert.get('title','')[:80]}"
)
async def build_digest_node(state: AlertTriageState) -> AlertTriageState:
"""Build short markdown digest for CTO/UI (max 3800 chars)."""
created = state.get("created_incidents", [])
updated = state.get("updated_incidents", [])
skipped = state.get("skipped_alerts", [])
errors = state.get("errors", [])
processed = state.get("processed", 0)
alerts = state.get("alerts", [])
dry_run = state.get("dry_run", False)
triage_runs = state.get("triage_runs", 0)
ts = _now_iso()
dry_tag = " **[DRY RUN]**" if dry_run else ""
lines = [
f"## Alert Triage Digest{dry_tag}{ts[:19]}Z",
"",
f"**Processed:** {processed} alerts | "
f"**New incidents:** {len(created)} | "
f"**Updated:** {len(updated)} | "
f"**Skipped/Digest:** {len(skipped)} | "
f"**Triages run:** {triage_runs} | "
f"**Errors:** {len(errors)}",
"",
]
if created:
lines.append("### 🆕 Created Incidents")
for item in created[:10]:
sev = item.get("severity", "?")
svc = item.get("service", "?")
inc_id = item.get("incident_id", "?")
ref = item.get("alert_ref", "?")
sig = (item.get("signature") or "")[:8]
lines.append(f"- `{inc_id}` [{sev}] {svc} (alert: {ref}, sig: {sig})")
lines.append("")
if updated:
lines.append("### 🔄 Updated Incidents (alert attached)")
for item in updated[:10]:
inc_id = item.get("incident_id", "?")
ref = item.get("alert_ref", "?")
lines.append(f"- `{inc_id}` ← alert `{ref}` ({item.get('note', '')})")
lines.append("")
if skipped:
lines.append("### ⏭ Skipped / Digest-only")
for item in skipped[:15]:
svc = item.get("service", "?")
sev = item.get("severity", "?")
reason = item.get("reason", "policy")
ref = item.get("alert_ref", "?")
lines.append(f"- [{sev}] {svc} `{ref}` — {reason}")
if len(skipped) > 15:
lines.append(f"- … and {len(skipped) - 15} more")
lines.append("")
if errors:
lines.append("### ⚠️ Errors (non-fatal)")
for e in errors[:5]:
lines.append(f"- `{e.get('node','?')}`: {str(e.get('error','?'))[:120]}")
lines.append("")
# ── Escalation results ─────────────────────────────────────────────────────
escalation = state.get("escalation_result") or {}
esc_count = escalation.get("escalated", 0)
esc_candidates = escalation.get("candidates", [])
if esc_count > 0:
lines.append(f"### ⬆️ Escalated Incidents ({esc_count})")
for c in esc_candidates[:5]:
if c.get("from_severity") != c.get("to_severity"):
lines.append(
f"- `{c.get('incident_id','?')}` {c.get('service','?')}: "
f"{c.get('from_severity')}{c.get('to_severity')} "
f"(occ_60m={c.get('occurrences_60m',0)}, "
f"triage_24h={c.get('triage_count_24h',0)})"
)
lines.append("")
# ── Auto-resolve candidates ────────────────────────────────────────────────
ar = state.get("autoresolve_result") or {}
ar_count = ar.get("candidates_count", 0)
if ar_count > 0:
lines.append(f"### 🟡 Auto-resolve Candidates ({ar_count})")
for c in (ar.get("candidates") or [])[:5]:
lines.append(
f"- `{c.get('incident_id','?')}` [{c.get('severity','?')}] "
f"{c.get('service','?')}: no alerts for "
f"{c.get('minutes_without_alerts', '?')}min"
)
lines.append("")
if not created and not updated and not skipped and not errors:
lines.append("_No alerts to process in this window._")
digest_md = "\n".join(lines)
# Truncate if over limit
if len(digest_md) > MAX_DIGEST_CHARS:
digest_md = digest_md[:MAX_DIGEST_CHARS - 50] + "\n\n… *(digest truncated)*"
result_summary = {
"processed": processed,
"created_incidents": len(created),
"updated_incidents": len(updated),
"skipped": len(skipped),
"triage_runs": triage_runs,
"escalated": esc_count,
"autoresolve_candidates": ar_count,
"errors": len(errors),
}
return {
**state,
"digest_md": digest_md,
"result_summary": result_summary,
}
async def post_process_escalation_node(state: AlertTriageState) -> AlertTriageState:
"""
After processing alerts: call incident_escalation_tool.evaluate.
Only runs if at least 1 alert was processed. Non-fatal.
"""
processed = state.get("processed", 0)
if processed == 0:
return {**state, "escalation_result": {}}
agent_id = state.get("agent_id", "sofiia")
workspace_id = state.get("workspace_id", "default")
run_id = state.get("_run_id", "unknown")
dry_run = state.get("dry_run", False)
try:
async with GatewayClient() as gw:
result = await _call_tool(
gw, "incident_escalation_tool", "evaluate",
{
"window_minutes": 60,
"limit": 50,
"dry_run": dry_run,
},
run_id=run_id, node="post_escalation",
agent_id=agent_id, workspace_id=workspace_id,
)
except Exception as e:
logger.warning("post_process_escalation_node failed (non-fatal): %s", e)
result = {}
return {**state, "escalation_result": result}
async def post_process_autoresolve_node(state: AlertTriageState) -> AlertTriageState:
"""
After processing alerts: find auto-resolve candidates.
Always dry_run=True (candidate-only, no actual close unless policy says otherwise).
Non-fatal.
"""
processed = state.get("processed", 0)
if processed == 0:
return {**state, "autoresolve_result": {}}
agent_id = state.get("agent_id", "sofiia")
workspace_id = state.get("workspace_id", "default")
run_id = state.get("_run_id", "unknown")
try:
async with GatewayClient() as gw:
result = await _call_tool(
gw, "incident_escalation_tool", "auto_resolve_candidates",
{
"no_alerts_minutes": 60,
"limit": 50,
"dry_run": True, # always candidate-only in loop
},
run_id=run_id, node="post_autoresolve",
agent_id=agent_id, workspace_id=workspace_id,
)
except Exception as e:
logger.warning("post_process_autoresolve_node failed (non-fatal): %s", e)
result = {}
return {**state, "autoresolve_result": result}
# ─── Graph builder ─────────────────────────────────────────────────────────────
def build_alert_triage_graph():
"""
Build the alert_triage LangGraph.
LLM usage: ZERO in steady state (llm_mode=off in policy).
All nodes are deterministic Python + gateway tool calls.
Flow: load_policy → claim_alerts → process_alerts
→ post_escalation → post_autoresolve → build_digest
"""
workflow = StateGraph(AlertTriageState)
workflow.add_node("load_policy", load_policy_node)
workflow.add_node("list_alerts", list_alerts_node)
workflow.add_node("process_alerts", process_alerts_node)
workflow.add_node("post_escalation", post_process_escalation_node)
workflow.add_node("post_autoresolve", post_process_autoresolve_node)
workflow.add_node("build_digest", build_digest_node)
workflow.set_entry_point("load_policy")
workflow.add_edge("load_policy", "list_alerts")
workflow.add_edge("list_alerts", "process_alerts")
workflow.add_edge("process_alerts", "post_escalation")
workflow.add_edge("post_escalation", "post_autoresolve")
workflow.add_edge("post_autoresolve", "build_digest")
workflow.add_edge("build_digest", END)
return workflow.compile()