New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
852 lines
34 KiB
Python
852 lines
34 KiB
Python
"""
|
|
alert_triage_graph — Deterministic alert → incident → triage loop.
|
|
|
|
Runs every 5 min (via scheduler or cron). Zero LLM tokens in steady state
|
|
(llm_mode=off). Routing decisions driven entirely by alert_routing_policy.yml.
|
|
|
|
Node sequence:
|
|
load_policy
|
|
→ list_alerts
|
|
→ for_each_alert (process loop)
|
|
→ decide_action (policy match)
|
|
→ alert_to_incident (if auto_incident)
|
|
→ run_deterministic_triage (if auto_triage, no LLM)
|
|
→ ack_alert
|
|
→ build_digest
|
|
→ END
|
|
|
|
All tool calls via GatewayClient (RBAC/audit enforced by gateway).
|
|
LLM is only invoked if policy.llm_mode != off AND rule.triage_mode == llm.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import datetime
|
|
import logging
|
|
import textwrap
|
|
from typing import Any, Dict, List, Optional, TypedDict
|
|
|
|
from langgraph.graph import StateGraph, END
|
|
|
|
from ..alert_routing import (
|
|
load_policy, match_alert, is_llm_allowed, compute_incident_signature
|
|
)
|
|
COOLDOWN_DEFAULT_MINUTES = 15
|
|
from ..config import settings
|
|
from ..gateway_client import GatewayClient
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
MAX_DIGEST_CHARS = 3800
|
|
MAX_ALERTS_HARD_CAP = 50 # safety cap regardless of policy
|
|
|
|
|
|
# ─── State ────────────────────────────────────────────────────────────────────
|
|
|
|
class AlertTriageState(TypedDict, total=False):
|
|
# Input
|
|
workspace_id: str
|
|
user_id: str
|
|
agent_id: str
|
|
policy_profile: str # "default" (reserved for future multi-profile support)
|
|
dry_run: bool # if True: no writes, no acks
|
|
|
|
# Policy
|
|
policy: Dict
|
|
max_alerts: int
|
|
max_incidents: int
|
|
max_triages: int
|
|
|
|
# Runtime
|
|
alerts: List[Dict]
|
|
processed: int
|
|
created_incidents: List[Dict]
|
|
updated_incidents: List[Dict]
|
|
skipped_alerts: List[Dict]
|
|
errors: List[Dict]
|
|
triage_runs: int
|
|
|
|
# Post-process results
|
|
escalation_result: Dict
|
|
autoresolve_result: Dict
|
|
|
|
# Output
|
|
digest_md: str
|
|
result_summary: Dict
|
|
|
|
|
|
# ─── Helpers ──────────────────────────────────────────────────────────────────
|
|
|
|
def _now_iso() -> str:
|
|
return datetime.datetime.utcnow().isoformat()
|
|
|
|
|
|
def _truncate(text: str, max_chars: int = 200) -> str:
|
|
if len(text) <= max_chars:
|
|
return text
|
|
return text[:max_chars] + "…"
|
|
|
|
|
|
def _alert_line(alert: Dict) -> str:
|
|
svc = alert.get("service", "?")
|
|
sev = alert.get("severity", "?")
|
|
kind = alert.get("kind", "?")
|
|
title = _truncate(alert.get("title", ""), 80)
|
|
ref = alert.get("alert_ref", "?")
|
|
return f"[{sev}] {svc}/{kind}: {title} ({ref})"
|
|
|
|
|
|
async def _call_tool(
|
|
gw: GatewayClient,
|
|
tool: str,
|
|
action: str,
|
|
params: Dict,
|
|
run_id: str,
|
|
node: str,
|
|
agent_id: str,
|
|
workspace_id: str,
|
|
) -> Dict:
|
|
"""Call a tool via gateway, return data dict or empty dict on error."""
|
|
result = await gw.call_tool(
|
|
tool_name=tool,
|
|
action=action,
|
|
params=params,
|
|
run_id=run_id,
|
|
node=node,
|
|
agent_id=agent_id,
|
|
workspace_id=workspace_id,
|
|
)
|
|
if result.success:
|
|
return result.data or {}
|
|
logger.warning("Tool %s.%s failed: %s", tool, action, result.error_message)
|
|
return {}
|
|
|
|
|
|
# ─── Nodes ────────────────────────────────────────────────────────────────────
|
|
|
|
async def load_policy_node(state: AlertTriageState) -> AlertTriageState:
|
|
"""Load alert routing policy. Never fails — falls back to built-in defaults."""
|
|
policy = load_policy()
|
|
defaults = policy.get("defaults", {})
|
|
return {
|
|
**state,
|
|
"policy": policy,
|
|
"max_alerts": min(
|
|
int(defaults.get("max_alerts_per_run", 20)),
|
|
MAX_ALERTS_HARD_CAP,
|
|
),
|
|
"max_incidents": int(defaults.get("max_incidents_per_run", 5)),
|
|
"max_triages": int(defaults.get("max_triages_per_run", 5)),
|
|
"created_incidents": [],
|
|
"updated_incidents": [],
|
|
"skipped_alerts": [],
|
|
"errors": [],
|
|
"triage_runs": 0,
|
|
"processed": 0,
|
|
}
|
|
|
|
|
|
async def list_alerts_node(state: AlertTriageState) -> AlertTriageState:
|
|
"""
|
|
Atomically claim a batch of new/failed alerts for processing.
|
|
Uses alert_ingest_tool.claim (SELECT FOR UPDATE SKIP LOCKED in Postgres).
|
|
Falls back to list with status_in=new,failed if claim not available.
|
|
"""
|
|
policy = state.get("policy", {})
|
|
max_alerts = state.get("max_alerts", 20)
|
|
|
|
agent_id = state.get("agent_id", "sofiia")
|
|
workspace_id = state.get("workspace_id", "default")
|
|
run_id = state.get("_run_id", "unknown")
|
|
|
|
try:
|
|
async with GatewayClient() as gw:
|
|
data = await _call_tool(
|
|
gw, "alert_ingest_tool", "claim",
|
|
{
|
|
"window_minutes": 240,
|
|
"limit": max_alerts,
|
|
"owner": f"supervisor:{run_id[:12]}",
|
|
"lock_ttl_seconds": 600,
|
|
},
|
|
run_id=run_id, node="claim_alerts",
|
|
agent_id=agent_id, workspace_id=workspace_id,
|
|
)
|
|
except Exception as e:
|
|
logger.error("claim_alerts_node failed: %s", e)
|
|
return {**state, "alerts": [], "errors": [{"node": "claim_alerts", "error": str(e)}]}
|
|
|
|
claimed = data.get("alerts", [])
|
|
requeued = data.get("requeued_stale", 0)
|
|
if requeued:
|
|
logger.info("Requeued %d stale-processing alerts", requeued)
|
|
|
|
return {**state, "alerts": claimed[:max_alerts]}
|
|
|
|
|
|
async def process_alerts_node(state: AlertTriageState) -> AlertTriageState:
|
|
"""
|
|
Main loop: for each alert → match policy → create/update incident → triage.
|
|
Deterministic by default (0 LLM tokens unless policy.llm_mode != off).
|
|
"""
|
|
policy = state.get("policy", {})
|
|
defaults = policy.get("defaults", {})
|
|
alerts = state.get("alerts", [])
|
|
dry_run = state.get("dry_run", False)
|
|
|
|
agent_id = state.get("agent_id", "sofiia")
|
|
workspace_id = state.get("workspace_id", "default")
|
|
run_id = state.get("_run_id", "unknown")
|
|
|
|
created_incidents: List[Dict] = list(state.get("created_incidents", []))
|
|
updated_incidents: List[Dict] = list(state.get("updated_incidents", []))
|
|
skipped_alerts: List[Dict] = list(state.get("skipped_alerts", []))
|
|
errors: List[Dict] = list(state.get("errors", []))
|
|
|
|
max_incidents = state.get("max_incidents", 5)
|
|
max_triages = state.get("max_triages", 5)
|
|
triage_runs = state.get("triage_runs", 0)
|
|
processed = 0
|
|
|
|
ack_prefix = defaults.get("ack_note_prefix", "alert_triage_loop")
|
|
|
|
async with GatewayClient() as gw:
|
|
for alert in alerts:
|
|
alert_ref = alert.get("alert_ref", "?")
|
|
try:
|
|
actions = match_alert(alert, policy)
|
|
incident_id = None
|
|
triage_run_id = None
|
|
|
|
# ── Digest-only: ack immediately, no incident ─────────────────
|
|
if not actions.get("auto_incident", False):
|
|
if actions.get("digest_only"):
|
|
skipped_alerts.append({
|
|
"alert_ref": alert_ref,
|
|
"service": alert.get("service"),
|
|
"severity": alert.get("severity"),
|
|
"reason": "digest_only (policy)",
|
|
})
|
|
if not dry_run and actions.get("ack", True):
|
|
await _call_tool(
|
|
gw, "alert_ingest_tool", "ack",
|
|
{"alert_ref": alert_ref, "actor": agent_id,
|
|
"note": f"{ack_prefix}:digest_only"},
|
|
run_id=run_id, node="ack_digest",
|
|
agent_id=agent_id, workspace_id=workspace_id,
|
|
)
|
|
processed += 1
|
|
continue
|
|
|
|
# ── Auto incident creation ─────────────────────────────────────
|
|
if len(created_incidents) + len(updated_incidents) >= max_incidents:
|
|
skipped_alerts.append({
|
|
"alert_ref": alert_ref,
|
|
"reason": "max_incidents_per_run reached",
|
|
})
|
|
# Don't ack — leave as processing; next run picks it up
|
|
processed += 1
|
|
continue
|
|
|
|
if not dry_run:
|
|
inc_result = await _call_tool(
|
|
gw, "oncall_tool", "alert_to_incident",
|
|
{
|
|
"alert_ref": alert_ref,
|
|
"incident_severity_cap": actions.get("incident_severity_cap", "P1"),
|
|
"dedupe_window_minutes": int(actions.get("dedupe_window_minutes", 120)),
|
|
"attach_artifact": actions.get("attach_alert_artifact", True),
|
|
},
|
|
run_id=run_id, node="alert_to_incident",
|
|
agent_id=agent_id, workspace_id=workspace_id,
|
|
)
|
|
if inc_result:
|
|
incident_id = inc_result.get("incident_id")
|
|
incident_signature = inc_result.get("incident_signature", "")
|
|
if inc_result.get("created"):
|
|
created_incidents.append({
|
|
"incident_id": incident_id,
|
|
"alert_ref": alert_ref,
|
|
"service": alert.get("service"),
|
|
"severity": inc_result.get("severity"),
|
|
"signature": incident_signature,
|
|
})
|
|
else:
|
|
updated_incidents.append({
|
|
"incident_id": incident_id,
|
|
"alert_ref": alert_ref,
|
|
"note": inc_result.get("note", "attached"),
|
|
})
|
|
else:
|
|
# incident creation failed — mark alert as failed
|
|
await _call_tool(
|
|
gw, "alert_ingest_tool", "fail",
|
|
{"alert_ref": alert_ref,
|
|
"error": "alert_to_incident returned empty",
|
|
"retry_after_seconds": 300},
|
|
run_id=run_id, node="fail_alert",
|
|
agent_id=agent_id, workspace_id=workspace_id,
|
|
)
|
|
errors.append({
|
|
"node": "alert_to_incident",
|
|
"alert_ref": alert_ref,
|
|
"error": "empty response",
|
|
})
|
|
processed += 1
|
|
continue
|
|
else:
|
|
sig = compute_incident_signature(alert, policy)
|
|
incident_id = f"dry_run_inc_{sig[:8]}"
|
|
incident_signature = sig
|
|
created_incidents.append({
|
|
"incident_id": incident_id,
|
|
"alert_ref": alert_ref,
|
|
"service": alert.get("service"),
|
|
"dry_run": True,
|
|
})
|
|
|
|
# ── Cooldown check before triage ──────────────────────────────
|
|
cooldown_ok = True
|
|
cooldown_minutes = int(
|
|
policy.get("defaults", {}).get("triage_cooldown_minutes",
|
|
COOLDOWN_DEFAULT_MINUTES)
|
|
)
|
|
if (
|
|
incident_id
|
|
and actions.get("auto_triage", False)
|
|
and incident_signature
|
|
and not dry_run
|
|
):
|
|
sig_check = await _call_tool(
|
|
gw, "oncall_tool", "signature_should_triage",
|
|
{"signature": incident_signature,
|
|
"cooldown_minutes": cooldown_minutes},
|
|
run_id=run_id, node="cooldown_check",
|
|
agent_id=agent_id, workspace_id=workspace_id,
|
|
)
|
|
cooldown_ok = sig_check.get("should_triage", True)
|
|
|
|
if not cooldown_ok:
|
|
# Cooldown active: append soft event but don't triage
|
|
await _call_tool(
|
|
gw, "oncall_tool", "incident_append_event",
|
|
{"incident_id": incident_id,
|
|
"type": "note",
|
|
"message": f"Alert observed during triage cooldown "
|
|
f"(signature={incident_signature[:8]}, "
|
|
f"cooldown={cooldown_minutes}min)",
|
|
"meta": {"alert_ref": alert_ref,
|
|
"cooldown_active": True}},
|
|
run_id=run_id, node="cooldown_event",
|
|
agent_id=agent_id, workspace_id=workspace_id,
|
|
)
|
|
|
|
# ── Deterministic triage ──────────────────────────────────────
|
|
if (
|
|
incident_id
|
|
and actions.get("auto_triage", False)
|
|
and triage_runs < max_triages
|
|
and cooldown_ok
|
|
):
|
|
triage_mode = actions.get("triage_mode", "deterministic")
|
|
if triage_mode == "llm" and not is_llm_allowed("triage", policy):
|
|
triage_mode = "deterministic"
|
|
logger.info("llm_mode=off → deterministic triage for %s", alert_ref)
|
|
|
|
if triage_mode == "deterministic" and not dry_run:
|
|
try:
|
|
triage_run_id = await _run_deterministic_triage(
|
|
gw, incident_id, alert, agent_id, workspace_id, run_id
|
|
)
|
|
triage_runs += 1
|
|
# Mark signature cooldown
|
|
await _call_tool(
|
|
gw, "oncall_tool", "signature_mark_triage",
|
|
{"signature": incident_signature},
|
|
run_id=run_id, node="mark_triage",
|
|
agent_id=agent_id, workspace_id=workspace_id,
|
|
)
|
|
except Exception as te:
|
|
logger.warning("Triage failed for %s: %s", incident_id, te)
|
|
errors.append({
|
|
"node": "triage",
|
|
"incident_id": incident_id,
|
|
"error": str(te),
|
|
})
|
|
|
|
# ── Ack alert (success) ────────────────────────────────────────
|
|
if not dry_run and actions.get("ack", True):
|
|
note_parts = [ack_prefix]
|
|
if incident_id:
|
|
note_parts.append(f"incident:{incident_id}")
|
|
if triage_run_id:
|
|
note_parts.append(f"triage:{triage_run_id}")
|
|
await _call_tool(
|
|
gw, "alert_ingest_tool", "ack",
|
|
{"alert_ref": alert_ref, "actor": agent_id,
|
|
"note": "|".join(note_parts)},
|
|
run_id=run_id, node="ack_alert",
|
|
agent_id=agent_id, workspace_id=workspace_id,
|
|
)
|
|
|
|
processed += 1
|
|
|
|
except Exception as e:
|
|
logger.error("Error processing alert %s: %s", alert_ref, e)
|
|
errors.append({
|
|
"node": "process_alerts",
|
|
"alert_ref": alert_ref,
|
|
"error": str(e),
|
|
})
|
|
# Mark alert as failed so it retries next run
|
|
try:
|
|
async with GatewayClient() as gw2:
|
|
await _call_tool(
|
|
gw2, "alert_ingest_tool", "fail",
|
|
{"alert_ref": alert_ref, "error": str(e)[:200],
|
|
"retry_after_seconds": 300},
|
|
run_id=run_id, node="fail_on_error",
|
|
agent_id=agent_id, workspace_id=workspace_id,
|
|
)
|
|
except Exception:
|
|
pass # non-fatal fail-marking
|
|
processed += 1
|
|
|
|
return {
|
|
**state,
|
|
"processed": state.get("processed", 0) + processed,
|
|
"created_incidents": created_incidents,
|
|
"updated_incidents": updated_incidents,
|
|
"skipped_alerts": skipped_alerts,
|
|
"errors": errors,
|
|
"triage_runs": triage_runs,
|
|
}
|
|
|
|
|
|
async def _run_deterministic_triage(
|
|
gw: GatewayClient,
|
|
incident_id: str,
|
|
alert: Dict,
|
|
agent_id: str,
|
|
workspace_id: str,
|
|
run_id: str,
|
|
) -> Optional[str]:
|
|
"""
|
|
Run deterministic triage for an incident:
|
|
1. service_overview (observability)
|
|
2. health check (oncall)
|
|
3. KB runbook snippets
|
|
4. Compile and attach triage report artifact
|
|
"""
|
|
import json, base64, hashlib
|
|
|
|
service = alert.get("service", "unknown")
|
|
env = alert.get("env", "prod")
|
|
now = datetime.datetime.utcnow()
|
|
time_from = (now - datetime.timedelta(hours=1)).isoformat()
|
|
time_to = now.isoformat()
|
|
triage_id = f"tri_{hashlib.sha256(f'{incident_id}{now}'.encode()).hexdigest()[:8]}"
|
|
|
|
# 1. Service overview
|
|
overview_data = await _call_tool(
|
|
gw, "observability_tool", "service_overview",
|
|
{"service": service, "env": env,
|
|
"time_range": {"from": time_from, "to": time_to}},
|
|
run_id=run_id, node="triage_overview",
|
|
agent_id=agent_id, workspace_id=workspace_id,
|
|
)
|
|
|
|
# 2. Health check
|
|
health_data = await _call_tool(
|
|
gw, "oncall_tool", "service_health",
|
|
{"service": service, "env": env},
|
|
run_id=run_id, node="triage_health",
|
|
agent_id=agent_id, workspace_id=workspace_id,
|
|
)
|
|
|
|
# 3. KB runbooks
|
|
kb_data = await _call_tool(
|
|
gw, "kb_tool", "snippets",
|
|
{"query": f"{service} {alert.get('kind', '')} {alert.get('title', '')}",
|
|
"limit": 3},
|
|
run_id=run_id, node="triage_kb",
|
|
agent_id=agent_id, workspace_id=workspace_id,
|
|
)
|
|
|
|
# 4. Compile deterministic report
|
|
report = {
|
|
"triage_id": triage_id,
|
|
"incident_id": incident_id,
|
|
"service": service,
|
|
"env": env,
|
|
"mode": "deterministic",
|
|
"alert_ref": alert.get("alert_ref", ""),
|
|
"generated_at": now.isoformat(),
|
|
"summary": (
|
|
f"Auto-triage for {service} {alert.get('kind','?')} "
|
|
f"(severity={alert.get('severity','?')})"
|
|
),
|
|
"suspected_root_causes": _build_root_causes(alert, overview_data, health_data),
|
|
"impact_assessment": _build_impact(alert, overview_data),
|
|
"mitigations_now": _build_mitigations(alert, health_data, kb_data),
|
|
"next_checks": _build_next_checks(alert, overview_data),
|
|
"references": {
|
|
"metrics": overview_data.get("metrics", {}),
|
|
"health": health_data,
|
|
"runbook_snippets": (kb_data.get("snippets") or [])[:3],
|
|
},
|
|
}
|
|
|
|
# Attach as incident artifact
|
|
content = json.dumps(report, indent=2, default=str).encode()
|
|
content_b64 = base64.b64encode(content).decode()
|
|
|
|
await _call_tool(
|
|
gw, "oncall_tool", "incident_attach_artifact",
|
|
{
|
|
"incident_id": incident_id,
|
|
"kind": "triage_report",
|
|
"format": "json",
|
|
"content_base64": content_b64,
|
|
"filename": f"triage_{triage_id}.json",
|
|
},
|
|
run_id=run_id, node="attach_triage",
|
|
agent_id=agent_id, workspace_id=workspace_id,
|
|
)
|
|
|
|
# Append timeline event
|
|
await _call_tool(
|
|
gw, "oncall_tool", "incident_append_event",
|
|
{
|
|
"incident_id": incident_id,
|
|
"type": "note",
|
|
"message": f"Deterministic triage completed (triage_id={triage_id})",
|
|
"meta": {"triage_id": triage_id, "mode": "deterministic"},
|
|
},
|
|
run_id=run_id, node="triage_event",
|
|
agent_id=agent_id, workspace_id=workspace_id,
|
|
)
|
|
|
|
return triage_id
|
|
|
|
|
|
def _build_root_causes(alert: Dict, overview: Dict, health: Dict) -> List[Dict]:
|
|
causes = []
|
|
kind = alert.get("kind", "custom")
|
|
metrics = (overview.get("metrics") or {})
|
|
|
|
kind_cause_map = {
|
|
"slo_breach": "SLO breach detected (latency/error rate exceeded threshold)",
|
|
"latency": "High latency observed — possible overload or downstream dependency degradation",
|
|
"error_rate": "Elevated error rate — check recent deployments and upstream dependencies",
|
|
"crashloop": "Container crash-looping — OOM or unhandled exception in startup",
|
|
"oom": "Out-of-memory condition — memory leak or insufficient limits",
|
|
"disk": "Disk/PVC capacity pressure — check log rotation and data retention",
|
|
"deploy": "Recent deployment may have introduced regression",
|
|
"security": "Security event detected — unauthorized access or injection attempt",
|
|
}
|
|
description = kind_cause_map.get(kind, f"Alert kind '{kind}' triggered on {alert.get('service', '?')}")
|
|
causes.append({"rank": 1, "cause": description, "evidence": [_alert_line(alert)]})
|
|
|
|
# Add metric-based hints
|
|
alert_metrics = alert.get("metrics", {})
|
|
if alert_metrics.get("latency_p95_ms", 0) > 500:
|
|
causes.append({
|
|
"rank": 2,
|
|
"cause": f"High p95 latency: {alert_metrics['latency_p95_ms']}ms",
|
|
"evidence": ["From alert metrics"],
|
|
})
|
|
if alert_metrics.get("error_rate_pct", 0) > 1.0:
|
|
causes.append({
|
|
"rank": 3,
|
|
"cause": f"Elevated error rate: {alert_metrics['error_rate_pct']}%",
|
|
"evidence": ["From alert metrics"],
|
|
})
|
|
|
|
if health and not health.get("healthy", True):
|
|
causes.append({
|
|
"rank": len(causes) + 1,
|
|
"cause": f"Service health check failed: {health.get('status', 'unknown')}",
|
|
"evidence": [str(health.get("details", ""))[:200]],
|
|
})
|
|
return causes
|
|
|
|
|
|
def _build_impact(alert: Dict, overview: Dict) -> str:
|
|
sev = alert.get("severity", "P2")
|
|
svc = alert.get("service", "unknown")
|
|
env = alert.get("env", "prod")
|
|
impact_map = {
|
|
"P0": f"CRITICAL: {svc} is fully degraded in {env}. Immediate action required.",
|
|
"P1": f"HIGH: {svc} in {env} is significantly impaired. Users affected.",
|
|
"P2": f"MEDIUM: {svc} in {env} is partially degraded. Monitoring required.",
|
|
"P3": f"LOW: Minor degradation in {svc} ({env}). No immediate user impact.",
|
|
}
|
|
return impact_map.get(sev, f"{svc} affected in {env}")
|
|
|
|
|
|
def _build_mitigations(alert: Dict, health: Dict, kb: Dict) -> List[str]:
|
|
mitigations = []
|
|
kind = alert.get("kind", "custom")
|
|
|
|
kind_mitigations = {
|
|
"slo_breach": ["Check recent deployments and rollback if needed",
|
|
"Scale service if under load", "Review error budget"],
|
|
"latency": ["Check downstream dependency health",
|
|
"Review connection pool settings", "Check for resource contention"],
|
|
"error_rate": ["Review application logs for exceptions",
|
|
"Check recent config changes", "Verify upstream dependencies"],
|
|
"crashloop": ["Check pod logs: kubectl logs <pod> --previous",
|
|
"Review resource limits", "Check liveness probe configuration"],
|
|
"oom": ["Increase memory limits", "Check for memory leaks",
|
|
"Review heap dumps if available"],
|
|
"disk": ["Run log rotation", "Check data retention policies",
|
|
"Delete old artifacts / compact audit logs"],
|
|
"deploy": ["Review deployment diff", "Run smoke tests",
|
|
"Consider rollback if metrics degraded"],
|
|
"security": ["Block suspicious IPs", "Rotate affected credentials",
|
|
"Audit access logs", "Notify security team"],
|
|
}
|
|
mitigations.extend(kind_mitigations.get(kind, ["Investigate logs and metrics"]))
|
|
|
|
snippets = (kb.get("snippets") or [])[:2]
|
|
for s in snippets:
|
|
ref = s.get("path", "KB")
|
|
mitigations.append(f"See runbook: {ref}")
|
|
|
|
return mitigations
|
|
|
|
|
|
def _build_next_checks(alert: Dict, overview: Dict) -> List[str]:
|
|
svc = alert.get("service", "unknown")
|
|
return [
|
|
f"Monitor {svc} error rate and latency for next 15 min",
|
|
"Check incident_triage_graph for deeper analysis",
|
|
"Verify SLO status with observability_tool.slo_snapshot",
|
|
"If not resolved in 30 min → escalate to P0",
|
|
]
|
|
|
|
|
|
def _alert_line(alert: Dict) -> str:
|
|
"""Short single-line summary of an alert for evidence lists."""
|
|
return (
|
|
f"[{alert.get('severity','?')}] {alert.get('service','?')} "
|
|
f"{alert.get('kind','?')}: {alert.get('title','')[:80]}"
|
|
)
|
|
|
|
|
|
async def build_digest_node(state: AlertTriageState) -> AlertTriageState:
|
|
"""Build short markdown digest for CTO/UI (max 3800 chars)."""
|
|
created = state.get("created_incidents", [])
|
|
updated = state.get("updated_incidents", [])
|
|
skipped = state.get("skipped_alerts", [])
|
|
errors = state.get("errors", [])
|
|
processed = state.get("processed", 0)
|
|
alerts = state.get("alerts", [])
|
|
dry_run = state.get("dry_run", False)
|
|
triage_runs = state.get("triage_runs", 0)
|
|
|
|
ts = _now_iso()
|
|
dry_tag = " **[DRY RUN]**" if dry_run else ""
|
|
lines = [
|
|
f"## Alert Triage Digest{dry_tag} — {ts[:19]}Z",
|
|
"",
|
|
f"**Processed:** {processed} alerts | "
|
|
f"**New incidents:** {len(created)} | "
|
|
f"**Updated:** {len(updated)} | "
|
|
f"**Skipped/Digest:** {len(skipped)} | "
|
|
f"**Triages run:** {triage_runs} | "
|
|
f"**Errors:** {len(errors)}",
|
|
"",
|
|
]
|
|
|
|
if created:
|
|
lines.append("### 🆕 Created Incidents")
|
|
for item in created[:10]:
|
|
sev = item.get("severity", "?")
|
|
svc = item.get("service", "?")
|
|
inc_id = item.get("incident_id", "?")
|
|
ref = item.get("alert_ref", "?")
|
|
sig = (item.get("signature") or "")[:8]
|
|
lines.append(f"- `{inc_id}` [{sev}] {svc} (alert: {ref}, sig: {sig})")
|
|
lines.append("")
|
|
|
|
if updated:
|
|
lines.append("### 🔄 Updated Incidents (alert attached)")
|
|
for item in updated[:10]:
|
|
inc_id = item.get("incident_id", "?")
|
|
ref = item.get("alert_ref", "?")
|
|
lines.append(f"- `{inc_id}` ← alert `{ref}` ({item.get('note', '')})")
|
|
lines.append("")
|
|
|
|
if skipped:
|
|
lines.append("### ⏭ Skipped / Digest-only")
|
|
for item in skipped[:15]:
|
|
svc = item.get("service", "?")
|
|
sev = item.get("severity", "?")
|
|
reason = item.get("reason", "policy")
|
|
ref = item.get("alert_ref", "?")
|
|
lines.append(f"- [{sev}] {svc} `{ref}` — {reason}")
|
|
if len(skipped) > 15:
|
|
lines.append(f"- … and {len(skipped) - 15} more")
|
|
lines.append("")
|
|
|
|
if errors:
|
|
lines.append("### ⚠️ Errors (non-fatal)")
|
|
for e in errors[:5]:
|
|
lines.append(f"- `{e.get('node','?')}`: {str(e.get('error','?'))[:120]}")
|
|
lines.append("")
|
|
|
|
# ── Escalation results ─────────────────────────────────────────────────────
|
|
escalation = state.get("escalation_result") or {}
|
|
esc_count = escalation.get("escalated", 0)
|
|
esc_candidates = escalation.get("candidates", [])
|
|
|
|
if esc_count > 0:
|
|
lines.append(f"### ⬆️ Escalated Incidents ({esc_count})")
|
|
for c in esc_candidates[:5]:
|
|
if c.get("from_severity") != c.get("to_severity"):
|
|
lines.append(
|
|
f"- `{c.get('incident_id','?')}` {c.get('service','?')}: "
|
|
f"{c.get('from_severity')} → {c.get('to_severity')} "
|
|
f"(occ_60m={c.get('occurrences_60m',0)}, "
|
|
f"triage_24h={c.get('triage_count_24h',0)})"
|
|
)
|
|
lines.append("")
|
|
|
|
# ── Auto-resolve candidates ────────────────────────────────────────────────
|
|
ar = state.get("autoresolve_result") or {}
|
|
ar_count = ar.get("candidates_count", 0)
|
|
if ar_count > 0:
|
|
lines.append(f"### 🟡 Auto-resolve Candidates ({ar_count})")
|
|
for c in (ar.get("candidates") or [])[:5]:
|
|
lines.append(
|
|
f"- `{c.get('incident_id','?')}` [{c.get('severity','?')}] "
|
|
f"{c.get('service','?')}: no alerts for "
|
|
f"{c.get('minutes_without_alerts', '?')}min"
|
|
)
|
|
lines.append("")
|
|
|
|
if not created and not updated and not skipped and not errors:
|
|
lines.append("_No alerts to process in this window._")
|
|
|
|
digest_md = "\n".join(lines)
|
|
|
|
# Truncate if over limit
|
|
if len(digest_md) > MAX_DIGEST_CHARS:
|
|
digest_md = digest_md[:MAX_DIGEST_CHARS - 50] + "\n\n… *(digest truncated)*"
|
|
|
|
result_summary = {
|
|
"processed": processed,
|
|
"created_incidents": len(created),
|
|
"updated_incidents": len(updated),
|
|
"skipped": len(skipped),
|
|
"triage_runs": triage_runs,
|
|
"escalated": esc_count,
|
|
"autoresolve_candidates": ar_count,
|
|
"errors": len(errors),
|
|
}
|
|
|
|
return {
|
|
**state,
|
|
"digest_md": digest_md,
|
|
"result_summary": result_summary,
|
|
}
|
|
|
|
|
|
async def post_process_escalation_node(state: AlertTriageState) -> AlertTriageState:
|
|
"""
|
|
After processing alerts: call incident_escalation_tool.evaluate.
|
|
Only runs if at least 1 alert was processed. Non-fatal.
|
|
"""
|
|
processed = state.get("processed", 0)
|
|
if processed == 0:
|
|
return {**state, "escalation_result": {}}
|
|
|
|
agent_id = state.get("agent_id", "sofiia")
|
|
workspace_id = state.get("workspace_id", "default")
|
|
run_id = state.get("_run_id", "unknown")
|
|
dry_run = state.get("dry_run", False)
|
|
|
|
try:
|
|
async with GatewayClient() as gw:
|
|
result = await _call_tool(
|
|
gw, "incident_escalation_tool", "evaluate",
|
|
{
|
|
"window_minutes": 60,
|
|
"limit": 50,
|
|
"dry_run": dry_run,
|
|
},
|
|
run_id=run_id, node="post_escalation",
|
|
agent_id=agent_id, workspace_id=workspace_id,
|
|
)
|
|
except Exception as e:
|
|
logger.warning("post_process_escalation_node failed (non-fatal): %s", e)
|
|
result = {}
|
|
|
|
return {**state, "escalation_result": result}
|
|
|
|
|
|
async def post_process_autoresolve_node(state: AlertTriageState) -> AlertTriageState:
|
|
"""
|
|
After processing alerts: find auto-resolve candidates.
|
|
Always dry_run=True (candidate-only, no actual close unless policy says otherwise).
|
|
Non-fatal.
|
|
"""
|
|
processed = state.get("processed", 0)
|
|
if processed == 0:
|
|
return {**state, "autoresolve_result": {}}
|
|
|
|
agent_id = state.get("agent_id", "sofiia")
|
|
workspace_id = state.get("workspace_id", "default")
|
|
run_id = state.get("_run_id", "unknown")
|
|
|
|
try:
|
|
async with GatewayClient() as gw:
|
|
result = await _call_tool(
|
|
gw, "incident_escalation_tool", "auto_resolve_candidates",
|
|
{
|
|
"no_alerts_minutes": 60,
|
|
"limit": 50,
|
|
"dry_run": True, # always candidate-only in loop
|
|
},
|
|
run_id=run_id, node="post_autoresolve",
|
|
agent_id=agent_id, workspace_id=workspace_id,
|
|
)
|
|
except Exception as e:
|
|
logger.warning("post_process_autoresolve_node failed (non-fatal): %s", e)
|
|
result = {}
|
|
|
|
return {**state, "autoresolve_result": result}
|
|
|
|
|
|
# ─── Graph builder ─────────────────────────────────────────────────────────────
|
|
|
|
def build_alert_triage_graph():
|
|
"""
|
|
Build the alert_triage LangGraph.
|
|
|
|
LLM usage: ZERO in steady state (llm_mode=off in policy).
|
|
All nodes are deterministic Python + gateway tool calls.
|
|
|
|
Flow: load_policy → claim_alerts → process_alerts
|
|
→ post_escalation → post_autoresolve → build_digest
|
|
"""
|
|
workflow = StateGraph(AlertTriageState)
|
|
|
|
workflow.add_node("load_policy", load_policy_node)
|
|
workflow.add_node("list_alerts", list_alerts_node)
|
|
workflow.add_node("process_alerts", process_alerts_node)
|
|
workflow.add_node("post_escalation", post_process_escalation_node)
|
|
workflow.add_node("post_autoresolve", post_process_autoresolve_node)
|
|
workflow.add_node("build_digest", build_digest_node)
|
|
|
|
workflow.set_entry_point("load_policy")
|
|
workflow.add_edge("load_policy", "list_alerts")
|
|
workflow.add_edge("list_alerts", "process_alerts")
|
|
workflow.add_edge("process_alerts", "post_escalation")
|
|
workflow.add_edge("post_escalation", "post_autoresolve")
|
|
workflow.add_edge("post_autoresolve", "build_digest")
|
|
workflow.add_edge("build_digest", END)
|
|
|
|
return workflow.compile()
|