Files
microdao-daarion/services/sofiia-supervisor/app/graphs/incident_triage_graph.py
Apple 129e4ea1fc feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store,
architecture_pressure, backlog_generator/store, cost_analyzer, data_governance,
dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment,
platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files),
signature_state_store, sofiia_auto_router, tool_governance

New services:
- sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static
- memory-service: integration_endpoints, integrations, voice_endpoints, static UI
- aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents)
- sofiia-supervisor: new supervisor service
- aistalk-bridge-lite: Telegram bridge lite
- calendar-service: CalDAV calendar service with reminders
- mlx-stt-service / mlx-tts-service: Apple Silicon speech services
- binance-bot-monitor: market monitor service
- node-worker: STT/TTS memory providers

New tools (9): agent_email, browser_tool, contract_tool, observability_tool,
oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault

New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus,
farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine,
session_context, style_adapter, telemetry)

Tests: 85+ test files for all new modules
Made-with: Cursor
2026-03-03 07:14:14 -08:00

743 lines
29 KiB
Python

"""
Graph 2: incident_triage_graph
Collects observability, logs, health, KB runbooks, optionally traces,
and governance context (privacy + cost), then builds a structured triage report.
Node sequence:
validate_input → service_overview → top_errors_logs
→ health_and_runbooks → trace_lookup (optional)
→ slo_context → privacy_context → cost_context
→ build_triage_report → END
All tool calls via gateway. No direct access to Prometheus/Loki/etc.
"""
from __future__ import annotations
import datetime
import logging
import re
from typing import Any, Dict, List, Optional, TypedDict
from langgraph.graph import StateGraph, END
from ..config import settings
from ..gateway_client import GatewayClient
logger = logging.getLogger(__name__)
_SECRET_PAT = re.compile(
r'(?i)(token|api[_-]?key|password|secret|bearer)\s*[=:]\s*\S+',
re.IGNORECASE,
)
def _redact_lines(lines: List[str]) -> List[str]:
"""Mask secrets in log lines before including in report."""
return [_SECRET_PAT.sub(lambda m: f"{m.group(1)}=***", line) for line in lines]
def _clamp_time_range(time_range: Optional[Dict[str, str]], max_hours: int) -> Dict[str, str]:
"""Ensure time window ≤ max_hours. Clamp end-start if larger."""
now = datetime.datetime.now(datetime.timezone.utc)
default_from = (now - datetime.timedelta(hours=1)).isoformat()
default_to = now.isoformat()
if not time_range:
return {"from": default_from, "to": default_to}
try:
from_dt = datetime.datetime.fromisoformat(time_range["from"].replace("Z", "+00:00"))
to_dt = datetime.datetime.fromisoformat(time_range.get("to", default_to).replace("Z", "+00:00"))
delta = to_dt - from_dt
if delta.total_seconds() > max_hours * 3600:
# Clamp: keep "to", shorten "from"
from_dt = to_dt - datetime.timedelta(hours=max_hours)
return {"from": from_dt.isoformat(), "to": to_dt.isoformat()}
return {"from": from_dt.isoformat(), "to": to_dt.isoformat()}
except Exception:
return {"from": default_from, "to": default_to}
# ─── State ────────────────────────────────────────────────────────────────────
class IncidentTriageState(TypedDict, total=False):
# Context (injected before graph.invoke)
run_id: str
agent_id: str
workspace_id: str
user_id: str
input: Dict[str, Any]
# Validated
service: str
symptom: str
time_range: Dict[str, str]
env: str
include_traces: bool
max_log_lines: int
log_query_hint: Optional[str]
validation_error: Optional[str]
# Node results
service_overview_data: Optional[Dict]
top_errors_data: Optional[Dict]
log_samples: List[str]
health_data: Optional[Dict]
runbook_snippets: List[Dict]
trace_data: Optional[Dict]
slo_context_data: Optional[Dict]
privacy_context_data: Optional[Dict]
cost_context_data: Optional[Dict]
# Output
result: Optional[Dict[str, Any]]
graph_status: str
error: Optional[str]
# ─── Nodes ────────────────────────────────────────────────────────────────────
async def validate_input_node(state: IncidentTriageState) -> IncidentTriageState:
"""Validate and normalise triage inputs. Clamp time window to max allowed."""
inp = state.get("input", {})
service = inp.get("service", "").strip()
symptom = inp.get("symptom", "").strip()
if not service:
return {**state, "graph_status": "failed", "validation_error": "service is required"}
if not symptom:
return {**state, "graph_status": "failed", "validation_error": "symptom is required"}
time_range = _clamp_time_range(
inp.get("time_range"),
settings.INCIDENT_MAX_TIME_WINDOW_H,
)
max_log_lines = min(
int(inp.get("max_log_lines", 120)),
settings.INCIDENT_MAX_LOG_LINES,
)
return {
**state,
"service": service,
"symptom": symptom,
"time_range": time_range,
"env": inp.get("env", "prod"),
"include_traces": bool(inp.get("include_traces", False)),
"max_log_lines": max_log_lines,
"log_query_hint": inp.get("log_query_hint"),
"log_samples": [],
"runbook_snippets": [],
"graph_status": "running",
}
async def service_overview_node(state: IncidentTriageState) -> IncidentTriageState:
"""
Node 1: Call observability_tool action=service_overview.
Collects metrics summary, recent alerts, SLO status.
"""
if state.get("graph_status") == "failed":
return state
run_id = state.get("run_id", "")
async with GatewayClient() as gw:
result = await gw.call_tool(
tool="observability_tool",
action="service_overview",
params={
"service": state["service"],
"time_range": state["time_range"],
"env": state.get("env", "prod"),
},
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
user_id=state.get("user_id", ""),
graph_run_id=run_id,
graph_node="service_overview",
)
if not result.success:
logger.warning("incident_triage: service_overview failed run=%s err=%s", run_id, result.error_message)
# Non-fatal: continue with partial data
return {**state, "service_overview_data": {"error": result.error_message}}
return {**state, "service_overview_data": result.data or {}}
async def top_errors_logs_node(state: IncidentTriageState) -> IncidentTriageState:
"""
Node 2: Call observability_tool action=logs_query.
Extract top N log lines and sample errors. Redact secrets.
"""
if state.get("graph_status") == "failed":
return state
run_id = state.get("run_id", "")
query_hint = state.get("log_query_hint") or f"service={state['service']} level=error"
async with GatewayClient() as gw:
result = await gw.call_tool(
tool="observability_tool",
action="logs_query",
params={
"service": state["service"],
"time_range": state["time_range"],
"env": state.get("env", "prod"),
"query": query_hint,
"limit": state.get("max_log_lines", 120),
},
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
user_id=state.get("user_id", ""),
graph_run_id=run_id,
graph_node="top_errors_logs",
)
if not result.success:
logger.warning("incident_triage: logs_query failed run=%s err=%s", run_id, result.error_message)
return {**state, "top_errors_data": {"error": result.error_message}, "log_samples": []}
data = result.data or {}
raw_lines: List[str] = data.get("lines") or data.get("logs") or []
safe_lines = _redact_lines(raw_lines[: state.get("max_log_lines", 120)])
return {**state, "top_errors_data": data, "log_samples": safe_lines}
async def health_and_runbooks_node(state: IncidentTriageState) -> IncidentTriageState:
"""
Node 3: Parallel-ish (sequential for simplicity):
a) oncall_tool action=service_health
b) kb_tool action=search for runbook snippets
"""
if state.get("graph_status") == "failed":
return state
run_id = state.get("run_id", "")
service = state["service"]
symptom = state.get("symptom", "")
# 3a — Health check
health_data: Dict = {}
async with GatewayClient() as gw:
hr = await gw.call_tool(
tool="oncall_tool",
action="service_health",
params={"service": service, "env": state.get("env", "prod")},
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
user_id=state.get("user_id", ""),
graph_run_id=run_id,
graph_node="health_check",
)
health_data = hr.data or {"error": hr.error_message} if not hr.success else hr.data or {}
# 3b — KB runbook search
runbook_snippets: List[Dict] = []
# Build KB query from service name + top error keywords from symptom
kb_query = f"{service} {symptom}"[:200]
async with GatewayClient() as gw:
kbr = await gw.call_tool(
tool="kb_tool",
action="search",
params={"query": kb_query, "top_k": 5, "filter": {"type": "runbook"}},
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
user_id=state.get("user_id", ""),
graph_run_id=run_id,
graph_node="kb_runbooks",
)
if kbr.success and kbr.data:
for item in (kbr.data.get("results") or [])[:5]:
snippet_text = _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***",
item.get("content", "")[:500])
runbook_snippets.append({
"path": item.get("path", item.get("source", "")),
"lines": item.get("lines", ""),
"text": snippet_text,
})
return {**state, "health_data": health_data, "runbook_snippets": runbook_snippets}
async def trace_lookup_node(state: IncidentTriageState) -> IncidentTriageState:
"""
Node 4 (optional): If include_traces=True, look for trace IDs in log samples
and query observability_tool action=traces_query.
Gracefully skips if no traces found or tool unavailable.
"""
if state.get("graph_status") == "failed":
return state
if not state.get("include_traces", False):
return {**state, "trace_data": None}
run_id = state.get("run_id", "")
# Extract trace IDs from log samples (simple regex: trace_id=<hex> or traceId=<hex>)
trace_pat = re.compile(r'(?:trace[_-]?id|traceId)[=:\s]+([0-9a-f]{16,32})', re.IGNORECASE)
trace_ids = []
for line in (state.get("log_samples") or [])[:50]:
for m in trace_pat.finditer(line):
trace_ids.append(m.group(1))
if len(trace_ids) >= 3:
break
if len(trace_ids) >= 3:
break
if not trace_ids:
logger.info("incident_triage: no trace IDs found in logs run=%s", run_id)
return {**state, "trace_data": {"note": "no_trace_ids_in_logs"}}
async with GatewayClient() as gw:
result = await gw.call_tool(
tool="observability_tool",
action="traces_query",
params={
"service": state["service"],
"trace_ids": trace_ids[:3],
"time_range": state["time_range"],
},
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
user_id=state.get("user_id", ""),
graph_run_id=run_id,
graph_node="trace_lookup",
)
if not result.success:
logger.info("incident_triage: trace_lookup skipped run=%s err=%s", run_id, result.error_message)
return {**state, "trace_data": {"note": f"trace_query_failed: {result.error_message}"}}
return {**state, "trace_data": result.data or {}}
async def slo_context_node(state: IncidentTriageState) -> IncidentTriageState:
"""
Node 4b: Query SLO thresholds and current metrics for the incident service.
Calls observability_tool.slo_snapshot via gateway.
Non-fatal: if tool unavailable, slo_context_data is set to skipped marker.
"""
if state.get("graph_status") == "failed":
return state
run_id = state.get("run_id", "")
service = state.get("service", "")
time_range = state.get("time_range", {})
try:
from_dt = datetime.datetime.fromisoformat(time_range.get("from", "").replace("Z", "+00:00"))
to_dt = datetime.datetime.fromisoformat(time_range.get("to", "").replace("Z", "+00:00"))
window_min = max(5, min(60, int((to_dt - from_dt).total_seconds() / 60)))
except Exception:
window_min = 60
try:
async with GatewayClient() as gw:
result = await gw.call_tool(
tool="observability_tool",
action="slo_snapshot",
params={
"service": service,
"env": state.get("env", "prod"),
"window_minutes": window_min,
},
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
user_id=state.get("user_id", ""),
graph_run_id=run_id,
graph_node="slo_context",
)
if not result.success:
logger.info("incident_triage: slo_context skipped run=%s err=%s", run_id, result.error_message)
return {**state, "slo_context_data": {"skipped": True, "reason": result.error_message}}
data = result.data or {}
return {**state, "slo_context_data": {
"violations": data.get("violations", []),
"metrics": data.get("metrics", {}),
"thresholds": data.get("thresholds", {}),
"skipped": data.get("skipped", False),
}}
except Exception as e:
logger.info("incident_triage: slo_context failed run=%s err=%s", run_id, e)
return {**state, "slo_context_data": {"skipped": True, "reason": str(e)}}
async def privacy_context_node(state: IncidentTriageState) -> IncidentTriageState:
"""
Node 5a: Scan audit events over the incident time window for privacy anomalies.
Calls data_governance_tool.scan_audit via gateway.
Non-fatal: if gateway fails, privacy_context_data is set to an error marker
and the triage report continues normally.
"""
if state.get("graph_status") == "failed":
return state
run_id = state.get("run_id", "")
time_range = state.get("time_range", {})
# Compute window_hours from time_range (clamp 1..24)
try:
import datetime
from_dt = datetime.datetime.fromisoformat(time_range.get("from", "").replace("Z", "+00:00"))
to_dt = datetime.datetime.fromisoformat(time_range.get("to", "").replace("Z", "+00:00"))
window_h = max(1, min(24, int((to_dt - from_dt).total_seconds() / 3600) + 1))
except Exception:
window_h = 1
try:
async with GatewayClient() as gw:
result = await gw.call_tool(
tool="data_governance_tool",
action="scan_audit",
params={
"backend": "jsonl",
"time_window_hours": window_h,
"max_events": 10000,
},
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
user_id=state.get("user_id", ""),
graph_run_id=run_id,
graph_node="privacy_context",
)
if not result.success:
logger.info(
"incident_triage: privacy_context skipped run=%s err=%s",
run_id, result.error_message,
)
return {**state, "privacy_context_data": {"skipped": True, "reason": result.error_message}}
data = result.data or {}
return {**state, "privacy_context_data": {
"findings_count": data.get("stats", {}).get("errors", 0) + data.get("stats", {}).get("warnings", 0),
"findings": (data.get("findings") or [])[:5], # top 5 only; evidence already masked
"summary": data.get("summary", ""),
}}
except Exception as e:
logger.info("incident_triage: privacy_context failed run=%s err=%s", run_id, e)
return {**state, "privacy_context_data": {"skipped": True, "reason": str(e)}}
async def cost_context_node(state: IncidentTriageState) -> IncidentTriageState:
"""
Node 5b: Detect cost/resource anomalies over the incident time window.
Calls cost_analyzer_tool.anomalies via gateway.
Non-fatal: on any failure, cost_context_data is set to skipped marker.
"""
if state.get("graph_status") == "failed":
return state
run_id = state.get("run_id", "")
time_range = state.get("time_range", {})
try:
import datetime
from_dt = datetime.datetime.fromisoformat(time_range.get("from", "").replace("Z", "+00:00"))
to_dt = datetime.datetime.fromisoformat(time_range.get("to", "").replace("Z", "+00:00"))
window_minutes = max(15, min(60, int((to_dt - from_dt).total_seconds() / 60)))
baseline_hours = max(4, min(24, int((to_dt - from_dt).total_seconds() / 3600) + 4))
except Exception:
window_minutes = 60
baseline_hours = 24
try:
async with GatewayClient() as gw:
result = await gw.call_tool(
tool="cost_analyzer_tool",
action="anomalies",
params={
"window_minutes": window_minutes,
"baseline_hours": baseline_hours,
"ratio_threshold": 3.0,
"min_calls": 5,
},
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
user_id=state.get("user_id", ""),
graph_run_id=run_id,
graph_node="cost_context",
)
if not result.success:
logger.info(
"incident_triage: cost_context skipped run=%s err=%s",
run_id, result.error_message,
)
return {**state, "cost_context_data": {"skipped": True, "reason": result.error_message}}
data = result.data or {}
anomalies = data.get("anomalies") or []
return {**state, "cost_context_data": {
"anomaly_count": data.get("anomaly_count", len(anomalies)),
"anomalies": anomalies[:5], # top 5 spikes
"recommendations": [a.get("recommendation", "") for a in anomalies[:3] if a.get("recommendation")],
}}
except Exception as e:
logger.info("incident_triage: cost_context failed run=%s err=%s", run_id, e)
return {**state, "cost_context_data": {"skipped": True, "reason": str(e)}}
async def build_triage_report_node(state: IncidentTriageState) -> IncidentTriageState:
"""
Node 5: Pure aggregation — no tool calls.
Builds structured triage report from all collected data.
"""
if state.get("graph_status") == "failed":
err = state.get("validation_error") or state.get("error", "Unknown error")
return {**state, "result": {"error": err}, "graph_status": "failed"}
service = state.get("service", "unknown")
symptom = state.get("symptom", "")
overview = state.get("service_overview_data") or {}
health = state.get("health_data") or {}
log_samples = state.get("log_samples") or []
runbooks = state.get("runbook_snippets") or []
traces = state.get("trace_data")
slo_ctx = state.get("slo_context_data") or {}
privacy_ctx = state.get("privacy_context_data") or {}
cost_ctx = state.get("cost_context_data") or {}
# Extract alerts and error stats from observability overview
alerts = overview.get("alerts", overview.get("active_alerts", []))
slo = overview.get("slo", overview.get("slo_status", {}))
health_status = health.get("status", health.get("health", "unknown"))
# Build suspected root causes from available signals
root_causes = []
rank = 1
if health_status in ("degraded", "down", "unhealthy", "error"):
root_causes.append({
"rank": rank,
"cause": f"Service health: {health_status}",
"evidence": [str(health.get("details", health_status))[:300]],
})
rank += 1
for alert in alerts[:3]:
root_causes.append({
"rank": rank,
"cause": f"Active alert: {alert.get('name', alert) if isinstance(alert, dict) else str(alert)}",
"evidence": [str(alert)[:300]],
})
rank += 1
if log_samples:
# Count unique error patterns
error_lines = [l for l in log_samples if "error" in l.lower() or "exception" in l.lower()][:10]
if error_lines:
root_causes.append({
"rank": rank,
"cause": f"Error patterns in logs ({len(error_lines)} samples)",
"evidence": error_lines[:3],
})
rank += 1
if not root_causes:
root_causes.append({
"rank": 1,
"cause": "No obvious signals found; investigation ongoing",
"evidence": [symptom],
})
# Pre-extract SLO violations for impact and enrichment
slo_violations = slo_ctx.get("violations") or []
# Impact assessment from SLO + observability
impact = "Unknown"
if slo_violations and not slo_ctx.get("skipped"):
slo_m = slo_ctx.get("metrics", {})
impact = f"SLO breached: {', '.join(slo_violations)} (latency_p95={slo_m.get('latency_p95_ms', '?')}ms, error_rate={slo_m.get('error_rate_pct', '?')}%)"
elif isinstance(slo, dict):
error_rate = slo.get("error_rate") or slo.get("error_budget_consumed")
if error_rate:
impact = f"SLO impact: error_rate={error_rate}"
if health_status in ("down", "unhealthy"):
impact = f"Service is {health_status}" + (f"; {impact}" if impact != "Unknown" else "")
# Mitigations from runbooks
mitigations_now = []
for rb in runbooks[:2]:
text = rb.get("text", "")
lines = [l.strip() for l in text.split("\n") if l.strip().startswith("-") or "restart" in l.lower() or "rollback" in l.lower()]
mitigations_now.extend(lines[:3])
if not mitigations_now:
mitigations_now = ["Review logs for error patterns", "Check service health dashboard", "Consult runbook"]
next_checks = [
f"Verify {service} health endpoint returns 200",
"Check upstream/downstream dependencies",
"Review recent deployments in release history",
]
if alerts:
next_checks.insert(0, f"Acknowledge/resolve {len(alerts)} active alert(s)")
# Enrich with SLO violations
if slo_violations and not slo_ctx.get("skipped"):
slo_metrics = slo_ctx.get("metrics", {})
slo_thresholds = slo_ctx.get("thresholds", {})
evidence = [
f"{v}: actual={slo_metrics.get(v + '_ms' if 'latency' in v else v + '_pct', '?')}, "
f"threshold={slo_thresholds.get(v + '_ms' if 'latency' in v else v + '_pct', '?')}"
for v in slo_violations
]
root_causes.append({
"rank": rank,
"cause": f"SLO violations: {', '.join(slo_violations)}",
"evidence": evidence,
})
rank += 1
next_checks.insert(0, f"Confirm SLO breach correlates with service degradation ({', '.join(slo_violations)})")
# Enrich with cost context insights
cost_anomalies = cost_ctx.get("anomalies") or []
if cost_anomalies and not cost_ctx.get("skipped"):
spike_tools = [a.get("tool", "?") for a in cost_anomalies[:2]]
root_causes.append({
"rank": rank,
"cause": f"Resource/cost spike detected on: {', '.join(spike_tools)}",
"evidence": [
f"{a.get('tool')}: ratio={a.get('ratio')}, window_calls={a.get('window_calls')}"
for a in cost_anomalies[:2]
],
})
rank += 1
next_checks.append("Investigate resource spike — possible runaway process or retry storm")
# Enrich with privacy context insights
privacy_findings = privacy_ctx.get("findings") or []
if privacy_findings and not privacy_ctx.get("skipped"):
privacy_errors = [f for f in privacy_findings if f.get("severity") == "error"]
if privacy_errors:
root_causes.append({
"rank": rank,
"cause": f"Privacy/data governance issue during incident window ({len(privacy_errors)} error(s))",
"evidence": [f.get("title", "")[:200] for f in privacy_errors[:2]],
})
rank += 1
next_checks.append("Review data governance findings — possible PII/secrets exposure")
# Build summary
error_count = len([l for l in log_samples if "error" in l.lower()])
summary = (
f"Incident triage for '{service}' (symptom: {symptom[:100]}). "
f"Health: {health_status}. "
f"{len(root_causes)} suspected cause(s). "
f"{error_count} error log samples. "
f"{len(runbooks)} runbook snippet(s) found."
+ (f" Cost spikes: {len(cost_anomalies)}." if cost_anomalies else "")
+ (f" Privacy findings: {privacy_ctx.get('findings_count', 0)}." if not privacy_ctx.get("skipped") else "")
)
# Cost recommendations
cost_recs = cost_ctx.get("recommendations") or []
result = {
"summary": summary,
"suspected_root_causes": root_causes[:6],
"impact_assessment": impact,
"mitigations_now": mitigations_now[:5],
"next_checks": next_checks[:6],
"references": {
"metrics": {
"slo": slo,
"alerts_count": len(alerts),
},
"log_samples": log_samples[:10],
"runbook_snippets": runbooks,
**({"traces": traces} if traces else {}),
},
"context": {
"slo": {
"violations": slo_violations,
"metrics": slo_ctx.get("metrics", {}),
"thresholds": slo_ctx.get("thresholds", {}),
"skipped": slo_ctx.get("skipped", False),
},
"privacy": {
"findings_count": privacy_ctx.get("findings_count", 0),
"findings": privacy_findings[:3],
"skipped": privacy_ctx.get("skipped", False),
},
"cost": {
"anomaly_count": cost_ctx.get("anomaly_count", 0),
"anomalies": cost_anomalies[:3],
"recommendations": cost_recs,
"skipped": cost_ctx.get("skipped", False),
},
},
}
return {**state, "result": result, "graph_status": "succeeded"}
# ─── Routing ─────────────────────────────────────────────────────────────────
def _after_validate(state: IncidentTriageState) -> str:
if state.get("graph_status") == "failed":
return "build_triage_report"
return "service_overview"
def _after_trace_lookup(state: IncidentTriageState) -> str:
return "build_triage_report"
# ─── Graph builder ────────────────────────────────────────────────────────────
def build_incident_triage_graph():
"""
Build and compile the incident_triage LangGraph.
Graph:
validate_input → [if valid] service_overview → top_errors_logs
→ health_and_runbooks → trace_lookup
→ slo_context → privacy_context → cost_context
→ build_triage_report → END
→ [if invalid] build_triage_report → END
"""
graph = StateGraph(IncidentTriageState)
graph.add_node("validate_input", validate_input_node)
graph.add_node("service_overview", service_overview_node)
graph.add_node("top_errors_logs", top_errors_logs_node)
graph.add_node("health_and_runbooks", health_and_runbooks_node)
graph.add_node("trace_lookup", trace_lookup_node)
graph.add_node("slo_context", slo_context_node)
graph.add_node("privacy_context", privacy_context_node)
graph.add_node("cost_context", cost_context_node)
graph.add_node("build_triage_report", build_triage_report_node)
graph.set_entry_point("validate_input")
graph.add_conditional_edges(
"validate_input",
_after_validate,
{"service_overview": "service_overview", "build_triage_report": "build_triage_report"},
)
# Linear chain after validation
graph.add_edge("service_overview", "top_errors_logs")
graph.add_edge("top_errors_logs", "health_and_runbooks")
graph.add_edge("health_and_runbooks", "trace_lookup")
graph.add_edge("trace_lookup", "slo_context")
graph.add_edge("slo_context", "privacy_context")
graph.add_edge("privacy_context", "cost_context")
graph.add_edge("cost_context", "build_triage_report")
graph.add_edge("build_triage_report", END)
return graph.compile()