New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
743 lines
29 KiB
Python
743 lines
29 KiB
Python
"""
|
|
Graph 2: incident_triage_graph
|
|
|
|
Collects observability, logs, health, KB runbooks, optionally traces,
|
|
and governance context (privacy + cost), then builds a structured triage report.
|
|
|
|
Node sequence:
|
|
validate_input → service_overview → top_errors_logs
|
|
→ health_and_runbooks → trace_lookup (optional)
|
|
→ slo_context → privacy_context → cost_context
|
|
→ build_triage_report → END
|
|
|
|
All tool calls via gateway. No direct access to Prometheus/Loki/etc.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import datetime
|
|
import logging
|
|
import re
|
|
from typing import Any, Dict, List, Optional, TypedDict
|
|
|
|
from langgraph.graph import StateGraph, END
|
|
|
|
from ..config import settings
|
|
from ..gateway_client import GatewayClient
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_SECRET_PAT = re.compile(
|
|
r'(?i)(token|api[_-]?key|password|secret|bearer)\s*[=:]\s*\S+',
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def _redact_lines(lines: List[str]) -> List[str]:
|
|
"""Mask secrets in log lines before including in report."""
|
|
return [_SECRET_PAT.sub(lambda m: f"{m.group(1)}=***", line) for line in lines]
|
|
|
|
|
|
def _clamp_time_range(time_range: Optional[Dict[str, str]], max_hours: int) -> Dict[str, str]:
|
|
"""Ensure time window ≤ max_hours. Clamp end-start if larger."""
|
|
now = datetime.datetime.now(datetime.timezone.utc)
|
|
default_from = (now - datetime.timedelta(hours=1)).isoformat()
|
|
default_to = now.isoformat()
|
|
|
|
if not time_range:
|
|
return {"from": default_from, "to": default_to}
|
|
|
|
try:
|
|
from_dt = datetime.datetime.fromisoformat(time_range["from"].replace("Z", "+00:00"))
|
|
to_dt = datetime.datetime.fromisoformat(time_range.get("to", default_to).replace("Z", "+00:00"))
|
|
delta = to_dt - from_dt
|
|
if delta.total_seconds() > max_hours * 3600:
|
|
# Clamp: keep "to", shorten "from"
|
|
from_dt = to_dt - datetime.timedelta(hours=max_hours)
|
|
return {"from": from_dt.isoformat(), "to": to_dt.isoformat()}
|
|
return {"from": from_dt.isoformat(), "to": to_dt.isoformat()}
|
|
except Exception:
|
|
return {"from": default_from, "to": default_to}
|
|
|
|
|
|
# ─── State ────────────────────────────────────────────────────────────────────
|
|
|
|
class IncidentTriageState(TypedDict, total=False):
|
|
# Context (injected before graph.invoke)
|
|
run_id: str
|
|
agent_id: str
|
|
workspace_id: str
|
|
user_id: str
|
|
input: Dict[str, Any]
|
|
|
|
# Validated
|
|
service: str
|
|
symptom: str
|
|
time_range: Dict[str, str]
|
|
env: str
|
|
include_traces: bool
|
|
max_log_lines: int
|
|
log_query_hint: Optional[str]
|
|
validation_error: Optional[str]
|
|
|
|
# Node results
|
|
service_overview_data: Optional[Dict]
|
|
top_errors_data: Optional[Dict]
|
|
log_samples: List[str]
|
|
health_data: Optional[Dict]
|
|
runbook_snippets: List[Dict]
|
|
trace_data: Optional[Dict]
|
|
slo_context_data: Optional[Dict]
|
|
privacy_context_data: Optional[Dict]
|
|
cost_context_data: Optional[Dict]
|
|
|
|
# Output
|
|
result: Optional[Dict[str, Any]]
|
|
graph_status: str
|
|
error: Optional[str]
|
|
|
|
|
|
# ─── Nodes ────────────────────────────────────────────────────────────────────
|
|
|
|
async def validate_input_node(state: IncidentTriageState) -> IncidentTriageState:
|
|
"""Validate and normalise triage inputs. Clamp time window to max allowed."""
|
|
inp = state.get("input", {})
|
|
service = inp.get("service", "").strip()
|
|
symptom = inp.get("symptom", "").strip()
|
|
|
|
if not service:
|
|
return {**state, "graph_status": "failed", "validation_error": "service is required"}
|
|
if not symptom:
|
|
return {**state, "graph_status": "failed", "validation_error": "symptom is required"}
|
|
|
|
time_range = _clamp_time_range(
|
|
inp.get("time_range"),
|
|
settings.INCIDENT_MAX_TIME_WINDOW_H,
|
|
)
|
|
|
|
max_log_lines = min(
|
|
int(inp.get("max_log_lines", 120)),
|
|
settings.INCIDENT_MAX_LOG_LINES,
|
|
)
|
|
|
|
return {
|
|
**state,
|
|
"service": service,
|
|
"symptom": symptom,
|
|
"time_range": time_range,
|
|
"env": inp.get("env", "prod"),
|
|
"include_traces": bool(inp.get("include_traces", False)),
|
|
"max_log_lines": max_log_lines,
|
|
"log_query_hint": inp.get("log_query_hint"),
|
|
"log_samples": [],
|
|
"runbook_snippets": [],
|
|
"graph_status": "running",
|
|
}
|
|
|
|
|
|
async def service_overview_node(state: IncidentTriageState) -> IncidentTriageState:
|
|
"""
|
|
Node 1: Call observability_tool action=service_overview.
|
|
Collects metrics summary, recent alerts, SLO status.
|
|
"""
|
|
if state.get("graph_status") == "failed":
|
|
return state
|
|
|
|
run_id = state.get("run_id", "")
|
|
async with GatewayClient() as gw:
|
|
result = await gw.call_tool(
|
|
tool="observability_tool",
|
|
action="service_overview",
|
|
params={
|
|
"service": state["service"],
|
|
"time_range": state["time_range"],
|
|
"env": state.get("env", "prod"),
|
|
},
|
|
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
|
|
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
|
|
user_id=state.get("user_id", ""),
|
|
graph_run_id=run_id,
|
|
graph_node="service_overview",
|
|
)
|
|
|
|
if not result.success:
|
|
logger.warning("incident_triage: service_overview failed run=%s err=%s", run_id, result.error_message)
|
|
# Non-fatal: continue with partial data
|
|
return {**state, "service_overview_data": {"error": result.error_message}}
|
|
|
|
return {**state, "service_overview_data": result.data or {}}
|
|
|
|
|
|
async def top_errors_logs_node(state: IncidentTriageState) -> IncidentTriageState:
|
|
"""
|
|
Node 2: Call observability_tool action=logs_query.
|
|
Extract top N log lines and sample errors. Redact secrets.
|
|
"""
|
|
if state.get("graph_status") == "failed":
|
|
return state
|
|
|
|
run_id = state.get("run_id", "")
|
|
query_hint = state.get("log_query_hint") or f"service={state['service']} level=error"
|
|
|
|
async with GatewayClient() as gw:
|
|
result = await gw.call_tool(
|
|
tool="observability_tool",
|
|
action="logs_query",
|
|
params={
|
|
"service": state["service"],
|
|
"time_range": state["time_range"],
|
|
"env": state.get("env", "prod"),
|
|
"query": query_hint,
|
|
"limit": state.get("max_log_lines", 120),
|
|
},
|
|
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
|
|
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
|
|
user_id=state.get("user_id", ""),
|
|
graph_run_id=run_id,
|
|
graph_node="top_errors_logs",
|
|
)
|
|
|
|
if not result.success:
|
|
logger.warning("incident_triage: logs_query failed run=%s err=%s", run_id, result.error_message)
|
|
return {**state, "top_errors_data": {"error": result.error_message}, "log_samples": []}
|
|
|
|
data = result.data or {}
|
|
raw_lines: List[str] = data.get("lines") or data.get("logs") or []
|
|
safe_lines = _redact_lines(raw_lines[: state.get("max_log_lines", 120)])
|
|
|
|
return {**state, "top_errors_data": data, "log_samples": safe_lines}
|
|
|
|
|
|
async def health_and_runbooks_node(state: IncidentTriageState) -> IncidentTriageState:
|
|
"""
|
|
Node 3: Parallel-ish (sequential for simplicity):
|
|
a) oncall_tool action=service_health
|
|
b) kb_tool action=search for runbook snippets
|
|
"""
|
|
if state.get("graph_status") == "failed":
|
|
return state
|
|
|
|
run_id = state.get("run_id", "")
|
|
service = state["service"]
|
|
symptom = state.get("symptom", "")
|
|
|
|
# 3a — Health check
|
|
health_data: Dict = {}
|
|
async with GatewayClient() as gw:
|
|
hr = await gw.call_tool(
|
|
tool="oncall_tool",
|
|
action="service_health",
|
|
params={"service": service, "env": state.get("env", "prod")},
|
|
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
|
|
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
|
|
user_id=state.get("user_id", ""),
|
|
graph_run_id=run_id,
|
|
graph_node="health_check",
|
|
)
|
|
health_data = hr.data or {"error": hr.error_message} if not hr.success else hr.data or {}
|
|
|
|
# 3b — KB runbook search
|
|
runbook_snippets: List[Dict] = []
|
|
# Build KB query from service name + top error keywords from symptom
|
|
kb_query = f"{service} {symptom}"[:200]
|
|
|
|
async with GatewayClient() as gw:
|
|
kbr = await gw.call_tool(
|
|
tool="kb_tool",
|
|
action="search",
|
|
params={"query": kb_query, "top_k": 5, "filter": {"type": "runbook"}},
|
|
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
|
|
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
|
|
user_id=state.get("user_id", ""),
|
|
graph_run_id=run_id,
|
|
graph_node="kb_runbooks",
|
|
)
|
|
if kbr.success and kbr.data:
|
|
for item in (kbr.data.get("results") or [])[:5]:
|
|
snippet_text = _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***",
|
|
item.get("content", "")[:500])
|
|
runbook_snippets.append({
|
|
"path": item.get("path", item.get("source", "")),
|
|
"lines": item.get("lines", ""),
|
|
"text": snippet_text,
|
|
})
|
|
|
|
return {**state, "health_data": health_data, "runbook_snippets": runbook_snippets}
|
|
|
|
|
|
async def trace_lookup_node(state: IncidentTriageState) -> IncidentTriageState:
|
|
"""
|
|
Node 4 (optional): If include_traces=True, look for trace IDs in log samples
|
|
and query observability_tool action=traces_query.
|
|
Gracefully skips if no traces found or tool unavailable.
|
|
"""
|
|
if state.get("graph_status") == "failed":
|
|
return state
|
|
if not state.get("include_traces", False):
|
|
return {**state, "trace_data": None}
|
|
|
|
run_id = state.get("run_id", "")
|
|
# Extract trace IDs from log samples (simple regex: trace_id=<hex> or traceId=<hex>)
|
|
trace_pat = re.compile(r'(?:trace[_-]?id|traceId)[=:\s]+([0-9a-f]{16,32})', re.IGNORECASE)
|
|
trace_ids = []
|
|
for line in (state.get("log_samples") or [])[:50]:
|
|
for m in trace_pat.finditer(line):
|
|
trace_ids.append(m.group(1))
|
|
if len(trace_ids) >= 3:
|
|
break
|
|
if len(trace_ids) >= 3:
|
|
break
|
|
|
|
if not trace_ids:
|
|
logger.info("incident_triage: no trace IDs found in logs run=%s", run_id)
|
|
return {**state, "trace_data": {"note": "no_trace_ids_in_logs"}}
|
|
|
|
async with GatewayClient() as gw:
|
|
result = await gw.call_tool(
|
|
tool="observability_tool",
|
|
action="traces_query",
|
|
params={
|
|
"service": state["service"],
|
|
"trace_ids": trace_ids[:3],
|
|
"time_range": state["time_range"],
|
|
},
|
|
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
|
|
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
|
|
user_id=state.get("user_id", ""),
|
|
graph_run_id=run_id,
|
|
graph_node="trace_lookup",
|
|
)
|
|
|
|
if not result.success:
|
|
logger.info("incident_triage: trace_lookup skipped run=%s err=%s", run_id, result.error_message)
|
|
return {**state, "trace_data": {"note": f"trace_query_failed: {result.error_message}"}}
|
|
|
|
return {**state, "trace_data": result.data or {}}
|
|
|
|
|
|
async def slo_context_node(state: IncidentTriageState) -> IncidentTriageState:
|
|
"""
|
|
Node 4b: Query SLO thresholds and current metrics for the incident service.
|
|
|
|
Calls observability_tool.slo_snapshot via gateway.
|
|
Non-fatal: if tool unavailable, slo_context_data is set to skipped marker.
|
|
"""
|
|
if state.get("graph_status") == "failed":
|
|
return state
|
|
|
|
run_id = state.get("run_id", "")
|
|
service = state.get("service", "")
|
|
time_range = state.get("time_range", {})
|
|
|
|
try:
|
|
from_dt = datetime.datetime.fromisoformat(time_range.get("from", "").replace("Z", "+00:00"))
|
|
to_dt = datetime.datetime.fromisoformat(time_range.get("to", "").replace("Z", "+00:00"))
|
|
window_min = max(5, min(60, int((to_dt - from_dt).total_seconds() / 60)))
|
|
except Exception:
|
|
window_min = 60
|
|
|
|
try:
|
|
async with GatewayClient() as gw:
|
|
result = await gw.call_tool(
|
|
tool="observability_tool",
|
|
action="slo_snapshot",
|
|
params={
|
|
"service": service,
|
|
"env": state.get("env", "prod"),
|
|
"window_minutes": window_min,
|
|
},
|
|
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
|
|
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
|
|
user_id=state.get("user_id", ""),
|
|
graph_run_id=run_id,
|
|
graph_node="slo_context",
|
|
)
|
|
|
|
if not result.success:
|
|
logger.info("incident_triage: slo_context skipped run=%s err=%s", run_id, result.error_message)
|
|
return {**state, "slo_context_data": {"skipped": True, "reason": result.error_message}}
|
|
|
|
data = result.data or {}
|
|
return {**state, "slo_context_data": {
|
|
"violations": data.get("violations", []),
|
|
"metrics": data.get("metrics", {}),
|
|
"thresholds": data.get("thresholds", {}),
|
|
"skipped": data.get("skipped", False),
|
|
}}
|
|
|
|
except Exception as e:
|
|
logger.info("incident_triage: slo_context failed run=%s err=%s", run_id, e)
|
|
return {**state, "slo_context_data": {"skipped": True, "reason": str(e)}}
|
|
|
|
|
|
async def privacy_context_node(state: IncidentTriageState) -> IncidentTriageState:
|
|
"""
|
|
Node 5a: Scan audit events over the incident time window for privacy anomalies.
|
|
|
|
Calls data_governance_tool.scan_audit via gateway.
|
|
Non-fatal: if gateway fails, privacy_context_data is set to an error marker
|
|
and the triage report continues normally.
|
|
"""
|
|
if state.get("graph_status") == "failed":
|
|
return state
|
|
|
|
run_id = state.get("run_id", "")
|
|
time_range = state.get("time_range", {})
|
|
|
|
# Compute window_hours from time_range (clamp 1..24)
|
|
try:
|
|
import datetime
|
|
from_dt = datetime.datetime.fromisoformat(time_range.get("from", "").replace("Z", "+00:00"))
|
|
to_dt = datetime.datetime.fromisoformat(time_range.get("to", "").replace("Z", "+00:00"))
|
|
window_h = max(1, min(24, int((to_dt - from_dt).total_seconds() / 3600) + 1))
|
|
except Exception:
|
|
window_h = 1
|
|
|
|
try:
|
|
async with GatewayClient() as gw:
|
|
result = await gw.call_tool(
|
|
tool="data_governance_tool",
|
|
action="scan_audit",
|
|
params={
|
|
"backend": "jsonl",
|
|
"time_window_hours": window_h,
|
|
"max_events": 10000,
|
|
},
|
|
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
|
|
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
|
|
user_id=state.get("user_id", ""),
|
|
graph_run_id=run_id,
|
|
graph_node="privacy_context",
|
|
)
|
|
|
|
if not result.success:
|
|
logger.info(
|
|
"incident_triage: privacy_context skipped run=%s err=%s",
|
|
run_id, result.error_message,
|
|
)
|
|
return {**state, "privacy_context_data": {"skipped": True, "reason": result.error_message}}
|
|
|
|
data = result.data or {}
|
|
return {**state, "privacy_context_data": {
|
|
"findings_count": data.get("stats", {}).get("errors", 0) + data.get("stats", {}).get("warnings", 0),
|
|
"findings": (data.get("findings") or [])[:5], # top 5 only; evidence already masked
|
|
"summary": data.get("summary", ""),
|
|
}}
|
|
|
|
except Exception as e:
|
|
logger.info("incident_triage: privacy_context failed run=%s err=%s", run_id, e)
|
|
return {**state, "privacy_context_data": {"skipped": True, "reason": str(e)}}
|
|
|
|
|
|
async def cost_context_node(state: IncidentTriageState) -> IncidentTriageState:
|
|
"""
|
|
Node 5b: Detect cost/resource anomalies over the incident time window.
|
|
|
|
Calls cost_analyzer_tool.anomalies via gateway.
|
|
Non-fatal: on any failure, cost_context_data is set to skipped marker.
|
|
"""
|
|
if state.get("graph_status") == "failed":
|
|
return state
|
|
|
|
run_id = state.get("run_id", "")
|
|
time_range = state.get("time_range", {})
|
|
|
|
try:
|
|
import datetime
|
|
from_dt = datetime.datetime.fromisoformat(time_range.get("from", "").replace("Z", "+00:00"))
|
|
to_dt = datetime.datetime.fromisoformat(time_range.get("to", "").replace("Z", "+00:00"))
|
|
window_minutes = max(15, min(60, int((to_dt - from_dt).total_seconds() / 60)))
|
|
baseline_hours = max(4, min(24, int((to_dt - from_dt).total_seconds() / 3600) + 4))
|
|
except Exception:
|
|
window_minutes = 60
|
|
baseline_hours = 24
|
|
|
|
try:
|
|
async with GatewayClient() as gw:
|
|
result = await gw.call_tool(
|
|
tool="cost_analyzer_tool",
|
|
action="anomalies",
|
|
params={
|
|
"window_minutes": window_minutes,
|
|
"baseline_hours": baseline_hours,
|
|
"ratio_threshold": 3.0,
|
|
"min_calls": 5,
|
|
},
|
|
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
|
|
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
|
|
user_id=state.get("user_id", ""),
|
|
graph_run_id=run_id,
|
|
graph_node="cost_context",
|
|
)
|
|
|
|
if not result.success:
|
|
logger.info(
|
|
"incident_triage: cost_context skipped run=%s err=%s",
|
|
run_id, result.error_message,
|
|
)
|
|
return {**state, "cost_context_data": {"skipped": True, "reason": result.error_message}}
|
|
|
|
data = result.data or {}
|
|
anomalies = data.get("anomalies") or []
|
|
return {**state, "cost_context_data": {
|
|
"anomaly_count": data.get("anomaly_count", len(anomalies)),
|
|
"anomalies": anomalies[:5], # top 5 spikes
|
|
"recommendations": [a.get("recommendation", "") for a in anomalies[:3] if a.get("recommendation")],
|
|
}}
|
|
|
|
except Exception as e:
|
|
logger.info("incident_triage: cost_context failed run=%s err=%s", run_id, e)
|
|
return {**state, "cost_context_data": {"skipped": True, "reason": str(e)}}
|
|
|
|
|
|
async def build_triage_report_node(state: IncidentTriageState) -> IncidentTriageState:
|
|
"""
|
|
Node 5: Pure aggregation — no tool calls.
|
|
Builds structured triage report from all collected data.
|
|
"""
|
|
if state.get("graph_status") == "failed":
|
|
err = state.get("validation_error") or state.get("error", "Unknown error")
|
|
return {**state, "result": {"error": err}, "graph_status": "failed"}
|
|
|
|
service = state.get("service", "unknown")
|
|
symptom = state.get("symptom", "")
|
|
overview = state.get("service_overview_data") or {}
|
|
health = state.get("health_data") or {}
|
|
log_samples = state.get("log_samples") or []
|
|
runbooks = state.get("runbook_snippets") or []
|
|
traces = state.get("trace_data")
|
|
slo_ctx = state.get("slo_context_data") or {}
|
|
privacy_ctx = state.get("privacy_context_data") or {}
|
|
cost_ctx = state.get("cost_context_data") or {}
|
|
|
|
# Extract alerts and error stats from observability overview
|
|
alerts = overview.get("alerts", overview.get("active_alerts", []))
|
|
slo = overview.get("slo", overview.get("slo_status", {}))
|
|
health_status = health.get("status", health.get("health", "unknown"))
|
|
|
|
# Build suspected root causes from available signals
|
|
root_causes = []
|
|
rank = 1
|
|
|
|
if health_status in ("degraded", "down", "unhealthy", "error"):
|
|
root_causes.append({
|
|
"rank": rank,
|
|
"cause": f"Service health: {health_status}",
|
|
"evidence": [str(health.get("details", health_status))[:300]],
|
|
})
|
|
rank += 1
|
|
|
|
for alert in alerts[:3]:
|
|
root_causes.append({
|
|
"rank": rank,
|
|
"cause": f"Active alert: {alert.get('name', alert) if isinstance(alert, dict) else str(alert)}",
|
|
"evidence": [str(alert)[:300]],
|
|
})
|
|
rank += 1
|
|
|
|
if log_samples:
|
|
# Count unique error patterns
|
|
error_lines = [l for l in log_samples if "error" in l.lower() or "exception" in l.lower()][:10]
|
|
if error_lines:
|
|
root_causes.append({
|
|
"rank": rank,
|
|
"cause": f"Error patterns in logs ({len(error_lines)} samples)",
|
|
"evidence": error_lines[:3],
|
|
})
|
|
rank += 1
|
|
|
|
if not root_causes:
|
|
root_causes.append({
|
|
"rank": 1,
|
|
"cause": "No obvious signals found; investigation ongoing",
|
|
"evidence": [symptom],
|
|
})
|
|
|
|
# Pre-extract SLO violations for impact and enrichment
|
|
slo_violations = slo_ctx.get("violations") or []
|
|
|
|
# Impact assessment from SLO + observability
|
|
impact = "Unknown"
|
|
if slo_violations and not slo_ctx.get("skipped"):
|
|
slo_m = slo_ctx.get("metrics", {})
|
|
impact = f"SLO breached: {', '.join(slo_violations)} (latency_p95={slo_m.get('latency_p95_ms', '?')}ms, error_rate={slo_m.get('error_rate_pct', '?')}%)"
|
|
elif isinstance(slo, dict):
|
|
error_rate = slo.get("error_rate") or slo.get("error_budget_consumed")
|
|
if error_rate:
|
|
impact = f"SLO impact: error_rate={error_rate}"
|
|
if health_status in ("down", "unhealthy"):
|
|
impact = f"Service is {health_status}" + (f"; {impact}" if impact != "Unknown" else "")
|
|
|
|
# Mitigations from runbooks
|
|
mitigations_now = []
|
|
for rb in runbooks[:2]:
|
|
text = rb.get("text", "")
|
|
lines = [l.strip() for l in text.split("\n") if l.strip().startswith("-") or "restart" in l.lower() or "rollback" in l.lower()]
|
|
mitigations_now.extend(lines[:3])
|
|
if not mitigations_now:
|
|
mitigations_now = ["Review logs for error patterns", "Check service health dashboard", "Consult runbook"]
|
|
|
|
next_checks = [
|
|
f"Verify {service} health endpoint returns 200",
|
|
"Check upstream/downstream dependencies",
|
|
"Review recent deployments in release history",
|
|
]
|
|
if alerts:
|
|
next_checks.insert(0, f"Acknowledge/resolve {len(alerts)} active alert(s)")
|
|
|
|
# Enrich with SLO violations
|
|
if slo_violations and not slo_ctx.get("skipped"):
|
|
slo_metrics = slo_ctx.get("metrics", {})
|
|
slo_thresholds = slo_ctx.get("thresholds", {})
|
|
evidence = [
|
|
f"{v}: actual={slo_metrics.get(v + '_ms' if 'latency' in v else v + '_pct', '?')}, "
|
|
f"threshold={slo_thresholds.get(v + '_ms' if 'latency' in v else v + '_pct', '?')}"
|
|
for v in slo_violations
|
|
]
|
|
root_causes.append({
|
|
"rank": rank,
|
|
"cause": f"SLO violations: {', '.join(slo_violations)}",
|
|
"evidence": evidence,
|
|
})
|
|
rank += 1
|
|
next_checks.insert(0, f"Confirm SLO breach correlates with service degradation ({', '.join(slo_violations)})")
|
|
|
|
# Enrich with cost context insights
|
|
cost_anomalies = cost_ctx.get("anomalies") or []
|
|
if cost_anomalies and not cost_ctx.get("skipped"):
|
|
spike_tools = [a.get("tool", "?") for a in cost_anomalies[:2]]
|
|
root_causes.append({
|
|
"rank": rank,
|
|
"cause": f"Resource/cost spike detected on: {', '.join(spike_tools)}",
|
|
"evidence": [
|
|
f"{a.get('tool')}: ratio={a.get('ratio')}, window_calls={a.get('window_calls')}"
|
|
for a in cost_anomalies[:2]
|
|
],
|
|
})
|
|
rank += 1
|
|
next_checks.append("Investigate resource spike — possible runaway process or retry storm")
|
|
|
|
# Enrich with privacy context insights
|
|
privacy_findings = privacy_ctx.get("findings") or []
|
|
if privacy_findings and not privacy_ctx.get("skipped"):
|
|
privacy_errors = [f for f in privacy_findings if f.get("severity") == "error"]
|
|
if privacy_errors:
|
|
root_causes.append({
|
|
"rank": rank,
|
|
"cause": f"Privacy/data governance issue during incident window ({len(privacy_errors)} error(s))",
|
|
"evidence": [f.get("title", "")[:200] for f in privacy_errors[:2]],
|
|
})
|
|
rank += 1
|
|
next_checks.append("Review data governance findings — possible PII/secrets exposure")
|
|
|
|
# Build summary
|
|
error_count = len([l for l in log_samples if "error" in l.lower()])
|
|
summary = (
|
|
f"Incident triage for '{service}' (symptom: {symptom[:100]}). "
|
|
f"Health: {health_status}. "
|
|
f"{len(root_causes)} suspected cause(s). "
|
|
f"{error_count} error log samples. "
|
|
f"{len(runbooks)} runbook snippet(s) found."
|
|
+ (f" Cost spikes: {len(cost_anomalies)}." if cost_anomalies else "")
|
|
+ (f" Privacy findings: {privacy_ctx.get('findings_count', 0)}." if not privacy_ctx.get("skipped") else "")
|
|
)
|
|
|
|
# Cost recommendations
|
|
cost_recs = cost_ctx.get("recommendations") or []
|
|
|
|
result = {
|
|
"summary": summary,
|
|
"suspected_root_causes": root_causes[:6],
|
|
"impact_assessment": impact,
|
|
"mitigations_now": mitigations_now[:5],
|
|
"next_checks": next_checks[:6],
|
|
"references": {
|
|
"metrics": {
|
|
"slo": slo,
|
|
"alerts_count": len(alerts),
|
|
},
|
|
"log_samples": log_samples[:10],
|
|
"runbook_snippets": runbooks,
|
|
**({"traces": traces} if traces else {}),
|
|
},
|
|
"context": {
|
|
"slo": {
|
|
"violations": slo_violations,
|
|
"metrics": slo_ctx.get("metrics", {}),
|
|
"thresholds": slo_ctx.get("thresholds", {}),
|
|
"skipped": slo_ctx.get("skipped", False),
|
|
},
|
|
"privacy": {
|
|
"findings_count": privacy_ctx.get("findings_count", 0),
|
|
"findings": privacy_findings[:3],
|
|
"skipped": privacy_ctx.get("skipped", False),
|
|
},
|
|
"cost": {
|
|
"anomaly_count": cost_ctx.get("anomaly_count", 0),
|
|
"anomalies": cost_anomalies[:3],
|
|
"recommendations": cost_recs,
|
|
"skipped": cost_ctx.get("skipped", False),
|
|
},
|
|
},
|
|
}
|
|
|
|
return {**state, "result": result, "graph_status": "succeeded"}
|
|
|
|
|
|
# ─── Routing ─────────────────────────────────────────────────────────────────
|
|
|
|
def _after_validate(state: IncidentTriageState) -> str:
|
|
if state.get("graph_status") == "failed":
|
|
return "build_triage_report"
|
|
return "service_overview"
|
|
|
|
|
|
def _after_trace_lookup(state: IncidentTriageState) -> str:
|
|
return "build_triage_report"
|
|
|
|
|
|
# ─── Graph builder ────────────────────────────────────────────────────────────
|
|
|
|
def build_incident_triage_graph():
|
|
"""
|
|
Build and compile the incident_triage LangGraph.
|
|
|
|
Graph:
|
|
validate_input → [if valid] service_overview → top_errors_logs
|
|
→ health_and_runbooks → trace_lookup
|
|
→ slo_context → privacy_context → cost_context
|
|
→ build_triage_report → END
|
|
→ [if invalid] build_triage_report → END
|
|
"""
|
|
graph = StateGraph(IncidentTriageState)
|
|
|
|
graph.add_node("validate_input", validate_input_node)
|
|
graph.add_node("service_overview", service_overview_node)
|
|
graph.add_node("top_errors_logs", top_errors_logs_node)
|
|
graph.add_node("health_and_runbooks", health_and_runbooks_node)
|
|
graph.add_node("trace_lookup", trace_lookup_node)
|
|
graph.add_node("slo_context", slo_context_node)
|
|
graph.add_node("privacy_context", privacy_context_node)
|
|
graph.add_node("cost_context", cost_context_node)
|
|
graph.add_node("build_triage_report", build_triage_report_node)
|
|
|
|
graph.set_entry_point("validate_input")
|
|
|
|
graph.add_conditional_edges(
|
|
"validate_input",
|
|
_after_validate,
|
|
{"service_overview": "service_overview", "build_triage_report": "build_triage_report"},
|
|
)
|
|
|
|
# Linear chain after validation
|
|
graph.add_edge("service_overview", "top_errors_logs")
|
|
graph.add_edge("top_errors_logs", "health_and_runbooks")
|
|
graph.add_edge("health_and_runbooks", "trace_lookup")
|
|
graph.add_edge("trace_lookup", "slo_context")
|
|
graph.add_edge("slo_context", "privacy_context")
|
|
graph.add_edge("privacy_context", "cost_context")
|
|
graph.add_edge("cost_context", "build_triage_report")
|
|
graph.add_edge("build_triage_report", END)
|
|
|
|
return graph.compile()
|