""" Graph 2: incident_triage_graph Collects observability, logs, health, KB runbooks, optionally traces, and governance context (privacy + cost), then builds a structured triage report. Node sequence: validate_input → service_overview → top_errors_logs → health_and_runbooks → trace_lookup (optional) → slo_context → privacy_context → cost_context → build_triage_report → END All tool calls via gateway. No direct access to Prometheus/Loki/etc. """ from __future__ import annotations import datetime import logging import re from typing import Any, Dict, List, Optional, TypedDict from langgraph.graph import StateGraph, END from ..config import settings from ..gateway_client import GatewayClient logger = logging.getLogger(__name__) _SECRET_PAT = re.compile( r'(?i)(token|api[_-]?key|password|secret|bearer)\s*[=:]\s*\S+', re.IGNORECASE, ) def _redact_lines(lines: List[str]) -> List[str]: """Mask secrets in log lines before including in report.""" return [_SECRET_PAT.sub(lambda m: f"{m.group(1)}=***", line) for line in lines] def _clamp_time_range(time_range: Optional[Dict[str, str]], max_hours: int) -> Dict[str, str]: """Ensure time window ≤ max_hours. Clamp end-start if larger.""" now = datetime.datetime.now(datetime.timezone.utc) default_from = (now - datetime.timedelta(hours=1)).isoformat() default_to = now.isoformat() if not time_range: return {"from": default_from, "to": default_to} try: from_dt = datetime.datetime.fromisoformat(time_range["from"].replace("Z", "+00:00")) to_dt = datetime.datetime.fromisoformat(time_range.get("to", default_to).replace("Z", "+00:00")) delta = to_dt - from_dt if delta.total_seconds() > max_hours * 3600: # Clamp: keep "to", shorten "from" from_dt = to_dt - datetime.timedelta(hours=max_hours) return {"from": from_dt.isoformat(), "to": to_dt.isoformat()} return {"from": from_dt.isoformat(), "to": to_dt.isoformat()} except Exception: return {"from": default_from, "to": default_to} # ─── State ──────────────────────────────────────────────────────────────────── class IncidentTriageState(TypedDict, total=False): # Context (injected before graph.invoke) run_id: str agent_id: str workspace_id: str user_id: str input: Dict[str, Any] # Validated service: str symptom: str time_range: Dict[str, str] env: str include_traces: bool max_log_lines: int log_query_hint: Optional[str] validation_error: Optional[str] # Node results service_overview_data: Optional[Dict] top_errors_data: Optional[Dict] log_samples: List[str] health_data: Optional[Dict] runbook_snippets: List[Dict] trace_data: Optional[Dict] slo_context_data: Optional[Dict] privacy_context_data: Optional[Dict] cost_context_data: Optional[Dict] # Output result: Optional[Dict[str, Any]] graph_status: str error: Optional[str] # ─── Nodes ──────────────────────────────────────────────────────────────────── async def validate_input_node(state: IncidentTriageState) -> IncidentTriageState: """Validate and normalise triage inputs. Clamp time window to max allowed.""" inp = state.get("input", {}) service = inp.get("service", "").strip() symptom = inp.get("symptom", "").strip() if not service: return {**state, "graph_status": "failed", "validation_error": "service is required"} if not symptom: return {**state, "graph_status": "failed", "validation_error": "symptom is required"} time_range = _clamp_time_range( inp.get("time_range"), settings.INCIDENT_MAX_TIME_WINDOW_H, ) max_log_lines = min( int(inp.get("max_log_lines", 120)), settings.INCIDENT_MAX_LOG_LINES, ) return { **state, "service": service, "symptom": symptom, "time_range": time_range, "env": inp.get("env", "prod"), "include_traces": bool(inp.get("include_traces", False)), "max_log_lines": max_log_lines, "log_query_hint": inp.get("log_query_hint"), "log_samples": [], "runbook_snippets": [], "graph_status": "running", } async def service_overview_node(state: IncidentTriageState) -> IncidentTriageState: """ Node 1: Call observability_tool action=service_overview. Collects metrics summary, recent alerts, SLO status. """ if state.get("graph_status") == "failed": return state run_id = state.get("run_id", "") async with GatewayClient() as gw: result = await gw.call_tool( tool="observability_tool", action="service_overview", params={ "service": state["service"], "time_range": state["time_range"], "env": state.get("env", "prod"), }, agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID), workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID), user_id=state.get("user_id", ""), graph_run_id=run_id, graph_node="service_overview", ) if not result.success: logger.warning("incident_triage: service_overview failed run=%s err=%s", run_id, result.error_message) # Non-fatal: continue with partial data return {**state, "service_overview_data": {"error": result.error_message}} return {**state, "service_overview_data": result.data or {}} async def top_errors_logs_node(state: IncidentTriageState) -> IncidentTriageState: """ Node 2: Call observability_tool action=logs_query. Extract top N log lines and sample errors. Redact secrets. """ if state.get("graph_status") == "failed": return state run_id = state.get("run_id", "") query_hint = state.get("log_query_hint") or f"service={state['service']} level=error" async with GatewayClient() as gw: result = await gw.call_tool( tool="observability_tool", action="logs_query", params={ "service": state["service"], "time_range": state["time_range"], "env": state.get("env", "prod"), "query": query_hint, "limit": state.get("max_log_lines", 120), }, agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID), workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID), user_id=state.get("user_id", ""), graph_run_id=run_id, graph_node="top_errors_logs", ) if not result.success: logger.warning("incident_triage: logs_query failed run=%s err=%s", run_id, result.error_message) return {**state, "top_errors_data": {"error": result.error_message}, "log_samples": []} data = result.data or {} raw_lines: List[str] = data.get("lines") or data.get("logs") or [] safe_lines = _redact_lines(raw_lines[: state.get("max_log_lines", 120)]) return {**state, "top_errors_data": data, "log_samples": safe_lines} async def health_and_runbooks_node(state: IncidentTriageState) -> IncidentTriageState: """ Node 3: Parallel-ish (sequential for simplicity): a) oncall_tool action=service_health b) kb_tool action=search for runbook snippets """ if state.get("graph_status") == "failed": return state run_id = state.get("run_id", "") service = state["service"] symptom = state.get("symptom", "") # 3a — Health check health_data: Dict = {} async with GatewayClient() as gw: hr = await gw.call_tool( tool="oncall_tool", action="service_health", params={"service": service, "env": state.get("env", "prod")}, agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID), workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID), user_id=state.get("user_id", ""), graph_run_id=run_id, graph_node="health_check", ) health_data = hr.data or {"error": hr.error_message} if not hr.success else hr.data or {} # 3b — KB runbook search runbook_snippets: List[Dict] = [] # Build KB query from service name + top error keywords from symptom kb_query = f"{service} {symptom}"[:200] async with GatewayClient() as gw: kbr = await gw.call_tool( tool="kb_tool", action="search", params={"query": kb_query, "top_k": 5, "filter": {"type": "runbook"}}, agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID), workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID), user_id=state.get("user_id", ""), graph_run_id=run_id, graph_node="kb_runbooks", ) if kbr.success and kbr.data: for item in (kbr.data.get("results") or [])[:5]: snippet_text = _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***", item.get("content", "")[:500]) runbook_snippets.append({ "path": item.get("path", item.get("source", "")), "lines": item.get("lines", ""), "text": snippet_text, }) return {**state, "health_data": health_data, "runbook_snippets": runbook_snippets} async def trace_lookup_node(state: IncidentTriageState) -> IncidentTriageState: """ Node 4 (optional): If include_traces=True, look for trace IDs in log samples and query observability_tool action=traces_query. Gracefully skips if no traces found or tool unavailable. """ if state.get("graph_status") == "failed": return state if not state.get("include_traces", False): return {**state, "trace_data": None} run_id = state.get("run_id", "") # Extract trace IDs from log samples (simple regex: trace_id= or traceId=) trace_pat = re.compile(r'(?:trace[_-]?id|traceId)[=:\s]+([0-9a-f]{16,32})', re.IGNORECASE) trace_ids = [] for line in (state.get("log_samples") or [])[:50]: for m in trace_pat.finditer(line): trace_ids.append(m.group(1)) if len(trace_ids) >= 3: break if len(trace_ids) >= 3: break if not trace_ids: logger.info("incident_triage: no trace IDs found in logs run=%s", run_id) return {**state, "trace_data": {"note": "no_trace_ids_in_logs"}} async with GatewayClient() as gw: result = await gw.call_tool( tool="observability_tool", action="traces_query", params={ "service": state["service"], "trace_ids": trace_ids[:3], "time_range": state["time_range"], }, agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID), workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID), user_id=state.get("user_id", ""), graph_run_id=run_id, graph_node="trace_lookup", ) if not result.success: logger.info("incident_triage: trace_lookup skipped run=%s err=%s", run_id, result.error_message) return {**state, "trace_data": {"note": f"trace_query_failed: {result.error_message}"}} return {**state, "trace_data": result.data or {}} async def slo_context_node(state: IncidentTriageState) -> IncidentTriageState: """ Node 4b: Query SLO thresholds and current metrics for the incident service. Calls observability_tool.slo_snapshot via gateway. Non-fatal: if tool unavailable, slo_context_data is set to skipped marker. """ if state.get("graph_status") == "failed": return state run_id = state.get("run_id", "") service = state.get("service", "") time_range = state.get("time_range", {}) try: from_dt = datetime.datetime.fromisoformat(time_range.get("from", "").replace("Z", "+00:00")) to_dt = datetime.datetime.fromisoformat(time_range.get("to", "").replace("Z", "+00:00")) window_min = max(5, min(60, int((to_dt - from_dt).total_seconds() / 60))) except Exception: window_min = 60 try: async with GatewayClient() as gw: result = await gw.call_tool( tool="observability_tool", action="slo_snapshot", params={ "service": service, "env": state.get("env", "prod"), "window_minutes": window_min, }, agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID), workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID), user_id=state.get("user_id", ""), graph_run_id=run_id, graph_node="slo_context", ) if not result.success: logger.info("incident_triage: slo_context skipped run=%s err=%s", run_id, result.error_message) return {**state, "slo_context_data": {"skipped": True, "reason": result.error_message}} data = result.data or {} return {**state, "slo_context_data": { "violations": data.get("violations", []), "metrics": data.get("metrics", {}), "thresholds": data.get("thresholds", {}), "skipped": data.get("skipped", False), }} except Exception as e: logger.info("incident_triage: slo_context failed run=%s err=%s", run_id, e) return {**state, "slo_context_data": {"skipped": True, "reason": str(e)}} async def privacy_context_node(state: IncidentTriageState) -> IncidentTriageState: """ Node 5a: Scan audit events over the incident time window for privacy anomalies. Calls data_governance_tool.scan_audit via gateway. Non-fatal: if gateway fails, privacy_context_data is set to an error marker and the triage report continues normally. """ if state.get("graph_status") == "failed": return state run_id = state.get("run_id", "") time_range = state.get("time_range", {}) # Compute window_hours from time_range (clamp 1..24) try: import datetime from_dt = datetime.datetime.fromisoformat(time_range.get("from", "").replace("Z", "+00:00")) to_dt = datetime.datetime.fromisoformat(time_range.get("to", "").replace("Z", "+00:00")) window_h = max(1, min(24, int((to_dt - from_dt).total_seconds() / 3600) + 1)) except Exception: window_h = 1 try: async with GatewayClient() as gw: result = await gw.call_tool( tool="data_governance_tool", action="scan_audit", params={ "backend": "jsonl", "time_window_hours": window_h, "max_events": 10000, }, agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID), workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID), user_id=state.get("user_id", ""), graph_run_id=run_id, graph_node="privacy_context", ) if not result.success: logger.info( "incident_triage: privacy_context skipped run=%s err=%s", run_id, result.error_message, ) return {**state, "privacy_context_data": {"skipped": True, "reason": result.error_message}} data = result.data or {} return {**state, "privacy_context_data": { "findings_count": data.get("stats", {}).get("errors", 0) + data.get("stats", {}).get("warnings", 0), "findings": (data.get("findings") or [])[:5], # top 5 only; evidence already masked "summary": data.get("summary", ""), }} except Exception as e: logger.info("incident_triage: privacy_context failed run=%s err=%s", run_id, e) return {**state, "privacy_context_data": {"skipped": True, "reason": str(e)}} async def cost_context_node(state: IncidentTriageState) -> IncidentTriageState: """ Node 5b: Detect cost/resource anomalies over the incident time window. Calls cost_analyzer_tool.anomalies via gateway. Non-fatal: on any failure, cost_context_data is set to skipped marker. """ if state.get("graph_status") == "failed": return state run_id = state.get("run_id", "") time_range = state.get("time_range", {}) try: import datetime from_dt = datetime.datetime.fromisoformat(time_range.get("from", "").replace("Z", "+00:00")) to_dt = datetime.datetime.fromisoformat(time_range.get("to", "").replace("Z", "+00:00")) window_minutes = max(15, min(60, int((to_dt - from_dt).total_seconds() / 60))) baseline_hours = max(4, min(24, int((to_dt - from_dt).total_seconds() / 3600) + 4)) except Exception: window_minutes = 60 baseline_hours = 24 try: async with GatewayClient() as gw: result = await gw.call_tool( tool="cost_analyzer_tool", action="anomalies", params={ "window_minutes": window_minutes, "baseline_hours": baseline_hours, "ratio_threshold": 3.0, "min_calls": 5, }, agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID), workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID), user_id=state.get("user_id", ""), graph_run_id=run_id, graph_node="cost_context", ) if not result.success: logger.info( "incident_triage: cost_context skipped run=%s err=%s", run_id, result.error_message, ) return {**state, "cost_context_data": {"skipped": True, "reason": result.error_message}} data = result.data or {} anomalies = data.get("anomalies") or [] return {**state, "cost_context_data": { "anomaly_count": data.get("anomaly_count", len(anomalies)), "anomalies": anomalies[:5], # top 5 spikes "recommendations": [a.get("recommendation", "") for a in anomalies[:3] if a.get("recommendation")], }} except Exception as e: logger.info("incident_triage: cost_context failed run=%s err=%s", run_id, e) return {**state, "cost_context_data": {"skipped": True, "reason": str(e)}} async def build_triage_report_node(state: IncidentTriageState) -> IncidentTriageState: """ Node 5: Pure aggregation — no tool calls. Builds structured triage report from all collected data. """ if state.get("graph_status") == "failed": err = state.get("validation_error") or state.get("error", "Unknown error") return {**state, "result": {"error": err}, "graph_status": "failed"} service = state.get("service", "unknown") symptom = state.get("symptom", "") overview = state.get("service_overview_data") or {} health = state.get("health_data") or {} log_samples = state.get("log_samples") or [] runbooks = state.get("runbook_snippets") or [] traces = state.get("trace_data") slo_ctx = state.get("slo_context_data") or {} privacy_ctx = state.get("privacy_context_data") or {} cost_ctx = state.get("cost_context_data") or {} # Extract alerts and error stats from observability overview alerts = overview.get("alerts", overview.get("active_alerts", [])) slo = overview.get("slo", overview.get("slo_status", {})) health_status = health.get("status", health.get("health", "unknown")) # Build suspected root causes from available signals root_causes = [] rank = 1 if health_status in ("degraded", "down", "unhealthy", "error"): root_causes.append({ "rank": rank, "cause": f"Service health: {health_status}", "evidence": [str(health.get("details", health_status))[:300]], }) rank += 1 for alert in alerts[:3]: root_causes.append({ "rank": rank, "cause": f"Active alert: {alert.get('name', alert) if isinstance(alert, dict) else str(alert)}", "evidence": [str(alert)[:300]], }) rank += 1 if log_samples: # Count unique error patterns error_lines = [l for l in log_samples if "error" in l.lower() or "exception" in l.lower()][:10] if error_lines: root_causes.append({ "rank": rank, "cause": f"Error patterns in logs ({len(error_lines)} samples)", "evidence": error_lines[:3], }) rank += 1 if not root_causes: root_causes.append({ "rank": 1, "cause": "No obvious signals found; investigation ongoing", "evidence": [symptom], }) # Pre-extract SLO violations for impact and enrichment slo_violations = slo_ctx.get("violations") or [] # Impact assessment from SLO + observability impact = "Unknown" if slo_violations and not slo_ctx.get("skipped"): slo_m = slo_ctx.get("metrics", {}) impact = f"SLO breached: {', '.join(slo_violations)} (latency_p95={slo_m.get('latency_p95_ms', '?')}ms, error_rate={slo_m.get('error_rate_pct', '?')}%)" elif isinstance(slo, dict): error_rate = slo.get("error_rate") or slo.get("error_budget_consumed") if error_rate: impact = f"SLO impact: error_rate={error_rate}" if health_status in ("down", "unhealthy"): impact = f"Service is {health_status}" + (f"; {impact}" if impact != "Unknown" else "") # Mitigations from runbooks mitigations_now = [] for rb in runbooks[:2]: text = rb.get("text", "") lines = [l.strip() for l in text.split("\n") if l.strip().startswith("-") or "restart" in l.lower() or "rollback" in l.lower()] mitigations_now.extend(lines[:3]) if not mitigations_now: mitigations_now = ["Review logs for error patterns", "Check service health dashboard", "Consult runbook"] next_checks = [ f"Verify {service} health endpoint returns 200", "Check upstream/downstream dependencies", "Review recent deployments in release history", ] if alerts: next_checks.insert(0, f"Acknowledge/resolve {len(alerts)} active alert(s)") # Enrich with SLO violations if slo_violations and not slo_ctx.get("skipped"): slo_metrics = slo_ctx.get("metrics", {}) slo_thresholds = slo_ctx.get("thresholds", {}) evidence = [ f"{v}: actual={slo_metrics.get(v + '_ms' if 'latency' in v else v + '_pct', '?')}, " f"threshold={slo_thresholds.get(v + '_ms' if 'latency' in v else v + '_pct', '?')}" for v in slo_violations ] root_causes.append({ "rank": rank, "cause": f"SLO violations: {', '.join(slo_violations)}", "evidence": evidence, }) rank += 1 next_checks.insert(0, f"Confirm SLO breach correlates with service degradation ({', '.join(slo_violations)})") # Enrich with cost context insights cost_anomalies = cost_ctx.get("anomalies") or [] if cost_anomalies and not cost_ctx.get("skipped"): spike_tools = [a.get("tool", "?") for a in cost_anomalies[:2]] root_causes.append({ "rank": rank, "cause": f"Resource/cost spike detected on: {', '.join(spike_tools)}", "evidence": [ f"{a.get('tool')}: ratio={a.get('ratio')}, window_calls={a.get('window_calls')}" for a in cost_anomalies[:2] ], }) rank += 1 next_checks.append("Investigate resource spike — possible runaway process or retry storm") # Enrich with privacy context insights privacy_findings = privacy_ctx.get("findings") or [] if privacy_findings and not privacy_ctx.get("skipped"): privacy_errors = [f for f in privacy_findings if f.get("severity") == "error"] if privacy_errors: root_causes.append({ "rank": rank, "cause": f"Privacy/data governance issue during incident window ({len(privacy_errors)} error(s))", "evidence": [f.get("title", "")[:200] for f in privacy_errors[:2]], }) rank += 1 next_checks.append("Review data governance findings — possible PII/secrets exposure") # Build summary error_count = len([l for l in log_samples if "error" in l.lower()]) summary = ( f"Incident triage for '{service}' (symptom: {symptom[:100]}). " f"Health: {health_status}. " f"{len(root_causes)} suspected cause(s). " f"{error_count} error log samples. " f"{len(runbooks)} runbook snippet(s) found." + (f" Cost spikes: {len(cost_anomalies)}." if cost_anomalies else "") + (f" Privacy findings: {privacy_ctx.get('findings_count', 0)}." if not privacy_ctx.get("skipped") else "") ) # Cost recommendations cost_recs = cost_ctx.get("recommendations") or [] result = { "summary": summary, "suspected_root_causes": root_causes[:6], "impact_assessment": impact, "mitigations_now": mitigations_now[:5], "next_checks": next_checks[:6], "references": { "metrics": { "slo": slo, "alerts_count": len(alerts), }, "log_samples": log_samples[:10], "runbook_snippets": runbooks, **({"traces": traces} if traces else {}), }, "context": { "slo": { "violations": slo_violations, "metrics": slo_ctx.get("metrics", {}), "thresholds": slo_ctx.get("thresholds", {}), "skipped": slo_ctx.get("skipped", False), }, "privacy": { "findings_count": privacy_ctx.get("findings_count", 0), "findings": privacy_findings[:3], "skipped": privacy_ctx.get("skipped", False), }, "cost": { "anomaly_count": cost_ctx.get("anomaly_count", 0), "anomalies": cost_anomalies[:3], "recommendations": cost_recs, "skipped": cost_ctx.get("skipped", False), }, }, } return {**state, "result": result, "graph_status": "succeeded"} # ─── Routing ───────────────────────────────────────────────────────────────── def _after_validate(state: IncidentTriageState) -> str: if state.get("graph_status") == "failed": return "build_triage_report" return "service_overview" def _after_trace_lookup(state: IncidentTriageState) -> str: return "build_triage_report" # ─── Graph builder ──────────────────────────────────────────────────────────── def build_incident_triage_graph(): """ Build and compile the incident_triage LangGraph. Graph: validate_input → [if valid] service_overview → top_errors_logs → health_and_runbooks → trace_lookup → slo_context → privacy_context → cost_context → build_triage_report → END → [if invalid] build_triage_report → END """ graph = StateGraph(IncidentTriageState) graph.add_node("validate_input", validate_input_node) graph.add_node("service_overview", service_overview_node) graph.add_node("top_errors_logs", top_errors_logs_node) graph.add_node("health_and_runbooks", health_and_runbooks_node) graph.add_node("trace_lookup", trace_lookup_node) graph.add_node("slo_context", slo_context_node) graph.add_node("privacy_context", privacy_context_node) graph.add_node("cost_context", cost_context_node) graph.add_node("build_triage_report", build_triage_report_node) graph.set_entry_point("validate_input") graph.add_conditional_edges( "validate_input", _after_validate, {"service_overview": "service_overview", "build_triage_report": "build_triage_report"}, ) # Linear chain after validation graph.add_edge("service_overview", "top_errors_logs") graph.add_edge("top_errors_logs", "health_and_runbooks") graph.add_edge("health_and_runbooks", "trace_lookup") graph.add_edge("trace_lookup", "slo_context") graph.add_edge("slo_context", "privacy_context") graph.add_edge("privacy_context", "cost_context") graph.add_edge("cost_context", "build_triage_report") graph.add_edge("build_triage_report", END) return graph.compile()