microdao-daarion/services/sofiia-supervisor/app/graphs/incident_triage_graph.py

"""
Graph 2: incident_triage_graph

Collects observability, logs, health, KB runbooks, optionally traces,
and governance context (privacy + cost), then builds a structured triage report.

Node sequence:
  validate_input → service_overview → top_errors_logs
    → health_and_runbooks → trace_lookup (optional)
    → slo_context → privacy_context → cost_context
    → build_triage_report → END

All tool calls via gateway. No direct access to Prometheus/Loki/etc.
"""

from __future__ import annotations

import datetime
import logging
import re
from typing import Any, Dict, List, Optional, TypedDict

from langgraph.graph import StateGraph, END

from ..config import settings
from ..gateway_client import GatewayClient

logger = logging.getLogger(__name__)

_SECRET_PAT = re.compile(
    r'(?i)(token|api[_-]?key|password|secret|bearer)\s*[=:]\s*\S+',
    re.IGNORECASE,
)


def _redact_lines(lines: List[str]) -> List[str]:
    """Mask secrets in log lines before including in report."""
    return [_SECRET_PAT.sub(lambda m: f"{m.group(1)}=***", line) for line in lines]


def _clamp_time_range(time_range: Optional[Dict[str, str]], max_hours: int) -> Dict[str, str]:
    """Ensure time window ≤ max_hours. Clamp end-start if larger."""
    now = datetime.datetime.now(datetime.timezone.utc)
    default_from = (now - datetime.timedelta(hours=1)).isoformat()
    default_to = now.isoformat()

    if not time_range:
        return {"from": default_from, "to": default_to}

    try:
        from_dt = datetime.datetime.fromisoformat(time_range["from"].replace("Z", "+00:00"))
        to_dt = datetime.datetime.fromisoformat(time_range.get("to", default_to).replace("Z", "+00:00"))
        delta = to_dt - from_dt
        if delta.total_seconds() > max_hours * 3600:
            # Clamp: keep "to", shorten "from"
            from_dt = to_dt - datetime.timedelta(hours=max_hours)
            return {"from": from_dt.isoformat(), "to": to_dt.isoformat()}
        return {"from": from_dt.isoformat(), "to": to_dt.isoformat()}
    except Exception:
        return {"from": default_from, "to": default_to}


# ─── State ────────────────────────────────────────────────────────────────────

class IncidentTriageState(TypedDict, total=False):
    # Context (injected before graph.invoke)
    run_id: str
    agent_id: str
    workspace_id: str
    user_id: str
    input: Dict[str, Any]

    # Validated
    service: str
    symptom: str
    time_range: Dict[str, str]
    env: str
    include_traces: bool
    max_log_lines: int
    log_query_hint: Optional[str]
    validation_error: Optional[str]

    # Node results
    service_overview_data: Optional[Dict]
    top_errors_data: Optional[Dict]
    log_samples: List[str]
    health_data: Optional[Dict]
    runbook_snippets: List[Dict]
    trace_data: Optional[Dict]
    slo_context_data: Optional[Dict]
    privacy_context_data: Optional[Dict]
    cost_context_data: Optional[Dict]

    # Output
    result: Optional[Dict[str, Any]]
    graph_status: str
    error: Optional[str]


# ─── Nodes ────────────────────────────────────────────────────────────────────

async def validate_input_node(state: IncidentTriageState) -> IncidentTriageState:
    """Validate and normalise triage inputs. Clamp time window to max allowed."""
    inp = state.get("input", {})
    service = inp.get("service", "").strip()
    symptom = inp.get("symptom", "").strip()

    if not service:
        return {**state, "graph_status": "failed", "validation_error": "service is required"}
    if not symptom:
        return {**state, "graph_status": "failed", "validation_error": "symptom is required"}

    time_range = _clamp_time_range(
        inp.get("time_range"),
        settings.INCIDENT_MAX_TIME_WINDOW_H,
    )

    max_log_lines = min(
        int(inp.get("max_log_lines", 120)),
        settings.INCIDENT_MAX_LOG_LINES,
    )

    return {
        **state,
        "service": service,
        "symptom": symptom,
        "time_range": time_range,
        "env": inp.get("env", "prod"),
        "include_traces": bool(inp.get("include_traces", False)),
        "max_log_lines": max_log_lines,
        "log_query_hint": inp.get("log_query_hint"),
        "log_samples": [],
        "runbook_snippets": [],
        "graph_status": "running",
    }


async def service_overview_node(state: IncidentTriageState) -> IncidentTriageState:
    """
    Node 1: Call observability_tool action=service_overview.
    Collects metrics summary, recent alerts, SLO status.
    """
    if state.get("graph_status") == "failed":
        return state

    run_id = state.get("run_id", "")
    async with GatewayClient() as gw:
        result = await gw.call_tool(
            tool="observability_tool",
            action="service_overview",
            params={
                "service": state["service"],
                "time_range": state["time_range"],
                "env": state.get("env", "prod"),
            },
            agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
            workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
            user_id=state.get("user_id", ""),
            graph_run_id=run_id,
            graph_node="service_overview",
        )

    if not result.success:
        logger.warning("incident_triage: service_overview failed run=%s err=%s", run_id, result.error_message)
        # Non-fatal: continue with partial data
        return {**state, "service_overview_data": {"error": result.error_message}}

    return {**state, "service_overview_data": result.data or {}}


async def top_errors_logs_node(state: IncidentTriageState) -> IncidentTriageState:
    """
    Node 2: Call observability_tool action=logs_query.
    Extract top N log lines and sample errors. Redact secrets.
    """
    if state.get("graph_status") == "failed":
        return state

    run_id = state.get("run_id", "")
    query_hint = state.get("log_query_hint") or f"service={state['service']} level=error"

    async with GatewayClient() as gw:
        result = await gw.call_tool(
            tool="observability_tool",
            action="logs_query",
            params={
                "service": state["service"],
                "time_range": state["time_range"],
                "env": state.get("env", "prod"),
                "query": query_hint,
                "limit": state.get("max_log_lines", 120),
            },
            agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
            workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
            user_id=state.get("user_id", ""),
            graph_run_id=run_id,
            graph_node="top_errors_logs",
        )

    if not result.success:
        logger.warning("incident_triage: logs_query failed run=%s err=%s", run_id, result.error_message)
        return {**state, "top_errors_data": {"error": result.error_message}, "log_samples": []}

    data = result.data or {}
    raw_lines: List[str] = data.get("lines") or data.get("logs") or []
    safe_lines = _redact_lines(raw_lines[: state.get("max_log_lines", 120)])

    return {**state, "top_errors_data": data, "log_samples": safe_lines}


async def health_and_runbooks_node(state: IncidentTriageState) -> IncidentTriageState:
    """
    Node 3: Parallel-ish (sequential for simplicity):
      a) oncall_tool action=service_health
      b) kb_tool action=search for runbook snippets
    """
    if state.get("graph_status") == "failed":
        return state

    run_id = state.get("run_id", "")
    service = state["service"]
    symptom = state.get("symptom", "")

    # 3a — Health check
    health_data: Dict = {}
    async with GatewayClient() as gw:
        hr = await gw.call_tool(
            tool="oncall_tool",
            action="service_health",
            params={"service": service, "env": state.get("env", "prod")},
            agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
            workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
            user_id=state.get("user_id", ""),
            graph_run_id=run_id,
            graph_node="health_check",
        )
        health_data = hr.data or {"error": hr.error_message} if not hr.success else hr.data or {}

    # 3b — KB runbook search
    runbook_snippets: List[Dict] = []
    # Build KB query from service name + top error keywords from symptom
    kb_query = f"{service} {symptom}"[:200]

    async with GatewayClient() as gw:
        kbr = await gw.call_tool(
            tool="kb_tool",
            action="search",
            params={"query": kb_query, "top_k": 5, "filter": {"type": "runbook"}},
            agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
            workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
            user_id=state.get("user_id", ""),
            graph_run_id=run_id,
            graph_node="kb_runbooks",
        )
        if kbr.success and kbr.data:
            for item in (kbr.data.get("results") or [])[:5]:
                snippet_text = _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***",
                                               item.get("content", "")[:500])
                runbook_snippets.append({
                    "path": item.get("path", item.get("source", "")),
                    "lines": item.get("lines", ""),
                    "text": snippet_text,
                })

    return {**state, "health_data": health_data, "runbook_snippets": runbook_snippets}


async def trace_lookup_node(state: IncidentTriageState) -> IncidentTriageState:
    """
    Node 4 (optional): If include_traces=True, look for trace IDs in log samples
    and query observability_tool action=traces_query.
    Gracefully skips if no traces found or tool unavailable.
    """
    if state.get("graph_status") == "failed":
        return state
    if not state.get("include_traces", False):
        return {**state, "trace_data": None}

    run_id = state.get("run_id", "")
    # Extract trace IDs from log samples (simple regex: trace_id=<hex> or traceId=<hex>)
    trace_pat = re.compile(r'(?:trace[_-]?id|traceId)[=:\s]+([0-9a-f]{16,32})', re.IGNORECASE)
    trace_ids = []
    for line in (state.get("log_samples") or [])[:50]:
        for m in trace_pat.finditer(line):
            trace_ids.append(m.group(1))
            if len(trace_ids) >= 3:
                break
        if len(trace_ids) >= 3:
            break

    if not trace_ids:
        logger.info("incident_triage: no trace IDs found in logs run=%s", run_id)
        return {**state, "trace_data": {"note": "no_trace_ids_in_logs"}}

    async with GatewayClient() as gw:
        result = await gw.call_tool(
            tool="observability_tool",
            action="traces_query",
            params={
                "service": state["service"],
                "trace_ids": trace_ids[:3],
                "time_range": state["time_range"],
            },
            agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
            workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
            user_id=state.get("user_id", ""),
            graph_run_id=run_id,
            graph_node="trace_lookup",
        )

    if not result.success:
        logger.info("incident_triage: trace_lookup skipped run=%s err=%s", run_id, result.error_message)
        return {**state, "trace_data": {"note": f"trace_query_failed: {result.error_message}"}}

    return {**state, "trace_data": result.data or {}}


async def slo_context_node(state: IncidentTriageState) -> IncidentTriageState:
    """
    Node 4b: Query SLO thresholds and current metrics for the incident service.

    Calls observability_tool.slo_snapshot via gateway.
    Non-fatal: if tool unavailable, slo_context_data is set to skipped marker.
    """
    if state.get("graph_status") == "failed":
        return state

    run_id = state.get("run_id", "")
    service = state.get("service", "")
    time_range = state.get("time_range", {})

    try:
        from_dt = datetime.datetime.fromisoformat(time_range.get("from", "").replace("Z", "+00:00"))
        to_dt = datetime.datetime.fromisoformat(time_range.get("to", "").replace("Z", "+00:00"))
        window_min = max(5, min(60, int((to_dt - from_dt).total_seconds() / 60)))
    except Exception:
        window_min = 60

    try:
        async with GatewayClient() as gw:
            result = await gw.call_tool(
                tool="observability_tool",
                action="slo_snapshot",
                params={
                    "service": service,
                    "env": state.get("env", "prod"),
                    "window_minutes": window_min,
                },
                agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
                workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
                user_id=state.get("user_id", ""),
                graph_run_id=run_id,
                graph_node="slo_context",
            )

        if not result.success:
            logger.info("incident_triage: slo_context skipped run=%s err=%s", run_id, result.error_message)
            return {**state, "slo_context_data": {"skipped": True, "reason": result.error_message}}

        data = result.data or {}
        return {**state, "slo_context_data": {
            "violations": data.get("violations", []),
            "metrics": data.get("metrics", {}),
            "thresholds": data.get("thresholds", {}),
            "skipped": data.get("skipped", False),
        }}

    except Exception as e:
        logger.info("incident_triage: slo_context failed run=%s err=%s", run_id, e)
        return {**state, "slo_context_data": {"skipped": True, "reason": str(e)}}


async def privacy_context_node(state: IncidentTriageState) -> IncidentTriageState:
    """
    Node 5a: Scan audit events over the incident time window for privacy anomalies.

    Calls data_governance_tool.scan_audit via gateway.
    Non-fatal: if gateway fails, privacy_context_data is set to an error marker
    and the triage report continues normally.
    """
    if state.get("graph_status") == "failed":
        return state

    run_id = state.get("run_id", "")
    time_range = state.get("time_range", {})

    # Compute window_hours from time_range (clamp 1..24)
    try:
        import datetime
        from_dt = datetime.datetime.fromisoformat(time_range.get("from", "").replace("Z", "+00:00"))
        to_dt = datetime.datetime.fromisoformat(time_range.get("to", "").replace("Z", "+00:00"))
        window_h = max(1, min(24, int((to_dt - from_dt).total_seconds() / 3600) + 1))
    except Exception:
        window_h = 1

    try:
        async with GatewayClient() as gw:
            result = await gw.call_tool(
                tool="data_governance_tool",
                action="scan_audit",
                params={
                    "backend": "jsonl",
                    "time_window_hours": window_h,
                    "max_events": 10000,
                },
                agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
                workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
                user_id=state.get("user_id", ""),
                graph_run_id=run_id,
                graph_node="privacy_context",
            )

        if not result.success:
            logger.info(
                "incident_triage: privacy_context skipped run=%s err=%s",
                run_id, result.error_message,
            )
            return {**state, "privacy_context_data": {"skipped": True, "reason": result.error_message}}

        data = result.data or {}
        return {**state, "privacy_context_data": {
            "findings_count": data.get("stats", {}).get("errors", 0) + data.get("stats", {}).get("warnings", 0),
            "findings": (data.get("findings") or [])[:5],   # top 5 only; evidence already masked
            "summary": data.get("summary", ""),
        }}

    except Exception as e:
        logger.info("incident_triage: privacy_context failed run=%s err=%s", run_id, e)
        return {**state, "privacy_context_data": {"skipped": True, "reason": str(e)}}


async def cost_context_node(state: IncidentTriageState) -> IncidentTriageState:
    """
    Node 5b: Detect cost/resource anomalies over the incident time window.

    Calls cost_analyzer_tool.anomalies via gateway.
    Non-fatal: on any failure, cost_context_data is set to skipped marker.
    """
    if state.get("graph_status") == "failed":
        return state

    run_id = state.get("run_id", "")
    time_range = state.get("time_range", {})

    try:
        import datetime
        from_dt = datetime.datetime.fromisoformat(time_range.get("from", "").replace("Z", "+00:00"))
        to_dt = datetime.datetime.fromisoformat(time_range.get("to", "").replace("Z", "+00:00"))
        window_minutes = max(15, min(60, int((to_dt - from_dt).total_seconds() / 60)))
        baseline_hours = max(4, min(24, int((to_dt - from_dt).total_seconds() / 3600) + 4))
    except Exception:
        window_minutes = 60
        baseline_hours = 24

    try:
        async with GatewayClient() as gw:
            result = await gw.call_tool(
                tool="cost_analyzer_tool",
                action="anomalies",
                params={
                    "window_minutes": window_minutes,
                    "baseline_hours": baseline_hours,
                    "ratio_threshold": 3.0,
                    "min_calls": 5,
                },
                agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
                workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
                user_id=state.get("user_id", ""),
                graph_run_id=run_id,
                graph_node="cost_context",
            )

        if not result.success:
            logger.info(
                "incident_triage: cost_context skipped run=%s err=%s",
                run_id, result.error_message,
            )
            return {**state, "cost_context_data": {"skipped": True, "reason": result.error_message}}

        data = result.data or {}
        anomalies = data.get("anomalies") or []
        return {**state, "cost_context_data": {
            "anomaly_count": data.get("anomaly_count", len(anomalies)),
            "anomalies": anomalies[:5],  # top 5 spikes
            "recommendations": [a.get("recommendation", "") for a in anomalies[:3] if a.get("recommendation")],
        }}

    except Exception as e:
        logger.info("incident_triage: cost_context failed run=%s err=%s", run_id, e)
        return {**state, "cost_context_data": {"skipped": True, "reason": str(e)}}


async def build_triage_report_node(state: IncidentTriageState) -> IncidentTriageState:
    """
    Node 5: Pure aggregation — no tool calls.
    Builds structured triage report from all collected data.
    """
    if state.get("graph_status") == "failed":
        err = state.get("validation_error") or state.get("error", "Unknown error")
        return {**state, "result": {"error": err}, "graph_status": "failed"}

    service = state.get("service", "unknown")
    symptom = state.get("symptom", "")
    overview = state.get("service_overview_data") or {}
    health = state.get("health_data") or {}
    log_samples = state.get("log_samples") or []
    runbooks = state.get("runbook_snippets") or []
    traces = state.get("trace_data")
    slo_ctx = state.get("slo_context_data") or {}
    privacy_ctx = state.get("privacy_context_data") or {}
    cost_ctx = state.get("cost_context_data") or {}

    # Extract alerts and error stats from observability overview
    alerts = overview.get("alerts", overview.get("active_alerts", []))
    slo = overview.get("slo", overview.get("slo_status", {}))
    health_status = health.get("status", health.get("health", "unknown"))

    # Build suspected root causes from available signals
    root_causes = []
    rank = 1

    if health_status in ("degraded", "down", "unhealthy", "error"):
        root_causes.append({
            "rank": rank,
            "cause": f"Service health: {health_status}",
            "evidence": [str(health.get("details", health_status))[:300]],
        })
        rank += 1

    for alert in alerts[:3]:
        root_causes.append({
            "rank": rank,
            "cause": f"Active alert: {alert.get('name', alert) if isinstance(alert, dict) else str(alert)}",
            "evidence": [str(alert)[:300]],
        })
        rank += 1

    if log_samples:
        # Count unique error patterns
        error_lines = [l for l in log_samples if "error" in l.lower() or "exception" in l.lower()][:10]
        if error_lines:
            root_causes.append({
                "rank": rank,
                "cause": f"Error patterns in logs ({len(error_lines)} samples)",
                "evidence": error_lines[:3],
            })
            rank += 1

    if not root_causes:
        root_causes.append({
            "rank": 1,
            "cause": "No obvious signals found; investigation ongoing",
            "evidence": [symptom],
        })

    # Pre-extract SLO violations for impact and enrichment
    slo_violations = slo_ctx.get("violations") or []

    # Impact assessment from SLO + observability
    impact = "Unknown"
    if slo_violations and not slo_ctx.get("skipped"):
        slo_m = slo_ctx.get("metrics", {})
        impact = f"SLO breached: {', '.join(slo_violations)} (latency_p95={slo_m.get('latency_p95_ms', '?')}ms, error_rate={slo_m.get('error_rate_pct', '?')}%)"
    elif isinstance(slo, dict):
        error_rate = slo.get("error_rate") or slo.get("error_budget_consumed")
        if error_rate:
            impact = f"SLO impact: error_rate={error_rate}"
    if health_status in ("down", "unhealthy"):
        impact = f"Service is {health_status}" + (f"; {impact}" if impact != "Unknown" else "")

    # Mitigations from runbooks
    mitigations_now = []
    for rb in runbooks[:2]:
        text = rb.get("text", "")
        lines = [l.strip() for l in text.split("\n") if l.strip().startswith("-") or "restart" in l.lower() or "rollback" in l.lower()]
        mitigations_now.extend(lines[:3])
    if not mitigations_now:
        mitigations_now = ["Review logs for error patterns", "Check service health dashboard", "Consult runbook"]

    next_checks = [
        f"Verify {service} health endpoint returns 200",
        "Check upstream/downstream dependencies",
        "Review recent deployments in release history",
    ]
    if alerts:
        next_checks.insert(0, f"Acknowledge/resolve {len(alerts)} active alert(s)")

    # Enrich with SLO violations
    if slo_violations and not slo_ctx.get("skipped"):
        slo_metrics = slo_ctx.get("metrics", {})
        slo_thresholds = slo_ctx.get("thresholds", {})
        evidence = [
            f"{v}: actual={slo_metrics.get(v + '_ms' if 'latency' in v else v + '_pct', '?')}, "
            f"threshold={slo_thresholds.get(v + '_ms' if 'latency' in v else v + '_pct', '?')}"
            for v in slo_violations
        ]
        root_causes.append({
            "rank": rank,
            "cause": f"SLO violations: {', '.join(slo_violations)}",
            "evidence": evidence,
        })
        rank += 1
        next_checks.insert(0, f"Confirm SLO breach correlates with service degradation ({', '.join(slo_violations)})")

    # Enrich with cost context insights
    cost_anomalies = cost_ctx.get("anomalies") or []
    if cost_anomalies and not cost_ctx.get("skipped"):
        spike_tools = [a.get("tool", "?") for a in cost_anomalies[:2]]
        root_causes.append({
            "rank": rank,
            "cause": f"Resource/cost spike detected on: {', '.join(spike_tools)}",
            "evidence": [
                f"{a.get('tool')}: ratio={a.get('ratio')}, window_calls={a.get('window_calls')}"
                for a in cost_anomalies[:2]
            ],
        })
        rank += 1
        next_checks.append("Investigate resource spike — possible runaway process or retry storm")

    # Enrich with privacy context insights
    privacy_findings = privacy_ctx.get("findings") or []
    if privacy_findings and not privacy_ctx.get("skipped"):
        privacy_errors = [f for f in privacy_findings if f.get("severity") == "error"]
        if privacy_errors:
            root_causes.append({
                "rank": rank,
                "cause": f"Privacy/data governance issue during incident window ({len(privacy_errors)} error(s))",
                "evidence": [f.get("title", "")[:200] for f in privacy_errors[:2]],
            })
            rank += 1
            next_checks.append("Review data governance findings — possible PII/secrets exposure")

    # Build summary
    error_count = len([l for l in log_samples if "error" in l.lower()])
    summary = (
        f"Incident triage for '{service}' (symptom: {symptom[:100]}). "
        f"Health: {health_status}. "
        f"{len(root_causes)} suspected cause(s). "
        f"{error_count} error log samples. "
        f"{len(runbooks)} runbook snippet(s) found."
        + (f" Cost spikes: {len(cost_anomalies)}." if cost_anomalies else "")
        + (f" Privacy findings: {privacy_ctx.get('findings_count', 0)}." if not privacy_ctx.get("skipped") else "")
    )

    # Cost recommendations
    cost_recs = cost_ctx.get("recommendations") or []

    result = {
        "summary": summary,
        "suspected_root_causes": root_causes[:6],
        "impact_assessment": impact,
        "mitigations_now": mitigations_now[:5],
        "next_checks": next_checks[:6],
        "references": {
            "metrics": {
                "slo": slo,
                "alerts_count": len(alerts),
            },
            "log_samples": log_samples[:10],
            "runbook_snippets": runbooks,
            **({"traces": traces} if traces else {}),
        },
        "context": {
            "slo": {
                "violations": slo_violations,
                "metrics": slo_ctx.get("metrics", {}),
                "thresholds": slo_ctx.get("thresholds", {}),
                "skipped": slo_ctx.get("skipped", False),
            },
            "privacy": {
                "findings_count": privacy_ctx.get("findings_count", 0),
                "findings": privacy_findings[:3],
                "skipped": privacy_ctx.get("skipped", False),
            },
            "cost": {
                "anomaly_count": cost_ctx.get("anomaly_count", 0),
                "anomalies": cost_anomalies[:3],
                "recommendations": cost_recs,
                "skipped": cost_ctx.get("skipped", False),
            },
        },
    }

    return {**state, "result": result, "graph_status": "succeeded"}


# ─── Routing ─────────────────────────────────────────────────────────────────

def _after_validate(state: IncidentTriageState) -> str:
    if state.get("graph_status") == "failed":
        return "build_triage_report"
    return "service_overview"


def _after_trace_lookup(state: IncidentTriageState) -> str:
    return "build_triage_report"


# ─── Graph builder ────────────────────────────────────────────────────────────

def build_incident_triage_graph():
    """
    Build and compile the incident_triage LangGraph.

    Graph:
      validate_input → [if valid] service_overview → top_errors_logs
                     → health_and_runbooks → trace_lookup
                     → slo_context → privacy_context → cost_context
                     → build_triage_report → END
                     → [if invalid] build_triage_report → END
    """
    graph = StateGraph(IncidentTriageState)

    graph.add_node("validate_input", validate_input_node)
    graph.add_node("service_overview", service_overview_node)
    graph.add_node("top_errors_logs", top_errors_logs_node)
    graph.add_node("health_and_runbooks", health_and_runbooks_node)
    graph.add_node("trace_lookup", trace_lookup_node)
    graph.add_node("slo_context", slo_context_node)
    graph.add_node("privacy_context", privacy_context_node)
    graph.add_node("cost_context", cost_context_node)
    graph.add_node("build_triage_report", build_triage_report_node)

    graph.set_entry_point("validate_input")

    graph.add_conditional_edges(
        "validate_input",
        _after_validate,
        {"service_overview": "service_overview", "build_triage_report": "build_triage_report"},
    )

    # Linear chain after validation
    graph.add_edge("service_overview", "top_errors_logs")
    graph.add_edge("top_errors_logs", "health_and_runbooks")
    graph.add_edge("health_and_runbooks", "trace_lookup")
    graph.add_edge("trace_lookup", "slo_context")
    graph.add_edge("slo_context", "privacy_context")
    graph.add_edge("privacy_context", "cost_context")
    graph.add_edge("cost_context", "build_triage_report")
    graph.add_edge("build_triage_report", END)

    return graph.compile()