microdao-daarion/services/sofiia-supervisor/app/graphs/postmortem_draft_graph.py

"""
Graph 3: postmortem_draft_graph

Generates a structured postmortem draft from an incident + triage report.

Node sequence:
  validate → load_incident → ensure_triage → draft_postmortem
    → attach_artifacts → append_followups → END

All tool calls via gateway. No direct DB or file access.
"""
from __future__ import annotations

import base64
import datetime
import json
import logging
import re
from typing import Any, Dict, List, Optional, TypedDict

from langgraph.graph import StateGraph, END

from ..config import settings
from ..gateway_client import GatewayClient

logger = logging.getLogger(__name__)

_SECRET_PAT = re.compile(
    r'(?i)(token|api[_-]?key|password|secret|bearer)\s*[=:]\s*\S+',
)


def _redact(text: str, max_len: int = 4000) -> str:
    text = _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***", text)
    return text[:max_len] if len(text) > max_len else text


def _now_iso() -> str:
    return datetime.datetime.now(datetime.timezone.utc).isoformat()


# ─── State ────────────────────────────────────────────────────────────────────

class PostmortemDraftState(TypedDict, total=False):
    run_id: str
    agent_id: str
    workspace_id: str
    user_id: str
    input: Dict[str, Any]

    # Validated
    incident_id: str
    service: str
    env: str
    time_range: Dict[str, str]
    include_traces: bool
    validation_error: Optional[str]

    # Node results
    incident_data: Optional[Dict]
    triage_report: Optional[Dict]
    triage_was_generated: bool
    postmortem_md: str
    postmortem_json: Optional[Dict]
    artifacts_attached: List[Dict]
    followups_appended: int

    # Output
    result: Optional[Dict[str, Any]]
    graph_status: str
    error: Optional[str]


# ─── Nodes ────────────────────────────────────────────────────────────────────

async def validate_node(state: PostmortemDraftState) -> PostmortemDraftState:
    inp = state.get("input", {})
    incident_id = inp.get("incident_id", "").strip()
    if not incident_id:
        return {**state, "graph_status": "failed", "validation_error": "incident_id is required"}
    return {
        **state,
        "incident_id": incident_id,
        "service": inp.get("service", ""),
        "env": inp.get("env", "prod"),
        "time_range": inp.get("time_range") or {},
        "include_traces": bool(inp.get("include_traces", False)),
        "triage_was_generated": False,
        "artifacts_attached": [],
        "followups_appended": 0,
        "graph_status": "running",
    }


async def load_incident_node(state: PostmortemDraftState) -> PostmortemDraftState:
    if state.get("graph_status") == "failed":
        return state

    run_id = state.get("run_id", "")
    async with GatewayClient() as gw:
        result = await gw.call_tool(
            tool="oncall_tool",
            action="incident_get",
            params={"incident_id": state["incident_id"]},
            agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
            workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
            user_id=state.get("user_id", ""),
            graph_run_id=run_id,
            graph_node="load_incident",
        )

    if not result.success:
        return {**state, "graph_status": "failed",
                "error": f"Incident not found: {result.error_message}"}

    inc = result.data or {}
    service = state.get("service") or inc.get("service", "unknown")

    # Check if triage_report artifact exists
    artifacts = inc.get("artifacts") or []
    triage_art = next((a for a in artifacts if a.get("kind") == "triage_report"), None)
    triage = None
    if triage_art:
        triage = {"note": "pre-existing triage_report artifact found", "artifact": triage_art}

    return {**state, "incident_data": inc, "service": service, "triage_report": triage}


async def ensure_triage_node(state: PostmortemDraftState) -> PostmortemDraftState:
    """If no triage report exists, run incident_triage_graph via gateway."""
    if state.get("graph_status") == "failed":
        return state
    if state.get("triage_report"):
        return state

    run_id = state.get("run_id", "")
    inc = state.get("incident_data") or {}
    service = state.get("service") or inc.get("service", "unknown")
    symptom = inc.get("title") or inc.get("summary") or "unknown symptom"

    time_range = state.get("time_range") or {}
    if not time_range.get("from"):
        started = inc.get("started_at", _now_iso())
        ended = inc.get("ended_at") or _now_iso()
        time_range = {"from": started, "to": ended}

    # Call observability + oncall + kb (simplified triage — mirror of incident_triage_graph)
    triage_data: Dict[str, Any] = {"generated": True, "service": service}

    async with GatewayClient() as gw:
        # Service overview
        overview = await gw.call_tool(
            tool="observability_tool", action="service_overview",
            params={"service": service, "time_range": time_range, "env": state.get("env", "prod")},
            agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
            workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
            user_id=state.get("user_id", ""),
            graph_run_id=run_id, graph_node="ensure_triage.overview",
        )
        triage_data["overview"] = overview.data if overview.success else {"error": overview.error_message}

        # Health
        health = await gw.call_tool(
            tool="oncall_tool", action="service_health",
            params={"service": service, "env": state.get("env", "prod")},
            agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
            workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
            user_id=state.get("user_id", ""),
            graph_run_id=run_id, graph_node="ensure_triage.health",
        )
        triage_data["health"] = health.data if health.success else {"error": health.error_message}

        # KB runbooks
        kb = await gw.call_tool(
            tool="kb_tool", action="search",
            params={"query": f"{service} {symptom}"[:200], "top_k": 3},
            agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
            workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
            user_id=state.get("user_id", ""),
            graph_run_id=run_id, graph_node="ensure_triage.kb",
        )
        triage_data["runbooks"] = (kb.data.get("results") or [])[:3] if kb.success and kb.data else []

    triage_data["summary"] = (
        f"Auto-generated triage for {service}: "
        f"health={triage_data.get('health', {}).get('status', '?')}, "
        f"symptom='{_redact(symptom, 100)}'"
    )
    triage_data["suspected_root_causes"] = [
        {"rank": 1, "cause": symptom, "evidence": []}
    ]

    return {**state, "triage_report": triage_data, "triage_was_generated": True}


async def draft_postmortem_node(state: PostmortemDraftState) -> PostmortemDraftState:
    """Generate postmortem from incident data + triage report (deterministic template)."""
    if state.get("graph_status") == "failed":
        return state

    inc = state.get("incident_data") or {}
    triage = state.get("triage_report") or {}
    service = state.get("service") or inc.get("service", "unknown")
    events = inc.get("events") or []

    # Build timeline from events
    timeline_lines = []
    for ev in events[:30]:
        ts_short = (ev.get("ts") or "")[:19]
        ev_type = ev.get("type", "note")
        msg = _redact(ev.get("message", ""), 300)
        timeline_lines.append(f"- **{ts_short}** [{ev_type}] {msg}")

    # Root causes
    causes = triage.get("suspected_root_causes") or []
    causes_lines = []
    for c in causes[:5]:
        causes_lines.append(f"- **#{c.get('rank', '?')}**: {_redact(c.get('cause', '?'), 200)}")
        for e in (c.get("evidence") or [])[:2]:
            causes_lines.append(f"  - Evidence: {_redact(str(e), 200)}")

    # Mitigations
    mitigations = triage.get("mitigations_now") or []
    if not mitigations:
        mitigations = ["(no mitigations recorded)"]

    # Follow-ups
    followups = _extract_followups(triage, inc)

    # Impact
    impact = triage.get("impact_assessment") or inc.get("summary") or "Unknown impact"

    # Build markdown
    md_lines = [
        f"# Postmortem: {_redact(inc.get('title', service), 200)}",
        "",
        f"**Incident ID:** `{inc.get('id', '?')}`",
        f"**Service:** {service}",
        f"**Environment:** {inc.get('env', '?')}",
        f"**Severity:** {inc.get('severity', '?')}",
        f"**Status:** {inc.get('status', '?')}",
        f"**Started:** {inc.get('started_at', '?')}",
        f"**Ended:** {inc.get('ended_at', 'ongoing')}",
        f"**Created by:** {inc.get('created_by', '?')}",
        "",
        "---",
        "",
        "## Summary",
        "",
        _redact(triage.get("summary") or inc.get("summary") or "No summary available.", 1000),
        "",
        "## Impact",
        "",
        _redact(str(impact), 500),
        "",
        "## Detection",
        "",
        f"Incident was reported at {inc.get('started_at', '?')} with symptom: "
        f"*{_redact(inc.get('title', ''), 200)}*.",
        "",
        "## Timeline",
        "",
    ]
    if timeline_lines:
        md_lines.extend(timeline_lines)
    else:
        md_lines.append("- (no timeline events recorded)")
    md_lines.extend([
        "",
        "## Root Cause Analysis",
        "",
    ])
    if causes_lines:
        md_lines.extend(causes_lines)
    else:
        md_lines.append("- Investigation ongoing")
    md_lines.extend([
        "",
        "## Mitigations Applied",
        "",
    ])
    for m in mitigations[:5]:
        md_lines.append(f"- {_redact(str(m), 200)}")
    md_lines.extend([
        "",
        "## Follow-ups",
        "",
    ])
    for i, fu in enumerate(followups, 1):
        md_lines.append(f"{i}. **[{fu.get('priority', 'P2')}]** {_redact(fu.get('title', '?'), 200)}")
    if not followups:
        md_lines.append("- (no follow-ups identified)")
    md_lines.extend([
        "",
        "## Prevention",
        "",
        "- Review and address all follow-up items",
        "- Update runbooks if this is a new failure mode",
        "- Consider adding alerts/monitors for early detection",
        "",
        "---",
        f"*Generated at {_now_iso()} by postmortem_draft_graph*",
    ])

    postmortem_md = "\n".join(md_lines)

    postmortem_json = {
        "incident_id": inc.get("id"),
        "service": service,
        "env": inc.get("env"),
        "severity": inc.get("severity"),
        "started_at": inc.get("started_at"),
        "ended_at": inc.get("ended_at"),
        "summary": _redact(triage.get("summary") or inc.get("summary") or "", 1000),
        "impact": _redact(str(impact), 500),
        "root_causes": causes[:5],
        "mitigations": mitigations[:5],
        "followups": followups,
        "timeline_event_count": len(events),
        "generated_at": _now_iso(),
    }

    return {
        **state,
        "postmortem_md": postmortem_md,
        "postmortem_json": postmortem_json,
    }


async def attach_artifacts_node(state: PostmortemDraftState) -> PostmortemDraftState:
    """Attach postmortem_draft.md and .json as incident artifacts."""
    if state.get("graph_status") == "failed":
        return state

    run_id = state.get("run_id", "")
    incident_id = state["incident_id"]
    attached = []

    async with GatewayClient() as gw:
        # Attach markdown
        md_bytes = state.get("postmortem_md", "").encode("utf-8")
        md_b64 = base64.b64encode(md_bytes).decode("ascii")
        md_res = await gw.call_tool(
            tool="oncall_tool", action="incident_attach_artifact",
            params={
                "incident_id": incident_id,
                "kind": "postmortem_draft",
                "format": "md",
                "content_base64": md_b64,
                "filename": "postmortem_draft.md",
            },
            agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
            workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
            user_id=state.get("user_id", ""),
            graph_run_id=run_id, graph_node="attach_artifacts.md",
        )
        if md_res.success:
            attached.append({"type": "md", "artifact": md_res.data})

        # Attach JSON
        json_bytes = json.dumps(
            state.get("postmortem_json") or {}, indent=2, ensure_ascii=False, default=str,
        ).encode("utf-8")
        json_b64 = base64.b64encode(json_bytes).decode("ascii")
        json_res = await gw.call_tool(
            tool="oncall_tool", action="incident_attach_artifact",
            params={
                "incident_id": incident_id,
                "kind": "postmortem_draft",
                "format": "json",
                "content_base64": json_b64,
                "filename": "postmortem_draft.json",
            },
            agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
            workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
            user_id=state.get("user_id", ""),
            graph_run_id=run_id, graph_node="attach_artifacts.json",
        )
        if json_res.success:
            attached.append({"type": "json", "artifact": json_res.data})

        # Also attach triage if it was auto-generated
        if state.get("triage_was_generated") and state.get("triage_report"):
            triage_bytes = json.dumps(
                state["triage_report"], indent=2, ensure_ascii=False, default=str,
            ).encode("utf-8")
            triage_b64 = base64.b64encode(triage_bytes).decode("ascii")
            tr_res = await gw.call_tool(
                tool="oncall_tool", action="incident_attach_artifact",
                params={
                    "incident_id": incident_id,
                    "kind": "triage_report",
                    "format": "json",
                    "content_base64": triage_b64,
                    "filename": "triage_report.json",
                },
                agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
                workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
                user_id=state.get("user_id", ""),
                graph_run_id=run_id, graph_node="attach_artifacts.triage",
            )
            if tr_res.success:
                attached.append({"type": "triage_json", "artifact": tr_res.data})

    return {**state, "artifacts_attached": attached}


async def append_followups_node(state: PostmortemDraftState) -> PostmortemDraftState:
    """Append follow-up items as incident timeline events."""
    if state.get("graph_status") == "failed":
        return state

    run_id = state.get("run_id", "")
    incident_id = state["incident_id"]
    pm_json = state.get("postmortem_json") or {}
    followups = pm_json.get("followups") or []
    count = 0

    async with GatewayClient() as gw:
        for fu in followups[:10]:
            try:
                res = await gw.call_tool(
                    tool="oncall_tool", action="incident_append_event",
                    params={
                        "incident_id": incident_id,
                        "type": "followup",
                        "message": _redact(fu.get("title", ""), 500),
                        "meta": {"priority": fu.get("priority", "P2"), "source": "postmortem_draft"},
                    },
                    agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
                    workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
                    user_id=state.get("user_id", ""),
                    graph_run_id=run_id, graph_node="append_followups",
                )
                if res.success:
                    count += 1
            except Exception as e:
                logger.warning("postmortem: followup append failed (non-fatal): %s", e)

    return {**state, "followups_appended": count}


async def build_result_node(state: PostmortemDraftState) -> PostmortemDraftState:
    """Build final output."""
    if state.get("graph_status") == "failed":
        err = state.get("validation_error") or state.get("error", "Unknown error")
        return {**state, "result": {"error": err}}

    md = state.get("postmortem_md", "")
    preview = md[:1500] + "\n…[truncated]" if len(md) > 1500 else md

    return {
        **state,
        "result": {
            "incident_id": state.get("incident_id"),
            "artifacts_count": len(state.get("artifacts_attached") or []),
            "artifacts": state.get("artifacts_attached") or [],
            "followups_count": state.get("followups_appended", 0),
            "triage_was_generated": state.get("triage_was_generated", False),
            "markdown_preview": preview,
        },
        "graph_status": "succeeded",
    }


# ─── Helpers ──────────────────────────────────────────────────────────────────

def _extract_followups(triage: Dict, incident: Dict) -> List[Dict]:
    """Extract actionable follow-ups from triage report."""
    followups = []

    # From triage next_checks
    for check in (triage.get("next_checks") or [])[:5]:
        followups.append({"title": _redact(str(check), 300), "priority": "P2"})

    # From cost recommendations
    for rec in (triage.get("context", {}).get("cost", {}).get("recommendations") or [])[:2]:
        followups.append({"title": f"[FinOps] {_redact(str(rec), 300)}", "priority": "P3"})

    # From privacy findings
    priv = triage.get("context", {}).get("privacy", {})
    if priv.get("findings_count", 0) > 0:
        followups.append({
            "title": f"[Privacy] Review {priv['findings_count']} data governance finding(s)",
            "priority": "P2",
        })

    return followups[:10]


# ─── Routing ──────────────────────────────────────────────────────────────────

def _after_validate(state: PostmortemDraftState) -> str:
    if state.get("graph_status") == "failed":
        return "build_result"
    return "load_incident"


def _after_load(state: PostmortemDraftState) -> str:
    if state.get("graph_status") == "failed":
        return "build_result"
    return "ensure_triage"


# ─── Graph builder ────────────────────────────────────────────────────────────

def build_postmortem_draft_graph():
    """
    Build and compile the postmortem_draft LangGraph.

    Graph:
      validate → load_incident → ensure_triage → draft_postmortem
        → attach_artifacts → append_followups → build_result → END
    """
    graph = StateGraph(PostmortemDraftState)

    graph.add_node("validate", validate_node)
    graph.add_node("load_incident", load_incident_node)
    graph.add_node("ensure_triage", ensure_triage_node)
    graph.add_node("draft_postmortem", draft_postmortem_node)
    graph.add_node("attach_artifacts", attach_artifacts_node)
    graph.add_node("append_followups", append_followups_node)
    graph.add_node("build_result", build_result_node)

    graph.set_entry_point("validate")

    graph.add_conditional_edges(
        "validate", _after_validate,
        {"load_incident": "load_incident", "build_result": "build_result"},
    )
    graph.add_conditional_edges(
        "load_incident", _after_load,
        {"ensure_triage": "ensure_triage", "build_result": "build_result"},
    )
    graph.add_edge("ensure_triage", "draft_postmortem")
    graph.add_edge("draft_postmortem", "attach_artifacts")
    graph.add_edge("attach_artifacts", "append_followups")
    graph.add_edge("append_followups", "build_result")
    graph.add_edge("build_result", END)

    return graph.compile()