""" Graph 3: postmortem_draft_graph Generates a structured postmortem draft from an incident + triage report. Node sequence: validate → load_incident → ensure_triage → draft_postmortem → attach_artifacts → append_followups → END All tool calls via gateway. No direct DB or file access. """ from __future__ import annotations import base64 import datetime import json import logging import re from typing import Any, Dict, List, Optional, TypedDict from langgraph.graph import StateGraph, END from ..config import settings from ..gateway_client import GatewayClient logger = logging.getLogger(__name__) _SECRET_PAT = re.compile( r'(?i)(token|api[_-]?key|password|secret|bearer)\s*[=:]\s*\S+', ) def _redact(text: str, max_len: int = 4000) -> str: text = _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***", text) return text[:max_len] if len(text) > max_len else text def _now_iso() -> str: return datetime.datetime.now(datetime.timezone.utc).isoformat() # ─── State ──────────────────────────────────────────────────────────────────── class PostmortemDraftState(TypedDict, total=False): run_id: str agent_id: str workspace_id: str user_id: str input: Dict[str, Any] # Validated incident_id: str service: str env: str time_range: Dict[str, str] include_traces: bool validation_error: Optional[str] # Node results incident_data: Optional[Dict] triage_report: Optional[Dict] triage_was_generated: bool postmortem_md: str postmortem_json: Optional[Dict] artifacts_attached: List[Dict] followups_appended: int # Output result: Optional[Dict[str, Any]] graph_status: str error: Optional[str] # ─── Nodes ──────────────────────────────────────────────────────────────────── async def validate_node(state: PostmortemDraftState) -> PostmortemDraftState: inp = state.get("input", {}) incident_id = inp.get("incident_id", "").strip() if not incident_id: return {**state, "graph_status": "failed", "validation_error": "incident_id is required"} return { **state, "incident_id": incident_id, "service": inp.get("service", ""), "env": inp.get("env", "prod"), "time_range": inp.get("time_range") or {}, "include_traces": bool(inp.get("include_traces", False)), "triage_was_generated": False, "artifacts_attached": [], "followups_appended": 0, "graph_status": "running", } async def load_incident_node(state: PostmortemDraftState) -> PostmortemDraftState: if state.get("graph_status") == "failed": return state run_id = state.get("run_id", "") async with GatewayClient() as gw: result = await gw.call_tool( tool="oncall_tool", action="incident_get", params={"incident_id": state["incident_id"]}, agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID), workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID), user_id=state.get("user_id", ""), graph_run_id=run_id, graph_node="load_incident", ) if not result.success: return {**state, "graph_status": "failed", "error": f"Incident not found: {result.error_message}"} inc = result.data or {} service = state.get("service") or inc.get("service", "unknown") # Check if triage_report artifact exists artifacts = inc.get("artifacts") or [] triage_art = next((a for a in artifacts if a.get("kind") == "triage_report"), None) triage = None if triage_art: triage = {"note": "pre-existing triage_report artifact found", "artifact": triage_art} return {**state, "incident_data": inc, "service": service, "triage_report": triage} async def ensure_triage_node(state: PostmortemDraftState) -> PostmortemDraftState: """If no triage report exists, run incident_triage_graph via gateway.""" if state.get("graph_status") == "failed": return state if state.get("triage_report"): return state run_id = state.get("run_id", "") inc = state.get("incident_data") or {} service = state.get("service") or inc.get("service", "unknown") symptom = inc.get("title") or inc.get("summary") or "unknown symptom" time_range = state.get("time_range") or {} if not time_range.get("from"): started = inc.get("started_at", _now_iso()) ended = inc.get("ended_at") or _now_iso() time_range = {"from": started, "to": ended} # Call observability + oncall + kb (simplified triage — mirror of incident_triage_graph) triage_data: Dict[str, Any] = {"generated": True, "service": service} async with GatewayClient() as gw: # Service overview overview = await gw.call_tool( tool="observability_tool", action="service_overview", params={"service": service, "time_range": time_range, "env": state.get("env", "prod")}, agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID), workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID), user_id=state.get("user_id", ""), graph_run_id=run_id, graph_node="ensure_triage.overview", ) triage_data["overview"] = overview.data if overview.success else {"error": overview.error_message} # Health health = await gw.call_tool( tool="oncall_tool", action="service_health", params={"service": service, "env": state.get("env", "prod")}, agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID), workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID), user_id=state.get("user_id", ""), graph_run_id=run_id, graph_node="ensure_triage.health", ) triage_data["health"] = health.data if health.success else {"error": health.error_message} # KB runbooks kb = await gw.call_tool( tool="kb_tool", action="search", params={"query": f"{service} {symptom}"[:200], "top_k": 3}, agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID), workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID), user_id=state.get("user_id", ""), graph_run_id=run_id, graph_node="ensure_triage.kb", ) triage_data["runbooks"] = (kb.data.get("results") or [])[:3] if kb.success and kb.data else [] triage_data["summary"] = ( f"Auto-generated triage for {service}: " f"health={triage_data.get('health', {}).get('status', '?')}, " f"symptom='{_redact(symptom, 100)}'" ) triage_data["suspected_root_causes"] = [ {"rank": 1, "cause": symptom, "evidence": []} ] return {**state, "triage_report": triage_data, "triage_was_generated": True} async def draft_postmortem_node(state: PostmortemDraftState) -> PostmortemDraftState: """Generate postmortem from incident data + triage report (deterministic template).""" if state.get("graph_status") == "failed": return state inc = state.get("incident_data") or {} triage = state.get("triage_report") or {} service = state.get("service") or inc.get("service", "unknown") events = inc.get("events") or [] # Build timeline from events timeline_lines = [] for ev in events[:30]: ts_short = (ev.get("ts") or "")[:19] ev_type = ev.get("type", "note") msg = _redact(ev.get("message", ""), 300) timeline_lines.append(f"- **{ts_short}** [{ev_type}] {msg}") # Root causes causes = triage.get("suspected_root_causes") or [] causes_lines = [] for c in causes[:5]: causes_lines.append(f"- **#{c.get('rank', '?')}**: {_redact(c.get('cause', '?'), 200)}") for e in (c.get("evidence") or [])[:2]: causes_lines.append(f" - Evidence: {_redact(str(e), 200)}") # Mitigations mitigations = triage.get("mitigations_now") or [] if not mitigations: mitigations = ["(no mitigations recorded)"] # Follow-ups followups = _extract_followups(triage, inc) # Impact impact = triage.get("impact_assessment") or inc.get("summary") or "Unknown impact" # Build markdown md_lines = [ f"# Postmortem: {_redact(inc.get('title', service), 200)}", "", f"**Incident ID:** `{inc.get('id', '?')}`", f"**Service:** {service}", f"**Environment:** {inc.get('env', '?')}", f"**Severity:** {inc.get('severity', '?')}", f"**Status:** {inc.get('status', '?')}", f"**Started:** {inc.get('started_at', '?')}", f"**Ended:** {inc.get('ended_at', 'ongoing')}", f"**Created by:** {inc.get('created_by', '?')}", "", "---", "", "## Summary", "", _redact(triage.get("summary") or inc.get("summary") or "No summary available.", 1000), "", "## Impact", "", _redact(str(impact), 500), "", "## Detection", "", f"Incident was reported at {inc.get('started_at', '?')} with symptom: " f"*{_redact(inc.get('title', ''), 200)}*.", "", "## Timeline", "", ] if timeline_lines: md_lines.extend(timeline_lines) else: md_lines.append("- (no timeline events recorded)") md_lines.extend([ "", "## Root Cause Analysis", "", ]) if causes_lines: md_lines.extend(causes_lines) else: md_lines.append("- Investigation ongoing") md_lines.extend([ "", "## Mitigations Applied", "", ]) for m in mitigations[:5]: md_lines.append(f"- {_redact(str(m), 200)}") md_lines.extend([ "", "## Follow-ups", "", ]) for i, fu in enumerate(followups, 1): md_lines.append(f"{i}. **[{fu.get('priority', 'P2')}]** {_redact(fu.get('title', '?'), 200)}") if not followups: md_lines.append("- (no follow-ups identified)") md_lines.extend([ "", "## Prevention", "", "- Review and address all follow-up items", "- Update runbooks if this is a new failure mode", "- Consider adding alerts/monitors for early detection", "", "---", f"*Generated at {_now_iso()} by postmortem_draft_graph*", ]) postmortem_md = "\n".join(md_lines) postmortem_json = { "incident_id": inc.get("id"), "service": service, "env": inc.get("env"), "severity": inc.get("severity"), "started_at": inc.get("started_at"), "ended_at": inc.get("ended_at"), "summary": _redact(triage.get("summary") or inc.get("summary") or "", 1000), "impact": _redact(str(impact), 500), "root_causes": causes[:5], "mitigations": mitigations[:5], "followups": followups, "timeline_event_count": len(events), "generated_at": _now_iso(), } return { **state, "postmortem_md": postmortem_md, "postmortem_json": postmortem_json, } async def attach_artifacts_node(state: PostmortemDraftState) -> PostmortemDraftState: """Attach postmortem_draft.md and .json as incident artifacts.""" if state.get("graph_status") == "failed": return state run_id = state.get("run_id", "") incident_id = state["incident_id"] attached = [] async with GatewayClient() as gw: # Attach markdown md_bytes = state.get("postmortem_md", "").encode("utf-8") md_b64 = base64.b64encode(md_bytes).decode("ascii") md_res = await gw.call_tool( tool="oncall_tool", action="incident_attach_artifact", params={ "incident_id": incident_id, "kind": "postmortem_draft", "format": "md", "content_base64": md_b64, "filename": "postmortem_draft.md", }, agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID), workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID), user_id=state.get("user_id", ""), graph_run_id=run_id, graph_node="attach_artifacts.md", ) if md_res.success: attached.append({"type": "md", "artifact": md_res.data}) # Attach JSON json_bytes = json.dumps( state.get("postmortem_json") or {}, indent=2, ensure_ascii=False, default=str, ).encode("utf-8") json_b64 = base64.b64encode(json_bytes).decode("ascii") json_res = await gw.call_tool( tool="oncall_tool", action="incident_attach_artifact", params={ "incident_id": incident_id, "kind": "postmortem_draft", "format": "json", "content_base64": json_b64, "filename": "postmortem_draft.json", }, agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID), workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID), user_id=state.get("user_id", ""), graph_run_id=run_id, graph_node="attach_artifacts.json", ) if json_res.success: attached.append({"type": "json", "artifact": json_res.data}) # Also attach triage if it was auto-generated if state.get("triage_was_generated") and state.get("triage_report"): triage_bytes = json.dumps( state["triage_report"], indent=2, ensure_ascii=False, default=str, ).encode("utf-8") triage_b64 = base64.b64encode(triage_bytes).decode("ascii") tr_res = await gw.call_tool( tool="oncall_tool", action="incident_attach_artifact", params={ "incident_id": incident_id, "kind": "triage_report", "format": "json", "content_base64": triage_b64, "filename": "triage_report.json", }, agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID), workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID), user_id=state.get("user_id", ""), graph_run_id=run_id, graph_node="attach_artifacts.triage", ) if tr_res.success: attached.append({"type": "triage_json", "artifact": tr_res.data}) return {**state, "artifacts_attached": attached} async def append_followups_node(state: PostmortemDraftState) -> PostmortemDraftState: """Append follow-up items as incident timeline events.""" if state.get("graph_status") == "failed": return state run_id = state.get("run_id", "") incident_id = state["incident_id"] pm_json = state.get("postmortem_json") or {} followups = pm_json.get("followups") or [] count = 0 async with GatewayClient() as gw: for fu in followups[:10]: try: res = await gw.call_tool( tool="oncall_tool", action="incident_append_event", params={ "incident_id": incident_id, "type": "followup", "message": _redact(fu.get("title", ""), 500), "meta": {"priority": fu.get("priority", "P2"), "source": "postmortem_draft"}, }, agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID), workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID), user_id=state.get("user_id", ""), graph_run_id=run_id, graph_node="append_followups", ) if res.success: count += 1 except Exception as e: logger.warning("postmortem: followup append failed (non-fatal): %s", e) return {**state, "followups_appended": count} async def build_result_node(state: PostmortemDraftState) -> PostmortemDraftState: """Build final output.""" if state.get("graph_status") == "failed": err = state.get("validation_error") or state.get("error", "Unknown error") return {**state, "result": {"error": err}} md = state.get("postmortem_md", "") preview = md[:1500] + "\n…[truncated]" if len(md) > 1500 else md return { **state, "result": { "incident_id": state.get("incident_id"), "artifacts_count": len(state.get("artifacts_attached") or []), "artifacts": state.get("artifacts_attached") or [], "followups_count": state.get("followups_appended", 0), "triage_was_generated": state.get("triage_was_generated", False), "markdown_preview": preview, }, "graph_status": "succeeded", } # ─── Helpers ────────────────────────────────────────────────────────────────── def _extract_followups(triage: Dict, incident: Dict) -> List[Dict]: """Extract actionable follow-ups from triage report.""" followups = [] # From triage next_checks for check in (triage.get("next_checks") or [])[:5]: followups.append({"title": _redact(str(check), 300), "priority": "P2"}) # From cost recommendations for rec in (triage.get("context", {}).get("cost", {}).get("recommendations") or [])[:2]: followups.append({"title": f"[FinOps] {_redact(str(rec), 300)}", "priority": "P3"}) # From privacy findings priv = triage.get("context", {}).get("privacy", {}) if priv.get("findings_count", 0) > 0: followups.append({ "title": f"[Privacy] Review {priv['findings_count']} data governance finding(s)", "priority": "P2", }) return followups[:10] # ─── Routing ────────────────────────────────────────────────────────────────── def _after_validate(state: PostmortemDraftState) -> str: if state.get("graph_status") == "failed": return "build_result" return "load_incident" def _after_load(state: PostmortemDraftState) -> str: if state.get("graph_status") == "failed": return "build_result" return "ensure_triage" # ─── Graph builder ──────────────────────────────────────────────────────────── def build_postmortem_draft_graph(): """ Build and compile the postmortem_draft LangGraph. Graph: validate → load_incident → ensure_triage → draft_postmortem → attach_artifacts → append_followups → build_result → END """ graph = StateGraph(PostmortemDraftState) graph.add_node("validate", validate_node) graph.add_node("load_incident", load_incident_node) graph.add_node("ensure_triage", ensure_triage_node) graph.add_node("draft_postmortem", draft_postmortem_node) graph.add_node("attach_artifacts", attach_artifacts_node) graph.add_node("append_followups", append_followups_node) graph.add_node("build_result", build_result_node) graph.set_entry_point("validate") graph.add_conditional_edges( "validate", _after_validate, {"load_incident": "load_incident", "build_result": "build_result"}, ) graph.add_conditional_edges( "load_incident", _after_load, {"ensure_triage": "ensure_triage", "build_result": "build_result"}, ) graph.add_edge("ensure_triage", "draft_postmortem") graph.add_edge("draft_postmortem", "attach_artifacts") graph.add_edge("attach_artifacts", "append_followups") graph.add_edge("append_followups", "build_result") graph.add_edge("build_result", END) return graph.compile()