Files
microdao-daarion/services/sofiia-supervisor/app/graphs/postmortem_draft_graph.py
Apple 129e4ea1fc feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store,
architecture_pressure, backlog_generator/store, cost_analyzer, data_governance,
dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment,
platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files),
signature_state_store, sofiia_auto_router, tool_governance

New services:
- sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static
- memory-service: integration_endpoints, integrations, voice_endpoints, static UI
- aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents)
- sofiia-supervisor: new supervisor service
- aistalk-bridge-lite: Telegram bridge lite
- calendar-service: CalDAV calendar service with reminders
- mlx-stt-service / mlx-tts-service: Apple Silicon speech services
- binance-bot-monitor: market monitor service
- node-worker: STT/TTS memory providers

New tools (9): agent_email, browser_tool, contract_tool, observability_tool,
oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault

New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus,
farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine,
session_context, style_adapter, telemetry)

Tests: 85+ test files for all new modules
Made-with: Cursor
2026-03-03 07:14:14 -08:00

542 lines
20 KiB
Python

"""
Graph 3: postmortem_draft_graph
Generates a structured postmortem draft from an incident + triage report.
Node sequence:
validate → load_incident → ensure_triage → draft_postmortem
→ attach_artifacts → append_followups → END
All tool calls via gateway. No direct DB or file access.
"""
from __future__ import annotations
import base64
import datetime
import json
import logging
import re
from typing import Any, Dict, List, Optional, TypedDict
from langgraph.graph import StateGraph, END
from ..config import settings
from ..gateway_client import GatewayClient
logger = logging.getLogger(__name__)
_SECRET_PAT = re.compile(
r'(?i)(token|api[_-]?key|password|secret|bearer)\s*[=:]\s*\S+',
)
def _redact(text: str, max_len: int = 4000) -> str:
text = _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***", text)
return text[:max_len] if len(text) > max_len else text
def _now_iso() -> str:
return datetime.datetime.now(datetime.timezone.utc).isoformat()
# ─── State ────────────────────────────────────────────────────────────────────
class PostmortemDraftState(TypedDict, total=False):
run_id: str
agent_id: str
workspace_id: str
user_id: str
input: Dict[str, Any]
# Validated
incident_id: str
service: str
env: str
time_range: Dict[str, str]
include_traces: bool
validation_error: Optional[str]
# Node results
incident_data: Optional[Dict]
triage_report: Optional[Dict]
triage_was_generated: bool
postmortem_md: str
postmortem_json: Optional[Dict]
artifacts_attached: List[Dict]
followups_appended: int
# Output
result: Optional[Dict[str, Any]]
graph_status: str
error: Optional[str]
# ─── Nodes ────────────────────────────────────────────────────────────────────
async def validate_node(state: PostmortemDraftState) -> PostmortemDraftState:
inp = state.get("input", {})
incident_id = inp.get("incident_id", "").strip()
if not incident_id:
return {**state, "graph_status": "failed", "validation_error": "incident_id is required"}
return {
**state,
"incident_id": incident_id,
"service": inp.get("service", ""),
"env": inp.get("env", "prod"),
"time_range": inp.get("time_range") or {},
"include_traces": bool(inp.get("include_traces", False)),
"triage_was_generated": False,
"artifacts_attached": [],
"followups_appended": 0,
"graph_status": "running",
}
async def load_incident_node(state: PostmortemDraftState) -> PostmortemDraftState:
if state.get("graph_status") == "failed":
return state
run_id = state.get("run_id", "")
async with GatewayClient() as gw:
result = await gw.call_tool(
tool="oncall_tool",
action="incident_get",
params={"incident_id": state["incident_id"]},
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
user_id=state.get("user_id", ""),
graph_run_id=run_id,
graph_node="load_incident",
)
if not result.success:
return {**state, "graph_status": "failed",
"error": f"Incident not found: {result.error_message}"}
inc = result.data or {}
service = state.get("service") or inc.get("service", "unknown")
# Check if triage_report artifact exists
artifacts = inc.get("artifacts") or []
triage_art = next((a for a in artifacts if a.get("kind") == "triage_report"), None)
triage = None
if triage_art:
triage = {"note": "pre-existing triage_report artifact found", "artifact": triage_art}
return {**state, "incident_data": inc, "service": service, "triage_report": triage}
async def ensure_triage_node(state: PostmortemDraftState) -> PostmortemDraftState:
"""If no triage report exists, run incident_triage_graph via gateway."""
if state.get("graph_status") == "failed":
return state
if state.get("triage_report"):
return state
run_id = state.get("run_id", "")
inc = state.get("incident_data") or {}
service = state.get("service") or inc.get("service", "unknown")
symptom = inc.get("title") or inc.get("summary") or "unknown symptom"
time_range = state.get("time_range") or {}
if not time_range.get("from"):
started = inc.get("started_at", _now_iso())
ended = inc.get("ended_at") or _now_iso()
time_range = {"from": started, "to": ended}
# Call observability + oncall + kb (simplified triage — mirror of incident_triage_graph)
triage_data: Dict[str, Any] = {"generated": True, "service": service}
async with GatewayClient() as gw:
# Service overview
overview = await gw.call_tool(
tool="observability_tool", action="service_overview",
params={"service": service, "time_range": time_range, "env": state.get("env", "prod")},
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
user_id=state.get("user_id", ""),
graph_run_id=run_id, graph_node="ensure_triage.overview",
)
triage_data["overview"] = overview.data if overview.success else {"error": overview.error_message}
# Health
health = await gw.call_tool(
tool="oncall_tool", action="service_health",
params={"service": service, "env": state.get("env", "prod")},
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
user_id=state.get("user_id", ""),
graph_run_id=run_id, graph_node="ensure_triage.health",
)
triage_data["health"] = health.data if health.success else {"error": health.error_message}
# KB runbooks
kb = await gw.call_tool(
tool="kb_tool", action="search",
params={"query": f"{service} {symptom}"[:200], "top_k": 3},
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
user_id=state.get("user_id", ""),
graph_run_id=run_id, graph_node="ensure_triage.kb",
)
triage_data["runbooks"] = (kb.data.get("results") or [])[:3] if kb.success and kb.data else []
triage_data["summary"] = (
f"Auto-generated triage for {service}: "
f"health={triage_data.get('health', {}).get('status', '?')}, "
f"symptom='{_redact(symptom, 100)}'"
)
triage_data["suspected_root_causes"] = [
{"rank": 1, "cause": symptom, "evidence": []}
]
return {**state, "triage_report": triage_data, "triage_was_generated": True}
async def draft_postmortem_node(state: PostmortemDraftState) -> PostmortemDraftState:
"""Generate postmortem from incident data + triage report (deterministic template)."""
if state.get("graph_status") == "failed":
return state
inc = state.get("incident_data") or {}
triage = state.get("triage_report") or {}
service = state.get("service") or inc.get("service", "unknown")
events = inc.get("events") or []
# Build timeline from events
timeline_lines = []
for ev in events[:30]:
ts_short = (ev.get("ts") or "")[:19]
ev_type = ev.get("type", "note")
msg = _redact(ev.get("message", ""), 300)
timeline_lines.append(f"- **{ts_short}** [{ev_type}] {msg}")
# Root causes
causes = triage.get("suspected_root_causes") or []
causes_lines = []
for c in causes[:5]:
causes_lines.append(f"- **#{c.get('rank', '?')}**: {_redact(c.get('cause', '?'), 200)}")
for e in (c.get("evidence") or [])[:2]:
causes_lines.append(f" - Evidence: {_redact(str(e), 200)}")
# Mitigations
mitigations = triage.get("mitigations_now") or []
if not mitigations:
mitigations = ["(no mitigations recorded)"]
# Follow-ups
followups = _extract_followups(triage, inc)
# Impact
impact = triage.get("impact_assessment") or inc.get("summary") or "Unknown impact"
# Build markdown
md_lines = [
f"# Postmortem: {_redact(inc.get('title', service), 200)}",
"",
f"**Incident ID:** `{inc.get('id', '?')}`",
f"**Service:** {service}",
f"**Environment:** {inc.get('env', '?')}",
f"**Severity:** {inc.get('severity', '?')}",
f"**Status:** {inc.get('status', '?')}",
f"**Started:** {inc.get('started_at', '?')}",
f"**Ended:** {inc.get('ended_at', 'ongoing')}",
f"**Created by:** {inc.get('created_by', '?')}",
"",
"---",
"",
"## Summary",
"",
_redact(triage.get("summary") or inc.get("summary") or "No summary available.", 1000),
"",
"## Impact",
"",
_redact(str(impact), 500),
"",
"## Detection",
"",
f"Incident was reported at {inc.get('started_at', '?')} with symptom: "
f"*{_redact(inc.get('title', ''), 200)}*.",
"",
"## Timeline",
"",
]
if timeline_lines:
md_lines.extend(timeline_lines)
else:
md_lines.append("- (no timeline events recorded)")
md_lines.extend([
"",
"## Root Cause Analysis",
"",
])
if causes_lines:
md_lines.extend(causes_lines)
else:
md_lines.append("- Investigation ongoing")
md_lines.extend([
"",
"## Mitigations Applied",
"",
])
for m in mitigations[:5]:
md_lines.append(f"- {_redact(str(m), 200)}")
md_lines.extend([
"",
"## Follow-ups",
"",
])
for i, fu in enumerate(followups, 1):
md_lines.append(f"{i}. **[{fu.get('priority', 'P2')}]** {_redact(fu.get('title', '?'), 200)}")
if not followups:
md_lines.append("- (no follow-ups identified)")
md_lines.extend([
"",
"## Prevention",
"",
"- Review and address all follow-up items",
"- Update runbooks if this is a new failure mode",
"- Consider adding alerts/monitors for early detection",
"",
"---",
f"*Generated at {_now_iso()} by postmortem_draft_graph*",
])
postmortem_md = "\n".join(md_lines)
postmortem_json = {
"incident_id": inc.get("id"),
"service": service,
"env": inc.get("env"),
"severity": inc.get("severity"),
"started_at": inc.get("started_at"),
"ended_at": inc.get("ended_at"),
"summary": _redact(triage.get("summary") or inc.get("summary") or "", 1000),
"impact": _redact(str(impact), 500),
"root_causes": causes[:5],
"mitigations": mitigations[:5],
"followups": followups,
"timeline_event_count": len(events),
"generated_at": _now_iso(),
}
return {
**state,
"postmortem_md": postmortem_md,
"postmortem_json": postmortem_json,
}
async def attach_artifacts_node(state: PostmortemDraftState) -> PostmortemDraftState:
"""Attach postmortem_draft.md and .json as incident artifacts."""
if state.get("graph_status") == "failed":
return state
run_id = state.get("run_id", "")
incident_id = state["incident_id"]
attached = []
async with GatewayClient() as gw:
# Attach markdown
md_bytes = state.get("postmortem_md", "").encode("utf-8")
md_b64 = base64.b64encode(md_bytes).decode("ascii")
md_res = await gw.call_tool(
tool="oncall_tool", action="incident_attach_artifact",
params={
"incident_id": incident_id,
"kind": "postmortem_draft",
"format": "md",
"content_base64": md_b64,
"filename": "postmortem_draft.md",
},
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
user_id=state.get("user_id", ""),
graph_run_id=run_id, graph_node="attach_artifacts.md",
)
if md_res.success:
attached.append({"type": "md", "artifact": md_res.data})
# Attach JSON
json_bytes = json.dumps(
state.get("postmortem_json") or {}, indent=2, ensure_ascii=False, default=str,
).encode("utf-8")
json_b64 = base64.b64encode(json_bytes).decode("ascii")
json_res = await gw.call_tool(
tool="oncall_tool", action="incident_attach_artifact",
params={
"incident_id": incident_id,
"kind": "postmortem_draft",
"format": "json",
"content_base64": json_b64,
"filename": "postmortem_draft.json",
},
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
user_id=state.get("user_id", ""),
graph_run_id=run_id, graph_node="attach_artifacts.json",
)
if json_res.success:
attached.append({"type": "json", "artifact": json_res.data})
# Also attach triage if it was auto-generated
if state.get("triage_was_generated") and state.get("triage_report"):
triage_bytes = json.dumps(
state["triage_report"], indent=2, ensure_ascii=False, default=str,
).encode("utf-8")
triage_b64 = base64.b64encode(triage_bytes).decode("ascii")
tr_res = await gw.call_tool(
tool="oncall_tool", action="incident_attach_artifact",
params={
"incident_id": incident_id,
"kind": "triage_report",
"format": "json",
"content_base64": triage_b64,
"filename": "triage_report.json",
},
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
user_id=state.get("user_id", ""),
graph_run_id=run_id, graph_node="attach_artifacts.triage",
)
if tr_res.success:
attached.append({"type": "triage_json", "artifact": tr_res.data})
return {**state, "artifacts_attached": attached}
async def append_followups_node(state: PostmortemDraftState) -> PostmortemDraftState:
"""Append follow-up items as incident timeline events."""
if state.get("graph_status") == "failed":
return state
run_id = state.get("run_id", "")
incident_id = state["incident_id"]
pm_json = state.get("postmortem_json") or {}
followups = pm_json.get("followups") or []
count = 0
async with GatewayClient() as gw:
for fu in followups[:10]:
try:
res = await gw.call_tool(
tool="oncall_tool", action="incident_append_event",
params={
"incident_id": incident_id,
"type": "followup",
"message": _redact(fu.get("title", ""), 500),
"meta": {"priority": fu.get("priority", "P2"), "source": "postmortem_draft"},
},
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
user_id=state.get("user_id", ""),
graph_run_id=run_id, graph_node="append_followups",
)
if res.success:
count += 1
except Exception as e:
logger.warning("postmortem: followup append failed (non-fatal): %s", e)
return {**state, "followups_appended": count}
async def build_result_node(state: PostmortemDraftState) -> PostmortemDraftState:
"""Build final output."""
if state.get("graph_status") == "failed":
err = state.get("validation_error") or state.get("error", "Unknown error")
return {**state, "result": {"error": err}}
md = state.get("postmortem_md", "")
preview = md[:1500] + "\n…[truncated]" if len(md) > 1500 else md
return {
**state,
"result": {
"incident_id": state.get("incident_id"),
"artifacts_count": len(state.get("artifacts_attached") or []),
"artifacts": state.get("artifacts_attached") or [],
"followups_count": state.get("followups_appended", 0),
"triage_was_generated": state.get("triage_was_generated", False),
"markdown_preview": preview,
},
"graph_status": "succeeded",
}
# ─── Helpers ──────────────────────────────────────────────────────────────────
def _extract_followups(triage: Dict, incident: Dict) -> List[Dict]:
"""Extract actionable follow-ups from triage report."""
followups = []
# From triage next_checks
for check in (triage.get("next_checks") or [])[:5]:
followups.append({"title": _redact(str(check), 300), "priority": "P2"})
# From cost recommendations
for rec in (triage.get("context", {}).get("cost", {}).get("recommendations") or [])[:2]:
followups.append({"title": f"[FinOps] {_redact(str(rec), 300)}", "priority": "P3"})
# From privacy findings
priv = triage.get("context", {}).get("privacy", {})
if priv.get("findings_count", 0) > 0:
followups.append({
"title": f"[Privacy] Review {priv['findings_count']} data governance finding(s)",
"priority": "P2",
})
return followups[:10]
# ─── Routing ──────────────────────────────────────────────────────────────────
def _after_validate(state: PostmortemDraftState) -> str:
if state.get("graph_status") == "failed":
return "build_result"
return "load_incident"
def _after_load(state: PostmortemDraftState) -> str:
if state.get("graph_status") == "failed":
return "build_result"
return "ensure_triage"
# ─── Graph builder ────────────────────────────────────────────────────────────
def build_postmortem_draft_graph():
"""
Build and compile the postmortem_draft LangGraph.
Graph:
validate → load_incident → ensure_triage → draft_postmortem
→ attach_artifacts → append_followups → build_result → END
"""
graph = StateGraph(PostmortemDraftState)
graph.add_node("validate", validate_node)
graph.add_node("load_incident", load_incident_node)
graph.add_node("ensure_triage", ensure_triage_node)
graph.add_node("draft_postmortem", draft_postmortem_node)
graph.add_node("attach_artifacts", attach_artifacts_node)
graph.add_node("append_followups", append_followups_node)
graph.add_node("build_result", build_result_node)
graph.set_entry_point("validate")
graph.add_conditional_edges(
"validate", _after_validate,
{"load_incident": "load_incident", "build_result": "build_result"},
)
graph.add_conditional_edges(
"load_incident", _after_load,
{"ensure_triage": "ensure_triage", "build_result": "build_result"},
)
graph.add_edge("ensure_triage", "draft_postmortem")
graph.add_edge("draft_postmortem", "attach_artifacts")
graph.add_edge("attach_artifacts", "append_followups")
graph.add_edge("append_followups", "build_result")
graph.add_edge("build_result", END)
return graph.compile()