New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
542 lines
20 KiB
Python
542 lines
20 KiB
Python
"""
|
|
Graph 3: postmortem_draft_graph
|
|
|
|
Generates a structured postmortem draft from an incident + triage report.
|
|
|
|
Node sequence:
|
|
validate → load_incident → ensure_triage → draft_postmortem
|
|
→ attach_artifacts → append_followups → END
|
|
|
|
All tool calls via gateway. No direct DB or file access.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import base64
|
|
import datetime
|
|
import json
|
|
import logging
|
|
import re
|
|
from typing import Any, Dict, List, Optional, TypedDict
|
|
|
|
from langgraph.graph import StateGraph, END
|
|
|
|
from ..config import settings
|
|
from ..gateway_client import GatewayClient
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_SECRET_PAT = re.compile(
|
|
r'(?i)(token|api[_-]?key|password|secret|bearer)\s*[=:]\s*\S+',
|
|
)
|
|
|
|
|
|
def _redact(text: str, max_len: int = 4000) -> str:
|
|
text = _SECRET_PAT.sub(lambda m: f"{m.group(1)}=***", text)
|
|
return text[:max_len] if len(text) > max_len else text
|
|
|
|
|
|
def _now_iso() -> str:
|
|
return datetime.datetime.now(datetime.timezone.utc).isoformat()
|
|
|
|
|
|
# ─── State ────────────────────────────────────────────────────────────────────
|
|
|
|
class PostmortemDraftState(TypedDict, total=False):
|
|
run_id: str
|
|
agent_id: str
|
|
workspace_id: str
|
|
user_id: str
|
|
input: Dict[str, Any]
|
|
|
|
# Validated
|
|
incident_id: str
|
|
service: str
|
|
env: str
|
|
time_range: Dict[str, str]
|
|
include_traces: bool
|
|
validation_error: Optional[str]
|
|
|
|
# Node results
|
|
incident_data: Optional[Dict]
|
|
triage_report: Optional[Dict]
|
|
triage_was_generated: bool
|
|
postmortem_md: str
|
|
postmortem_json: Optional[Dict]
|
|
artifacts_attached: List[Dict]
|
|
followups_appended: int
|
|
|
|
# Output
|
|
result: Optional[Dict[str, Any]]
|
|
graph_status: str
|
|
error: Optional[str]
|
|
|
|
|
|
# ─── Nodes ────────────────────────────────────────────────────────────────────
|
|
|
|
async def validate_node(state: PostmortemDraftState) -> PostmortemDraftState:
|
|
inp = state.get("input", {})
|
|
incident_id = inp.get("incident_id", "").strip()
|
|
if not incident_id:
|
|
return {**state, "graph_status": "failed", "validation_error": "incident_id is required"}
|
|
return {
|
|
**state,
|
|
"incident_id": incident_id,
|
|
"service": inp.get("service", ""),
|
|
"env": inp.get("env", "prod"),
|
|
"time_range": inp.get("time_range") or {},
|
|
"include_traces": bool(inp.get("include_traces", False)),
|
|
"triage_was_generated": False,
|
|
"artifacts_attached": [],
|
|
"followups_appended": 0,
|
|
"graph_status": "running",
|
|
}
|
|
|
|
|
|
async def load_incident_node(state: PostmortemDraftState) -> PostmortemDraftState:
|
|
if state.get("graph_status") == "failed":
|
|
return state
|
|
|
|
run_id = state.get("run_id", "")
|
|
async with GatewayClient() as gw:
|
|
result = await gw.call_tool(
|
|
tool="oncall_tool",
|
|
action="incident_get",
|
|
params={"incident_id": state["incident_id"]},
|
|
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
|
|
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
|
|
user_id=state.get("user_id", ""),
|
|
graph_run_id=run_id,
|
|
graph_node="load_incident",
|
|
)
|
|
|
|
if not result.success:
|
|
return {**state, "graph_status": "failed",
|
|
"error": f"Incident not found: {result.error_message}"}
|
|
|
|
inc = result.data or {}
|
|
service = state.get("service") or inc.get("service", "unknown")
|
|
|
|
# Check if triage_report artifact exists
|
|
artifacts = inc.get("artifacts") or []
|
|
triage_art = next((a for a in artifacts if a.get("kind") == "triage_report"), None)
|
|
triage = None
|
|
if triage_art:
|
|
triage = {"note": "pre-existing triage_report artifact found", "artifact": triage_art}
|
|
|
|
return {**state, "incident_data": inc, "service": service, "triage_report": triage}
|
|
|
|
|
|
async def ensure_triage_node(state: PostmortemDraftState) -> PostmortemDraftState:
|
|
"""If no triage report exists, run incident_triage_graph via gateway."""
|
|
if state.get("graph_status") == "failed":
|
|
return state
|
|
if state.get("triage_report"):
|
|
return state
|
|
|
|
run_id = state.get("run_id", "")
|
|
inc = state.get("incident_data") or {}
|
|
service = state.get("service") or inc.get("service", "unknown")
|
|
symptom = inc.get("title") or inc.get("summary") or "unknown symptom"
|
|
|
|
time_range = state.get("time_range") or {}
|
|
if not time_range.get("from"):
|
|
started = inc.get("started_at", _now_iso())
|
|
ended = inc.get("ended_at") or _now_iso()
|
|
time_range = {"from": started, "to": ended}
|
|
|
|
# Call observability + oncall + kb (simplified triage — mirror of incident_triage_graph)
|
|
triage_data: Dict[str, Any] = {"generated": True, "service": service}
|
|
|
|
async with GatewayClient() as gw:
|
|
# Service overview
|
|
overview = await gw.call_tool(
|
|
tool="observability_tool", action="service_overview",
|
|
params={"service": service, "time_range": time_range, "env": state.get("env", "prod")},
|
|
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
|
|
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
|
|
user_id=state.get("user_id", ""),
|
|
graph_run_id=run_id, graph_node="ensure_triage.overview",
|
|
)
|
|
triage_data["overview"] = overview.data if overview.success else {"error": overview.error_message}
|
|
|
|
# Health
|
|
health = await gw.call_tool(
|
|
tool="oncall_tool", action="service_health",
|
|
params={"service": service, "env": state.get("env", "prod")},
|
|
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
|
|
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
|
|
user_id=state.get("user_id", ""),
|
|
graph_run_id=run_id, graph_node="ensure_triage.health",
|
|
)
|
|
triage_data["health"] = health.data if health.success else {"error": health.error_message}
|
|
|
|
# KB runbooks
|
|
kb = await gw.call_tool(
|
|
tool="kb_tool", action="search",
|
|
params={"query": f"{service} {symptom}"[:200], "top_k": 3},
|
|
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
|
|
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
|
|
user_id=state.get("user_id", ""),
|
|
graph_run_id=run_id, graph_node="ensure_triage.kb",
|
|
)
|
|
triage_data["runbooks"] = (kb.data.get("results") or [])[:3] if kb.success and kb.data else []
|
|
|
|
triage_data["summary"] = (
|
|
f"Auto-generated triage for {service}: "
|
|
f"health={triage_data.get('health', {}).get('status', '?')}, "
|
|
f"symptom='{_redact(symptom, 100)}'"
|
|
)
|
|
triage_data["suspected_root_causes"] = [
|
|
{"rank": 1, "cause": symptom, "evidence": []}
|
|
]
|
|
|
|
return {**state, "triage_report": triage_data, "triage_was_generated": True}
|
|
|
|
|
|
async def draft_postmortem_node(state: PostmortemDraftState) -> PostmortemDraftState:
|
|
"""Generate postmortem from incident data + triage report (deterministic template)."""
|
|
if state.get("graph_status") == "failed":
|
|
return state
|
|
|
|
inc = state.get("incident_data") or {}
|
|
triage = state.get("triage_report") or {}
|
|
service = state.get("service") or inc.get("service", "unknown")
|
|
events = inc.get("events") or []
|
|
|
|
# Build timeline from events
|
|
timeline_lines = []
|
|
for ev in events[:30]:
|
|
ts_short = (ev.get("ts") or "")[:19]
|
|
ev_type = ev.get("type", "note")
|
|
msg = _redact(ev.get("message", ""), 300)
|
|
timeline_lines.append(f"- **{ts_short}** [{ev_type}] {msg}")
|
|
|
|
# Root causes
|
|
causes = triage.get("suspected_root_causes") or []
|
|
causes_lines = []
|
|
for c in causes[:5]:
|
|
causes_lines.append(f"- **#{c.get('rank', '?')}**: {_redact(c.get('cause', '?'), 200)}")
|
|
for e in (c.get("evidence") or [])[:2]:
|
|
causes_lines.append(f" - Evidence: {_redact(str(e), 200)}")
|
|
|
|
# Mitigations
|
|
mitigations = triage.get("mitigations_now") or []
|
|
if not mitigations:
|
|
mitigations = ["(no mitigations recorded)"]
|
|
|
|
# Follow-ups
|
|
followups = _extract_followups(triage, inc)
|
|
|
|
# Impact
|
|
impact = triage.get("impact_assessment") or inc.get("summary") or "Unknown impact"
|
|
|
|
# Build markdown
|
|
md_lines = [
|
|
f"# Postmortem: {_redact(inc.get('title', service), 200)}",
|
|
"",
|
|
f"**Incident ID:** `{inc.get('id', '?')}`",
|
|
f"**Service:** {service}",
|
|
f"**Environment:** {inc.get('env', '?')}",
|
|
f"**Severity:** {inc.get('severity', '?')}",
|
|
f"**Status:** {inc.get('status', '?')}",
|
|
f"**Started:** {inc.get('started_at', '?')}",
|
|
f"**Ended:** {inc.get('ended_at', 'ongoing')}",
|
|
f"**Created by:** {inc.get('created_by', '?')}",
|
|
"",
|
|
"---",
|
|
"",
|
|
"## Summary",
|
|
"",
|
|
_redact(triage.get("summary") or inc.get("summary") or "No summary available.", 1000),
|
|
"",
|
|
"## Impact",
|
|
"",
|
|
_redact(str(impact), 500),
|
|
"",
|
|
"## Detection",
|
|
"",
|
|
f"Incident was reported at {inc.get('started_at', '?')} with symptom: "
|
|
f"*{_redact(inc.get('title', ''), 200)}*.",
|
|
"",
|
|
"## Timeline",
|
|
"",
|
|
]
|
|
if timeline_lines:
|
|
md_lines.extend(timeline_lines)
|
|
else:
|
|
md_lines.append("- (no timeline events recorded)")
|
|
md_lines.extend([
|
|
"",
|
|
"## Root Cause Analysis",
|
|
"",
|
|
])
|
|
if causes_lines:
|
|
md_lines.extend(causes_lines)
|
|
else:
|
|
md_lines.append("- Investigation ongoing")
|
|
md_lines.extend([
|
|
"",
|
|
"## Mitigations Applied",
|
|
"",
|
|
])
|
|
for m in mitigations[:5]:
|
|
md_lines.append(f"- {_redact(str(m), 200)}")
|
|
md_lines.extend([
|
|
"",
|
|
"## Follow-ups",
|
|
"",
|
|
])
|
|
for i, fu in enumerate(followups, 1):
|
|
md_lines.append(f"{i}. **[{fu.get('priority', 'P2')}]** {_redact(fu.get('title', '?'), 200)}")
|
|
if not followups:
|
|
md_lines.append("- (no follow-ups identified)")
|
|
md_lines.extend([
|
|
"",
|
|
"## Prevention",
|
|
"",
|
|
"- Review and address all follow-up items",
|
|
"- Update runbooks if this is a new failure mode",
|
|
"- Consider adding alerts/monitors for early detection",
|
|
"",
|
|
"---",
|
|
f"*Generated at {_now_iso()} by postmortem_draft_graph*",
|
|
])
|
|
|
|
postmortem_md = "\n".join(md_lines)
|
|
|
|
postmortem_json = {
|
|
"incident_id": inc.get("id"),
|
|
"service": service,
|
|
"env": inc.get("env"),
|
|
"severity": inc.get("severity"),
|
|
"started_at": inc.get("started_at"),
|
|
"ended_at": inc.get("ended_at"),
|
|
"summary": _redact(triage.get("summary") or inc.get("summary") or "", 1000),
|
|
"impact": _redact(str(impact), 500),
|
|
"root_causes": causes[:5],
|
|
"mitigations": mitigations[:5],
|
|
"followups": followups,
|
|
"timeline_event_count": len(events),
|
|
"generated_at": _now_iso(),
|
|
}
|
|
|
|
return {
|
|
**state,
|
|
"postmortem_md": postmortem_md,
|
|
"postmortem_json": postmortem_json,
|
|
}
|
|
|
|
|
|
async def attach_artifacts_node(state: PostmortemDraftState) -> PostmortemDraftState:
|
|
"""Attach postmortem_draft.md and .json as incident artifacts."""
|
|
if state.get("graph_status") == "failed":
|
|
return state
|
|
|
|
run_id = state.get("run_id", "")
|
|
incident_id = state["incident_id"]
|
|
attached = []
|
|
|
|
async with GatewayClient() as gw:
|
|
# Attach markdown
|
|
md_bytes = state.get("postmortem_md", "").encode("utf-8")
|
|
md_b64 = base64.b64encode(md_bytes).decode("ascii")
|
|
md_res = await gw.call_tool(
|
|
tool="oncall_tool", action="incident_attach_artifact",
|
|
params={
|
|
"incident_id": incident_id,
|
|
"kind": "postmortem_draft",
|
|
"format": "md",
|
|
"content_base64": md_b64,
|
|
"filename": "postmortem_draft.md",
|
|
},
|
|
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
|
|
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
|
|
user_id=state.get("user_id", ""),
|
|
graph_run_id=run_id, graph_node="attach_artifacts.md",
|
|
)
|
|
if md_res.success:
|
|
attached.append({"type": "md", "artifact": md_res.data})
|
|
|
|
# Attach JSON
|
|
json_bytes = json.dumps(
|
|
state.get("postmortem_json") or {}, indent=2, ensure_ascii=False, default=str,
|
|
).encode("utf-8")
|
|
json_b64 = base64.b64encode(json_bytes).decode("ascii")
|
|
json_res = await gw.call_tool(
|
|
tool="oncall_tool", action="incident_attach_artifact",
|
|
params={
|
|
"incident_id": incident_id,
|
|
"kind": "postmortem_draft",
|
|
"format": "json",
|
|
"content_base64": json_b64,
|
|
"filename": "postmortem_draft.json",
|
|
},
|
|
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
|
|
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
|
|
user_id=state.get("user_id", ""),
|
|
graph_run_id=run_id, graph_node="attach_artifacts.json",
|
|
)
|
|
if json_res.success:
|
|
attached.append({"type": "json", "artifact": json_res.data})
|
|
|
|
# Also attach triage if it was auto-generated
|
|
if state.get("triage_was_generated") and state.get("triage_report"):
|
|
triage_bytes = json.dumps(
|
|
state["triage_report"], indent=2, ensure_ascii=False, default=str,
|
|
).encode("utf-8")
|
|
triage_b64 = base64.b64encode(triage_bytes).decode("ascii")
|
|
tr_res = await gw.call_tool(
|
|
tool="oncall_tool", action="incident_attach_artifact",
|
|
params={
|
|
"incident_id": incident_id,
|
|
"kind": "triage_report",
|
|
"format": "json",
|
|
"content_base64": triage_b64,
|
|
"filename": "triage_report.json",
|
|
},
|
|
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
|
|
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
|
|
user_id=state.get("user_id", ""),
|
|
graph_run_id=run_id, graph_node="attach_artifacts.triage",
|
|
)
|
|
if tr_res.success:
|
|
attached.append({"type": "triage_json", "artifact": tr_res.data})
|
|
|
|
return {**state, "artifacts_attached": attached}
|
|
|
|
|
|
async def append_followups_node(state: PostmortemDraftState) -> PostmortemDraftState:
|
|
"""Append follow-up items as incident timeline events."""
|
|
if state.get("graph_status") == "failed":
|
|
return state
|
|
|
|
run_id = state.get("run_id", "")
|
|
incident_id = state["incident_id"]
|
|
pm_json = state.get("postmortem_json") or {}
|
|
followups = pm_json.get("followups") or []
|
|
count = 0
|
|
|
|
async with GatewayClient() as gw:
|
|
for fu in followups[:10]:
|
|
try:
|
|
res = await gw.call_tool(
|
|
tool="oncall_tool", action="incident_append_event",
|
|
params={
|
|
"incident_id": incident_id,
|
|
"type": "followup",
|
|
"message": _redact(fu.get("title", ""), 500),
|
|
"meta": {"priority": fu.get("priority", "P2"), "source": "postmortem_draft"},
|
|
},
|
|
agent_id=state.get("agent_id", settings.DEFAULT_AGENT_ID),
|
|
workspace_id=state.get("workspace_id", settings.DEFAULT_WORKSPACE_ID),
|
|
user_id=state.get("user_id", ""),
|
|
graph_run_id=run_id, graph_node="append_followups",
|
|
)
|
|
if res.success:
|
|
count += 1
|
|
except Exception as e:
|
|
logger.warning("postmortem: followup append failed (non-fatal): %s", e)
|
|
|
|
return {**state, "followups_appended": count}
|
|
|
|
|
|
async def build_result_node(state: PostmortemDraftState) -> PostmortemDraftState:
|
|
"""Build final output."""
|
|
if state.get("graph_status") == "failed":
|
|
err = state.get("validation_error") or state.get("error", "Unknown error")
|
|
return {**state, "result": {"error": err}}
|
|
|
|
md = state.get("postmortem_md", "")
|
|
preview = md[:1500] + "\n…[truncated]" if len(md) > 1500 else md
|
|
|
|
return {
|
|
**state,
|
|
"result": {
|
|
"incident_id": state.get("incident_id"),
|
|
"artifacts_count": len(state.get("artifacts_attached") or []),
|
|
"artifacts": state.get("artifacts_attached") or [],
|
|
"followups_count": state.get("followups_appended", 0),
|
|
"triage_was_generated": state.get("triage_was_generated", False),
|
|
"markdown_preview": preview,
|
|
},
|
|
"graph_status": "succeeded",
|
|
}
|
|
|
|
|
|
# ─── Helpers ──────────────────────────────────────────────────────────────────
|
|
|
|
def _extract_followups(triage: Dict, incident: Dict) -> List[Dict]:
|
|
"""Extract actionable follow-ups from triage report."""
|
|
followups = []
|
|
|
|
# From triage next_checks
|
|
for check in (triage.get("next_checks") or [])[:5]:
|
|
followups.append({"title": _redact(str(check), 300), "priority": "P2"})
|
|
|
|
# From cost recommendations
|
|
for rec in (triage.get("context", {}).get("cost", {}).get("recommendations") or [])[:2]:
|
|
followups.append({"title": f"[FinOps] {_redact(str(rec), 300)}", "priority": "P3"})
|
|
|
|
# From privacy findings
|
|
priv = triage.get("context", {}).get("privacy", {})
|
|
if priv.get("findings_count", 0) > 0:
|
|
followups.append({
|
|
"title": f"[Privacy] Review {priv['findings_count']} data governance finding(s)",
|
|
"priority": "P2",
|
|
})
|
|
|
|
return followups[:10]
|
|
|
|
|
|
# ─── Routing ──────────────────────────────────────────────────────────────────
|
|
|
|
def _after_validate(state: PostmortemDraftState) -> str:
|
|
if state.get("graph_status") == "failed":
|
|
return "build_result"
|
|
return "load_incident"
|
|
|
|
|
|
def _after_load(state: PostmortemDraftState) -> str:
|
|
if state.get("graph_status") == "failed":
|
|
return "build_result"
|
|
return "ensure_triage"
|
|
|
|
|
|
# ─── Graph builder ────────────────────────────────────────────────────────────
|
|
|
|
def build_postmortem_draft_graph():
|
|
"""
|
|
Build and compile the postmortem_draft LangGraph.
|
|
|
|
Graph:
|
|
validate → load_incident → ensure_triage → draft_postmortem
|
|
→ attach_artifacts → append_followups → build_result → END
|
|
"""
|
|
graph = StateGraph(PostmortemDraftState)
|
|
|
|
graph.add_node("validate", validate_node)
|
|
graph.add_node("load_incident", load_incident_node)
|
|
graph.add_node("ensure_triage", ensure_triage_node)
|
|
graph.add_node("draft_postmortem", draft_postmortem_node)
|
|
graph.add_node("attach_artifacts", attach_artifacts_node)
|
|
graph.add_node("append_followups", append_followups_node)
|
|
graph.add_node("build_result", build_result_node)
|
|
|
|
graph.set_entry_point("validate")
|
|
|
|
graph.add_conditional_edges(
|
|
"validate", _after_validate,
|
|
{"load_incident": "load_incident", "build_result": "build_result"},
|
|
)
|
|
graph.add_conditional_edges(
|
|
"load_incident", _after_load,
|
|
{"ensure_triage": "ensure_triage", "build_result": "build_result"},
|
|
)
|
|
graph.add_edge("ensure_triage", "draft_postmortem")
|
|
graph.add_edge("draft_postmortem", "attach_artifacts")
|
|
graph.add_edge("attach_artifacts", "append_followups")
|
|
graph.add_edge("append_followups", "build_result")
|
|
graph.add_edge("build_result", END)
|
|
|
|
return graph.compile()
|