feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
This commit is contained in:
0
services/sofiia-supervisor/tests/__init__.py
Normal file
0
services/sofiia-supervisor/tests/__init__.py
Normal file
112
services/sofiia-supervisor/tests/conftest.py
Normal file
112
services/sofiia-supervisor/tests/conftest.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""
|
||||
Shared test fixtures for sofiia-supervisor.
|
||||
|
||||
Uses httpx.MockTransport to mock gateway responses — no real network calls.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
# ─── Path bootstrap ───────────────────────────────────────────────────────────
|
||||
_svc_root = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(_svc_root))
|
||||
|
||||
|
||||
# ─── Gateway mock helpers ─────────────────────────────────────────────────────
|
||||
|
||||
class MockGatewayClient:
|
||||
"""
|
||||
Drop-in replacement for GatewayClient that intercepts call_tool and returns
|
||||
pre-configured responses without making HTTP requests.
|
||||
|
||||
Usage:
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("job_orchestrator_tool", "start_task", {"job_id": "j_001", "status": "running"})
|
||||
mock_gw.register("job_orchestrator_tool", "get_job", {"status": "succeeded", "result": {...}})
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._responses: Dict[str, List[Any]] = {} # key: "tool:action" → list of responses
|
||||
self.calls: List[Dict] = [] # recorded calls (no payload)
|
||||
|
||||
def register(self, tool: str, action: str, data: Any, *, error: Optional[str] = None, retryable: bool = False):
|
||||
"""Register a response for (tool, action). Multiple registrations → FIFO queue."""
|
||||
key = f"{tool}:{action}"
|
||||
self._responses.setdefault(key, []).append({
|
||||
"data": data, "error": error, "retryable": retryable
|
||||
})
|
||||
|
||||
def _pop(self, tool: str, action: str) -> Dict:
|
||||
key = f"{tool}:{action}"
|
||||
queue = self._responses.get(key, [])
|
||||
if queue:
|
||||
resp = queue.pop(0)
|
||||
if not queue:
|
||||
# Keep last response for further calls
|
||||
self._responses[key] = [resp]
|
||||
return resp
|
||||
return {"data": {}, "error": None, "retryable": False}
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *args):
|
||||
pass
|
||||
|
||||
async def call_tool(
|
||||
self,
|
||||
tool: str,
|
||||
action: str,
|
||||
params: Optional[Dict] = None,
|
||||
agent_id: str = "",
|
||||
workspace_id: str = "",
|
||||
user_id: str = "",
|
||||
graph_run_id: str = "",
|
||||
graph_node: str = "",
|
||||
**kwargs,
|
||||
):
|
||||
# Record call metadata (no payload logged)
|
||||
self.calls.append({
|
||||
"tool": tool,
|
||||
"action": action,
|
||||
"graph_run_id": graph_run_id,
|
||||
"graph_node": graph_node,
|
||||
"agent_id": agent_id,
|
||||
})
|
||||
|
||||
resp = self._pop(tool, action)
|
||||
from app.gateway_client import ToolCallResult
|
||||
if resp["error"]:
|
||||
return ToolCallResult(
|
||||
success=False,
|
||||
error_code="mock_error",
|
||||
error_message=resp["error"],
|
||||
retryable=resp.get("retryable", False),
|
||||
)
|
||||
return ToolCallResult(success=True, data=resp["data"])
|
||||
|
||||
|
||||
# ─── Fixtures ────────────────────────────────────────────────────────────────
|
||||
|
||||
@pytest.fixture
|
||||
def mock_gw_factory():
|
||||
"""Factory: returns a MockGatewayClient and patches app.gateway_client.GatewayClient."""
|
||||
def _make(patch_target: str = "app.gateway_client.GatewayClient"):
|
||||
return MockGatewayClient()
|
||||
return _make
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def in_memory_backend():
|
||||
from app.state_backend import MemoryStateBackend
|
||||
return MemoryStateBackend()
|
||||
|
||||
|
||||
def _run(coro):
|
||||
return asyncio.run(coro)
|
||||
752
services/sofiia-supervisor/tests/test_alert_triage_graph.py
Normal file
752
services/sofiia-supervisor/tests/test_alert_triage_graph.py
Normal file
@@ -0,0 +1,752 @@
|
||||
"""
|
||||
Tests for alert_triage_graph.
|
||||
|
||||
Covers:
|
||||
- P1 prod alert → incident created + deterministic triage + ack (no LLM)
|
||||
- P3 alert → digest-only, no incident
|
||||
- Signature dedupe → same signature reuses existing incident
|
||||
- Gateway error on one alert → loop continues (non-fatal)
|
||||
- Policy loader fallback (missing file)
|
||||
- LLM guard: llm_mode=off forces deterministic even when rule says llm
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
from unittest.mock import patch, MagicMock, AsyncMock
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[4]
|
||||
SUPERVISOR = ROOT / "services" / "sofiia-supervisor"
|
||||
if str(SUPERVISOR) not in sys.path:
|
||||
sys.path.insert(0, str(SUPERVISOR))
|
||||
|
||||
# ─── Mock GatewayClient ───────────────────────────────────────────────────────
|
||||
|
||||
class MockToolCallResult:
|
||||
def __init__(self, success=True, data=None, error_message=""):
|
||||
self.success = success
|
||||
self.data = data or {}
|
||||
self.error_message = error_message
|
||||
|
||||
|
||||
class MockGatewayClient:
|
||||
"""Records all calls, returns configurable responses per (tool, action)."""
|
||||
|
||||
def __init__(self, responses: Optional[Dict] = None):
|
||||
self.calls: List[Dict] = []
|
||||
self.responses = responses or {}
|
||||
|
||||
async def call_tool(self, tool_name, action, params=None, **kwargs) -> MockToolCallResult:
|
||||
self.calls.append({"tool": tool_name, "action": action, "params": params or {}})
|
||||
key = f"{tool_name}.{action}"
|
||||
if key in self.responses:
|
||||
resp = self.responses[key]
|
||||
if callable(resp):
|
||||
return resp(tool_name, action, params)
|
||||
return MockToolCallResult(True, resp)
|
||||
# Default success responses per tool/action
|
||||
defaults = {
|
||||
"alert_ingest_tool.claim": {
|
||||
"alerts": [], "claimed": 0, "requeued_stale": 0,
|
||||
},
|
||||
"alert_ingest_tool.list": {
|
||||
"alerts": [], "count": 0,
|
||||
},
|
||||
"alert_ingest_tool.ack": {"ack_status": "acked"},
|
||||
"alert_ingest_tool.fail": {"alert_ref": "?", "status": "failed"},
|
||||
"oncall_tool.signature_should_triage": {"should_triage": True},
|
||||
"oncall_tool.signature_mark_triage": {"marked": "triage_run"},
|
||||
"oncall_tool.signature_mark_alert": {"marked": "alert_seen"},
|
||||
"incident_escalation_tool.evaluate": {
|
||||
"evaluated": 0, "escalated": 0, "followups_created": 0,
|
||||
"candidates": [], "recommendations": [], "dry_run": False,
|
||||
},
|
||||
"incident_escalation_tool.auto_resolve_candidates": {
|
||||
"candidates": [], "candidates_count": 0,
|
||||
"closed": [], "closed_count": 0, "dry_run": True,
|
||||
},
|
||||
"oncall_tool.alert_to_incident": {
|
||||
"incident_id": "inc_test_001",
|
||||
"created": True,
|
||||
"severity": "P1",
|
||||
"incident_signature": "abcd1234" * 4,
|
||||
},
|
||||
"oncall_tool.incident_attach_artifact": {"artifact": {"path": "ops/incidents/test/triage.json"}},
|
||||
"oncall_tool.incident_append_event": {"event": {"id": 1}},
|
||||
"oncall_tool.service_health": {"healthy": True, "status": "ok"},
|
||||
"observability_tool.service_overview": {"metrics": {}, "status": "ok"},
|
||||
"kb_tool.snippets": {"snippets": []},
|
||||
}
|
||||
if key in defaults:
|
||||
return MockToolCallResult(True, defaults[key])
|
||||
return MockToolCallResult(True, {})
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *args):
|
||||
pass
|
||||
|
||||
|
||||
# ─── Alert fixtures ───────────────────────────────────────────────────────────
|
||||
|
||||
def _make_alert(
|
||||
service="gateway", severity="P1", kind="slo_breach",
|
||||
env="prod", fingerprint="fp1", ref="alrt_001",
|
||||
):
|
||||
return {
|
||||
"alert_ref": ref,
|
||||
"source": "monitor@node1",
|
||||
"service": service,
|
||||
"env": env,
|
||||
"severity": severity,
|
||||
"kind": kind,
|
||||
"title": f"{service} {kind} alert",
|
||||
"summary": f"{service} is experiencing {kind}",
|
||||
"started_at": "2025-01-23T09:00:00",
|
||||
"labels": {"node": "node1", "fingerprint": fingerprint},
|
||||
"metrics": {"latency_p95_ms": 450, "error_rate_pct": 2.5},
|
||||
"ack_status": "pending",
|
||||
}
|
||||
|
||||
|
||||
# ─── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def _run_graph(state_input: Dict, mock_gw: MockGatewayClient) -> Dict:
|
||||
"""Execute alert_triage_graph with mocked GatewayClient."""
|
||||
from app.graphs.alert_triage_graph import build_alert_triage_graph
|
||||
|
||||
graph = build_alert_triage_graph()
|
||||
|
||||
async def _run():
|
||||
with patch("app.graphs.alert_triage_graph.GatewayClient", return_value=mock_gw):
|
||||
with patch("app.graphs.alert_triage_graph.GatewayClient.__aenter__",
|
||||
return_value=mock_gw):
|
||||
with patch("app.graphs.alert_triage_graph.GatewayClient.__aexit__",
|
||||
return_value=AsyncMock(return_value=None)):
|
||||
return await graph.ainvoke(state_input)
|
||||
|
||||
return asyncio.run(_run())
|
||||
|
||||
|
||||
# ─── Tests ────────────────────────────────────────────────────────────────────
|
||||
|
||||
class TestAlertTriageNoLLM:
|
||||
"""P1 prod alert → incident + deterministic triage, zero LLM calls."""
|
||||
|
||||
def _run_with_p1_alert(self, alert_ref="alrt_p1"):
|
||||
p1_alert = _make_alert(severity="P1", env="prod", ref=alert_ref)
|
||||
inc_sig = hashlib.sha256(f"gateway|prod|slo_breach|fp1".encode()).hexdigest()[:32]
|
||||
|
||||
gw = MockGatewayClient(responses={
|
||||
"alert_ingest_tool.claim": {
|
||||
"alerts": [p1_alert], "claimed": 1, "requeued_stale": 0,
|
||||
},
|
||||
"alert_ingest_tool.list": {
|
||||
"alerts": [p1_alert], "count": 1,
|
||||
},
|
||||
"oncall_tool.signature_should_triage": {"should_triage": False},
|
||||
"oncall_tool.alert_to_incident": {
|
||||
"incident_id": "inc_test_p1",
|
||||
"created": True,
|
||||
"severity": "P1",
|
||||
"incident_signature": inc_sig,
|
||||
},
|
||||
})
|
||||
|
||||
state = {
|
||||
"workspace_id": "default",
|
||||
"user_id": "test",
|
||||
"agent_id": "sofiia",
|
||||
"_run_id": "test_run_001",
|
||||
}
|
||||
|
||||
with patch("app.graphs.alert_triage_graph.load_policy") as mp:
|
||||
mp.return_value = {
|
||||
"defaults": {
|
||||
"max_alerts_per_run": 10,
|
||||
"only_unacked": False,
|
||||
"max_incidents_per_run": 5,
|
||||
"max_triages_per_run": 5,
|
||||
"llm_mode": "off",
|
||||
"llm_on": {"triage": False},
|
||||
"dedupe_window_minutes_default": 120,
|
||||
"ack_note_prefix": "test_loop",
|
||||
},
|
||||
"routing": [
|
||||
{
|
||||
"match": {"env_in": ["prod"], "severity_in": ["P0", "P1"]},
|
||||
"actions": {
|
||||
"auto_incident": True,
|
||||
"auto_triage": False, # skip triage in unit test
|
||||
"triage_mode": "deterministic",
|
||||
"incident_severity_cap": "P1",
|
||||
"dedupe_window_minutes": 120,
|
||||
"attach_alert_artifact": True,
|
||||
"ack": True,
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
with patch("app.graphs.alert_triage_graph.match_alert",
|
||||
side_effect=lambda a, p: {
|
||||
"auto_incident": True, "auto_triage": False,
|
||||
"triage_mode": "deterministic",
|
||||
"incident_severity_cap": "P1",
|
||||
"dedupe_window_minutes": 120,
|
||||
"ack": True,
|
||||
"_normalized_kind": "slo_breach",
|
||||
}):
|
||||
result = asyncio.run(self._async_run_graph(state, gw))
|
||||
return result, gw
|
||||
|
||||
async def _async_run_graph(self, state, gw):
|
||||
from app.graphs.alert_triage_graph import (
|
||||
load_policy_node, list_alerts_node, process_alerts_node, build_digest_node
|
||||
)
|
||||
s = await load_policy_node(state)
|
||||
s["_run_id"] = "test_run_001"
|
||||
with patch("app.graphs.alert_triage_graph.GatewayClient", return_value=gw):
|
||||
s = await list_alerts_node(s)
|
||||
s = await process_alerts_node(s)
|
||||
s = await build_digest_node(s)
|
||||
return s
|
||||
|
||||
def test_incident_created_for_p1_prod(self):
|
||||
result, gw = self._run_with_p1_alert()
|
||||
created = result.get("created_incidents", [])
|
||||
assert len(created) >= 1
|
||||
assert created[0]["incident_id"] == "inc_test_p1"
|
||||
|
||||
def test_no_llm_calls(self):
|
||||
result, gw = self._run_with_p1_alert()
|
||||
llm_tools = [c for c in gw.calls if c["tool"] in ("llm_tool", "chat_tool")]
|
||||
assert len(llm_tools) == 0, f"Unexpected LLM calls: {llm_tools}"
|
||||
|
||||
def test_alert_acked(self):
|
||||
result, gw = self._run_with_p1_alert()
|
||||
ack_calls = [c for c in gw.calls
|
||||
if c["tool"] == "alert_ingest_tool" and c["action"] == "ack"]
|
||||
assert len(ack_calls) >= 1
|
||||
|
||||
def test_digest_contains_incident(self):
|
||||
result, gw = self._run_with_p1_alert()
|
||||
digest = result.get("digest_md", "")
|
||||
assert "inc_test_p1" in digest
|
||||
|
||||
def test_result_summary_populated(self):
|
||||
result, gw = self._run_with_p1_alert()
|
||||
summary = result.get("result_summary", {})
|
||||
assert summary.get("created_incidents", 0) >= 1
|
||||
|
||||
|
||||
class TestAlertTriageDigestOnly:
|
||||
"""P3 alert → digest_only, no incident created, alert acked."""
|
||||
|
||||
async def _run(self, gw, state):
|
||||
from app.graphs.alert_triage_graph import (
|
||||
load_policy_node, list_alerts_node, process_alerts_node, build_digest_node
|
||||
)
|
||||
with patch("app.graphs.alert_triage_graph.load_policy") as mp:
|
||||
mp.return_value = {
|
||||
"defaults": {
|
||||
"max_alerts_per_run": 10,
|
||||
"only_unacked": False,
|
||||
"max_incidents_per_run": 5,
|
||||
"max_triages_per_run": 5,
|
||||
"llm_mode": "off",
|
||||
"llm_on": {"triage": False},
|
||||
"dedupe_window_minutes_default": 120,
|
||||
"ack_note_prefix": "test_loop",
|
||||
},
|
||||
"routing": [
|
||||
{
|
||||
"match": {"severity_in": ["P2", "P3", "INFO"]},
|
||||
"actions": {"auto_incident": False, "digest_only": True, "ack": True},
|
||||
},
|
||||
],
|
||||
}
|
||||
s = await load_policy_node(state)
|
||||
s["_run_id"] = "test_p3"
|
||||
with patch("app.graphs.alert_triage_graph.GatewayClient") as MockGW:
|
||||
MockGW.return_value.__aenter__ = AsyncMock(return_value=gw)
|
||||
MockGW.return_value.__aexit__ = AsyncMock(return_value=None)
|
||||
with patch("app.graphs.alert_triage_graph.load_policy") as mp2:
|
||||
mp2.return_value = s["policy"]
|
||||
with patch("app.graphs.alert_triage_graph.match_alert",
|
||||
side_effect=lambda a, p: {
|
||||
"auto_incident": False, "digest_only": True, "ack": True,
|
||||
}):
|
||||
s = await list_alerts_node(s)
|
||||
s = await process_alerts_node(s)
|
||||
return await build_digest_node(s)
|
||||
|
||||
def test_no_incident_created(self):
|
||||
p3_alert = _make_alert(severity="P3", env="prod", ref="alrt_p3")
|
||||
gw = MockGatewayClient(responses={
|
||||
"alert_ingest_tool.claim": {"alerts": [p3_alert], "claimed": 1, "requeued_stale": 0},
|
||||
"alert_ingest_tool.list": {"alerts": [p3_alert], "count": 1},
|
||||
})
|
||||
state = {"workspace_id": "default", "user_id": "test", "agent_id": "sofiia"}
|
||||
result = asyncio.run(self._run(gw, state))
|
||||
assert result.get("created_incidents", []) == []
|
||||
assert len(result.get("skipped_alerts", [])) >= 1
|
||||
|
||||
def test_no_oncall_write_calls(self):
|
||||
p3_alert = _make_alert(severity="P3", env="prod", ref="alrt_p3_2")
|
||||
gw = MockGatewayClient(responses={
|
||||
"alert_ingest_tool.claim": {"alerts": [p3_alert], "claimed": 1, "requeued_stale": 0},
|
||||
"alert_ingest_tool.list": {"alerts": [p3_alert], "count": 1},
|
||||
})
|
||||
state = {"workspace_id": "default", "user_id": "test", "agent_id": "sofiia"}
|
||||
asyncio.run(self._run(gw, state))
|
||||
write_calls = [c for c in gw.calls if c["tool"] == "oncall_tool"
|
||||
and "incident" in c["action"]]
|
||||
assert len(write_calls) == 0
|
||||
|
||||
def test_digest_shows_skipped(self):
|
||||
p3_alert = _make_alert(severity="P3", env="prod", ref="alrt_p3_3")
|
||||
gw = MockGatewayClient(responses={
|
||||
"alert_ingest_tool.claim": {"alerts": [p3_alert], "claimed": 1, "requeued_stale": 0},
|
||||
"alert_ingest_tool.list": {"alerts": [p3_alert], "count": 1},
|
||||
})
|
||||
state = {"workspace_id": "default", "user_id": "test", "agent_id": "sofiia"}
|
||||
result = asyncio.run(self._run(gw, state))
|
||||
digest = result.get("digest_md", "")
|
||||
assert "Skipped" in digest or "skipped" in digest.lower()
|
||||
|
||||
|
||||
class TestAlertTriageSignatureDedupe:
|
||||
"""Same signature → existing incident reused, no duplicate created."""
|
||||
|
||||
def test_same_signature_reuse(self):
|
||||
from app.alert_routing import compute_incident_signature
|
||||
|
||||
alert1 = _make_alert(ref="alrt_sig1", fingerprint="samefp")
|
||||
alert2 = _make_alert(ref="alrt_sig2", fingerprint="samefp") # same fingerprint
|
||||
|
||||
# Verify both produce the same signature
|
||||
sig1 = compute_incident_signature(alert1)
|
||||
sig2 = compute_incident_signature(alert2)
|
||||
assert sig1 == sig2, f"Signatures differ: {sig1} vs {sig2}"
|
||||
|
||||
def test_different_fingerprint_different_signature(self):
|
||||
from app.alert_routing import compute_incident_signature
|
||||
|
||||
alert1 = _make_alert(ref="alrt_diff1", fingerprint="fp_a")
|
||||
alert2 = _make_alert(ref="alrt_diff2", fingerprint="fp_b")
|
||||
|
||||
sig1 = compute_incident_signature(alert1)
|
||||
sig2 = compute_incident_signature(alert2)
|
||||
assert sig1 != sig2
|
||||
|
||||
def test_different_service_different_signature(self):
|
||||
from app.alert_routing import compute_incident_signature
|
||||
|
||||
alert1 = _make_alert(service="gateway", fingerprint="fp1")
|
||||
alert2 = _make_alert(service="router", fingerprint="fp1")
|
||||
|
||||
assert compute_incident_signature(alert1) != compute_incident_signature(alert2)
|
||||
|
||||
def test_signature_stored_in_incident_meta(self):
|
||||
"""Verify that alert_to_incident stores incident_signature in result."""
|
||||
from app.alert_routing import compute_incident_signature
|
||||
|
||||
alert = _make_alert(ref="alrt_meta_test")
|
||||
sig = compute_incident_signature(alert)
|
||||
|
||||
# The router tool_manager stores sig in incident meta and returns it
|
||||
# We test the compute function here; integration tested in test_alert_to_incident.py
|
||||
assert len(sig) == 32
|
||||
assert all(c in "0123456789abcdef" for c in sig)
|
||||
|
||||
|
||||
class TestAlertTriageNonFatalErrors:
|
||||
"""Gateway error on one alert → loop continues others."""
|
||||
|
||||
async def _run_mixed(self, alerts, gw, state):
|
||||
from app.graphs.alert_triage_graph import (
|
||||
load_policy_node, list_alerts_node, process_alerts_node, build_digest_node
|
||||
)
|
||||
with patch("app.graphs.alert_triage_graph.load_policy") as mp:
|
||||
mp.return_value = {
|
||||
"defaults": {
|
||||
"max_alerts_per_run": 10,
|
||||
"only_unacked": False,
|
||||
"max_incidents_per_run": 5,
|
||||
"max_triages_per_run": 5,
|
||||
"llm_mode": "off",
|
||||
"llm_on": {},
|
||||
"dedupe_window_minutes_default": 120,
|
||||
"ack_note_prefix": "test",
|
||||
},
|
||||
"routing": [],
|
||||
}
|
||||
s = await load_policy_node(state)
|
||||
s["_run_id"] = "test_nonfatal"
|
||||
s["alerts"] = alerts
|
||||
|
||||
with patch("app.graphs.alert_triage_graph.GatewayClient") as MockGW:
|
||||
MockGW.return_value.__aenter__ = AsyncMock(return_value=gw)
|
||||
MockGW.return_value.__aexit__ = AsyncMock(return_value=None)
|
||||
with patch("app.graphs.alert_triage_graph.match_alert") as mock_match:
|
||||
call_count = [0]
|
||||
def match_side_effect(alert, policy=None):
|
||||
call_count[0] += 1
|
||||
if call_count[0] == 1:
|
||||
# First alert raises (simulated via actions that trigger error)
|
||||
raise RuntimeError("Gateway timeout for first alert")
|
||||
return {
|
||||
"auto_incident": False, "digest_only": True, "ack": True,
|
||||
}
|
||||
mock_match.side_effect = match_side_effect
|
||||
s = await process_alerts_node(s)
|
||||
return await build_digest_node(s)
|
||||
|
||||
def test_error_on_one_continues_others(self):
|
||||
alerts = [
|
||||
_make_alert(ref="alrt_fail", severity="P1"),
|
||||
_make_alert(ref="alrt_ok", severity="P3"),
|
||||
]
|
||||
gw = MockGatewayClient()
|
||||
state = {"workspace_id": "default", "user_id": "test", "agent_id": "sofiia"}
|
||||
result = asyncio.run(self._run_mixed(alerts, gw, state))
|
||||
|
||||
# Both should be counted as processed
|
||||
assert result.get("processed", 0) == 2
|
||||
# Error recorded
|
||||
errors = result.get("errors", [])
|
||||
assert len(errors) >= 1
|
||||
|
||||
def test_digest_shows_errors(self):
|
||||
alerts = [
|
||||
_make_alert(ref="alrt_err", severity="P1"),
|
||||
_make_alert(ref="alrt_ok2", severity="P3"),
|
||||
]
|
||||
gw = MockGatewayClient()
|
||||
state = {"workspace_id": "default", "user_id": "test", "agent_id": "sofiia"}
|
||||
result = asyncio.run(self._run_mixed(alerts, gw, state))
|
||||
digest = result.get("digest_md", "")
|
||||
assert "Error" in digest or "error" in digest.lower()
|
||||
|
||||
|
||||
class TestPostProcessNodes:
|
||||
"""Test escalation + autoresolve post-process nodes."""
|
||||
|
||||
def setup_method(self):
|
||||
sup_path = ROOT.parent / "services" / "sofiia-supervisor"
|
||||
if str(sup_path) not in sys.path:
|
||||
sys.path.insert(0, str(sup_path))
|
||||
|
||||
def test_escalation_result_in_digest(self):
|
||||
"""Escalation results appear in digest when incidents are escalated."""
|
||||
import asyncio
|
||||
from app.graphs.alert_triage_graph import (
|
||||
load_policy_node, list_alerts_node, process_alerts_node,
|
||||
post_process_escalation_node, post_process_autoresolve_node, build_digest_node
|
||||
)
|
||||
|
||||
p1_alert = _make_alert(severity="P1", fingerprint="fp_esc")
|
||||
gw = MockGatewayClient(responses={
|
||||
"alert_ingest_tool.claim": {
|
||||
"alerts": [p1_alert], "claimed": 1, "requeued_stale": 0,
|
||||
},
|
||||
"oncall_tool.alert_to_incident": {
|
||||
"incident_id": "inc_esc_001", "created": True,
|
||||
"incident_signature": "esc_sig_001",
|
||||
},
|
||||
"oncall_tool.signature_should_triage": {"should_triage": False},
|
||||
"incident_escalation_tool.evaluate": {
|
||||
"evaluated": 1, "escalated": 1, "followups_created": 1,
|
||||
"candidates": [{"incident_id": "inc_esc_001", "service": "gateway",
|
||||
"from_severity": "P2", "to_severity": "P1",
|
||||
"occurrences_60m": 15, "triage_count_24h": 2}],
|
||||
"recommendations": ["Escalated inc_esc_001"],
|
||||
"dry_run": False,
|
||||
},
|
||||
"incident_escalation_tool.auto_resolve_candidates": {
|
||||
"candidates": [], "candidates_count": 0,
|
||||
"closed": [], "closed_count": 0, "dry_run": True,
|
||||
},
|
||||
})
|
||||
|
||||
state = {
|
||||
"workspace_id": "ws1", "user_id": "u1", "agent_id": "sofiia",
|
||||
"policy": {
|
||||
"defaults": {"only_unacked": True, "auto_incident": True,
|
||||
"auto_triage": False, "llm_mode": "off", "ack": True},
|
||||
"routing": [],
|
||||
},
|
||||
"dry_run": False, "max_alerts": 20,
|
||||
"max_incidents_per_run": 5, "max_triages_per_run": 5,
|
||||
"created_incidents": [], "updated_incidents": [], "skipped_alerts": [],
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
async def run():
|
||||
s = {**state, "_run_id": "test_esc_001"}
|
||||
with patch("app.graphs.alert_triage_graph.GatewayClient", return_value=gw):
|
||||
s = await list_alerts_node(s)
|
||||
s = await process_alerts_node(s)
|
||||
s = await post_process_escalation_node(s)
|
||||
s = await post_process_autoresolve_node(s)
|
||||
s = await build_digest_node(s)
|
||||
return s
|
||||
|
||||
result = asyncio.run(run())
|
||||
assert result["escalation_result"]["escalated"] == 1
|
||||
assert result["result_summary"]["escalated"] == 1
|
||||
assert "Escalated Incidents" in result["digest_md"]
|
||||
|
||||
def test_post_process_skipped_when_no_alerts_processed(self):
|
||||
"""If 0 alerts processed, post-process nodes skip gracefully."""
|
||||
import asyncio
|
||||
from app.graphs.alert_triage_graph import (
|
||||
post_process_escalation_node, post_process_autoresolve_node
|
||||
)
|
||||
|
||||
state = {"processed": 0, "agent_id": "sofiia", "workspace_id": "ws1",
|
||||
"_run_id": "test_skip_001", "dry_run": False}
|
||||
gw = MockGatewayClient()
|
||||
|
||||
async def run():
|
||||
s = {**state}
|
||||
with patch("app.graphs.alert_triage_graph.GatewayClient", return_value=gw):
|
||||
s = await post_process_escalation_node(s)
|
||||
s = await post_process_autoresolve_node(s)
|
||||
return s
|
||||
|
||||
result = asyncio.run(run())
|
||||
assert result["escalation_result"] == {}
|
||||
assert result["autoresolve_result"] == {}
|
||||
# No tool calls made
|
||||
esc_calls = [c for c in gw.calls if c["tool"] == "incident_escalation_tool"]
|
||||
assert len(esc_calls) == 0
|
||||
|
||||
|
||||
class TestCooldownPreventsTriage:
|
||||
def setup_method(self):
|
||||
sup_path = ROOT.parent / "services" / "sofiia-supervisor"
|
||||
if str(sup_path) not in sys.path:
|
||||
sys.path.insert(0, str(sup_path))
|
||||
|
||||
def test_cooldown_active_appends_event_but_acks(self):
|
||||
"""When cooldown is active: no triage, but alert is acked and event appended."""
|
||||
import asyncio
|
||||
from app.graphs.alert_triage_graph import (
|
||||
load_policy_node, list_alerts_node, process_alerts_node, build_digest_node
|
||||
)
|
||||
policy = {
|
||||
"defaults": {
|
||||
"only_unacked": True, "auto_incident": True, "auto_triage": True,
|
||||
"triage_mode": "deterministic", "triage_cooldown_minutes": 15,
|
||||
"llm_mode": "off",
|
||||
},
|
||||
"routing": [
|
||||
{"match": {"severity": "P1"}, "actions": {
|
||||
"auto_incident": True, "auto_triage": True,
|
||||
"triage_mode": "deterministic", "incident_severity_cap": "P1",
|
||||
"ack": True,
|
||||
}}
|
||||
],
|
||||
}
|
||||
p1_alert = _make_alert(severity="P1", fingerprint="fp_cooldown")
|
||||
|
||||
# signature_should_triage returns False (cooldown active)
|
||||
gw = MockGatewayClient(responses={
|
||||
"alert_ingest_tool.claim": {"alerts": [p1_alert], "claimed": 1, "requeued_stale": 0},
|
||||
"oncall_tool.alert_to_incident": {
|
||||
"incident_id": "inc_cooldown_001", "created": True,
|
||||
"incident_signature": "abcd1234",
|
||||
},
|
||||
"oncall_tool.signature_should_triage": {"should_triage": False},
|
||||
"oncall_tool.incident_append_event": {"event_id": 10},
|
||||
"alert_ingest_tool.ack": {"alert_ref": p1_alert["alert_ref"], "status": "acked"},
|
||||
})
|
||||
|
||||
state = {
|
||||
"workspace_id": "ws1", "user_id": "u1", "agent_id": "sofiia",
|
||||
"policy": policy, "dry_run": False, "max_alerts": 20,
|
||||
"max_incidents_per_run": 5, "max_triages_per_run": 5,
|
||||
"created_incidents": [], "updated_incidents": [], "skipped_alerts": [],
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
async def run():
|
||||
s = {**state, "_run_id": "test_cooldown_001"}
|
||||
with patch("app.graphs.alert_triage_graph.GatewayClient", return_value=gw):
|
||||
s = await list_alerts_node(s)
|
||||
s = await process_alerts_node(s)
|
||||
return s
|
||||
|
||||
result = asyncio.run(run())
|
||||
# Incident was created
|
||||
assert len(result.get("created_incidents", [])) >= 1
|
||||
# No triage_run_id appended (cooldown blocked it)
|
||||
# Verify append_event was called (for cooldown notification)
|
||||
calls = gw.calls
|
||||
append_calls = [c for c in calls
|
||||
if c["tool"] == "oncall_tool" and c["action"] == "incident_append_event"]
|
||||
assert len(append_calls) >= 1
|
||||
# Ack was still called
|
||||
ack_calls = [c for c in calls
|
||||
if c["tool"] == "alert_ingest_tool" and c["action"] == "ack"]
|
||||
assert len(ack_calls) >= 1
|
||||
|
||||
|
||||
class TestAlertRoutingPolicy:
|
||||
"""Policy loader and match_alert tests."""
|
||||
|
||||
def test_load_policy_builtin_fallback(self):
|
||||
from app.alert_routing import load_policy
|
||||
from pathlib import Path
|
||||
result = load_policy(Path("/nonexistent/path.yml"))
|
||||
assert "defaults" in result
|
||||
assert "routing" in result
|
||||
|
||||
def test_match_p1_prod_returns_auto_incident(self):
|
||||
from app.alert_routing import match_alert, load_policy
|
||||
policy = load_policy()
|
||||
alert = _make_alert(severity="P1", env="prod")
|
||||
actions = match_alert(alert, policy)
|
||||
assert actions["auto_incident"] is True
|
||||
|
||||
def test_match_p3_returns_digest_only(self):
|
||||
from app.alert_routing import match_alert, load_policy
|
||||
policy = load_policy()
|
||||
alert = _make_alert(severity="P3", env="prod")
|
||||
actions = match_alert(alert, policy)
|
||||
assert actions.get("auto_incident", True) is False
|
||||
assert actions.get("digest_only", False) is True
|
||||
|
||||
def test_match_security_returns_auto_incident(self):
|
||||
from app.alert_routing import match_alert
|
||||
# Use inline policy with security rule (avoids path resolution in tests)
|
||||
policy = {
|
||||
"defaults": {"dedupe_window_minutes_default": 120},
|
||||
"routing": [
|
||||
{
|
||||
"match": {"kind_in": ["security"]},
|
||||
"actions": {
|
||||
"auto_incident": True, "auto_triage": True,
|
||||
"triage_mode": "deterministic",
|
||||
"incident_severity_cap": "P0",
|
||||
"ack": True,
|
||||
},
|
||||
},
|
||||
],
|
||||
"kind_map": {},
|
||||
}
|
||||
alert = _make_alert(kind="security", severity="P2", env="dev")
|
||||
actions = match_alert(alert, policy)
|
||||
assert actions.get("auto_incident") is True
|
||||
|
||||
def test_llm_guard_off_mode(self):
|
||||
from app.alert_routing import is_llm_allowed
|
||||
policy = {
|
||||
"defaults": {
|
||||
"llm_mode": "off",
|
||||
"llm_on": {"triage": True},
|
||||
}
|
||||
}
|
||||
assert is_llm_allowed("triage", policy) is False
|
||||
|
||||
def test_llm_guard_local_mode_enabled(self):
|
||||
from app.alert_routing import is_llm_allowed
|
||||
policy = {
|
||||
"defaults": {
|
||||
"llm_mode": "local",
|
||||
"llm_on": {"triage": True},
|
||||
}
|
||||
}
|
||||
assert is_llm_allowed("triage", policy) is True
|
||||
|
||||
def test_kind_normalization(self):
|
||||
from app.alert_routing import match_alert, load_policy
|
||||
policy = load_policy()
|
||||
# "oom_kill" is an alias for "oom" in kind_map
|
||||
alert = _make_alert(kind="oom_kill", severity="P1", env="prod")
|
||||
actions = match_alert(alert, policy)
|
||||
assert actions["auto_incident"] is True
|
||||
|
||||
def test_fallback_no_match(self):
|
||||
"""Alert with severity=P2 and no matching rule → digest_only."""
|
||||
from app.alert_routing import match_alert
|
||||
policy = {
|
||||
"defaults": {"dedupe_window_minutes_default": 120},
|
||||
"routing": [
|
||||
{
|
||||
"match": {"env_in": ["prod"], "severity_in": ["P0", "P1"]},
|
||||
"actions": {"auto_incident": True, "ack": True},
|
||||
}
|
||||
],
|
||||
}
|
||||
alert = _make_alert(severity="P2", env="staging")
|
||||
actions = match_alert(alert, policy)
|
||||
assert actions["auto_incident"] is False
|
||||
assert actions["digest_only"] is True
|
||||
|
||||
|
||||
class TestDryRunMode:
|
||||
"""Dry run should not write anything but still build digest."""
|
||||
|
||||
async def _run_dry(self, alerts, gw, state):
|
||||
from app.graphs.alert_triage_graph import (
|
||||
load_policy_node, list_alerts_node, process_alerts_node, build_digest_node
|
||||
)
|
||||
with patch("app.graphs.alert_triage_graph.load_policy") as mp:
|
||||
mp.return_value = {
|
||||
"defaults": {
|
||||
"max_alerts_per_run": 10,
|
||||
"only_unacked": False,
|
||||
"max_incidents_per_run": 5,
|
||||
"max_triages_per_run": 5,
|
||||
"llm_mode": "off",
|
||||
"llm_on": {},
|
||||
"dedupe_window_minutes_default": 120,
|
||||
"ack_note_prefix": "dry",
|
||||
},
|
||||
"routing": [],
|
||||
}
|
||||
s = await load_policy_node({**state, "dry_run": True})
|
||||
s["_run_id"] = "dry_run_test"
|
||||
s["alerts"] = alerts
|
||||
|
||||
with patch("app.graphs.alert_triage_graph.GatewayClient") as MockGW:
|
||||
MockGW.return_value.__aenter__ = AsyncMock(return_value=gw)
|
||||
MockGW.return_value.__aexit__ = AsyncMock(return_value=None)
|
||||
with patch("app.graphs.alert_triage_graph.match_alert",
|
||||
side_effect=lambda a, p=None: {
|
||||
"auto_incident": True, "auto_triage": False,
|
||||
"triage_mode": "deterministic",
|
||||
"incident_severity_cap": "P1",
|
||||
"dedupe_window_minutes": 120,
|
||||
"ack": False,
|
||||
}):
|
||||
with patch("app.graphs.alert_triage_graph.compute_incident_signature",
|
||||
return_value="drysigsig"):
|
||||
s = await process_alerts_node(s)
|
||||
return await build_digest_node(s)
|
||||
|
||||
def test_dry_run_no_write_calls(self):
|
||||
gw = MockGatewayClient()
|
||||
state = {"workspace_id": "default", "user_id": "test", "agent_id": "sofiia"}
|
||||
alerts = [_make_alert(ref="alrt_dry", severity="P1")]
|
||||
result = asyncio.run(self._run_dry(alerts, gw, state))
|
||||
|
||||
# No oncall tool write calls
|
||||
write_calls = [c for c in gw.calls
|
||||
if c["tool"] == "oncall_tool" and "incident" in c["action"]]
|
||||
assert len(write_calls) == 0
|
||||
|
||||
def test_dry_run_digest_has_marker(self):
|
||||
gw = MockGatewayClient()
|
||||
state = {"workspace_id": "default", "user_id": "test", "agent_id": "sofiia"}
|
||||
alerts = [_make_alert(ref="alrt_dry2", severity="P1")]
|
||||
result = asyncio.run(self._run_dry(alerts, gw, state))
|
||||
digest = result.get("digest_md", "")
|
||||
assert "DRY RUN" in digest
|
||||
391
services/sofiia-supervisor/tests/test_incident_triage_graph.py
Normal file
391
services/sofiia-supervisor/tests/test_incident_triage_graph.py
Normal file
@@ -0,0 +1,391 @@
|
||||
"""
|
||||
Tests for incident_triage_graph.
|
||||
|
||||
Mocks the GatewayClient.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from tests.conftest import MockGatewayClient, _run
|
||||
|
||||
|
||||
_OVERVIEW_DATA = {
|
||||
"status": "ok",
|
||||
"alerts": [{"name": "HighErrorRate", "severity": "warning"}],
|
||||
"slo": {"error_rate": "2.1%", "error_budget_consumed": "42%"},
|
||||
"metrics": {"request_rate": "120/s", "p99_latency_ms": 890},
|
||||
}
|
||||
|
||||
_LOGS_DATA = {
|
||||
"lines": [
|
||||
"2026-02-23T10:00:01Z ERROR router: connection refused to db host",
|
||||
"2026-02-23T10:00:02Z ERROR router: timeout after 30s waiting for upstream",
|
||||
"2026-02-23T10:00:03Z WARN router: retry 2/3 on POST /v1/agents/sofiia/infer",
|
||||
],
|
||||
"total": 3,
|
||||
}
|
||||
|
||||
_HEALTH_DATA = {
|
||||
"status": "degraded",
|
||||
"details": "DB connection pool exhausted",
|
||||
"checks": {"db": "fail", "redis": "ok", "nats": "ok"},
|
||||
}
|
||||
|
||||
_KB_DATA = {
|
||||
"results": [
|
||||
{
|
||||
"path": "docs/runbooks/router-db-exhausted.md",
|
||||
"lines": "L1-L30",
|
||||
"content": "## DB Pool Exhaustion\n- Increase pool size in DB_POOL_SIZE env\n- Check for long-running transactions\n- Restart service if needed",
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
class TestIncidentTriageGraph:
|
||||
"""Full happy-path test for incident_triage_graph."""
|
||||
|
||||
def test_full_triage(self):
|
||||
from app.graphs.incident_triage_graph import build_incident_triage_graph
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA)
|
||||
mock_gw.register("observability_tool", "logs_query", _LOGS_DATA)
|
||||
mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
|
||||
mock_gw.register("kb_tool", "search", _KB_DATA)
|
||||
# trace_lookup is skipped (include_traces=False)
|
||||
|
||||
compiled = build_incident_triage_graph()
|
||||
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
||||
final = _run(compiled.ainvoke({
|
||||
"run_id": "gr_triage_001",
|
||||
"agent_id": "sofiia", "workspace_id": "daarion", "user_id": "u_001",
|
||||
"input": {
|
||||
"service": "router",
|
||||
"symptom": "high error rate and slow responses",
|
||||
"env": "prod",
|
||||
"include_traces": False,
|
||||
"max_log_lines": 50,
|
||||
},
|
||||
}))
|
||||
|
||||
assert final["graph_status"] == "succeeded"
|
||||
result = final["result"]
|
||||
|
||||
# Required fields
|
||||
assert "summary" in result
|
||||
assert "suspected_root_causes" in result
|
||||
assert "impact_assessment" in result
|
||||
assert "mitigations_now" in result
|
||||
assert "next_checks" in result
|
||||
assert "references" in result
|
||||
|
||||
# Root causes derived from health=degraded and alert
|
||||
causes = result["suspected_root_causes"]
|
||||
assert len(causes) >= 1
|
||||
assert all("rank" in c and "cause" in c and "evidence" in c for c in causes)
|
||||
|
||||
# Log samples in references (redacted)
|
||||
ref_logs = result["references"]["log_samples"]
|
||||
assert len(ref_logs) > 0
|
||||
|
||||
# Runbook snippets in references
|
||||
runbooks = result["references"]["runbook_snippets"]
|
||||
assert len(runbooks) == 1
|
||||
assert "router-db-exhausted" in runbooks[0]["path"]
|
||||
|
||||
def test_with_traces_enabled(self):
|
||||
"""When include_traces=True, trace_lookup node runs."""
|
||||
from app.graphs.incident_triage_graph import build_incident_triage_graph
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA)
|
||||
# Include a trace_id in logs
|
||||
logs_with_trace = {
|
||||
"lines": [
|
||||
"2026-02-23T10:00:01Z ERROR router: trace_id=abcdef1234567890 connection refused",
|
||||
]
|
||||
}
|
||||
mock_gw.register("observability_tool", "logs_query", logs_with_trace)
|
||||
mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
|
||||
mock_gw.register("kb_tool", "search", _KB_DATA)
|
||||
mock_gw.register("observability_tool", "traces_query", {
|
||||
"traces": [{"trace_id": "abcdef1234567890", "duration_ms": 1250, "status": "error"}]
|
||||
})
|
||||
|
||||
compiled = build_incident_triage_graph()
|
||||
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
||||
final = _run(compiled.ainvoke({
|
||||
"run_id": "gr_trace_001",
|
||||
"agent_id": "sofiia", "workspace_id": "daarion", "user_id": "u",
|
||||
"input": {
|
||||
"service": "router",
|
||||
"symptom": "errors",
|
||||
"include_traces": True,
|
||||
},
|
||||
}))
|
||||
|
||||
assert final["graph_status"] == "succeeded"
|
||||
# Trace data should be in references
|
||||
assert "traces" in final["result"]["references"]
|
||||
|
||||
def test_invalid_service_fails_gracefully(self):
|
||||
"""Empty service → validation error → graph_status=failed."""
|
||||
from app.graphs.incident_triage_graph import build_incident_triage_graph
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
compiled = build_incident_triage_graph()
|
||||
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
||||
final = _run(compiled.ainvoke({
|
||||
"run_id": "gr_invalid_001",
|
||||
"agent_id": "sofiia", "workspace_id": "d", "user_id": "u",
|
||||
"input": {"service": "", "symptom": "something"},
|
||||
}))
|
||||
|
||||
assert final["graph_status"] == "failed"
|
||||
# No observability calls should have been made
|
||||
assert not any(c["tool"] == "observability_tool" for c in mock_gw.calls)
|
||||
|
||||
def test_observability_failure_is_non_fatal(self):
|
||||
"""If observability_tool fails, triage continues with partial data."""
|
||||
from app.graphs.incident_triage_graph import build_incident_triage_graph
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("observability_tool", "service_overview",
|
||||
None, error="observability tool timeout")
|
||||
mock_gw.register("observability_tool", "logs_query",
|
||||
None, error="logs unavailable")
|
||||
mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
|
||||
mock_gw.register("kb_tool", "search", _KB_DATA)
|
||||
|
||||
compiled = build_incident_triage_graph()
|
||||
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
||||
final = _run(compiled.ainvoke({
|
||||
"run_id": "gr_partial_001",
|
||||
"agent_id": "sofiia", "workspace_id": "d", "user_id": "u",
|
||||
"input": {"service": "router", "symptom": "slow"},
|
||||
}))
|
||||
|
||||
# Should still produce a result (degraded mode)
|
||||
assert final["graph_status"] == "succeeded"
|
||||
assert "summary" in final["result"]
|
||||
|
||||
def test_secret_redaction_in_logs(self):
|
||||
"""Log lines containing secrets should be redacted in output."""
|
||||
from app.graphs.incident_triage_graph import build_incident_triage_graph
|
||||
|
||||
secret_logs = {
|
||||
"lines": [
|
||||
"2026-02-23T10:00:01Z ERROR svc: token=sk-supersecretkey123 auth failed",
|
||||
"2026-02-23T10:00:02Z INFO svc: api_key=abc12345 request failed",
|
||||
]
|
||||
}
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("observability_tool", "service_overview", {})
|
||||
mock_gw.register("observability_tool", "logs_query", secret_logs)
|
||||
mock_gw.register("oncall_tool", "service_health", {"status": "ok"})
|
||||
mock_gw.register("kb_tool", "search", {"results": []})
|
||||
|
||||
compiled = build_incident_triage_graph()
|
||||
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
||||
final = _run(compiled.ainvoke({
|
||||
"run_id": "gr_secret_001",
|
||||
"agent_id": "sofiia", "workspace_id": "d", "user_id": "u",
|
||||
"input": {"service": "svc", "symptom": "auth issues"},
|
||||
}))
|
||||
|
||||
log_samples = final["result"]["references"]["log_samples"]
|
||||
all_text = " ".join(log_samples)
|
||||
assert "sk-supersecretkey123" not in all_text
|
||||
assert "abc12345" not in all_text
|
||||
assert "***" in all_text
|
||||
|
||||
|
||||
class TestTimeWindowLimit:
|
||||
"""incident_triage_graph rejects or clamps time windows > 24h."""
|
||||
|
||||
def test_time_window_clamped_to_24h(self):
|
||||
from app.graphs.incident_triage_graph import _clamp_time_range
|
||||
import datetime
|
||||
|
||||
# 48h window → should be clamped to 24h
|
||||
now = datetime.datetime.now(datetime.timezone.utc)
|
||||
from_48h = (now - datetime.timedelta(hours=48)).isoformat()
|
||||
to_now = now.isoformat()
|
||||
|
||||
clamped = _clamp_time_range({"from": from_48h, "to": to_now}, max_hours=24)
|
||||
|
||||
from_dt = datetime.datetime.fromisoformat(clamped["from"].replace("Z", "+00:00"))
|
||||
to_dt = datetime.datetime.fromisoformat(clamped["to"].replace("Z", "+00:00"))
|
||||
delta = to_dt - from_dt
|
||||
assert delta.total_seconds() <= 24 * 3600 + 1 # 1s tolerance
|
||||
|
||||
def test_valid_window_unchanged(self):
|
||||
from app.graphs.incident_triage_graph import _clamp_time_range
|
||||
import datetime
|
||||
|
||||
now = datetime.datetime.now(datetime.timezone.utc)
|
||||
from_1h = (now - datetime.timedelta(hours=1)).isoformat()
|
||||
clamped = _clamp_time_range({"from": from_1h, "to": now.isoformat()}, max_hours=24)
|
||||
|
||||
from_dt = datetime.datetime.fromisoformat(clamped["from"].replace("Z", "+00:00"))
|
||||
to_dt = datetime.datetime.fromisoformat(clamped["to"].replace("Z", "+00:00"))
|
||||
delta = to_dt - from_dt
|
||||
assert 3500 < delta.total_seconds() < 3700 # ~1h
|
||||
|
||||
def test_no_time_range_gets_default(self):
|
||||
from app.graphs.incident_triage_graph import _clamp_time_range
|
||||
result = _clamp_time_range(None, max_hours=24)
|
||||
assert "from" in result and "to" in result
|
||||
|
||||
|
||||
class TestCorrelationIds:
|
||||
"""All tool calls in incident_triage must contain graph_run_id."""
|
||||
|
||||
def test_all_calls_carry_run_id(self):
|
||||
from app.graphs.incident_triage_graph import build_incident_triage_graph
|
||||
|
||||
run_id = "gr_triage_corr_001"
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA)
|
||||
mock_gw.register("observability_tool", "logs_query", _LOGS_DATA)
|
||||
mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
|
||||
mock_gw.register("kb_tool", "search", _KB_DATA)
|
||||
# Register governance context tools
|
||||
mock_gw.register("data_governance_tool", "scan_audit", {
|
||||
"pass": True, "findings": [], "stats": {"errors": 0, "warnings": 0}, "recommendations": [],
|
||||
})
|
||||
mock_gw.register("cost_analyzer_tool", "anomalies", {"anomalies": [], "anomaly_count": 0})
|
||||
|
||||
compiled = build_incident_triage_graph()
|
||||
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
||||
_run(compiled.ainvoke({
|
||||
"run_id": run_id,
|
||||
"agent_id": "sofiia", "workspace_id": "d", "user_id": "u",
|
||||
"input": {"service": "router", "symptom": "errors"},
|
||||
}))
|
||||
|
||||
for call in mock_gw.calls:
|
||||
assert call["graph_run_id"] == run_id, (
|
||||
f"Call {call['tool']}:{call['action']} missing graph_run_id={run_id}"
|
||||
)
|
||||
|
||||
|
||||
class TestPrivacyCostContext:
|
||||
"""Tests for privacy_context and cost_context nodes."""
|
||||
|
||||
def test_incident_triage_includes_privacy_and_cost_context(self):
|
||||
"""Full triage should include context.privacy and context.cost in result."""
|
||||
from app.graphs.incident_triage_graph import build_incident_triage_graph
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA)
|
||||
mock_gw.register("observability_tool", "logs_query", _LOGS_DATA)
|
||||
mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
|
||||
mock_gw.register("kb_tool", "search", _KB_DATA)
|
||||
|
||||
# Privacy context: 2 findings
|
||||
mock_gw.register("data_governance_tool", "scan_audit", {
|
||||
"pass": True,
|
||||
"summary": "2 audit findings",
|
||||
"stats": {"errors": 1, "warnings": 1, "infos": 0},
|
||||
"findings": [
|
||||
{"id": "DG-AUD-101", "severity": "warning",
|
||||
"title": "PII in audit meta", "category": "audit",
|
||||
"evidence": {"details": "user***@***.com"}, "recommended_fix": "Use opaque IDs"},
|
||||
{"id": "DG-AUD-102", "severity": "error",
|
||||
"title": "Large output detected", "category": "audit",
|
||||
"evidence": {"details": "out_size=200000"}, "recommended_fix": "Enforce max_bytes_out"},
|
||||
],
|
||||
"recommendations": ["Use opaque identifiers"],
|
||||
})
|
||||
|
||||
# Cost context: one spike
|
||||
mock_gw.register("cost_analyzer_tool", "anomalies", {
|
||||
"anomalies": [{
|
||||
"type": "cost_spike",
|
||||
"tool": "observability_tool",
|
||||
"ratio": 5.2,
|
||||
"window_calls": 200,
|
||||
"baseline_calls": 10,
|
||||
"recommendation": "Reduce polling frequency.",
|
||||
}],
|
||||
"anomaly_count": 1,
|
||||
})
|
||||
|
||||
compiled = build_incident_triage_graph()
|
||||
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
||||
final = _run(compiled.ainvoke({
|
||||
"run_id": "gr_ctx_test_001",
|
||||
"agent_id": "sofiia", "workspace_id": "ws", "user_id": "u",
|
||||
"input": {"service": "router", "symptom": "errors + cost spike"},
|
||||
}))
|
||||
|
||||
assert final["graph_status"] == "succeeded"
|
||||
result = final["result"]
|
||||
|
||||
# context block must exist
|
||||
assert "context" in result
|
||||
privacy = result["context"]["privacy"]
|
||||
cost = result["context"]["cost"]
|
||||
|
||||
assert privacy["findings_count"] == 2
|
||||
assert not privacy["skipped"]
|
||||
|
||||
assert cost["anomaly_count"] == 1
|
||||
assert not cost["skipped"]
|
||||
assert len(cost["anomalies"]) == 1
|
||||
assert cost["anomalies"][0]["tool"] == "observability_tool"
|
||||
|
||||
# Cost spike should enrich root_causes
|
||||
causes_text = " ".join(str(c) for c in result["suspected_root_causes"])
|
||||
assert "observability_tool" in causes_text or "spike" in causes_text.lower()
|
||||
|
||||
# Privacy error should also appear in root_causes
|
||||
assert any(
|
||||
"privacy" in str(c).lower() or "governance" in str(c).lower()
|
||||
for c in result["suspected_root_causes"]
|
||||
)
|
||||
|
||||
def test_incident_triage_context_nonfatal_on_gateway_error(self):
|
||||
"""privacy_context and cost_context failures are non-fatal — triage still succeeds."""
|
||||
from app.graphs.incident_triage_graph import build_incident_triage_graph
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA)
|
||||
mock_gw.register("observability_tool", "logs_query", _LOGS_DATA)
|
||||
mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
|
||||
mock_gw.register("kb_tool", "search", _KB_DATA)
|
||||
# Both governance tools return errors
|
||||
mock_gw.register("data_governance_tool", "scan_audit",
|
||||
None, error="gateway timeout")
|
||||
mock_gw.register("cost_analyzer_tool", "anomalies",
|
||||
None, error="rate limit exceeded")
|
||||
|
||||
compiled = build_incident_triage_graph()
|
||||
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
||||
final = _run(compiled.ainvoke({
|
||||
"run_id": "gr_ctx_fail_001",
|
||||
"agent_id": "sofiia", "workspace_id": "ws", "user_id": "u",
|
||||
"input": {"service": "router", "symptom": "errors"},
|
||||
}))
|
||||
|
||||
# Triage must succeed despite governance context failures
|
||||
assert final["graph_status"] == "succeeded"
|
||||
result = final["result"]
|
||||
|
||||
# context block present with skipped=True
|
||||
assert "context" in result
|
||||
assert result["context"]["privacy"]["skipped"] is True
|
||||
assert result["context"]["cost"]["skipped"] is True
|
||||
|
||||
# Core triage fields still present
|
||||
assert "summary" in result
|
||||
assert "suspected_root_causes" in result
|
||||
@@ -0,0 +1,255 @@
|
||||
"""
|
||||
Tests for slo_context_node in incident_triage_graph.
|
||||
Verifies SLO violations are detected, enrich triage, and non-fatal on error.
|
||||
"""
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
ROOT_SUPERVISOR = Path(__file__).resolve().parent.parent
|
||||
if str(ROOT_SUPERVISOR) not in sys.path:
|
||||
sys.path.insert(0, str(ROOT_SUPERVISOR))
|
||||
|
||||
|
||||
class MockGatewayResult:
|
||||
def __init__(self, success, data=None, error_message=None):
|
||||
self.success = success
|
||||
self.data = data
|
||||
self.error_message = error_message
|
||||
|
||||
|
||||
class MockGatewayClient:
|
||||
"""Configurable mock for GatewayClient that routes by tool+action."""
|
||||
|
||||
def __init__(self, overrides=None):
|
||||
self.overrides = overrides or {}
|
||||
self.calls = []
|
||||
|
||||
async def call_tool(self, tool, action, params=None, **kwargs):
|
||||
self.calls.append({"tool": tool, "action": action, "params": params})
|
||||
key = f"{tool}.{action}"
|
||||
if key in self.overrides:
|
||||
return self.overrides[key]
|
||||
return MockGatewayResult(True, {"status": "ok", "lines": [], "results": []})
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *args):
|
||||
pass
|
||||
|
||||
|
||||
class TestSLOContextNode:
|
||||
"""Tests for the slo_context_node in isolation."""
|
||||
|
||||
def _run(self, coro):
|
||||
return asyncio.run(coro)
|
||||
|
||||
def test_slo_violations_detected(self):
|
||||
from app.graphs.incident_triage_graph import slo_context_node
|
||||
mock_gw = MockGatewayClient(overrides={
|
||||
"observability_tool.slo_snapshot": MockGatewayResult(True, {
|
||||
"service": "gateway",
|
||||
"window_minutes": 60,
|
||||
"metrics": {"latency_p95_ms": 450, "error_rate_pct": 2.5, "req_rate_rps": 100},
|
||||
"thresholds": {"latency_p95_ms": 300, "error_rate_pct": 1.0},
|
||||
"violations": ["latency_p95", "error_rate"],
|
||||
"skipped": False,
|
||||
}),
|
||||
})
|
||||
|
||||
state = {
|
||||
"run_id": "test_run_1",
|
||||
"service": "gateway",
|
||||
"env": "prod",
|
||||
"time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"},
|
||||
"agent_id": "sofiia",
|
||||
"workspace_id": "default",
|
||||
"user_id": "test",
|
||||
"graph_status": "running",
|
||||
}
|
||||
|
||||
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
||||
result = self._run(slo_context_node(state))
|
||||
|
||||
slo_data = result.get("slo_context_data", {})
|
||||
assert not slo_data.get("skipped", False)
|
||||
assert "latency_p95" in slo_data["violations"]
|
||||
assert "error_rate" in slo_data["violations"]
|
||||
assert slo_data["metrics"]["latency_p95_ms"] == 450
|
||||
|
||||
def test_slo_no_violations(self):
|
||||
from app.graphs.incident_triage_graph import slo_context_node
|
||||
mock_gw = MockGatewayClient(overrides={
|
||||
"observability_tool.slo_snapshot": MockGatewayResult(True, {
|
||||
"service": "gateway",
|
||||
"window_minutes": 60,
|
||||
"metrics": {"latency_p95_ms": 150, "error_rate_pct": 0.3},
|
||||
"thresholds": {"latency_p95_ms": 300, "error_rate_pct": 1.0},
|
||||
"violations": [],
|
||||
"skipped": False,
|
||||
}),
|
||||
})
|
||||
|
||||
state = {
|
||||
"run_id": "test_run_2",
|
||||
"service": "gateway",
|
||||
"env": "prod",
|
||||
"time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"},
|
||||
"graph_status": "running",
|
||||
}
|
||||
|
||||
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
||||
result = self._run(slo_context_node(state))
|
||||
|
||||
slo_data = result.get("slo_context_data", {})
|
||||
assert slo_data["violations"] == []
|
||||
assert not slo_data.get("skipped")
|
||||
|
||||
def test_slo_gateway_error_nonfatal(self):
|
||||
from app.graphs.incident_triage_graph import slo_context_node
|
||||
mock_gw = MockGatewayClient(overrides={
|
||||
"observability_tool.slo_snapshot": MockGatewayResult(False, error_message="timeout"),
|
||||
})
|
||||
|
||||
state = {
|
||||
"run_id": "test_run_3",
|
||||
"service": "gateway",
|
||||
"env": "prod",
|
||||
"time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"},
|
||||
"graph_status": "running",
|
||||
}
|
||||
|
||||
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
||||
result = self._run(slo_context_node(state))
|
||||
|
||||
slo_data = result.get("slo_context_data", {})
|
||||
assert slo_data["skipped"] is True
|
||||
assert result.get("graph_status") == "running"
|
||||
|
||||
def test_slo_exception_nonfatal(self):
|
||||
from app.graphs.incident_triage_graph import slo_context_node
|
||||
|
||||
class FailingGW:
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
async def __aexit__(self, *a):
|
||||
pass
|
||||
async def call_tool(self, **kwargs):
|
||||
raise ConnectionError("connection refused")
|
||||
|
||||
state = {
|
||||
"run_id": "test_run_4",
|
||||
"service": "gateway",
|
||||
"env": "prod",
|
||||
"time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"},
|
||||
"graph_status": "running",
|
||||
}
|
||||
|
||||
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=FailingGW()):
|
||||
result = self._run(slo_context_node(state))
|
||||
|
||||
assert result.get("slo_context_data", {}).get("skipped") is True
|
||||
assert result.get("graph_status") == "running"
|
||||
|
||||
|
||||
class TestTriageReportWithSLO:
|
||||
"""Tests that build_triage_report_node includes SLO context properly."""
|
||||
|
||||
def _run(self, coro):
|
||||
return asyncio.run(coro)
|
||||
|
||||
def test_slo_violations_appear_in_root_causes(self):
|
||||
from app.graphs.incident_triage_graph import build_triage_report_node
|
||||
state = {
|
||||
"service": "gateway",
|
||||
"symptom": "high latency",
|
||||
"time_range": {"from": "2025-01-01T00:00:00", "to": "2025-01-01T01:00:00"},
|
||||
"env": "prod",
|
||||
"graph_status": "running",
|
||||
"service_overview_data": {},
|
||||
"top_errors_data": {},
|
||||
"log_samples": [],
|
||||
"health_data": {"status": "degraded"},
|
||||
"runbook_snippets": [],
|
||||
"trace_data": None,
|
||||
"slo_context_data": {
|
||||
"violations": ["latency_p95", "error_rate"],
|
||||
"metrics": {"latency_p95_ms": 500, "error_rate_pct": 3.0},
|
||||
"thresholds": {"latency_p95_ms": 300, "error_rate_pct": 1.0},
|
||||
"skipped": False,
|
||||
},
|
||||
"privacy_context_data": {"skipped": True},
|
||||
"cost_context_data": {"skipped": True},
|
||||
}
|
||||
|
||||
result = self._run(build_triage_report_node(state))
|
||||
report = result["result"]
|
||||
assert result["graph_status"] == "succeeded"
|
||||
|
||||
causes_text = json.dumps(report["suspected_root_causes"])
|
||||
assert "SLO violations" in causes_text
|
||||
|
||||
assert "slo" in report["context"]
|
||||
assert report["context"]["slo"]["violations"] == ["latency_p95", "error_rate"]
|
||||
|
||||
assert any("SLO breach" in c for c in report["next_checks"])
|
||||
|
||||
def test_slo_skipped_does_not_add_causes(self):
|
||||
from app.graphs.incident_triage_graph import build_triage_report_node
|
||||
state = {
|
||||
"service": "gateway",
|
||||
"symptom": "slow",
|
||||
"time_range": {"from": "2025-01-01T00:00:00", "to": "2025-01-01T01:00:00"},
|
||||
"env": "prod",
|
||||
"graph_status": "running",
|
||||
"service_overview_data": {},
|
||||
"top_errors_data": {},
|
||||
"log_samples": [],
|
||||
"health_data": {"status": "healthy"},
|
||||
"runbook_snippets": [],
|
||||
"trace_data": None,
|
||||
"slo_context_data": {"skipped": True, "reason": "no metrics"},
|
||||
"privacy_context_data": {"skipped": True},
|
||||
"cost_context_data": {"skipped": True},
|
||||
}
|
||||
|
||||
result = self._run(build_triage_report_node(state))
|
||||
report = result["result"]
|
||||
causes_text = json.dumps(report["suspected_root_causes"])
|
||||
assert "SLO violations" not in causes_text
|
||||
assert report["context"]["slo"]["skipped"] is True
|
||||
|
||||
def test_slo_in_impact_assessment(self):
|
||||
from app.graphs.incident_triage_graph import build_triage_report_node
|
||||
state = {
|
||||
"service": "router",
|
||||
"symptom": "errors spike",
|
||||
"time_range": {"from": "2025-01-01T00:00:00", "to": "2025-01-01T01:00:00"},
|
||||
"env": "prod",
|
||||
"graph_status": "running",
|
||||
"service_overview_data": {},
|
||||
"top_errors_data": {},
|
||||
"log_samples": [],
|
||||
"health_data": {"status": "healthy"},
|
||||
"runbook_snippets": [],
|
||||
"trace_data": None,
|
||||
"slo_context_data": {
|
||||
"violations": ["error_rate"],
|
||||
"metrics": {"error_rate_pct": 5.0},
|
||||
"thresholds": {"error_rate_pct": 0.5},
|
||||
"skipped": False,
|
||||
},
|
||||
"privacy_context_data": {"skipped": True},
|
||||
"cost_context_data": {"skipped": True},
|
||||
}
|
||||
|
||||
result = self._run(build_triage_report_node(state))
|
||||
assert "SLO breached" in result["result"]["impact_assessment"]
|
||||
|
||||
|
||||
import json
|
||||
203
services/sofiia-supervisor/tests/test_postmortem_graph.py
Normal file
203
services/sofiia-supervisor/tests/test_postmortem_graph.py
Normal file
@@ -0,0 +1,203 @@
|
||||
"""
|
||||
Tests for postmortem_draft_graph.
|
||||
|
||||
Mocks the GatewayClient — no real network calls.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from tests.conftest import MockGatewayClient, _run
|
||||
|
||||
|
||||
# ─── Mock data ────────────────────────────────────────────────────────────────
|
||||
|
||||
_INCIDENT_DATA = {
|
||||
"id": "inc_20260223_1000_abc123",
|
||||
"service": "router",
|
||||
"env": "prod",
|
||||
"severity": "P1",
|
||||
"status": "open",
|
||||
"title": "Router OOM",
|
||||
"summary": "Router pods running out of memory under high load",
|
||||
"started_at": "2026-02-23T10:00:00Z",
|
||||
"ended_at": None,
|
||||
"created_by": "sofiia",
|
||||
"events": [
|
||||
{"ts": "2026-02-23T10:01:00Z", "type": "note", "message": "Memory usage >90%"},
|
||||
{"ts": "2026-02-23T10:10:00Z", "type": "action", "message": "Restarted pods"},
|
||||
],
|
||||
"artifacts": [],
|
||||
}
|
||||
|
||||
_INCIDENT_WITH_TRIAGE = {
|
||||
**_INCIDENT_DATA,
|
||||
"artifacts": [
|
||||
{"kind": "triage_report", "format": "json", "path": "ops/incidents/inc_test/triage_report.json"},
|
||||
],
|
||||
}
|
||||
|
||||
_OVERVIEW_DATA = {
|
||||
"status": "degraded",
|
||||
"alerts": [{"name": "OOMKilled", "severity": "critical"}],
|
||||
}
|
||||
|
||||
_HEALTH_DATA = {"status": "unhealthy", "error": "OOM"}
|
||||
|
||||
_KB_DATA = {"results": [
|
||||
{"path": "docs/runbooks/oom.md", "content": "## OOM Runbook\n- Check memory limits\n- Restart pods"}
|
||||
]}
|
||||
|
||||
|
||||
# ─── Tests ────────────────────────────────────────────────────────────────────
|
||||
|
||||
class TestPostmortemDraftGraph:
|
||||
"""Happy path: incident exists, triage exists, postmortem generated."""
|
||||
|
||||
def test_happy_path_with_triage(self):
|
||||
from app.graphs.postmortem_draft_graph import build_postmortem_draft_graph
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("oncall_tool", "incident_get", _INCIDENT_WITH_TRIAGE)
|
||||
mock_gw.register("oncall_tool", "incident_attach_artifact", {"artifact": {"path": "test", "sha256": "abc"}})
|
||||
mock_gw.register("oncall_tool", "incident_append_event", {"event": {"ts": "now", "type": "followup"}})
|
||||
|
||||
graph = build_postmortem_draft_graph()
|
||||
|
||||
with patch("app.graphs.postmortem_draft_graph.GatewayClient", return_value=mock_gw):
|
||||
result = _run(graph.ainvoke({
|
||||
"run_id": "gr_test_01",
|
||||
"agent_id": "sofiia",
|
||||
"workspace_id": "ws1",
|
||||
"user_id": "u1",
|
||||
"input": {
|
||||
"incident_id": "inc_20260223_1000_abc123",
|
||||
"service": "router",
|
||||
},
|
||||
}))
|
||||
|
||||
assert result["graph_status"] == "succeeded"
|
||||
pm = result["result"]
|
||||
assert pm["incident_id"] == "inc_20260223_1000_abc123"
|
||||
assert pm["artifacts_count"] >= 2 # md + json
|
||||
assert "postmortem" in result["postmortem_md"].lower()
|
||||
|
||||
def test_triage_missing_triggers_generation(self):
|
||||
"""When incident has no triage artifact, the graph generates one."""
|
||||
from app.graphs.postmortem_draft_graph import build_postmortem_draft_graph
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("oncall_tool", "incident_get", _INCIDENT_DATA) # no triage artifact
|
||||
mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA)
|
||||
mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
|
||||
mock_gw.register("kb_tool", "search", _KB_DATA)
|
||||
mock_gw.register("oncall_tool", "incident_attach_artifact", {"artifact": {"path": "t", "sha256": "x"}})
|
||||
mock_gw.register("oncall_tool", "incident_append_event", {"event": {}})
|
||||
|
||||
graph = build_postmortem_draft_graph()
|
||||
|
||||
with patch("app.graphs.postmortem_draft_graph.GatewayClient", return_value=mock_gw):
|
||||
result = _run(graph.ainvoke({
|
||||
"run_id": "gr_test_02",
|
||||
"agent_id": "sofiia",
|
||||
"workspace_id": "ws1",
|
||||
"user_id": "u1",
|
||||
"input": {"incident_id": "inc_20260223_1000_abc123"},
|
||||
}))
|
||||
|
||||
assert result["graph_status"] == "succeeded"
|
||||
assert result.get("triage_was_generated") is True
|
||||
# Should have triage + postmortem artifacts (3 total)
|
||||
assert result["result"]["artifacts_count"] >= 2
|
||||
|
||||
def test_incident_not_found_fails_gracefully(self):
|
||||
from app.graphs.postmortem_draft_graph import build_postmortem_draft_graph
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("oncall_tool", "incident_get", None, error="Incident not found")
|
||||
|
||||
graph = build_postmortem_draft_graph()
|
||||
|
||||
with patch("app.graphs.postmortem_draft_graph.GatewayClient", return_value=mock_gw):
|
||||
result = _run(graph.ainvoke({
|
||||
"run_id": "gr_test_03",
|
||||
"agent_id": "sofiia",
|
||||
"workspace_id": "ws1",
|
||||
"user_id": "u1",
|
||||
"input": {"incident_id": "inc_nonexistent"},
|
||||
}))
|
||||
|
||||
assert result["graph_status"] == "failed"
|
||||
assert "not found" in (result.get("error") or "").lower()
|
||||
|
||||
def test_missing_incident_id_fails(self):
|
||||
from app.graphs.postmortem_draft_graph import build_postmortem_draft_graph
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
graph = build_postmortem_draft_graph()
|
||||
|
||||
with patch("app.graphs.postmortem_draft_graph.GatewayClient", return_value=mock_gw):
|
||||
result = _run(graph.ainvoke({
|
||||
"run_id": "gr_test_04",
|
||||
"agent_id": "sofiia",
|
||||
"workspace_id": "ws1",
|
||||
"user_id": "u1",
|
||||
"input": {},
|
||||
}))
|
||||
|
||||
assert result["graph_status"] == "failed"
|
||||
assert "incident_id" in (result.get("validation_error") or "").lower()
|
||||
|
||||
def test_gateway_error_on_followup_nonfatal(self):
|
||||
"""If follow-up append fails, graph still succeeds."""
|
||||
from app.graphs.postmortem_draft_graph import build_postmortem_draft_graph
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("oncall_tool", "incident_get", _INCIDENT_WITH_TRIAGE)
|
||||
mock_gw.register("oncall_tool", "incident_attach_artifact", {"artifact": {"path": "t", "sha256": "x"}})
|
||||
mock_gw.register("oncall_tool", "incident_append_event", None, error="gateway timeout")
|
||||
|
||||
graph = build_postmortem_draft_graph()
|
||||
|
||||
with patch("app.graphs.postmortem_draft_graph.GatewayClient", return_value=mock_gw):
|
||||
result = _run(graph.ainvoke({
|
||||
"run_id": "gr_test_05",
|
||||
"agent_id": "sofiia",
|
||||
"workspace_id": "ws1",
|
||||
"user_id": "u1",
|
||||
"input": {"incident_id": "inc_20260223_1000_abc123"},
|
||||
}))
|
||||
|
||||
assert result["graph_status"] == "succeeded"
|
||||
# followups may be 0 due to error, but graph still completed
|
||||
assert result["result"]["followups_count"] == 0
|
||||
|
||||
def test_correlation_ids_present(self):
|
||||
from app.graphs.postmortem_draft_graph import build_postmortem_draft_graph
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("oncall_tool", "incident_get", _INCIDENT_WITH_TRIAGE)
|
||||
mock_gw.register("oncall_tool", "incident_attach_artifact", {"artifact": {}})
|
||||
mock_gw.register("oncall_tool", "incident_append_event", {"event": {}})
|
||||
|
||||
graph = build_postmortem_draft_graph()
|
||||
|
||||
with patch("app.graphs.postmortem_draft_graph.GatewayClient", return_value=mock_gw):
|
||||
_run(graph.ainvoke({
|
||||
"run_id": "gr_corr_01",
|
||||
"agent_id": "sofiia",
|
||||
"workspace_id": "ws1",
|
||||
"user_id": "u1",
|
||||
"input": {"incident_id": "inc_20260223_1000_abc123"},
|
||||
}))
|
||||
|
||||
# All calls should have graph_run_id
|
||||
for call in mock_gw.calls:
|
||||
assert call["graph_run_id"] == "gr_corr_01"
|
||||
225
services/sofiia-supervisor/tests/test_release_check_graph.py
Normal file
225
services/sofiia-supervisor/tests/test_release_check_graph.py
Normal file
@@ -0,0 +1,225 @@
|
||||
"""
|
||||
Tests for release_check_graph.
|
||||
|
||||
Mocks the GatewayClient — no real network calls.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from tests.conftest import MockGatewayClient, _run
|
||||
|
||||
|
||||
RELEASE_CHECK_PASS_REPORT = {
|
||||
"pass": True,
|
||||
"gates": [
|
||||
{"name": "pr_review", "status": "pass"},
|
||||
{"name": "config_lint", "status": "pass"},
|
||||
{"name": "dependency_scan", "status": "pass"},
|
||||
{"name": "contract_diff", "status": "pass"},
|
||||
],
|
||||
"recommendations": [],
|
||||
"summary": "All gates passed.",
|
||||
"elapsed_ms": 1200,
|
||||
}
|
||||
|
||||
RELEASE_CHECK_FAIL_REPORT = {
|
||||
"pass": False,
|
||||
"gates": [
|
||||
{"name": "pr_review", "status": "fail"},
|
||||
{"name": "config_lint", "status": "pass"},
|
||||
],
|
||||
"recommendations": ["Fix PR review issues before release."],
|
||||
"summary": "PR review failed.",
|
||||
"elapsed_ms": 800,
|
||||
}
|
||||
|
||||
|
||||
class TestReleaseCheckGraphSuccess:
|
||||
"""release_check_graph: job starts → job succeeds → returns pass=True."""
|
||||
|
||||
def test_async_job_flow(self):
|
||||
"""start_task returns job_id, then get_job returns succeeded."""
|
||||
from app.graphs.release_check_graph import build_release_check_graph
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
# start_task: returns a job that needs polling
|
||||
mock_gw.register("job_orchestrator_tool", "start_task", {
|
||||
"job_id": "j_test_001", "status": "running"
|
||||
})
|
||||
# First poll: still running
|
||||
mock_gw.register("job_orchestrator_tool", "get_job", {"status": "running"})
|
||||
# Second poll: succeeded with result
|
||||
mock_gw.register("job_orchestrator_tool", "get_job", {
|
||||
"status": "succeeded",
|
||||
"result": RELEASE_CHECK_PASS_REPORT,
|
||||
})
|
||||
|
||||
compiled = build_release_check_graph()
|
||||
initial_state = {
|
||||
"run_id": "gr_test_release_001",
|
||||
"agent_id": "sofiia",
|
||||
"workspace_id": "daarion",
|
||||
"user_id": "u_001",
|
||||
"input": {
|
||||
"service_name": "router",
|
||||
"fail_fast": True,
|
||||
"run_deps": True,
|
||||
"run_drift": True,
|
||||
},
|
||||
}
|
||||
|
||||
with patch("app.graphs.release_check_graph.GatewayClient", return_value=mock_gw):
|
||||
final = _run(compiled.ainvoke(initial_state))
|
||||
|
||||
assert final["graph_status"] == "succeeded"
|
||||
assert final["result"]["pass"] is True
|
||||
assert final["result"]["summary"] == "All gates passed."
|
||||
|
||||
def test_synchronous_job_completion(self):
|
||||
"""start_task returns result immediately (no polling needed)."""
|
||||
from app.graphs.release_check_graph import build_release_check_graph
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("job_orchestrator_tool", "start_task", {
|
||||
"job_id": "j_sync_001",
|
||||
"status": "succeeded",
|
||||
"result": RELEASE_CHECK_PASS_REPORT,
|
||||
})
|
||||
|
||||
compiled = build_release_check_graph()
|
||||
with patch("app.graphs.release_check_graph.GatewayClient", return_value=mock_gw):
|
||||
final = _run(compiled.ainvoke({
|
||||
"run_id": "gr_sync_001",
|
||||
"agent_id": "sofiia", "workspace_id": "daarion", "user_id": "u_001",
|
||||
"input": {"service_name": "router"},
|
||||
}))
|
||||
|
||||
assert final["graph_status"] == "succeeded"
|
||||
assert final["result"]["pass"] is True
|
||||
# Only one call made (no polling)
|
||||
tool_calls = [c for c in mock_gw.calls if c["tool"] == "job_orchestrator_tool"]
|
||||
assert len(tool_calls) == 1
|
||||
|
||||
|
||||
class TestReleaseCheckGraphFail:
|
||||
"""release_check_graph: job fails → pass=False with error."""
|
||||
|
||||
def test_job_fails(self):
|
||||
"""get_job returns failed → result.pass=False."""
|
||||
from app.graphs.release_check_graph import build_release_check_graph
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("job_orchestrator_tool", "start_task", {
|
||||
"job_id": "j_fail_001", "status": "running"
|
||||
})
|
||||
mock_gw.register("job_orchestrator_tool", "get_job", {
|
||||
"status": "failed",
|
||||
"error": "PR review failed",
|
||||
"result": RELEASE_CHECK_FAIL_REPORT,
|
||||
})
|
||||
|
||||
compiled = build_release_check_graph()
|
||||
with patch("app.graphs.release_check_graph.GatewayClient", return_value=mock_gw):
|
||||
final = _run(compiled.ainvoke({
|
||||
"run_id": "gr_fail_001",
|
||||
"agent_id": "sofiia", "workspace_id": "daarion", "user_id": "u_001",
|
||||
"input": {"service_name": "router"},
|
||||
}))
|
||||
|
||||
assert final["graph_status"] == "failed"
|
||||
|
||||
def test_start_task_gateway_error(self):
|
||||
"""Gateway returns error on start_task → graph fails gracefully."""
|
||||
from app.graphs.release_check_graph import build_release_check_graph
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("job_orchestrator_tool", "start_task",
|
||||
None, error="RBAC denied: tools.jobs.run not found")
|
||||
|
||||
compiled = build_release_check_graph()
|
||||
with patch("app.graphs.release_check_graph.GatewayClient", return_value=mock_gw):
|
||||
final = _run(compiled.ainvoke({
|
||||
"run_id": "gr_err_001",
|
||||
"agent_id": "nobody", "workspace_id": "w", "user_id": "u",
|
||||
"input": {},
|
||||
}))
|
||||
|
||||
assert final["graph_status"] == "failed"
|
||||
assert "start_task failed" in (final.get("error") or "")
|
||||
|
||||
def test_finalize_produces_valid_report(self):
|
||||
"""Even on failure, finalize returns a valid report structure."""
|
||||
from app.graphs.release_check_graph import build_release_check_graph
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("job_orchestrator_tool", "start_task",
|
||||
None, error="timeout")
|
||||
|
||||
compiled = build_release_check_graph()
|
||||
with patch("app.graphs.release_check_graph.GatewayClient", return_value=mock_gw):
|
||||
final = _run(compiled.ainvoke({
|
||||
"run_id": "gr_fin_001",
|
||||
"agent_id": "sofiia", "workspace_id": "daarion", "user_id": "u",
|
||||
"input": {},
|
||||
}))
|
||||
|
||||
result = final.get("result")
|
||||
assert result is not None
|
||||
assert "pass" in result
|
||||
assert "summary" in result
|
||||
|
||||
|
||||
# ─── Correlation IDs test ─────────────────────────────────────────────────────
|
||||
|
||||
class TestCorrelationIds:
|
||||
"""Every tool call must carry graph_run_id in metadata."""
|
||||
|
||||
def test_all_calls_have_run_id(self):
|
||||
from app.graphs.release_check_graph import build_release_check_graph
|
||||
|
||||
run_id = "gr_correlation_test_001"
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("job_orchestrator_tool", "start_task", {
|
||||
"job_id": "j_corr_001", "status": "succeeded",
|
||||
"result": RELEASE_CHECK_PASS_REPORT,
|
||||
})
|
||||
|
||||
compiled = build_release_check_graph()
|
||||
with patch("app.graphs.release_check_graph.GatewayClient", return_value=mock_gw):
|
||||
_run(compiled.ainvoke({
|
||||
"run_id": run_id,
|
||||
"agent_id": "sofiia", "workspace_id": "daarion", "user_id": "u",
|
||||
"input": {"service_name": "router"},
|
||||
}))
|
||||
|
||||
for call in mock_gw.calls:
|
||||
assert call["graph_run_id"] == run_id, (
|
||||
f"Call {call['tool']}:{call['action']} missing graph_run_id"
|
||||
)
|
||||
|
||||
def test_graph_node_included_in_calls(self):
|
||||
"""Each call should have a non-empty graph_node."""
|
||||
from app.graphs.release_check_graph import build_release_check_graph
|
||||
|
||||
mock_gw = MockGatewayClient()
|
||||
mock_gw.register("job_orchestrator_tool", "start_task", {
|
||||
"job_id": "j_node_001", "status": "succeeded",
|
||||
"result": RELEASE_CHECK_PASS_REPORT,
|
||||
})
|
||||
|
||||
compiled = build_release_check_graph()
|
||||
with patch("app.graphs.release_check_graph.GatewayClient", return_value=mock_gw):
|
||||
_run(compiled.ainvoke({
|
||||
"run_id": "gr_node_001",
|
||||
"agent_id": "sofiia", "workspace_id": "daarion", "user_id": "u",
|
||||
"input": {},
|
||||
}))
|
||||
|
||||
for call in mock_gw.calls:
|
||||
assert call["graph_node"], f"Call missing graph_node: {call}"
|
||||
91
services/sofiia-supervisor/tests/test_state_backend.py
Normal file
91
services/sofiia-supervisor/tests/test_state_backend.py
Normal file
@@ -0,0 +1,91 @@
|
||||
"""Tests for in-memory state backend (Redis tested in integration)."""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from tests.conftest import _run
|
||||
from app.models import EventType, RunEvent, RunRecord, RunStatus
|
||||
|
||||
|
||||
def _make_run(run_id: str = "gr_test_001") -> RunRecord:
|
||||
return RunRecord(
|
||||
run_id=run_id,
|
||||
graph="release_check",
|
||||
status=RunStatus.QUEUED,
|
||||
agent_id="sofiia",
|
||||
workspace_id="daarion",
|
||||
user_id="u_001",
|
||||
)
|
||||
|
||||
|
||||
class TestMemoryBackend:
|
||||
def test_save_and_get_run(self):
|
||||
from app.state_backend import MemoryStateBackend
|
||||
backend = MemoryStateBackend()
|
||||
run = _make_run("gr_001")
|
||||
_run(backend.save_run(run))
|
||||
fetched = _run(backend.get_run("gr_001"))
|
||||
assert fetched is not None
|
||||
assert fetched.run_id == "gr_001"
|
||||
assert fetched.status == RunStatus.QUEUED
|
||||
|
||||
def test_get_missing_run_returns_none(self):
|
||||
from app.state_backend import MemoryStateBackend
|
||||
backend = MemoryStateBackend()
|
||||
assert _run(backend.get_run("does_not_exist")) is None
|
||||
|
||||
def test_append_and_get_events(self):
|
||||
from app.state_backend import MemoryStateBackend
|
||||
backend = MemoryStateBackend()
|
||||
run = _make_run("gr_002")
|
||||
_run(backend.save_run(run))
|
||||
|
||||
ev1 = RunEvent(ts="2026-01-01T00:00:00Z", type=EventType.NODE_START, node="start_job")
|
||||
ev2 = RunEvent(ts="2026-01-01T00:00:01Z", type=EventType.TOOL_CALL, tool="job_orchestrator_tool",
|
||||
details={"hash": "abc123", "size": 200})
|
||||
_run(backend.append_event("gr_002", ev1))
|
||||
_run(backend.append_event("gr_002", ev2))
|
||||
|
||||
events = _run(backend.get_events("gr_002"))
|
||||
assert len(events) == 2
|
||||
assert events[0].type == EventType.NODE_START
|
||||
assert events[1].tool == "job_orchestrator_tool"
|
||||
# Events should NOT contain payload content
|
||||
assert "size" in events[1].details
|
||||
|
||||
def test_cancel_queued_run(self):
|
||||
from app.state_backend import MemoryStateBackend
|
||||
backend = MemoryStateBackend()
|
||||
run = _make_run("gr_003")
|
||||
_run(backend.save_run(run))
|
||||
|
||||
ok = _run(backend.cancel_run("gr_003"))
|
||||
assert ok is True
|
||||
fetched = _run(backend.get_run("gr_003"))
|
||||
assert fetched.status == RunStatus.CANCELLED
|
||||
|
||||
def test_cancel_completed_run_returns_false(self):
|
||||
from app.state_backend import MemoryStateBackend
|
||||
backend = MemoryStateBackend()
|
||||
run = _make_run("gr_004")
|
||||
run.status = RunStatus.SUCCEEDED
|
||||
_run(backend.save_run(run))
|
||||
|
||||
ok = _run(backend.cancel_run("gr_004"))
|
||||
assert ok is False
|
||||
|
||||
def test_update_run_status(self):
|
||||
from app.state_backend import MemoryStateBackend
|
||||
backend = MemoryStateBackend()
|
||||
run = _make_run("gr_005")
|
||||
_run(backend.save_run(run))
|
||||
|
||||
run.status = RunStatus.RUNNING
|
||||
run.started_at = "2026-01-01T00:00:00Z"
|
||||
_run(backend.save_run(run))
|
||||
|
||||
fetched = _run(backend.get_run("gr_005"))
|
||||
assert fetched.status == RunStatus.RUNNING
|
||||
assert fetched.started_at == "2026-01-01T00:00:00Z"
|
||||
Reference in New Issue
Block a user