Files
microdao-daarion/services/sofiia-supervisor/tests/test_incident_triage_slo_context.py
Apple 129e4ea1fc feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store,
architecture_pressure, backlog_generator/store, cost_analyzer, data_governance,
dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment,
platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files),
signature_state_store, sofiia_auto_router, tool_governance

New services:
- sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static
- memory-service: integration_endpoints, integrations, voice_endpoints, static UI
- aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents)
- sofiia-supervisor: new supervisor service
- aistalk-bridge-lite: Telegram bridge lite
- calendar-service: CalDAV calendar service with reminders
- mlx-stt-service / mlx-tts-service: Apple Silicon speech services
- binance-bot-monitor: market monitor service
- node-worker: STT/TTS memory providers

New tools (9): agent_email, browser_tool, contract_tool, observability_tool,
oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault

New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus,
farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine,
session_context, style_adapter, telemetry)

Tests: 85+ test files for all new modules
Made-with: Cursor
2026-03-03 07:14:14 -08:00

256 lines
9.3 KiB
Python

"""
Tests for slo_context_node in incident_triage_graph.
Verifies SLO violations are detected, enrich triage, and non-fatal on error.
"""
import asyncio
import os
import sys
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
ROOT_SUPERVISOR = Path(__file__).resolve().parent.parent
if str(ROOT_SUPERVISOR) not in sys.path:
sys.path.insert(0, str(ROOT_SUPERVISOR))
class MockGatewayResult:
def __init__(self, success, data=None, error_message=None):
self.success = success
self.data = data
self.error_message = error_message
class MockGatewayClient:
"""Configurable mock for GatewayClient that routes by tool+action."""
def __init__(self, overrides=None):
self.overrides = overrides or {}
self.calls = []
async def call_tool(self, tool, action, params=None, **kwargs):
self.calls.append({"tool": tool, "action": action, "params": params})
key = f"{tool}.{action}"
if key in self.overrides:
return self.overrides[key]
return MockGatewayResult(True, {"status": "ok", "lines": [], "results": []})
async def __aenter__(self):
return self
async def __aexit__(self, *args):
pass
class TestSLOContextNode:
"""Tests for the slo_context_node in isolation."""
def _run(self, coro):
return asyncio.run(coro)
def test_slo_violations_detected(self):
from app.graphs.incident_triage_graph import slo_context_node
mock_gw = MockGatewayClient(overrides={
"observability_tool.slo_snapshot": MockGatewayResult(True, {
"service": "gateway",
"window_minutes": 60,
"metrics": {"latency_p95_ms": 450, "error_rate_pct": 2.5, "req_rate_rps": 100},
"thresholds": {"latency_p95_ms": 300, "error_rate_pct": 1.0},
"violations": ["latency_p95", "error_rate"],
"skipped": False,
}),
})
state = {
"run_id": "test_run_1",
"service": "gateway",
"env": "prod",
"time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"},
"agent_id": "sofiia",
"workspace_id": "default",
"user_id": "test",
"graph_status": "running",
}
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
result = self._run(slo_context_node(state))
slo_data = result.get("slo_context_data", {})
assert not slo_data.get("skipped", False)
assert "latency_p95" in slo_data["violations"]
assert "error_rate" in slo_data["violations"]
assert slo_data["metrics"]["latency_p95_ms"] == 450
def test_slo_no_violations(self):
from app.graphs.incident_triage_graph import slo_context_node
mock_gw = MockGatewayClient(overrides={
"observability_tool.slo_snapshot": MockGatewayResult(True, {
"service": "gateway",
"window_minutes": 60,
"metrics": {"latency_p95_ms": 150, "error_rate_pct": 0.3},
"thresholds": {"latency_p95_ms": 300, "error_rate_pct": 1.0},
"violations": [],
"skipped": False,
}),
})
state = {
"run_id": "test_run_2",
"service": "gateway",
"env": "prod",
"time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"},
"graph_status": "running",
}
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
result = self._run(slo_context_node(state))
slo_data = result.get("slo_context_data", {})
assert slo_data["violations"] == []
assert not slo_data.get("skipped")
def test_slo_gateway_error_nonfatal(self):
from app.graphs.incident_triage_graph import slo_context_node
mock_gw = MockGatewayClient(overrides={
"observability_tool.slo_snapshot": MockGatewayResult(False, error_message="timeout"),
})
state = {
"run_id": "test_run_3",
"service": "gateway",
"env": "prod",
"time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"},
"graph_status": "running",
}
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
result = self._run(slo_context_node(state))
slo_data = result.get("slo_context_data", {})
assert slo_data["skipped"] is True
assert result.get("graph_status") == "running"
def test_slo_exception_nonfatal(self):
from app.graphs.incident_triage_graph import slo_context_node
class FailingGW:
async def __aenter__(self):
return self
async def __aexit__(self, *a):
pass
async def call_tool(self, **kwargs):
raise ConnectionError("connection refused")
state = {
"run_id": "test_run_4",
"service": "gateway",
"env": "prod",
"time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"},
"graph_status": "running",
}
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=FailingGW()):
result = self._run(slo_context_node(state))
assert result.get("slo_context_data", {}).get("skipped") is True
assert result.get("graph_status") == "running"
class TestTriageReportWithSLO:
"""Tests that build_triage_report_node includes SLO context properly."""
def _run(self, coro):
return asyncio.run(coro)
def test_slo_violations_appear_in_root_causes(self):
from app.graphs.incident_triage_graph import build_triage_report_node
state = {
"service": "gateway",
"symptom": "high latency",
"time_range": {"from": "2025-01-01T00:00:00", "to": "2025-01-01T01:00:00"},
"env": "prod",
"graph_status": "running",
"service_overview_data": {},
"top_errors_data": {},
"log_samples": [],
"health_data": {"status": "degraded"},
"runbook_snippets": [],
"trace_data": None,
"slo_context_data": {
"violations": ["latency_p95", "error_rate"],
"metrics": {"latency_p95_ms": 500, "error_rate_pct": 3.0},
"thresholds": {"latency_p95_ms": 300, "error_rate_pct": 1.0},
"skipped": False,
},
"privacy_context_data": {"skipped": True},
"cost_context_data": {"skipped": True},
}
result = self._run(build_triage_report_node(state))
report = result["result"]
assert result["graph_status"] == "succeeded"
causes_text = json.dumps(report["suspected_root_causes"])
assert "SLO violations" in causes_text
assert "slo" in report["context"]
assert report["context"]["slo"]["violations"] == ["latency_p95", "error_rate"]
assert any("SLO breach" in c for c in report["next_checks"])
def test_slo_skipped_does_not_add_causes(self):
from app.graphs.incident_triage_graph import build_triage_report_node
state = {
"service": "gateway",
"symptom": "slow",
"time_range": {"from": "2025-01-01T00:00:00", "to": "2025-01-01T01:00:00"},
"env": "prod",
"graph_status": "running",
"service_overview_data": {},
"top_errors_data": {},
"log_samples": [],
"health_data": {"status": "healthy"},
"runbook_snippets": [],
"trace_data": None,
"slo_context_data": {"skipped": True, "reason": "no metrics"},
"privacy_context_data": {"skipped": True},
"cost_context_data": {"skipped": True},
}
result = self._run(build_triage_report_node(state))
report = result["result"]
causes_text = json.dumps(report["suspected_root_causes"])
assert "SLO violations" not in causes_text
assert report["context"]["slo"]["skipped"] is True
def test_slo_in_impact_assessment(self):
from app.graphs.incident_triage_graph import build_triage_report_node
state = {
"service": "router",
"symptom": "errors spike",
"time_range": {"from": "2025-01-01T00:00:00", "to": "2025-01-01T01:00:00"},
"env": "prod",
"graph_status": "running",
"service_overview_data": {},
"top_errors_data": {},
"log_samples": [],
"health_data": {"status": "healthy"},
"runbook_snippets": [],
"trace_data": None,
"slo_context_data": {
"violations": ["error_rate"],
"metrics": {"error_rate_pct": 5.0},
"thresholds": {"error_rate_pct": 0.5},
"skipped": False,
},
"privacy_context_data": {"skipped": True},
"cost_context_data": {"skipped": True},
}
result = self._run(build_triage_report_node(state))
assert "SLO breached" in result["result"]["impact_assessment"]
import json