New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
256 lines
9.3 KiB
Python
256 lines
9.3 KiB
Python
"""
|
|
Tests for slo_context_node in incident_triage_graph.
|
|
Verifies SLO violations are detected, enrich triage, and non-fatal on error.
|
|
"""
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
ROOT_SUPERVISOR = Path(__file__).resolve().parent.parent
|
|
if str(ROOT_SUPERVISOR) not in sys.path:
|
|
sys.path.insert(0, str(ROOT_SUPERVISOR))
|
|
|
|
|
|
class MockGatewayResult:
|
|
def __init__(self, success, data=None, error_message=None):
|
|
self.success = success
|
|
self.data = data
|
|
self.error_message = error_message
|
|
|
|
|
|
class MockGatewayClient:
|
|
"""Configurable mock for GatewayClient that routes by tool+action."""
|
|
|
|
def __init__(self, overrides=None):
|
|
self.overrides = overrides or {}
|
|
self.calls = []
|
|
|
|
async def call_tool(self, tool, action, params=None, **kwargs):
|
|
self.calls.append({"tool": tool, "action": action, "params": params})
|
|
key = f"{tool}.{action}"
|
|
if key in self.overrides:
|
|
return self.overrides[key]
|
|
return MockGatewayResult(True, {"status": "ok", "lines": [], "results": []})
|
|
|
|
async def __aenter__(self):
|
|
return self
|
|
|
|
async def __aexit__(self, *args):
|
|
pass
|
|
|
|
|
|
class TestSLOContextNode:
|
|
"""Tests for the slo_context_node in isolation."""
|
|
|
|
def _run(self, coro):
|
|
return asyncio.run(coro)
|
|
|
|
def test_slo_violations_detected(self):
|
|
from app.graphs.incident_triage_graph import slo_context_node
|
|
mock_gw = MockGatewayClient(overrides={
|
|
"observability_tool.slo_snapshot": MockGatewayResult(True, {
|
|
"service": "gateway",
|
|
"window_minutes": 60,
|
|
"metrics": {"latency_p95_ms": 450, "error_rate_pct": 2.5, "req_rate_rps": 100},
|
|
"thresholds": {"latency_p95_ms": 300, "error_rate_pct": 1.0},
|
|
"violations": ["latency_p95", "error_rate"],
|
|
"skipped": False,
|
|
}),
|
|
})
|
|
|
|
state = {
|
|
"run_id": "test_run_1",
|
|
"service": "gateway",
|
|
"env": "prod",
|
|
"time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"},
|
|
"agent_id": "sofiia",
|
|
"workspace_id": "default",
|
|
"user_id": "test",
|
|
"graph_status": "running",
|
|
}
|
|
|
|
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
|
result = self._run(slo_context_node(state))
|
|
|
|
slo_data = result.get("slo_context_data", {})
|
|
assert not slo_data.get("skipped", False)
|
|
assert "latency_p95" in slo_data["violations"]
|
|
assert "error_rate" in slo_data["violations"]
|
|
assert slo_data["metrics"]["latency_p95_ms"] == 450
|
|
|
|
def test_slo_no_violations(self):
|
|
from app.graphs.incident_triage_graph import slo_context_node
|
|
mock_gw = MockGatewayClient(overrides={
|
|
"observability_tool.slo_snapshot": MockGatewayResult(True, {
|
|
"service": "gateway",
|
|
"window_minutes": 60,
|
|
"metrics": {"latency_p95_ms": 150, "error_rate_pct": 0.3},
|
|
"thresholds": {"latency_p95_ms": 300, "error_rate_pct": 1.0},
|
|
"violations": [],
|
|
"skipped": False,
|
|
}),
|
|
})
|
|
|
|
state = {
|
|
"run_id": "test_run_2",
|
|
"service": "gateway",
|
|
"env": "prod",
|
|
"time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"},
|
|
"graph_status": "running",
|
|
}
|
|
|
|
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
|
result = self._run(slo_context_node(state))
|
|
|
|
slo_data = result.get("slo_context_data", {})
|
|
assert slo_data["violations"] == []
|
|
assert not slo_data.get("skipped")
|
|
|
|
def test_slo_gateway_error_nonfatal(self):
|
|
from app.graphs.incident_triage_graph import slo_context_node
|
|
mock_gw = MockGatewayClient(overrides={
|
|
"observability_tool.slo_snapshot": MockGatewayResult(False, error_message="timeout"),
|
|
})
|
|
|
|
state = {
|
|
"run_id": "test_run_3",
|
|
"service": "gateway",
|
|
"env": "prod",
|
|
"time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"},
|
|
"graph_status": "running",
|
|
}
|
|
|
|
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
|
result = self._run(slo_context_node(state))
|
|
|
|
slo_data = result.get("slo_context_data", {})
|
|
assert slo_data["skipped"] is True
|
|
assert result.get("graph_status") == "running"
|
|
|
|
def test_slo_exception_nonfatal(self):
|
|
from app.graphs.incident_triage_graph import slo_context_node
|
|
|
|
class FailingGW:
|
|
async def __aenter__(self):
|
|
return self
|
|
async def __aexit__(self, *a):
|
|
pass
|
|
async def call_tool(self, **kwargs):
|
|
raise ConnectionError("connection refused")
|
|
|
|
state = {
|
|
"run_id": "test_run_4",
|
|
"service": "gateway",
|
|
"env": "prod",
|
|
"time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"},
|
|
"graph_status": "running",
|
|
}
|
|
|
|
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=FailingGW()):
|
|
result = self._run(slo_context_node(state))
|
|
|
|
assert result.get("slo_context_data", {}).get("skipped") is True
|
|
assert result.get("graph_status") == "running"
|
|
|
|
|
|
class TestTriageReportWithSLO:
|
|
"""Tests that build_triage_report_node includes SLO context properly."""
|
|
|
|
def _run(self, coro):
|
|
return asyncio.run(coro)
|
|
|
|
def test_slo_violations_appear_in_root_causes(self):
|
|
from app.graphs.incident_triage_graph import build_triage_report_node
|
|
state = {
|
|
"service": "gateway",
|
|
"symptom": "high latency",
|
|
"time_range": {"from": "2025-01-01T00:00:00", "to": "2025-01-01T01:00:00"},
|
|
"env": "prod",
|
|
"graph_status": "running",
|
|
"service_overview_data": {},
|
|
"top_errors_data": {},
|
|
"log_samples": [],
|
|
"health_data": {"status": "degraded"},
|
|
"runbook_snippets": [],
|
|
"trace_data": None,
|
|
"slo_context_data": {
|
|
"violations": ["latency_p95", "error_rate"],
|
|
"metrics": {"latency_p95_ms": 500, "error_rate_pct": 3.0},
|
|
"thresholds": {"latency_p95_ms": 300, "error_rate_pct": 1.0},
|
|
"skipped": False,
|
|
},
|
|
"privacy_context_data": {"skipped": True},
|
|
"cost_context_data": {"skipped": True},
|
|
}
|
|
|
|
result = self._run(build_triage_report_node(state))
|
|
report = result["result"]
|
|
assert result["graph_status"] == "succeeded"
|
|
|
|
causes_text = json.dumps(report["suspected_root_causes"])
|
|
assert "SLO violations" in causes_text
|
|
|
|
assert "slo" in report["context"]
|
|
assert report["context"]["slo"]["violations"] == ["latency_p95", "error_rate"]
|
|
|
|
assert any("SLO breach" in c for c in report["next_checks"])
|
|
|
|
def test_slo_skipped_does_not_add_causes(self):
|
|
from app.graphs.incident_triage_graph import build_triage_report_node
|
|
state = {
|
|
"service": "gateway",
|
|
"symptom": "slow",
|
|
"time_range": {"from": "2025-01-01T00:00:00", "to": "2025-01-01T01:00:00"},
|
|
"env": "prod",
|
|
"graph_status": "running",
|
|
"service_overview_data": {},
|
|
"top_errors_data": {},
|
|
"log_samples": [],
|
|
"health_data": {"status": "healthy"},
|
|
"runbook_snippets": [],
|
|
"trace_data": None,
|
|
"slo_context_data": {"skipped": True, "reason": "no metrics"},
|
|
"privacy_context_data": {"skipped": True},
|
|
"cost_context_data": {"skipped": True},
|
|
}
|
|
|
|
result = self._run(build_triage_report_node(state))
|
|
report = result["result"]
|
|
causes_text = json.dumps(report["suspected_root_causes"])
|
|
assert "SLO violations" not in causes_text
|
|
assert report["context"]["slo"]["skipped"] is True
|
|
|
|
def test_slo_in_impact_assessment(self):
|
|
from app.graphs.incident_triage_graph import build_triage_report_node
|
|
state = {
|
|
"service": "router",
|
|
"symptom": "errors spike",
|
|
"time_range": {"from": "2025-01-01T00:00:00", "to": "2025-01-01T01:00:00"},
|
|
"env": "prod",
|
|
"graph_status": "running",
|
|
"service_overview_data": {},
|
|
"top_errors_data": {},
|
|
"log_samples": [],
|
|
"health_data": {"status": "healthy"},
|
|
"runbook_snippets": [],
|
|
"trace_data": None,
|
|
"slo_context_data": {
|
|
"violations": ["error_rate"],
|
|
"metrics": {"error_rate_pct": 5.0},
|
|
"thresholds": {"error_rate_pct": 0.5},
|
|
"skipped": False,
|
|
},
|
|
"privacy_context_data": {"skipped": True},
|
|
"cost_context_data": {"skipped": True},
|
|
}
|
|
|
|
result = self._run(build_triage_report_node(state))
|
|
assert "SLO breached" in result["result"]["impact_assessment"]
|
|
|
|
|
|
import json
|