""" Tests for slo_context_node in incident_triage_graph. Verifies SLO violations are detected, enrich triage, and non-fatal on error. """ import asyncio import os import sys from pathlib import Path from unittest.mock import AsyncMock, MagicMock, patch import pytest ROOT_SUPERVISOR = Path(__file__).resolve().parent.parent if str(ROOT_SUPERVISOR) not in sys.path: sys.path.insert(0, str(ROOT_SUPERVISOR)) class MockGatewayResult: def __init__(self, success, data=None, error_message=None): self.success = success self.data = data self.error_message = error_message class MockGatewayClient: """Configurable mock for GatewayClient that routes by tool+action.""" def __init__(self, overrides=None): self.overrides = overrides or {} self.calls = [] async def call_tool(self, tool, action, params=None, **kwargs): self.calls.append({"tool": tool, "action": action, "params": params}) key = f"{tool}.{action}" if key in self.overrides: return self.overrides[key] return MockGatewayResult(True, {"status": "ok", "lines": [], "results": []}) async def __aenter__(self): return self async def __aexit__(self, *args): pass class TestSLOContextNode: """Tests for the slo_context_node in isolation.""" def _run(self, coro): return asyncio.run(coro) def test_slo_violations_detected(self): from app.graphs.incident_triage_graph import slo_context_node mock_gw = MockGatewayClient(overrides={ "observability_tool.slo_snapshot": MockGatewayResult(True, { "service": "gateway", "window_minutes": 60, "metrics": {"latency_p95_ms": 450, "error_rate_pct": 2.5, "req_rate_rps": 100}, "thresholds": {"latency_p95_ms": 300, "error_rate_pct": 1.0}, "violations": ["latency_p95", "error_rate"], "skipped": False, }), }) state = { "run_id": "test_run_1", "service": "gateway", "env": "prod", "time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"}, "agent_id": "sofiia", "workspace_id": "default", "user_id": "test", "graph_status": "running", } with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw): result = self._run(slo_context_node(state)) slo_data = result.get("slo_context_data", {}) assert not slo_data.get("skipped", False) assert "latency_p95" in slo_data["violations"] assert "error_rate" in slo_data["violations"] assert slo_data["metrics"]["latency_p95_ms"] == 450 def test_slo_no_violations(self): from app.graphs.incident_triage_graph import slo_context_node mock_gw = MockGatewayClient(overrides={ "observability_tool.slo_snapshot": MockGatewayResult(True, { "service": "gateway", "window_minutes": 60, "metrics": {"latency_p95_ms": 150, "error_rate_pct": 0.3}, "thresholds": {"latency_p95_ms": 300, "error_rate_pct": 1.0}, "violations": [], "skipped": False, }), }) state = { "run_id": "test_run_2", "service": "gateway", "env": "prod", "time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"}, "graph_status": "running", } with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw): result = self._run(slo_context_node(state)) slo_data = result.get("slo_context_data", {}) assert slo_data["violations"] == [] assert not slo_data.get("skipped") def test_slo_gateway_error_nonfatal(self): from app.graphs.incident_triage_graph import slo_context_node mock_gw = MockGatewayClient(overrides={ "observability_tool.slo_snapshot": MockGatewayResult(False, error_message="timeout"), }) state = { "run_id": "test_run_3", "service": "gateway", "env": "prod", "time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"}, "graph_status": "running", } with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw): result = self._run(slo_context_node(state)) slo_data = result.get("slo_context_data", {}) assert slo_data["skipped"] is True assert result.get("graph_status") == "running" def test_slo_exception_nonfatal(self): from app.graphs.incident_triage_graph import slo_context_node class FailingGW: async def __aenter__(self): return self async def __aexit__(self, *a): pass async def call_tool(self, **kwargs): raise ConnectionError("connection refused") state = { "run_id": "test_run_4", "service": "gateway", "env": "prod", "time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"}, "graph_status": "running", } with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=FailingGW()): result = self._run(slo_context_node(state)) assert result.get("slo_context_data", {}).get("skipped") is True assert result.get("graph_status") == "running" class TestTriageReportWithSLO: """Tests that build_triage_report_node includes SLO context properly.""" def _run(self, coro): return asyncio.run(coro) def test_slo_violations_appear_in_root_causes(self): from app.graphs.incident_triage_graph import build_triage_report_node state = { "service": "gateway", "symptom": "high latency", "time_range": {"from": "2025-01-01T00:00:00", "to": "2025-01-01T01:00:00"}, "env": "prod", "graph_status": "running", "service_overview_data": {}, "top_errors_data": {}, "log_samples": [], "health_data": {"status": "degraded"}, "runbook_snippets": [], "trace_data": None, "slo_context_data": { "violations": ["latency_p95", "error_rate"], "metrics": {"latency_p95_ms": 500, "error_rate_pct": 3.0}, "thresholds": {"latency_p95_ms": 300, "error_rate_pct": 1.0}, "skipped": False, }, "privacy_context_data": {"skipped": True}, "cost_context_data": {"skipped": True}, } result = self._run(build_triage_report_node(state)) report = result["result"] assert result["graph_status"] == "succeeded" causes_text = json.dumps(report["suspected_root_causes"]) assert "SLO violations" in causes_text assert "slo" in report["context"] assert report["context"]["slo"]["violations"] == ["latency_p95", "error_rate"] assert any("SLO breach" in c for c in report["next_checks"]) def test_slo_skipped_does_not_add_causes(self): from app.graphs.incident_triage_graph import build_triage_report_node state = { "service": "gateway", "symptom": "slow", "time_range": {"from": "2025-01-01T00:00:00", "to": "2025-01-01T01:00:00"}, "env": "prod", "graph_status": "running", "service_overview_data": {}, "top_errors_data": {}, "log_samples": [], "health_data": {"status": "healthy"}, "runbook_snippets": [], "trace_data": None, "slo_context_data": {"skipped": True, "reason": "no metrics"}, "privacy_context_data": {"skipped": True}, "cost_context_data": {"skipped": True}, } result = self._run(build_triage_report_node(state)) report = result["result"] causes_text = json.dumps(report["suspected_root_causes"]) assert "SLO violations" not in causes_text assert report["context"]["slo"]["skipped"] is True def test_slo_in_impact_assessment(self): from app.graphs.incident_triage_graph import build_triage_report_node state = { "service": "router", "symptom": "errors spike", "time_range": {"from": "2025-01-01T00:00:00", "to": "2025-01-01T01:00:00"}, "env": "prod", "graph_status": "running", "service_overview_data": {}, "top_errors_data": {}, "log_samples": [], "health_data": {"status": "healthy"}, "runbook_snippets": [], "trace_data": None, "slo_context_data": { "violations": ["error_rate"], "metrics": {"error_rate_pct": 5.0}, "thresholds": {"error_rate_pct": 0.5}, "skipped": False, }, "privacy_context_data": {"skipped": True}, "cost_context_data": {"skipped": True}, } result = self._run(build_triage_report_node(state)) assert "SLO breached" in result["result"]["impact_assessment"] import json