microdao-daarion/services/sofiia-supervisor/tests/test_incident_triage_slo_context.py

"""
Tests for slo_context_node in incident_triage_graph.
Verifies SLO violations are detected, enrich triage, and non-fatal on error.
"""
import asyncio
import os
import sys
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch

import pytest

ROOT_SUPERVISOR = Path(__file__).resolve().parent.parent
if str(ROOT_SUPERVISOR) not in sys.path:
    sys.path.insert(0, str(ROOT_SUPERVISOR))


class MockGatewayResult:
    def __init__(self, success, data=None, error_message=None):
        self.success = success
        self.data = data
        self.error_message = error_message


class MockGatewayClient:
    """Configurable mock for GatewayClient that routes by tool+action."""

    def __init__(self, overrides=None):
        self.overrides = overrides or {}
        self.calls = []

    async def call_tool(self, tool, action, params=None, **kwargs):
        self.calls.append({"tool": tool, "action": action, "params": params})
        key = f"{tool}.{action}"
        if key in self.overrides:
            return self.overrides[key]
        return MockGatewayResult(True, {"status": "ok", "lines": [], "results": []})

    async def __aenter__(self):
        return self

    async def __aexit__(self, *args):
        pass


class TestSLOContextNode:
    """Tests for the slo_context_node in isolation."""

    def _run(self, coro):
        return asyncio.run(coro)

    def test_slo_violations_detected(self):
        from app.graphs.incident_triage_graph import slo_context_node
        mock_gw = MockGatewayClient(overrides={
            "observability_tool.slo_snapshot": MockGatewayResult(True, {
                "service": "gateway",
                "window_minutes": 60,
                "metrics": {"latency_p95_ms": 450, "error_rate_pct": 2.5, "req_rate_rps": 100},
                "thresholds": {"latency_p95_ms": 300, "error_rate_pct": 1.0},
                "violations": ["latency_p95", "error_rate"],
                "skipped": False,
            }),
        })

        state = {
            "run_id": "test_run_1",
            "service": "gateway",
            "env": "prod",
            "time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"},
            "agent_id": "sofiia",
            "workspace_id": "default",
            "user_id": "test",
            "graph_status": "running",
        }

        with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
            result = self._run(slo_context_node(state))

        slo_data = result.get("slo_context_data", {})
        assert not slo_data.get("skipped", False)
        assert "latency_p95" in slo_data["violations"]
        assert "error_rate" in slo_data["violations"]
        assert slo_data["metrics"]["latency_p95_ms"] == 450

    def test_slo_no_violations(self):
        from app.graphs.incident_triage_graph import slo_context_node
        mock_gw = MockGatewayClient(overrides={
            "observability_tool.slo_snapshot": MockGatewayResult(True, {
                "service": "gateway",
                "window_minutes": 60,
                "metrics": {"latency_p95_ms": 150, "error_rate_pct": 0.3},
                "thresholds": {"latency_p95_ms": 300, "error_rate_pct": 1.0},
                "violations": [],
                "skipped": False,
            }),
        })

        state = {
            "run_id": "test_run_2",
            "service": "gateway",
            "env": "prod",
            "time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"},
            "graph_status": "running",
        }

        with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
            result = self._run(slo_context_node(state))

        slo_data = result.get("slo_context_data", {})
        assert slo_data["violations"] == []
        assert not slo_data.get("skipped")

    def test_slo_gateway_error_nonfatal(self):
        from app.graphs.incident_triage_graph import slo_context_node
        mock_gw = MockGatewayClient(overrides={
            "observability_tool.slo_snapshot": MockGatewayResult(False, error_message="timeout"),
        })

        state = {
            "run_id": "test_run_3",
            "service": "gateway",
            "env": "prod",
            "time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"},
            "graph_status": "running",
        }

        with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
            result = self._run(slo_context_node(state))

        slo_data = result.get("slo_context_data", {})
        assert slo_data["skipped"] is True
        assert result.get("graph_status") == "running"

    def test_slo_exception_nonfatal(self):
        from app.graphs.incident_triage_graph import slo_context_node

        class FailingGW:
            async def __aenter__(self):
                return self
            async def __aexit__(self, *a):
                pass
            async def call_tool(self, **kwargs):
                raise ConnectionError("connection refused")

        state = {
            "run_id": "test_run_4",
            "service": "gateway",
            "env": "prod",
            "time_range": {"from": "2025-01-01T00:00:00+00:00", "to": "2025-01-01T01:00:00+00:00"},
            "graph_status": "running",
        }

        with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=FailingGW()):
            result = self._run(slo_context_node(state))

        assert result.get("slo_context_data", {}).get("skipped") is True
        assert result.get("graph_status") == "running"


class TestTriageReportWithSLO:
    """Tests that build_triage_report_node includes SLO context properly."""

    def _run(self, coro):
        return asyncio.run(coro)

    def test_slo_violations_appear_in_root_causes(self):
        from app.graphs.incident_triage_graph import build_triage_report_node
        state = {
            "service": "gateway",
            "symptom": "high latency",
            "time_range": {"from": "2025-01-01T00:00:00", "to": "2025-01-01T01:00:00"},
            "env": "prod",
            "graph_status": "running",
            "service_overview_data": {},
            "top_errors_data": {},
            "log_samples": [],
            "health_data": {"status": "degraded"},
            "runbook_snippets": [],
            "trace_data": None,
            "slo_context_data": {
                "violations": ["latency_p95", "error_rate"],
                "metrics": {"latency_p95_ms": 500, "error_rate_pct": 3.0},
                "thresholds": {"latency_p95_ms": 300, "error_rate_pct": 1.0},
                "skipped": False,
            },
            "privacy_context_data": {"skipped": True},
            "cost_context_data": {"skipped": True},
        }

        result = self._run(build_triage_report_node(state))
        report = result["result"]
        assert result["graph_status"] == "succeeded"

        causes_text = json.dumps(report["suspected_root_causes"])
        assert "SLO violations" in causes_text

        assert "slo" in report["context"]
        assert report["context"]["slo"]["violations"] == ["latency_p95", "error_rate"]

        assert any("SLO breach" in c for c in report["next_checks"])

    def test_slo_skipped_does_not_add_causes(self):
        from app.graphs.incident_triage_graph import build_triage_report_node
        state = {
            "service": "gateway",
            "symptom": "slow",
            "time_range": {"from": "2025-01-01T00:00:00", "to": "2025-01-01T01:00:00"},
            "env": "prod",
            "graph_status": "running",
            "service_overview_data": {},
            "top_errors_data": {},
            "log_samples": [],
            "health_data": {"status": "healthy"},
            "runbook_snippets": [],
            "trace_data": None,
            "slo_context_data": {"skipped": True, "reason": "no metrics"},
            "privacy_context_data": {"skipped": True},
            "cost_context_data": {"skipped": True},
        }

        result = self._run(build_triage_report_node(state))
        report = result["result"]
        causes_text = json.dumps(report["suspected_root_causes"])
        assert "SLO violations" not in causes_text
        assert report["context"]["slo"]["skipped"] is True

    def test_slo_in_impact_assessment(self):
        from app.graphs.incident_triage_graph import build_triage_report_node
        state = {
            "service": "router",
            "symptom": "errors spike",
            "time_range": {"from": "2025-01-01T00:00:00", "to": "2025-01-01T01:00:00"},
            "env": "prod",
            "graph_status": "running",
            "service_overview_data": {},
            "top_errors_data": {},
            "log_samples": [],
            "health_data": {"status": "healthy"},
            "runbook_snippets": [],
            "trace_data": None,
            "slo_context_data": {
                "violations": ["error_rate"],
                "metrics": {"error_rate_pct": 5.0},
                "thresholds": {"error_rate_pct": 0.5},
                "skipped": False,
            },
            "privacy_context_data": {"skipped": True},
            "cost_context_data": {"skipped": True},
        }

        result = self._run(build_triage_report_node(state))
        assert "SLO breached" in result["result"]["impact_assessment"]


import json