microdao-daarion/services/sofiia-supervisor/tests/test_incident_triage_graph.py

"""
Tests for incident_triage_graph.

Mocks the GatewayClient.
"""

import asyncio
import sys
from pathlib import Path
from unittest.mock import patch

import pytest

sys.path.insert(0, str(Path(__file__).parent.parent))
from tests.conftest import MockGatewayClient, _run


_OVERVIEW_DATA = {
    "status": "ok",
    "alerts": [{"name": "HighErrorRate", "severity": "warning"}],
    "slo": {"error_rate": "2.1%", "error_budget_consumed": "42%"},
    "metrics": {"request_rate": "120/s", "p99_latency_ms": 890},
}

_LOGS_DATA = {
    "lines": [
        "2026-02-23T10:00:01Z ERROR router: connection refused to db host",
        "2026-02-23T10:00:02Z ERROR router: timeout after 30s waiting for upstream",
        "2026-02-23T10:00:03Z WARN  router: retry 2/3 on POST /v1/agents/sofiia/infer",
    ],
    "total": 3,
}

_HEALTH_DATA = {
    "status": "degraded",
    "details": "DB connection pool exhausted",
    "checks": {"db": "fail", "redis": "ok", "nats": "ok"},
}

_KB_DATA = {
    "results": [
        {
            "path": "docs/runbooks/router-db-exhausted.md",
            "lines": "L1-L30",
            "content": "## DB Pool Exhaustion\n- Increase pool size in DB_POOL_SIZE env\n- Check for long-running transactions\n- Restart service if needed",
        }
    ]
}


class TestIncidentTriageGraph:
    """Full happy-path test for incident_triage_graph."""

    def test_full_triage(self):
        from app.graphs.incident_triage_graph import build_incident_triage_graph

        mock_gw = MockGatewayClient()
        mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA)
        mock_gw.register("observability_tool", "logs_query", _LOGS_DATA)
        mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
        mock_gw.register("kb_tool", "search", _KB_DATA)
        # trace_lookup is skipped (include_traces=False)

        compiled = build_incident_triage_graph()
        with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
            final = _run(compiled.ainvoke({
                "run_id": "gr_triage_001",
                "agent_id": "sofiia", "workspace_id": "daarion", "user_id": "u_001",
                "input": {
                    "service": "router",
                    "symptom": "high error rate and slow responses",
                    "env": "prod",
                    "include_traces": False,
                    "max_log_lines": 50,
                },
            }))

        assert final["graph_status"] == "succeeded"
        result = final["result"]

        # Required fields
        assert "summary" in result
        assert "suspected_root_causes" in result
        assert "impact_assessment" in result
        assert "mitigations_now" in result
        assert "next_checks" in result
        assert "references" in result

        # Root causes derived from health=degraded and alert
        causes = result["suspected_root_causes"]
        assert len(causes) >= 1
        assert all("rank" in c and "cause" in c and "evidence" in c for c in causes)

        # Log samples in references (redacted)
        ref_logs = result["references"]["log_samples"]
        assert len(ref_logs) > 0

        # Runbook snippets in references
        runbooks = result["references"]["runbook_snippets"]
        assert len(runbooks) == 1
        assert "router-db-exhausted" in runbooks[0]["path"]

    def test_with_traces_enabled(self):
        """When include_traces=True, trace_lookup node runs."""
        from app.graphs.incident_triage_graph import build_incident_triage_graph

        mock_gw = MockGatewayClient()
        mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA)
        # Include a trace_id in logs
        logs_with_trace = {
            "lines": [
                "2026-02-23T10:00:01Z ERROR router: trace_id=abcdef1234567890 connection refused",
            ]
        }
        mock_gw.register("observability_tool", "logs_query", logs_with_trace)
        mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
        mock_gw.register("kb_tool", "search", _KB_DATA)
        mock_gw.register("observability_tool", "traces_query", {
            "traces": [{"trace_id": "abcdef1234567890", "duration_ms": 1250, "status": "error"}]
        })

        compiled = build_incident_triage_graph()
        with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
            final = _run(compiled.ainvoke({
                "run_id": "gr_trace_001",
                "agent_id": "sofiia", "workspace_id": "daarion", "user_id": "u",
                "input": {
                    "service": "router",
                    "symptom": "errors",
                    "include_traces": True,
                },
            }))

        assert final["graph_status"] == "succeeded"
        # Trace data should be in references
        assert "traces" in final["result"]["references"]

    def test_invalid_service_fails_gracefully(self):
        """Empty service → validation error → graph_status=failed."""
        from app.graphs.incident_triage_graph import build_incident_triage_graph

        mock_gw = MockGatewayClient()
        compiled = build_incident_triage_graph()
        with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
            final = _run(compiled.ainvoke({
                "run_id": "gr_invalid_001",
                "agent_id": "sofiia", "workspace_id": "d", "user_id": "u",
                "input": {"service": "", "symptom": "something"},
            }))

        assert final["graph_status"] == "failed"
        # No observability calls should have been made
        assert not any(c["tool"] == "observability_tool" for c in mock_gw.calls)

    def test_observability_failure_is_non_fatal(self):
        """If observability_tool fails, triage continues with partial data."""
        from app.graphs.incident_triage_graph import build_incident_triage_graph

        mock_gw = MockGatewayClient()
        mock_gw.register("observability_tool", "service_overview",
                         None, error="observability tool timeout")
        mock_gw.register("observability_tool", "logs_query",
                         None, error="logs unavailable")
        mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
        mock_gw.register("kb_tool", "search", _KB_DATA)

        compiled = build_incident_triage_graph()
        with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
            final = _run(compiled.ainvoke({
                "run_id": "gr_partial_001",
                "agent_id": "sofiia", "workspace_id": "d", "user_id": "u",
                "input": {"service": "router", "symptom": "slow"},
            }))

        # Should still produce a result (degraded mode)
        assert final["graph_status"] == "succeeded"
        assert "summary" in final["result"]

    def test_secret_redaction_in_logs(self):
        """Log lines containing secrets should be redacted in output."""
        from app.graphs.incident_triage_graph import build_incident_triage_graph

        secret_logs = {
            "lines": [
                "2026-02-23T10:00:01Z ERROR svc: token=sk-supersecretkey123 auth failed",
                "2026-02-23T10:00:02Z INFO  svc: api_key=abc12345 request failed",
            ]
        }

        mock_gw = MockGatewayClient()
        mock_gw.register("observability_tool", "service_overview", {})
        mock_gw.register("observability_tool", "logs_query", secret_logs)
        mock_gw.register("oncall_tool", "service_health", {"status": "ok"})
        mock_gw.register("kb_tool", "search", {"results": []})

        compiled = build_incident_triage_graph()
        with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
            final = _run(compiled.ainvoke({
                "run_id": "gr_secret_001",
                "agent_id": "sofiia", "workspace_id": "d", "user_id": "u",
                "input": {"service": "svc", "symptom": "auth issues"},
            }))

        log_samples = final["result"]["references"]["log_samples"]
        all_text = " ".join(log_samples)
        assert "sk-supersecretkey123" not in all_text
        assert "abc12345" not in all_text
        assert "***" in all_text


class TestTimeWindowLimit:
    """incident_triage_graph rejects or clamps time windows > 24h."""

    def test_time_window_clamped_to_24h(self):
        from app.graphs.incident_triage_graph import _clamp_time_range
        import datetime

        # 48h window → should be clamped to 24h
        now = datetime.datetime.now(datetime.timezone.utc)
        from_48h = (now - datetime.timedelta(hours=48)).isoformat()
        to_now = now.isoformat()

        clamped = _clamp_time_range({"from": from_48h, "to": to_now}, max_hours=24)

        from_dt = datetime.datetime.fromisoformat(clamped["from"].replace("Z", "+00:00"))
        to_dt = datetime.datetime.fromisoformat(clamped["to"].replace("Z", "+00:00"))
        delta = to_dt - from_dt
        assert delta.total_seconds() <= 24 * 3600 + 1  # 1s tolerance

    def test_valid_window_unchanged(self):
        from app.graphs.incident_triage_graph import _clamp_time_range
        import datetime

        now = datetime.datetime.now(datetime.timezone.utc)
        from_1h = (now - datetime.timedelta(hours=1)).isoformat()
        clamped = _clamp_time_range({"from": from_1h, "to": now.isoformat()}, max_hours=24)

        from_dt = datetime.datetime.fromisoformat(clamped["from"].replace("Z", "+00:00"))
        to_dt = datetime.datetime.fromisoformat(clamped["to"].replace("Z", "+00:00"))
        delta = to_dt - from_dt
        assert 3500 < delta.total_seconds() < 3700  # ~1h

    def test_no_time_range_gets_default(self):
        from app.graphs.incident_triage_graph import _clamp_time_range
        result = _clamp_time_range(None, max_hours=24)
        assert "from" in result and "to" in result


class TestCorrelationIds:
    """All tool calls in incident_triage must contain graph_run_id."""

    def test_all_calls_carry_run_id(self):
        from app.graphs.incident_triage_graph import build_incident_triage_graph

        run_id = "gr_triage_corr_001"
        mock_gw = MockGatewayClient()
        mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA)
        mock_gw.register("observability_tool", "logs_query", _LOGS_DATA)
        mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
        mock_gw.register("kb_tool", "search", _KB_DATA)
        # Register governance context tools
        mock_gw.register("data_governance_tool", "scan_audit", {
            "pass": True, "findings": [], "stats": {"errors": 0, "warnings": 0}, "recommendations": [],
        })
        mock_gw.register("cost_analyzer_tool", "anomalies", {"anomalies": [], "anomaly_count": 0})

        compiled = build_incident_triage_graph()
        with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
            _run(compiled.ainvoke({
                "run_id": run_id,
                "agent_id": "sofiia", "workspace_id": "d", "user_id": "u",
                "input": {"service": "router", "symptom": "errors"},
            }))

        for call in mock_gw.calls:
            assert call["graph_run_id"] == run_id, (
                f"Call {call['tool']}:{call['action']} missing graph_run_id={run_id}"
            )


class TestPrivacyCostContext:
    """Tests for privacy_context and cost_context nodes."""

    def test_incident_triage_includes_privacy_and_cost_context(self):
        """Full triage should include context.privacy and context.cost in result."""
        from app.graphs.incident_triage_graph import build_incident_triage_graph

        mock_gw = MockGatewayClient()
        mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA)
        mock_gw.register("observability_tool", "logs_query", _LOGS_DATA)
        mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
        mock_gw.register("kb_tool", "search", _KB_DATA)

        # Privacy context: 2 findings
        mock_gw.register("data_governance_tool", "scan_audit", {
            "pass": True,
            "summary": "2 audit findings",
            "stats": {"errors": 1, "warnings": 1, "infos": 0},
            "findings": [
                {"id": "DG-AUD-101", "severity": "warning",
                 "title": "PII in audit meta", "category": "audit",
                 "evidence": {"details": "user***@***.com"}, "recommended_fix": "Use opaque IDs"},
                {"id": "DG-AUD-102", "severity": "error",
                 "title": "Large output detected", "category": "audit",
                 "evidence": {"details": "out_size=200000"}, "recommended_fix": "Enforce max_bytes_out"},
            ],
            "recommendations": ["Use opaque identifiers"],
        })

        # Cost context: one spike
        mock_gw.register("cost_analyzer_tool", "anomalies", {
            "anomalies": [{
                "type": "cost_spike",
                "tool": "observability_tool",
                "ratio": 5.2,
                "window_calls": 200,
                "baseline_calls": 10,
                "recommendation": "Reduce polling frequency.",
            }],
            "anomaly_count": 1,
        })

        compiled = build_incident_triage_graph()
        with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
            final = _run(compiled.ainvoke({
                "run_id": "gr_ctx_test_001",
                "agent_id": "sofiia", "workspace_id": "ws", "user_id": "u",
                "input": {"service": "router", "symptom": "errors + cost spike"},
            }))

        assert final["graph_status"] == "succeeded"
        result = final["result"]

        # context block must exist
        assert "context" in result
        privacy = result["context"]["privacy"]
        cost = result["context"]["cost"]

        assert privacy["findings_count"] == 2
        assert not privacy["skipped"]

        assert cost["anomaly_count"] == 1
        assert not cost["skipped"]
        assert len(cost["anomalies"]) == 1
        assert cost["anomalies"][0]["tool"] == "observability_tool"

        # Cost spike should enrich root_causes
        causes_text = " ".join(str(c) for c in result["suspected_root_causes"])
        assert "observability_tool" in causes_text or "spike" in causes_text.lower()

        # Privacy error should also appear in root_causes
        assert any(
            "privacy" in str(c).lower() or "governance" in str(c).lower()
            for c in result["suspected_root_causes"]
        )

    def test_incident_triage_context_nonfatal_on_gateway_error(self):
        """privacy_context and cost_context failures are non-fatal — triage still succeeds."""
        from app.graphs.incident_triage_graph import build_incident_triage_graph

        mock_gw = MockGatewayClient()
        mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA)
        mock_gw.register("observability_tool", "logs_query", _LOGS_DATA)
        mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
        mock_gw.register("kb_tool", "search", _KB_DATA)
        # Both governance tools return errors
        mock_gw.register("data_governance_tool", "scan_audit",
                          None, error="gateway timeout")
        mock_gw.register("cost_analyzer_tool", "anomalies",
                          None, error="rate limit exceeded")

        compiled = build_incident_triage_graph()
        with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
            final = _run(compiled.ainvoke({
                "run_id": "gr_ctx_fail_001",
                "agent_id": "sofiia", "workspace_id": "ws", "user_id": "u",
                "input": {"service": "router", "symptom": "errors"},
            }))

        # Triage must succeed despite governance context failures
        assert final["graph_status"] == "succeeded"
        result = final["result"]

        # context block present with skipped=True
        assert "context" in result
        assert result["context"]["privacy"]["skipped"] is True
        assert result["context"]["cost"]["skipped"] is True

        # Core triage fields still present
        assert "summary" in result
        assert "suspected_root_causes" in result