""" Tests for incident_triage_graph. Mocks the GatewayClient. """ import asyncio import sys from pathlib import Path from unittest.mock import patch import pytest sys.path.insert(0, str(Path(__file__).parent.parent)) from tests.conftest import MockGatewayClient, _run _OVERVIEW_DATA = { "status": "ok", "alerts": [{"name": "HighErrorRate", "severity": "warning"}], "slo": {"error_rate": "2.1%", "error_budget_consumed": "42%"}, "metrics": {"request_rate": "120/s", "p99_latency_ms": 890}, } _LOGS_DATA = { "lines": [ "2026-02-23T10:00:01Z ERROR router: connection refused to db host", "2026-02-23T10:00:02Z ERROR router: timeout after 30s waiting for upstream", "2026-02-23T10:00:03Z WARN router: retry 2/3 on POST /v1/agents/sofiia/infer", ], "total": 3, } _HEALTH_DATA = { "status": "degraded", "details": "DB connection pool exhausted", "checks": {"db": "fail", "redis": "ok", "nats": "ok"}, } _KB_DATA = { "results": [ { "path": "docs/runbooks/router-db-exhausted.md", "lines": "L1-L30", "content": "## DB Pool Exhaustion\n- Increase pool size in DB_POOL_SIZE env\n- Check for long-running transactions\n- Restart service if needed", } ] } class TestIncidentTriageGraph: """Full happy-path test for incident_triage_graph.""" def test_full_triage(self): from app.graphs.incident_triage_graph import build_incident_triage_graph mock_gw = MockGatewayClient() mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA) mock_gw.register("observability_tool", "logs_query", _LOGS_DATA) mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA) mock_gw.register("kb_tool", "search", _KB_DATA) # trace_lookup is skipped (include_traces=False) compiled = build_incident_triage_graph() with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw): final = _run(compiled.ainvoke({ "run_id": "gr_triage_001", "agent_id": "sofiia", "workspace_id": "daarion", "user_id": "u_001", "input": { "service": "router", "symptom": "high error rate and slow responses", "env": "prod", "include_traces": False, "max_log_lines": 50, }, })) assert final["graph_status"] == "succeeded" result = final["result"] # Required fields assert "summary" in result assert "suspected_root_causes" in result assert "impact_assessment" in result assert "mitigations_now" in result assert "next_checks" in result assert "references" in result # Root causes derived from health=degraded and alert causes = result["suspected_root_causes"] assert len(causes) >= 1 assert all("rank" in c and "cause" in c and "evidence" in c for c in causes) # Log samples in references (redacted) ref_logs = result["references"]["log_samples"] assert len(ref_logs) > 0 # Runbook snippets in references runbooks = result["references"]["runbook_snippets"] assert len(runbooks) == 1 assert "router-db-exhausted" in runbooks[0]["path"] def test_with_traces_enabled(self): """When include_traces=True, trace_lookup node runs.""" from app.graphs.incident_triage_graph import build_incident_triage_graph mock_gw = MockGatewayClient() mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA) # Include a trace_id in logs logs_with_trace = { "lines": [ "2026-02-23T10:00:01Z ERROR router: trace_id=abcdef1234567890 connection refused", ] } mock_gw.register("observability_tool", "logs_query", logs_with_trace) mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA) mock_gw.register("kb_tool", "search", _KB_DATA) mock_gw.register("observability_tool", "traces_query", { "traces": [{"trace_id": "abcdef1234567890", "duration_ms": 1250, "status": "error"}] }) compiled = build_incident_triage_graph() with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw): final = _run(compiled.ainvoke({ "run_id": "gr_trace_001", "agent_id": "sofiia", "workspace_id": "daarion", "user_id": "u", "input": { "service": "router", "symptom": "errors", "include_traces": True, }, })) assert final["graph_status"] == "succeeded" # Trace data should be in references assert "traces" in final["result"]["references"] def test_invalid_service_fails_gracefully(self): """Empty service → validation error → graph_status=failed.""" from app.graphs.incident_triage_graph import build_incident_triage_graph mock_gw = MockGatewayClient() compiled = build_incident_triage_graph() with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw): final = _run(compiled.ainvoke({ "run_id": "gr_invalid_001", "agent_id": "sofiia", "workspace_id": "d", "user_id": "u", "input": {"service": "", "symptom": "something"}, })) assert final["graph_status"] == "failed" # No observability calls should have been made assert not any(c["tool"] == "observability_tool" for c in mock_gw.calls) def test_observability_failure_is_non_fatal(self): """If observability_tool fails, triage continues with partial data.""" from app.graphs.incident_triage_graph import build_incident_triage_graph mock_gw = MockGatewayClient() mock_gw.register("observability_tool", "service_overview", None, error="observability tool timeout") mock_gw.register("observability_tool", "logs_query", None, error="logs unavailable") mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA) mock_gw.register("kb_tool", "search", _KB_DATA) compiled = build_incident_triage_graph() with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw): final = _run(compiled.ainvoke({ "run_id": "gr_partial_001", "agent_id": "sofiia", "workspace_id": "d", "user_id": "u", "input": {"service": "router", "symptom": "slow"}, })) # Should still produce a result (degraded mode) assert final["graph_status"] == "succeeded" assert "summary" in final["result"] def test_secret_redaction_in_logs(self): """Log lines containing secrets should be redacted in output.""" from app.graphs.incident_triage_graph import build_incident_triage_graph secret_logs = { "lines": [ "2026-02-23T10:00:01Z ERROR svc: token=sk-supersecretkey123 auth failed", "2026-02-23T10:00:02Z INFO svc: api_key=abc12345 request failed", ] } mock_gw = MockGatewayClient() mock_gw.register("observability_tool", "service_overview", {}) mock_gw.register("observability_tool", "logs_query", secret_logs) mock_gw.register("oncall_tool", "service_health", {"status": "ok"}) mock_gw.register("kb_tool", "search", {"results": []}) compiled = build_incident_triage_graph() with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw): final = _run(compiled.ainvoke({ "run_id": "gr_secret_001", "agent_id": "sofiia", "workspace_id": "d", "user_id": "u", "input": {"service": "svc", "symptom": "auth issues"}, })) log_samples = final["result"]["references"]["log_samples"] all_text = " ".join(log_samples) assert "sk-supersecretkey123" not in all_text assert "abc12345" not in all_text assert "***" in all_text class TestTimeWindowLimit: """incident_triage_graph rejects or clamps time windows > 24h.""" def test_time_window_clamped_to_24h(self): from app.graphs.incident_triage_graph import _clamp_time_range import datetime # 48h window → should be clamped to 24h now = datetime.datetime.now(datetime.timezone.utc) from_48h = (now - datetime.timedelta(hours=48)).isoformat() to_now = now.isoformat() clamped = _clamp_time_range({"from": from_48h, "to": to_now}, max_hours=24) from_dt = datetime.datetime.fromisoformat(clamped["from"].replace("Z", "+00:00")) to_dt = datetime.datetime.fromisoformat(clamped["to"].replace("Z", "+00:00")) delta = to_dt - from_dt assert delta.total_seconds() <= 24 * 3600 + 1 # 1s tolerance def test_valid_window_unchanged(self): from app.graphs.incident_triage_graph import _clamp_time_range import datetime now = datetime.datetime.now(datetime.timezone.utc) from_1h = (now - datetime.timedelta(hours=1)).isoformat() clamped = _clamp_time_range({"from": from_1h, "to": now.isoformat()}, max_hours=24) from_dt = datetime.datetime.fromisoformat(clamped["from"].replace("Z", "+00:00")) to_dt = datetime.datetime.fromisoformat(clamped["to"].replace("Z", "+00:00")) delta = to_dt - from_dt assert 3500 < delta.total_seconds() < 3700 # ~1h def test_no_time_range_gets_default(self): from app.graphs.incident_triage_graph import _clamp_time_range result = _clamp_time_range(None, max_hours=24) assert "from" in result and "to" in result class TestCorrelationIds: """All tool calls in incident_triage must contain graph_run_id.""" def test_all_calls_carry_run_id(self): from app.graphs.incident_triage_graph import build_incident_triage_graph run_id = "gr_triage_corr_001" mock_gw = MockGatewayClient() mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA) mock_gw.register("observability_tool", "logs_query", _LOGS_DATA) mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA) mock_gw.register("kb_tool", "search", _KB_DATA) # Register governance context tools mock_gw.register("data_governance_tool", "scan_audit", { "pass": True, "findings": [], "stats": {"errors": 0, "warnings": 0}, "recommendations": [], }) mock_gw.register("cost_analyzer_tool", "anomalies", {"anomalies": [], "anomaly_count": 0}) compiled = build_incident_triage_graph() with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw): _run(compiled.ainvoke({ "run_id": run_id, "agent_id": "sofiia", "workspace_id": "d", "user_id": "u", "input": {"service": "router", "symptom": "errors"}, })) for call in mock_gw.calls: assert call["graph_run_id"] == run_id, ( f"Call {call['tool']}:{call['action']} missing graph_run_id={run_id}" ) class TestPrivacyCostContext: """Tests for privacy_context and cost_context nodes.""" def test_incident_triage_includes_privacy_and_cost_context(self): """Full triage should include context.privacy and context.cost in result.""" from app.graphs.incident_triage_graph import build_incident_triage_graph mock_gw = MockGatewayClient() mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA) mock_gw.register("observability_tool", "logs_query", _LOGS_DATA) mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA) mock_gw.register("kb_tool", "search", _KB_DATA) # Privacy context: 2 findings mock_gw.register("data_governance_tool", "scan_audit", { "pass": True, "summary": "2 audit findings", "stats": {"errors": 1, "warnings": 1, "infos": 0}, "findings": [ {"id": "DG-AUD-101", "severity": "warning", "title": "PII in audit meta", "category": "audit", "evidence": {"details": "user***@***.com"}, "recommended_fix": "Use opaque IDs"}, {"id": "DG-AUD-102", "severity": "error", "title": "Large output detected", "category": "audit", "evidence": {"details": "out_size=200000"}, "recommended_fix": "Enforce max_bytes_out"}, ], "recommendations": ["Use opaque identifiers"], }) # Cost context: one spike mock_gw.register("cost_analyzer_tool", "anomalies", { "anomalies": [{ "type": "cost_spike", "tool": "observability_tool", "ratio": 5.2, "window_calls": 200, "baseline_calls": 10, "recommendation": "Reduce polling frequency.", }], "anomaly_count": 1, }) compiled = build_incident_triage_graph() with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw): final = _run(compiled.ainvoke({ "run_id": "gr_ctx_test_001", "agent_id": "sofiia", "workspace_id": "ws", "user_id": "u", "input": {"service": "router", "symptom": "errors + cost spike"}, })) assert final["graph_status"] == "succeeded" result = final["result"] # context block must exist assert "context" in result privacy = result["context"]["privacy"] cost = result["context"]["cost"] assert privacy["findings_count"] == 2 assert not privacy["skipped"] assert cost["anomaly_count"] == 1 assert not cost["skipped"] assert len(cost["anomalies"]) == 1 assert cost["anomalies"][0]["tool"] == "observability_tool" # Cost spike should enrich root_causes causes_text = " ".join(str(c) for c in result["suspected_root_causes"]) assert "observability_tool" in causes_text or "spike" in causes_text.lower() # Privacy error should also appear in root_causes assert any( "privacy" in str(c).lower() or "governance" in str(c).lower() for c in result["suspected_root_causes"] ) def test_incident_triage_context_nonfatal_on_gateway_error(self): """privacy_context and cost_context failures are non-fatal — triage still succeeds.""" from app.graphs.incident_triage_graph import build_incident_triage_graph mock_gw = MockGatewayClient() mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA) mock_gw.register("observability_tool", "logs_query", _LOGS_DATA) mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA) mock_gw.register("kb_tool", "search", _KB_DATA) # Both governance tools return errors mock_gw.register("data_governance_tool", "scan_audit", None, error="gateway timeout") mock_gw.register("cost_analyzer_tool", "anomalies", None, error="rate limit exceeded") compiled = build_incident_triage_graph() with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw): final = _run(compiled.ainvoke({ "run_id": "gr_ctx_fail_001", "agent_id": "sofiia", "workspace_id": "ws", "user_id": "u", "input": {"service": "router", "symptom": "errors"}, })) # Triage must succeed despite governance context failures assert final["graph_status"] == "succeeded" result = final["result"] # context block present with skipped=True assert "context" in result assert result["context"]["privacy"]["skipped"] is True assert result["context"]["cost"]["skipped"] is True # Core triage fields still present assert "summary" in result assert "suspected_root_causes" in result