New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
392 lines
16 KiB
Python
392 lines
16 KiB
Python
"""
|
|
Tests for incident_triage_graph.
|
|
|
|
Mocks the GatewayClient.
|
|
"""
|
|
|
|
import asyncio
|
|
import sys
|
|
from pathlib import Path
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
from tests.conftest import MockGatewayClient, _run
|
|
|
|
|
|
_OVERVIEW_DATA = {
|
|
"status": "ok",
|
|
"alerts": [{"name": "HighErrorRate", "severity": "warning"}],
|
|
"slo": {"error_rate": "2.1%", "error_budget_consumed": "42%"},
|
|
"metrics": {"request_rate": "120/s", "p99_latency_ms": 890},
|
|
}
|
|
|
|
_LOGS_DATA = {
|
|
"lines": [
|
|
"2026-02-23T10:00:01Z ERROR router: connection refused to db host",
|
|
"2026-02-23T10:00:02Z ERROR router: timeout after 30s waiting for upstream",
|
|
"2026-02-23T10:00:03Z WARN router: retry 2/3 on POST /v1/agents/sofiia/infer",
|
|
],
|
|
"total": 3,
|
|
}
|
|
|
|
_HEALTH_DATA = {
|
|
"status": "degraded",
|
|
"details": "DB connection pool exhausted",
|
|
"checks": {"db": "fail", "redis": "ok", "nats": "ok"},
|
|
}
|
|
|
|
_KB_DATA = {
|
|
"results": [
|
|
{
|
|
"path": "docs/runbooks/router-db-exhausted.md",
|
|
"lines": "L1-L30",
|
|
"content": "## DB Pool Exhaustion\n- Increase pool size in DB_POOL_SIZE env\n- Check for long-running transactions\n- Restart service if needed",
|
|
}
|
|
]
|
|
}
|
|
|
|
|
|
class TestIncidentTriageGraph:
|
|
"""Full happy-path test for incident_triage_graph."""
|
|
|
|
def test_full_triage(self):
|
|
from app.graphs.incident_triage_graph import build_incident_triage_graph
|
|
|
|
mock_gw = MockGatewayClient()
|
|
mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA)
|
|
mock_gw.register("observability_tool", "logs_query", _LOGS_DATA)
|
|
mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
|
|
mock_gw.register("kb_tool", "search", _KB_DATA)
|
|
# trace_lookup is skipped (include_traces=False)
|
|
|
|
compiled = build_incident_triage_graph()
|
|
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
|
final = _run(compiled.ainvoke({
|
|
"run_id": "gr_triage_001",
|
|
"agent_id": "sofiia", "workspace_id": "daarion", "user_id": "u_001",
|
|
"input": {
|
|
"service": "router",
|
|
"symptom": "high error rate and slow responses",
|
|
"env": "prod",
|
|
"include_traces": False,
|
|
"max_log_lines": 50,
|
|
},
|
|
}))
|
|
|
|
assert final["graph_status"] == "succeeded"
|
|
result = final["result"]
|
|
|
|
# Required fields
|
|
assert "summary" in result
|
|
assert "suspected_root_causes" in result
|
|
assert "impact_assessment" in result
|
|
assert "mitigations_now" in result
|
|
assert "next_checks" in result
|
|
assert "references" in result
|
|
|
|
# Root causes derived from health=degraded and alert
|
|
causes = result["suspected_root_causes"]
|
|
assert len(causes) >= 1
|
|
assert all("rank" in c and "cause" in c and "evidence" in c for c in causes)
|
|
|
|
# Log samples in references (redacted)
|
|
ref_logs = result["references"]["log_samples"]
|
|
assert len(ref_logs) > 0
|
|
|
|
# Runbook snippets in references
|
|
runbooks = result["references"]["runbook_snippets"]
|
|
assert len(runbooks) == 1
|
|
assert "router-db-exhausted" in runbooks[0]["path"]
|
|
|
|
def test_with_traces_enabled(self):
|
|
"""When include_traces=True, trace_lookup node runs."""
|
|
from app.graphs.incident_triage_graph import build_incident_triage_graph
|
|
|
|
mock_gw = MockGatewayClient()
|
|
mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA)
|
|
# Include a trace_id in logs
|
|
logs_with_trace = {
|
|
"lines": [
|
|
"2026-02-23T10:00:01Z ERROR router: trace_id=abcdef1234567890 connection refused",
|
|
]
|
|
}
|
|
mock_gw.register("observability_tool", "logs_query", logs_with_trace)
|
|
mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
|
|
mock_gw.register("kb_tool", "search", _KB_DATA)
|
|
mock_gw.register("observability_tool", "traces_query", {
|
|
"traces": [{"trace_id": "abcdef1234567890", "duration_ms": 1250, "status": "error"}]
|
|
})
|
|
|
|
compiled = build_incident_triage_graph()
|
|
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
|
final = _run(compiled.ainvoke({
|
|
"run_id": "gr_trace_001",
|
|
"agent_id": "sofiia", "workspace_id": "daarion", "user_id": "u",
|
|
"input": {
|
|
"service": "router",
|
|
"symptom": "errors",
|
|
"include_traces": True,
|
|
},
|
|
}))
|
|
|
|
assert final["graph_status"] == "succeeded"
|
|
# Trace data should be in references
|
|
assert "traces" in final["result"]["references"]
|
|
|
|
def test_invalid_service_fails_gracefully(self):
|
|
"""Empty service → validation error → graph_status=failed."""
|
|
from app.graphs.incident_triage_graph import build_incident_triage_graph
|
|
|
|
mock_gw = MockGatewayClient()
|
|
compiled = build_incident_triage_graph()
|
|
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
|
final = _run(compiled.ainvoke({
|
|
"run_id": "gr_invalid_001",
|
|
"agent_id": "sofiia", "workspace_id": "d", "user_id": "u",
|
|
"input": {"service": "", "symptom": "something"},
|
|
}))
|
|
|
|
assert final["graph_status"] == "failed"
|
|
# No observability calls should have been made
|
|
assert not any(c["tool"] == "observability_tool" for c in mock_gw.calls)
|
|
|
|
def test_observability_failure_is_non_fatal(self):
|
|
"""If observability_tool fails, triage continues with partial data."""
|
|
from app.graphs.incident_triage_graph import build_incident_triage_graph
|
|
|
|
mock_gw = MockGatewayClient()
|
|
mock_gw.register("observability_tool", "service_overview",
|
|
None, error="observability tool timeout")
|
|
mock_gw.register("observability_tool", "logs_query",
|
|
None, error="logs unavailable")
|
|
mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
|
|
mock_gw.register("kb_tool", "search", _KB_DATA)
|
|
|
|
compiled = build_incident_triage_graph()
|
|
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
|
final = _run(compiled.ainvoke({
|
|
"run_id": "gr_partial_001",
|
|
"agent_id": "sofiia", "workspace_id": "d", "user_id": "u",
|
|
"input": {"service": "router", "symptom": "slow"},
|
|
}))
|
|
|
|
# Should still produce a result (degraded mode)
|
|
assert final["graph_status"] == "succeeded"
|
|
assert "summary" in final["result"]
|
|
|
|
def test_secret_redaction_in_logs(self):
|
|
"""Log lines containing secrets should be redacted in output."""
|
|
from app.graphs.incident_triage_graph import build_incident_triage_graph
|
|
|
|
secret_logs = {
|
|
"lines": [
|
|
"2026-02-23T10:00:01Z ERROR svc: token=sk-supersecretkey123 auth failed",
|
|
"2026-02-23T10:00:02Z INFO svc: api_key=abc12345 request failed",
|
|
]
|
|
}
|
|
|
|
mock_gw = MockGatewayClient()
|
|
mock_gw.register("observability_tool", "service_overview", {})
|
|
mock_gw.register("observability_tool", "logs_query", secret_logs)
|
|
mock_gw.register("oncall_tool", "service_health", {"status": "ok"})
|
|
mock_gw.register("kb_tool", "search", {"results": []})
|
|
|
|
compiled = build_incident_triage_graph()
|
|
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
|
final = _run(compiled.ainvoke({
|
|
"run_id": "gr_secret_001",
|
|
"agent_id": "sofiia", "workspace_id": "d", "user_id": "u",
|
|
"input": {"service": "svc", "symptom": "auth issues"},
|
|
}))
|
|
|
|
log_samples = final["result"]["references"]["log_samples"]
|
|
all_text = " ".join(log_samples)
|
|
assert "sk-supersecretkey123" not in all_text
|
|
assert "abc12345" not in all_text
|
|
assert "***" in all_text
|
|
|
|
|
|
class TestTimeWindowLimit:
|
|
"""incident_triage_graph rejects or clamps time windows > 24h."""
|
|
|
|
def test_time_window_clamped_to_24h(self):
|
|
from app.graphs.incident_triage_graph import _clamp_time_range
|
|
import datetime
|
|
|
|
# 48h window → should be clamped to 24h
|
|
now = datetime.datetime.now(datetime.timezone.utc)
|
|
from_48h = (now - datetime.timedelta(hours=48)).isoformat()
|
|
to_now = now.isoformat()
|
|
|
|
clamped = _clamp_time_range({"from": from_48h, "to": to_now}, max_hours=24)
|
|
|
|
from_dt = datetime.datetime.fromisoformat(clamped["from"].replace("Z", "+00:00"))
|
|
to_dt = datetime.datetime.fromisoformat(clamped["to"].replace("Z", "+00:00"))
|
|
delta = to_dt - from_dt
|
|
assert delta.total_seconds() <= 24 * 3600 + 1 # 1s tolerance
|
|
|
|
def test_valid_window_unchanged(self):
|
|
from app.graphs.incident_triage_graph import _clamp_time_range
|
|
import datetime
|
|
|
|
now = datetime.datetime.now(datetime.timezone.utc)
|
|
from_1h = (now - datetime.timedelta(hours=1)).isoformat()
|
|
clamped = _clamp_time_range({"from": from_1h, "to": now.isoformat()}, max_hours=24)
|
|
|
|
from_dt = datetime.datetime.fromisoformat(clamped["from"].replace("Z", "+00:00"))
|
|
to_dt = datetime.datetime.fromisoformat(clamped["to"].replace("Z", "+00:00"))
|
|
delta = to_dt - from_dt
|
|
assert 3500 < delta.total_seconds() < 3700 # ~1h
|
|
|
|
def test_no_time_range_gets_default(self):
|
|
from app.graphs.incident_triage_graph import _clamp_time_range
|
|
result = _clamp_time_range(None, max_hours=24)
|
|
assert "from" in result and "to" in result
|
|
|
|
|
|
class TestCorrelationIds:
|
|
"""All tool calls in incident_triage must contain graph_run_id."""
|
|
|
|
def test_all_calls_carry_run_id(self):
|
|
from app.graphs.incident_triage_graph import build_incident_triage_graph
|
|
|
|
run_id = "gr_triage_corr_001"
|
|
mock_gw = MockGatewayClient()
|
|
mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA)
|
|
mock_gw.register("observability_tool", "logs_query", _LOGS_DATA)
|
|
mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
|
|
mock_gw.register("kb_tool", "search", _KB_DATA)
|
|
# Register governance context tools
|
|
mock_gw.register("data_governance_tool", "scan_audit", {
|
|
"pass": True, "findings": [], "stats": {"errors": 0, "warnings": 0}, "recommendations": [],
|
|
})
|
|
mock_gw.register("cost_analyzer_tool", "anomalies", {"anomalies": [], "anomaly_count": 0})
|
|
|
|
compiled = build_incident_triage_graph()
|
|
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
|
_run(compiled.ainvoke({
|
|
"run_id": run_id,
|
|
"agent_id": "sofiia", "workspace_id": "d", "user_id": "u",
|
|
"input": {"service": "router", "symptom": "errors"},
|
|
}))
|
|
|
|
for call in mock_gw.calls:
|
|
assert call["graph_run_id"] == run_id, (
|
|
f"Call {call['tool']}:{call['action']} missing graph_run_id={run_id}"
|
|
)
|
|
|
|
|
|
class TestPrivacyCostContext:
|
|
"""Tests for privacy_context and cost_context nodes."""
|
|
|
|
def test_incident_triage_includes_privacy_and_cost_context(self):
|
|
"""Full triage should include context.privacy and context.cost in result."""
|
|
from app.graphs.incident_triage_graph import build_incident_triage_graph
|
|
|
|
mock_gw = MockGatewayClient()
|
|
mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA)
|
|
mock_gw.register("observability_tool", "logs_query", _LOGS_DATA)
|
|
mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
|
|
mock_gw.register("kb_tool", "search", _KB_DATA)
|
|
|
|
# Privacy context: 2 findings
|
|
mock_gw.register("data_governance_tool", "scan_audit", {
|
|
"pass": True,
|
|
"summary": "2 audit findings",
|
|
"stats": {"errors": 1, "warnings": 1, "infos": 0},
|
|
"findings": [
|
|
{"id": "DG-AUD-101", "severity": "warning",
|
|
"title": "PII in audit meta", "category": "audit",
|
|
"evidence": {"details": "user***@***.com"}, "recommended_fix": "Use opaque IDs"},
|
|
{"id": "DG-AUD-102", "severity": "error",
|
|
"title": "Large output detected", "category": "audit",
|
|
"evidence": {"details": "out_size=200000"}, "recommended_fix": "Enforce max_bytes_out"},
|
|
],
|
|
"recommendations": ["Use opaque identifiers"],
|
|
})
|
|
|
|
# Cost context: one spike
|
|
mock_gw.register("cost_analyzer_tool", "anomalies", {
|
|
"anomalies": [{
|
|
"type": "cost_spike",
|
|
"tool": "observability_tool",
|
|
"ratio": 5.2,
|
|
"window_calls": 200,
|
|
"baseline_calls": 10,
|
|
"recommendation": "Reduce polling frequency.",
|
|
}],
|
|
"anomaly_count": 1,
|
|
})
|
|
|
|
compiled = build_incident_triage_graph()
|
|
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
|
final = _run(compiled.ainvoke({
|
|
"run_id": "gr_ctx_test_001",
|
|
"agent_id": "sofiia", "workspace_id": "ws", "user_id": "u",
|
|
"input": {"service": "router", "symptom": "errors + cost spike"},
|
|
}))
|
|
|
|
assert final["graph_status"] == "succeeded"
|
|
result = final["result"]
|
|
|
|
# context block must exist
|
|
assert "context" in result
|
|
privacy = result["context"]["privacy"]
|
|
cost = result["context"]["cost"]
|
|
|
|
assert privacy["findings_count"] == 2
|
|
assert not privacy["skipped"]
|
|
|
|
assert cost["anomaly_count"] == 1
|
|
assert not cost["skipped"]
|
|
assert len(cost["anomalies"]) == 1
|
|
assert cost["anomalies"][0]["tool"] == "observability_tool"
|
|
|
|
# Cost spike should enrich root_causes
|
|
causes_text = " ".join(str(c) for c in result["suspected_root_causes"])
|
|
assert "observability_tool" in causes_text or "spike" in causes_text.lower()
|
|
|
|
# Privacy error should also appear in root_causes
|
|
assert any(
|
|
"privacy" in str(c).lower() or "governance" in str(c).lower()
|
|
for c in result["suspected_root_causes"]
|
|
)
|
|
|
|
def test_incident_triage_context_nonfatal_on_gateway_error(self):
|
|
"""privacy_context and cost_context failures are non-fatal — triage still succeeds."""
|
|
from app.graphs.incident_triage_graph import build_incident_triage_graph
|
|
|
|
mock_gw = MockGatewayClient()
|
|
mock_gw.register("observability_tool", "service_overview", _OVERVIEW_DATA)
|
|
mock_gw.register("observability_tool", "logs_query", _LOGS_DATA)
|
|
mock_gw.register("oncall_tool", "service_health", _HEALTH_DATA)
|
|
mock_gw.register("kb_tool", "search", _KB_DATA)
|
|
# Both governance tools return errors
|
|
mock_gw.register("data_governance_tool", "scan_audit",
|
|
None, error="gateway timeout")
|
|
mock_gw.register("cost_analyzer_tool", "anomalies",
|
|
None, error="rate limit exceeded")
|
|
|
|
compiled = build_incident_triage_graph()
|
|
with patch("app.graphs.incident_triage_graph.GatewayClient", return_value=mock_gw):
|
|
final = _run(compiled.ainvoke({
|
|
"run_id": "gr_ctx_fail_001",
|
|
"agent_id": "sofiia", "workspace_id": "ws", "user_id": "u",
|
|
"input": {"service": "router", "symptom": "errors"},
|
|
}))
|
|
|
|
# Triage must succeed despite governance context failures
|
|
assert final["graph_status"] == "succeeded"
|
|
result = final["result"]
|
|
|
|
# context block present with skipped=True
|
|
assert "context" in result
|
|
assert result["context"]["privacy"]["skipped"] is True
|
|
assert result["context"]["cost"]["skipped"] is True
|
|
|
|
# Core triage fields still present
|
|
assert "summary" in result
|
|
assert "suspected_root_causes" in result
|