""" tests/test_llm_hardening.py Tests for LLM enrichment hardening guards in llm_enrichment.py: - model not in allowlist → skip - max_calls_per_digest enforced via call_counter - per_day_dedupe prevents second call for same (service, env) - all guards are non-fatal (never affect scores) """ import sys, os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../services/router")) import datetime import unittest from unittest.mock import patch, MagicMock import llm_enrichment def _policy( mode: str = "local", model: str = "llama3", allowlist=None, max_calls: int = 3, per_day_dedupe: bool = True, delta_warn: int = 10, band_in=None, ) -> dict: if allowlist is None: allowlist = ["llama3", "qwen2.5-coder:3b", "llama3.1:8b-instruct"] if band_in is None: band_in = ["high", "critical"] return { "defaults": { "llm_mode": mode, "llm_max_chars_in": 3500, "llm_max_chars_out": 800, }, "llm_triggers": { "risk_delta_warn": delta_warn, "band_in": band_in, }, "llm_local": { "endpoint": "http://localhost:11434/api/generate", "model": model, "timeout_seconds": 5, "model_allowlist": allowlist, "max_calls_per_digest": max_calls, "per_day_dedupe": per_day_dedupe, }, } def _risk_report(band: str = "high", delta: float = 15.0) -> dict: return {"service": "gw", "env": "prod", "band": band, "score": 75, "trend": {"delta_24h": delta}, "reasons": ["P0 incident open"]} def _attr_report(service: str = "gw", env: str = "prod", delta: float = 15.0) -> dict: return { "service": service, "env": env, "delta_24h": delta, "window_hours": 24, "causes": [{"type": "deploy", "score": 30, "confidence": "medium", "evidence": ["deploy alerts: 2"]}], "summary": "Likely: deploy.", } def _patched_llm(text: str = "LLM insight text."): """Return a patcher that makes _call_local_llm return the given text.""" return patch("llm_enrichment._call_local_llm", return_value=text) class TestModelAllowlist: def setup_method(self): llm_enrichment._clear_dedupe_store() def test_model_not_in_allowlist_skips(self): policy = _policy(model="unknown-model", allowlist=["llama3"]) with _patched_llm("should not appear") as mock_llm: result = llm_enrichment.maybe_enrich_attribution( _attr_report(), _risk_report(), policy ) # _call_local_llm is called but internally checks allowlist and returns None # so enabled should be False assert result["enabled"] is False def test_model_in_allowlist_proceeds(self): policy = _policy(model="llama3", allowlist=["llama3"]) with _patched_llm("Good insight."): result = llm_enrichment.maybe_enrich_attribution( _attr_report(), _risk_report(), policy ) assert result["enabled"] is True assert result["text"] == "Good insight." def test_empty_allowlist_allows_any(self): """Empty allowlist = no restriction.""" policy = _policy(model="custom-model", allowlist=[]) with _patched_llm("text"): result = llm_enrichment.maybe_enrich_attribution( _attr_report(), _risk_report(), policy ) assert result["enabled"] is True def test_is_model_allowed_true(self): policy = _policy(allowlist=["a", "b"]) assert llm_enrichment._is_model_allowed("a", policy) is True def test_is_model_allowed_false(self): policy = _policy(allowlist=["a", "b"]) assert llm_enrichment._is_model_allowed("c", policy) is False class TestMaxCallsPerDigest: def setup_method(self): llm_enrichment._clear_dedupe_store() def test_calls_stop_at_max(self): policy = _policy(max_calls=2) counter = {"count": 0} with _patched_llm("insight"): r1 = llm_enrichment.maybe_enrich_attribution( _attr_report("svc1"), _risk_report(), policy, call_counter=counter) with _patched_llm("insight"): r2 = llm_enrichment.maybe_enrich_attribution( _attr_report("svc2", env="staging"), _risk_report(), policy, call_counter=counter) # counter should be 2 now; next call should be skipped with _patched_llm("should be blocked"): r3 = llm_enrichment.maybe_enrich_attribution( _attr_report("svc3", env="dev"), _risk_report(), policy, call_counter=counter) assert r1["enabled"] is True assert r2["enabled"] is True assert r3["enabled"] is False assert "max_calls_per_digest" in r3.get("skipped_reason", "") assert counter["count"] == 2 def test_no_counter_allows_unlimited(self): policy = _policy(max_calls=1) with _patched_llm("text"): r1 = llm_enrichment.maybe_enrich_attribution( _attr_report(), _risk_report(), policy, call_counter=None) with _patched_llm("text"): r2 = llm_enrichment.maybe_enrich_attribution( _attr_report("svc2", env="staging"), _risk_report(), policy, call_counter=None) assert r1["enabled"] is True assert r2["enabled"] is True def test_counter_starts_at_zero(self): policy = _policy(max_calls=0) counter = {"count": 0} with _patched_llm("blocked"): result = llm_enrichment.maybe_enrich_attribution( _attr_report(), _risk_report(), policy, call_counter=counter) assert result["enabled"] is False assert "max_calls_per_digest" in result.get("skipped_reason", "") class TestPerDayDedupe: def setup_method(self): llm_enrichment._clear_dedupe_store() def test_second_call_same_service_env_is_deduped(self): policy = _policy(per_day_dedupe=True) with _patched_llm("first"): r1 = llm_enrichment.maybe_enrich_attribution( _attr_report("gw", "prod"), _risk_report(), policy) with _patched_llm("second"): r2 = llm_enrichment.maybe_enrich_attribution( _attr_report("gw", "prod"), _risk_report(), policy) assert r1["enabled"] is True assert r2["enabled"] is False assert "per_day_dedupe" in r2.get("skipped_reason", "") def test_different_service_not_deduped(self): policy = _policy(per_day_dedupe=True) with _patched_llm("gw insight"): r1 = llm_enrichment.maybe_enrich_attribution( _attr_report("gw", "prod"), _risk_report(), policy) with _patched_llm("router insight"): r2 = llm_enrichment.maybe_enrich_attribution( _attr_report("router", "prod"), _risk_report(), policy) assert r1["enabled"] is True assert r2["enabled"] is True def test_different_env_not_deduped(self): policy = _policy(per_day_dedupe=True) with _patched_llm("prod insight"): r1 = llm_enrichment.maybe_enrich_attribution( _attr_report("gw", "prod"), _risk_report(), policy) with _patched_llm("staging insight"): r2 = llm_enrichment.maybe_enrich_attribution( _attr_report("gw", "staging"), _risk_report(), policy) assert r1["enabled"] is True assert r2["enabled"] is True def test_dedupe_disabled_allows_second_call(self): policy = _policy(per_day_dedupe=False) with _patched_llm("first"): r1 = llm_enrichment.maybe_enrich_attribution( _attr_report("gw", "prod"), _risk_report(), policy) with _patched_llm("second"): r2 = llm_enrichment.maybe_enrich_attribution( _attr_report("gw", "prod"), _risk_report(), policy) assert r1["enabled"] is True assert r2["enabled"] is True def test_dedupe_does_not_affect_scores(self): """LLM output must never be present in risk report scoring.""" policy = _policy(per_day_dedupe=True) risk_report = _risk_report() original_score = risk_report["score"] with _patched_llm("some explanation"): llm_enrichment.maybe_enrich_attribution( _attr_report(), risk_report, policy) # score unchanged assert risk_report["score"] == original_score class TestLlmModeOff: def setup_method(self): llm_enrichment._clear_dedupe_store() def test_mode_off_never_calls_llm(self): policy = _policy(mode="off") with _patched_llm("should not appear") as mock_llm: result = llm_enrichment.maybe_enrich_attribution( _attr_report(), _risk_report(), policy) mock_llm.assert_not_called() assert result["enabled"] is False assert result["mode"] == "off" class TestTriggersNotMet: def setup_method(self): llm_enrichment._clear_dedupe_store() def test_low_band_low_delta_no_trigger(self): policy = _policy(delta_warn=10, band_in=["high", "critical"]) risk_report = _risk_report(band="low", delta=5.0) with _patched_llm("should not appear") as mock_llm: result = llm_enrichment.maybe_enrich_attribution( _attr_report(), risk_report, policy) mock_llm.assert_not_called() assert result["enabled"] is False assert "triggers not met" in result.get("skipped_reason", "")