New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
249 lines
9.4 KiB
Python
249 lines
9.4 KiB
Python
"""
|
|
tests/test_llm_hardening.py
|
|
|
|
Tests for LLM enrichment hardening guards in llm_enrichment.py:
|
|
- model not in allowlist → skip
|
|
- max_calls_per_digest enforced via call_counter
|
|
- per_day_dedupe prevents second call for same (service, env)
|
|
- all guards are non-fatal (never affect scores)
|
|
"""
|
|
import sys, os
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../services/router"))
|
|
|
|
import datetime
|
|
import unittest
|
|
from unittest.mock import patch, MagicMock
|
|
import llm_enrichment
|
|
|
|
|
|
def _policy(
|
|
mode: str = "local",
|
|
model: str = "llama3",
|
|
allowlist=None,
|
|
max_calls: int = 3,
|
|
per_day_dedupe: bool = True,
|
|
delta_warn: int = 10,
|
|
band_in=None,
|
|
) -> dict:
|
|
if allowlist is None:
|
|
allowlist = ["llama3", "qwen2.5-coder:3b", "llama3.1:8b-instruct"]
|
|
if band_in is None:
|
|
band_in = ["high", "critical"]
|
|
return {
|
|
"defaults": {
|
|
"llm_mode": mode,
|
|
"llm_max_chars_in": 3500,
|
|
"llm_max_chars_out": 800,
|
|
},
|
|
"llm_triggers": {
|
|
"risk_delta_warn": delta_warn,
|
|
"band_in": band_in,
|
|
},
|
|
"llm_local": {
|
|
"endpoint": "http://localhost:11434/api/generate",
|
|
"model": model,
|
|
"timeout_seconds": 5,
|
|
"model_allowlist": allowlist,
|
|
"max_calls_per_digest": max_calls,
|
|
"per_day_dedupe": per_day_dedupe,
|
|
},
|
|
}
|
|
|
|
|
|
def _risk_report(band: str = "high", delta: float = 15.0) -> dict:
|
|
return {"service": "gw", "env": "prod", "band": band, "score": 75,
|
|
"trend": {"delta_24h": delta}, "reasons": ["P0 incident open"]}
|
|
|
|
|
|
def _attr_report(service: str = "gw", env: str = "prod", delta: float = 15.0) -> dict:
|
|
return {
|
|
"service": service, "env": env, "delta_24h": delta, "window_hours": 24,
|
|
"causes": [{"type": "deploy", "score": 30, "confidence": "medium",
|
|
"evidence": ["deploy alerts: 2"]}],
|
|
"summary": "Likely: deploy.",
|
|
}
|
|
|
|
|
|
def _patched_llm(text: str = "LLM insight text."):
|
|
"""Return a patcher that makes _call_local_llm return the given text."""
|
|
return patch("llm_enrichment._call_local_llm", return_value=text)
|
|
|
|
|
|
class TestModelAllowlist:
|
|
def setup_method(self):
|
|
llm_enrichment._clear_dedupe_store()
|
|
|
|
def test_model_not_in_allowlist_skips(self):
|
|
policy = _policy(model="unknown-model", allowlist=["llama3"])
|
|
with _patched_llm("should not appear") as mock_llm:
|
|
result = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report(), _risk_report(), policy
|
|
)
|
|
# _call_local_llm is called but internally checks allowlist and returns None
|
|
# so enabled should be False
|
|
assert result["enabled"] is False
|
|
|
|
def test_model_in_allowlist_proceeds(self):
|
|
policy = _policy(model="llama3", allowlist=["llama3"])
|
|
with _patched_llm("Good insight."):
|
|
result = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report(), _risk_report(), policy
|
|
)
|
|
assert result["enabled"] is True
|
|
assert result["text"] == "Good insight."
|
|
|
|
def test_empty_allowlist_allows_any(self):
|
|
"""Empty allowlist = no restriction."""
|
|
policy = _policy(model="custom-model", allowlist=[])
|
|
with _patched_llm("text"):
|
|
result = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report(), _risk_report(), policy
|
|
)
|
|
assert result["enabled"] is True
|
|
|
|
def test_is_model_allowed_true(self):
|
|
policy = _policy(allowlist=["a", "b"])
|
|
assert llm_enrichment._is_model_allowed("a", policy) is True
|
|
|
|
def test_is_model_allowed_false(self):
|
|
policy = _policy(allowlist=["a", "b"])
|
|
assert llm_enrichment._is_model_allowed("c", policy) is False
|
|
|
|
|
|
class TestMaxCallsPerDigest:
|
|
def setup_method(self):
|
|
llm_enrichment._clear_dedupe_store()
|
|
|
|
def test_calls_stop_at_max(self):
|
|
policy = _policy(max_calls=2)
|
|
counter = {"count": 0}
|
|
with _patched_llm("insight"):
|
|
r1 = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report("svc1"), _risk_report(), policy, call_counter=counter)
|
|
with _patched_llm("insight"):
|
|
r2 = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report("svc2", env="staging"), _risk_report(), policy,
|
|
call_counter=counter)
|
|
# counter should be 2 now; next call should be skipped
|
|
with _patched_llm("should be blocked"):
|
|
r3 = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report("svc3", env="dev"), _risk_report(), policy,
|
|
call_counter=counter)
|
|
|
|
assert r1["enabled"] is True
|
|
assert r2["enabled"] is True
|
|
assert r3["enabled"] is False
|
|
assert "max_calls_per_digest" in r3.get("skipped_reason", "")
|
|
assert counter["count"] == 2
|
|
|
|
def test_no_counter_allows_unlimited(self):
|
|
policy = _policy(max_calls=1)
|
|
with _patched_llm("text"):
|
|
r1 = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report(), _risk_report(), policy, call_counter=None)
|
|
with _patched_llm("text"):
|
|
r2 = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report("svc2", env="staging"), _risk_report(), policy, call_counter=None)
|
|
assert r1["enabled"] is True
|
|
assert r2["enabled"] is True
|
|
|
|
def test_counter_starts_at_zero(self):
|
|
policy = _policy(max_calls=0)
|
|
counter = {"count": 0}
|
|
with _patched_llm("blocked"):
|
|
result = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report(), _risk_report(), policy, call_counter=counter)
|
|
assert result["enabled"] is False
|
|
assert "max_calls_per_digest" in result.get("skipped_reason", "")
|
|
|
|
|
|
class TestPerDayDedupe:
|
|
def setup_method(self):
|
|
llm_enrichment._clear_dedupe_store()
|
|
|
|
def test_second_call_same_service_env_is_deduped(self):
|
|
policy = _policy(per_day_dedupe=True)
|
|
with _patched_llm("first"):
|
|
r1 = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report("gw", "prod"), _risk_report(), policy)
|
|
with _patched_llm("second"):
|
|
r2 = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report("gw", "prod"), _risk_report(), policy)
|
|
|
|
assert r1["enabled"] is True
|
|
assert r2["enabled"] is False
|
|
assert "per_day_dedupe" in r2.get("skipped_reason", "")
|
|
|
|
def test_different_service_not_deduped(self):
|
|
policy = _policy(per_day_dedupe=True)
|
|
with _patched_llm("gw insight"):
|
|
r1 = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report("gw", "prod"), _risk_report(), policy)
|
|
with _patched_llm("router insight"):
|
|
r2 = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report("router", "prod"), _risk_report(), policy)
|
|
assert r1["enabled"] is True
|
|
assert r2["enabled"] is True
|
|
|
|
def test_different_env_not_deduped(self):
|
|
policy = _policy(per_day_dedupe=True)
|
|
with _patched_llm("prod insight"):
|
|
r1 = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report("gw", "prod"), _risk_report(), policy)
|
|
with _patched_llm("staging insight"):
|
|
r2 = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report("gw", "staging"), _risk_report(), policy)
|
|
assert r1["enabled"] is True
|
|
assert r2["enabled"] is True
|
|
|
|
def test_dedupe_disabled_allows_second_call(self):
|
|
policy = _policy(per_day_dedupe=False)
|
|
with _patched_llm("first"):
|
|
r1 = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report("gw", "prod"), _risk_report(), policy)
|
|
with _patched_llm("second"):
|
|
r2 = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report("gw", "prod"), _risk_report(), policy)
|
|
assert r1["enabled"] is True
|
|
assert r2["enabled"] is True
|
|
|
|
def test_dedupe_does_not_affect_scores(self):
|
|
"""LLM output must never be present in risk report scoring."""
|
|
policy = _policy(per_day_dedupe=True)
|
|
risk_report = _risk_report()
|
|
original_score = risk_report["score"]
|
|
with _patched_llm("some explanation"):
|
|
llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report(), risk_report, policy)
|
|
# score unchanged
|
|
assert risk_report["score"] == original_score
|
|
|
|
|
|
class TestLlmModeOff:
|
|
def setup_method(self):
|
|
llm_enrichment._clear_dedupe_store()
|
|
|
|
def test_mode_off_never_calls_llm(self):
|
|
policy = _policy(mode="off")
|
|
with _patched_llm("should not appear") as mock_llm:
|
|
result = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report(), _risk_report(), policy)
|
|
mock_llm.assert_not_called()
|
|
assert result["enabled"] is False
|
|
assert result["mode"] == "off"
|
|
|
|
|
|
class TestTriggersNotMet:
|
|
def setup_method(self):
|
|
llm_enrichment._clear_dedupe_store()
|
|
|
|
def test_low_band_low_delta_no_trigger(self):
|
|
policy = _policy(delta_warn=10, band_in=["high", "critical"])
|
|
risk_report = _risk_report(band="low", delta=5.0)
|
|
with _patched_llm("should not appear") as mock_llm:
|
|
result = llm_enrichment.maybe_enrich_attribution(
|
|
_attr_report(), risk_report, policy)
|
|
mock_llm.assert_not_called()
|
|
assert result["enabled"] is False
|
|
assert "triggers not met" in result.get("skipped_reason", "")
|