Files
microdao-daarion/tests/test_llm_hardening.py
Apple 129e4ea1fc feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store,
architecture_pressure, backlog_generator/store, cost_analyzer, data_governance,
dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment,
platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files),
signature_state_store, sofiia_auto_router, tool_governance

New services:
- sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static
- memory-service: integration_endpoints, integrations, voice_endpoints, static UI
- aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents)
- sofiia-supervisor: new supervisor service
- aistalk-bridge-lite: Telegram bridge lite
- calendar-service: CalDAV calendar service with reminders
- mlx-stt-service / mlx-tts-service: Apple Silicon speech services
- binance-bot-monitor: market monitor service
- node-worker: STT/TTS memory providers

New tools (9): agent_email, browser_tool, contract_tool, observability_tool,
oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault

New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus,
farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine,
session_context, style_adapter, telemetry)

Tests: 85+ test files for all new modules
Made-with: Cursor
2026-03-03 07:14:14 -08:00

249 lines
9.4 KiB
Python

"""
tests/test_llm_hardening.py
Tests for LLM enrichment hardening guards in llm_enrichment.py:
- model not in allowlist → skip
- max_calls_per_digest enforced via call_counter
- per_day_dedupe prevents second call for same (service, env)
- all guards are non-fatal (never affect scores)
"""
import sys, os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../services/router"))
import datetime
import unittest
from unittest.mock import patch, MagicMock
import llm_enrichment
def _policy(
mode: str = "local",
model: str = "llama3",
allowlist=None,
max_calls: int = 3,
per_day_dedupe: bool = True,
delta_warn: int = 10,
band_in=None,
) -> dict:
if allowlist is None:
allowlist = ["llama3", "qwen2.5-coder:3b", "llama3.1:8b-instruct"]
if band_in is None:
band_in = ["high", "critical"]
return {
"defaults": {
"llm_mode": mode,
"llm_max_chars_in": 3500,
"llm_max_chars_out": 800,
},
"llm_triggers": {
"risk_delta_warn": delta_warn,
"band_in": band_in,
},
"llm_local": {
"endpoint": "http://localhost:11434/api/generate",
"model": model,
"timeout_seconds": 5,
"model_allowlist": allowlist,
"max_calls_per_digest": max_calls,
"per_day_dedupe": per_day_dedupe,
},
}
def _risk_report(band: str = "high", delta: float = 15.0) -> dict:
return {"service": "gw", "env": "prod", "band": band, "score": 75,
"trend": {"delta_24h": delta}, "reasons": ["P0 incident open"]}
def _attr_report(service: str = "gw", env: str = "prod", delta: float = 15.0) -> dict:
return {
"service": service, "env": env, "delta_24h": delta, "window_hours": 24,
"causes": [{"type": "deploy", "score": 30, "confidence": "medium",
"evidence": ["deploy alerts: 2"]}],
"summary": "Likely: deploy.",
}
def _patched_llm(text: str = "LLM insight text."):
"""Return a patcher that makes _call_local_llm return the given text."""
return patch("llm_enrichment._call_local_llm", return_value=text)
class TestModelAllowlist:
def setup_method(self):
llm_enrichment._clear_dedupe_store()
def test_model_not_in_allowlist_skips(self):
policy = _policy(model="unknown-model", allowlist=["llama3"])
with _patched_llm("should not appear") as mock_llm:
result = llm_enrichment.maybe_enrich_attribution(
_attr_report(), _risk_report(), policy
)
# _call_local_llm is called but internally checks allowlist and returns None
# so enabled should be False
assert result["enabled"] is False
def test_model_in_allowlist_proceeds(self):
policy = _policy(model="llama3", allowlist=["llama3"])
with _patched_llm("Good insight."):
result = llm_enrichment.maybe_enrich_attribution(
_attr_report(), _risk_report(), policy
)
assert result["enabled"] is True
assert result["text"] == "Good insight."
def test_empty_allowlist_allows_any(self):
"""Empty allowlist = no restriction."""
policy = _policy(model="custom-model", allowlist=[])
with _patched_llm("text"):
result = llm_enrichment.maybe_enrich_attribution(
_attr_report(), _risk_report(), policy
)
assert result["enabled"] is True
def test_is_model_allowed_true(self):
policy = _policy(allowlist=["a", "b"])
assert llm_enrichment._is_model_allowed("a", policy) is True
def test_is_model_allowed_false(self):
policy = _policy(allowlist=["a", "b"])
assert llm_enrichment._is_model_allowed("c", policy) is False
class TestMaxCallsPerDigest:
def setup_method(self):
llm_enrichment._clear_dedupe_store()
def test_calls_stop_at_max(self):
policy = _policy(max_calls=2)
counter = {"count": 0}
with _patched_llm("insight"):
r1 = llm_enrichment.maybe_enrich_attribution(
_attr_report("svc1"), _risk_report(), policy, call_counter=counter)
with _patched_llm("insight"):
r2 = llm_enrichment.maybe_enrich_attribution(
_attr_report("svc2", env="staging"), _risk_report(), policy,
call_counter=counter)
# counter should be 2 now; next call should be skipped
with _patched_llm("should be blocked"):
r3 = llm_enrichment.maybe_enrich_attribution(
_attr_report("svc3", env="dev"), _risk_report(), policy,
call_counter=counter)
assert r1["enabled"] is True
assert r2["enabled"] is True
assert r3["enabled"] is False
assert "max_calls_per_digest" in r3.get("skipped_reason", "")
assert counter["count"] == 2
def test_no_counter_allows_unlimited(self):
policy = _policy(max_calls=1)
with _patched_llm("text"):
r1 = llm_enrichment.maybe_enrich_attribution(
_attr_report(), _risk_report(), policy, call_counter=None)
with _patched_llm("text"):
r2 = llm_enrichment.maybe_enrich_attribution(
_attr_report("svc2", env="staging"), _risk_report(), policy, call_counter=None)
assert r1["enabled"] is True
assert r2["enabled"] is True
def test_counter_starts_at_zero(self):
policy = _policy(max_calls=0)
counter = {"count": 0}
with _patched_llm("blocked"):
result = llm_enrichment.maybe_enrich_attribution(
_attr_report(), _risk_report(), policy, call_counter=counter)
assert result["enabled"] is False
assert "max_calls_per_digest" in result.get("skipped_reason", "")
class TestPerDayDedupe:
def setup_method(self):
llm_enrichment._clear_dedupe_store()
def test_second_call_same_service_env_is_deduped(self):
policy = _policy(per_day_dedupe=True)
with _patched_llm("first"):
r1 = llm_enrichment.maybe_enrich_attribution(
_attr_report("gw", "prod"), _risk_report(), policy)
with _patched_llm("second"):
r2 = llm_enrichment.maybe_enrich_attribution(
_attr_report("gw", "prod"), _risk_report(), policy)
assert r1["enabled"] is True
assert r2["enabled"] is False
assert "per_day_dedupe" in r2.get("skipped_reason", "")
def test_different_service_not_deduped(self):
policy = _policy(per_day_dedupe=True)
with _patched_llm("gw insight"):
r1 = llm_enrichment.maybe_enrich_attribution(
_attr_report("gw", "prod"), _risk_report(), policy)
with _patched_llm("router insight"):
r2 = llm_enrichment.maybe_enrich_attribution(
_attr_report("router", "prod"), _risk_report(), policy)
assert r1["enabled"] is True
assert r2["enabled"] is True
def test_different_env_not_deduped(self):
policy = _policy(per_day_dedupe=True)
with _patched_llm("prod insight"):
r1 = llm_enrichment.maybe_enrich_attribution(
_attr_report("gw", "prod"), _risk_report(), policy)
with _patched_llm("staging insight"):
r2 = llm_enrichment.maybe_enrich_attribution(
_attr_report("gw", "staging"), _risk_report(), policy)
assert r1["enabled"] is True
assert r2["enabled"] is True
def test_dedupe_disabled_allows_second_call(self):
policy = _policy(per_day_dedupe=False)
with _patched_llm("first"):
r1 = llm_enrichment.maybe_enrich_attribution(
_attr_report("gw", "prod"), _risk_report(), policy)
with _patched_llm("second"):
r2 = llm_enrichment.maybe_enrich_attribution(
_attr_report("gw", "prod"), _risk_report(), policy)
assert r1["enabled"] is True
assert r2["enabled"] is True
def test_dedupe_does_not_affect_scores(self):
"""LLM output must never be present in risk report scoring."""
policy = _policy(per_day_dedupe=True)
risk_report = _risk_report()
original_score = risk_report["score"]
with _patched_llm("some explanation"):
llm_enrichment.maybe_enrich_attribution(
_attr_report(), risk_report, policy)
# score unchanged
assert risk_report["score"] == original_score
class TestLlmModeOff:
def setup_method(self):
llm_enrichment._clear_dedupe_store()
def test_mode_off_never_calls_llm(self):
policy = _policy(mode="off")
with _patched_llm("should not appear") as mock_llm:
result = llm_enrichment.maybe_enrich_attribution(
_attr_report(), _risk_report(), policy)
mock_llm.assert_not_called()
assert result["enabled"] is False
assert result["mode"] == "off"
class TestTriggersNotMet:
def setup_method(self):
llm_enrichment._clear_dedupe_store()
def test_low_band_low_delta_no_trigger(self):
policy = _policy(delta_warn=10, band_in=["high", "critical"])
risk_report = _risk_report(band="low", delta=5.0)
with _patched_llm("should not appear") as mock_llm:
result = llm_enrichment.maybe_enrich_attribution(
_attr_report(), risk_report, policy)
mock_llm.assert_not_called()
assert result["enabled"] is False
assert "triggers not met" in result.get("skipped_reason", "")