""" Voice routing policy contract tests. Rules enforced: 1. voice_fast_uk must NOT contain models with known p50 > 15000ms. 2. voice_fast_uk must contain at least one model with p50 ≤ 3000ms (fast gate). 3. voice_quality_uk deadline_ms ≤ 12000 (bounded — not unlimited chat). 4. voice_fast_uk deadline_ms ≤ 9000. 5. auto_promote.condition.p95_ratio_vs_next_model < 1.0 (must be faster, not slower). 6. exclude_models list must contain known-slow models. 7. voice_fast_uk max_tokens (num_predict equivalent) ≤ 256. 8. Audit JSON (if present): latest run must show no "FAIL" in TTS scenarios. 9. voice_guardrails: SOFIIA_VOICE_PROMPT_SUFFIX must contain hard constraints. 10. BFF: ChatSendBody must accept voice_profile field. """ import json import os import re from pathlib import Path import pytest import yaml REPO_ROOT = Path(__file__).parent.parent ROUTER_CONFIG = REPO_ROOT / "services" / "router" / "router-config.yml" AUDIT_DIR = REPO_ROOT / "ops" / "voice_audit_results" BFF_MAIN = REPO_ROOT / "services" / "sofiia-console" / "app" / "main.py" # Known-slow models: p50 > 15s in any voice audit run → banned from voice_fast_uk KNOWN_SLOW_MODELS = { "glm-4.7-flash:32k", "glm-4.7-flash", "deepseek-r1:70b", } # Hard deadline threshold for voice profiles (ms) VOICE_FAST_MAX_DEADLINE_MS = 9_000 VOICE_QUALITY_MAX_DEADLINE_MS = 12_000 # SLO: at least one prefer model must be historically ≤ this latency VOICE_FAST_GATE_MS = 6_000 # gemma3 measured at ~2.6s @pytest.fixture(scope="module") def router_config() -> dict: with open(ROUTER_CONFIG) as f: return yaml.safe_load(f) @pytest.fixture(scope="module") def voice_fast(router_config) -> dict: policies = router_config.get("selection_policies", {}) assert "voice_fast_uk" in policies, "selection_policies.voice_fast_uk not found in router-config.yml" return policies["voice_fast_uk"] @pytest.fixture(scope="module") def voice_quality(router_config) -> dict: policies = router_config.get("selection_policies", {}) assert "voice_quality_uk" in policies, "selection_policies.voice_quality_uk not found" return policies["voice_quality_uk"] @pytest.fixture(scope="module") def latest_audit() -> dict | None: """Load the most recent voice audit JSON if present.""" if not AUDIT_DIR.exists(): return None files = sorted(AUDIT_DIR.glob("audit_*.json"), reverse=True) if not files: return None with open(files[0]) as f: return json.load(f) # ── 1. voice_fast_uk: no slow models ───────────────────────────────────────── class TestVoiceFastUkPolicy: def test_no_known_slow_models_in_prefer(self, voice_fast): prefer = voice_fast.get("prefer_models", []) for model in prefer: assert model not in KNOWN_SLOW_MODELS, ( f"SLOW model '{model}' found in voice_fast_uk.prefer_models — " f"remove or move to voice_quality_uk" ) def test_known_slow_models_are_excluded(self, voice_fast): excluded = set(voice_fast.get("exclude_models", [])) missing = KNOWN_SLOW_MODELS - excluded assert not missing, ( f"Known-slow models not in voice_fast_uk.exclude_models: {missing}" ) def test_deadline_bounded(self, voice_fast): deadline = voice_fast.get("deadline_ms", 0) assert deadline > 0, "deadline_ms must be set" assert deadline <= VOICE_FAST_MAX_DEADLINE_MS, ( f"voice_fast_uk.deadline_ms={deadline} exceeds hard limit {VOICE_FAST_MAX_DEADLINE_MS}ms" ) def test_max_tokens_bounded(self, voice_fast): tokens = voice_fast.get("max_tokens", 999) assert tokens <= 256, ( f"voice_fast_uk.max_tokens={tokens} — too high for voice (≤256 required)" ) def test_prefer_has_fast_model(self, voice_fast): prefer = voice_fast.get("prefer_models", []) assert len(prefer) >= 1, "voice_fast_uk must have at least one prefer_model" # gemma3 (known fast) should be first or second fast_candidates = [m for m in prefer if "gemma3" in m or "8b" in m] assert fast_candidates, ( f"voice_fast_uk.prefer_models has no known-fast model (gemma3/8b). " f"Current list: {prefer}" ) def test_gemma3_is_first_or_second(self, voice_fast): prefer = voice_fast.get("prefer_models", []) gemma3_idx = next((i for i, m in enumerate(prefer) if "gemma3" in m), None) assert gemma3_idx is not None, "gemma3 must be in voice_fast_uk.prefer_models" assert gemma3_idx <= 1, ( f"gemma3 should be first or second in prefer_models (found at index {gemma3_idx})" ) def test_auto_promote_ratio_less_than_one(self, voice_fast): ap = voice_fast.get("auto_promote", {}) if not ap: pytest.skip("auto_promote not configured") ratio = ap.get("condition", {}).get("p95_ratio_vs_next_model", 1.0) assert ratio < 1.0, ( f"auto_promote.condition.p95_ratio_vs_next_model={ratio} must be < 1.0 " f"(candidate must be faster than next model)" ) def test_require_caps_includes_tts(self, voice_fast): caps = voice_fast.get("require_caps", []) assert "tts" in caps, "voice_fast_uk must require caps=[tts]" # ── 2. voice_quality_uk: bounded deadline ──────────────────────────────────── class TestVoiceQualityUkPolicy: def test_deadline_bounded(self, voice_quality): deadline = voice_quality.get("deadline_ms", 0) assert deadline <= VOICE_QUALITY_MAX_DEADLINE_MS, ( f"voice_quality_uk.deadline_ms={deadline} exceeds {VOICE_QUALITY_MAX_DEADLINE_MS}ms" ) def test_no_known_slow_in_prefer(self, voice_quality): prefer = voice_quality.get("prefer_models", []) for model in prefer: assert model not in KNOWN_SLOW_MODELS, ( f"SLOW model '{model}' in voice_quality_uk.prefer_models" ) def test_max_tokens_bounded(self, voice_quality): tokens = voice_quality.get("max_tokens", 999) assert tokens <= 320, ( f"voice_quality_uk.max_tokens={tokens} exceeds 320 — high for voice" ) # ── 3. Audit JSON: TTS scenarios must pass ─────────────────────────────────── class TestAuditResultsContract: def test_tts_scenarios_all_pass(self, latest_audit): if latest_audit is None: pytest.skip("No audit results found — run ops/voice_latency_audit.sh first") results = latest_audit.get("results", []) tts_results = [r for r in results if "TTS" in r.get("scenario", "")] assert tts_results, "No TTS scenarios in audit results" failed = [r for r in tts_results if r.get("status") != "ok"] assert not failed, ( f"TTS scenarios failed: {[r['scenario'] for r in failed]}" ) def test_tts_p95_under_slo(self, latest_audit): if latest_audit is None: pytest.skip("No audit results") results = latest_audit.get("results", []) tts_ms = [r["ms"] for r in results if "TTS" in r.get("scenario", "") and r.get("status") == "ok"] if not tts_ms: pytest.skip("No successful TTS results") tts_p95 = sorted(tts_ms)[int(len(tts_ms) * 0.95)] if len(tts_ms) > 1 else tts_ms[0] assert tts_p95 <= 2500, f"TTS p95={tts_p95}ms exceeds SLO of 2500ms" def test_no_glm_in_passing_chat_scenarios(self, latest_audit): if latest_audit is None: pytest.skip("No audit results") results = latest_audit.get("results", []) glm_ok = [r for r in results if "glm" in r.get("scenario", "").lower() and r.get("status") == "ok" and r.get("ms", 0) > 15000] assert not glm_ok, ( f"GLM model passed but took >15s — it must remain in exclude_models: {glm_ok}" ) # ── 4. Voice guardrails in BFF ──────────────────────────────────────────────── class TestVoiceGuardrails: def test_voice_prompt_suffix_exists(self): src = BFF_MAIN.read_text() assert "SOFIIA_VOICE_PROMPT_SUFFIX" in src, ( "SOFIIA_VOICE_PROMPT_SUFFIX not found in sofiia-console/app/main.py" ) def test_voice_prompt_suffix_contains_max_sentences(self): src = BFF_MAIN.read_text() # Extract the suffix constant m = re.search( r'SOFIIA_VOICE_PROMPT_SUFFIX\s*=\s*"""(.*?)"""', src, re.DOTALL ) assert m, "Could not parse SOFIIA_VOICE_PROMPT_SUFFIX" content = m.group(1) assert "2 речення" in content or "2 sentence" in content.lower(), ( "SOFIIA_VOICE_PROMPT_SUFFIX must enforce max 2 sentence constraint" ) def test_voice_prompt_suffix_bans_markdown(self): src = BFF_MAIN.read_text() m = re.search( r'SOFIIA_VOICE_PROMPT_SUFFIX\s*=\s*"""(.*?)"""', src, re.DOTALL ) assert m content = m.group(1) assert "markdown" in content.lower() or "bold" in content.lower() or "bullet" in content.lower(), ( "SOFIIA_VOICE_PROMPT_SUFFIX should ban markdown/lists for voice" ) def test_voice_prompt_suffix_bans_think_tags(self): src = BFF_MAIN.read_text() m = re.search( r'SOFIIA_VOICE_PROMPT_SUFFIX\s*=\s*"""(.*?)"""', src, re.DOTALL ) assert m content = m.group(1) assert "" in content, ( "SOFIIA_VOICE_PROMPT_SUFFIX should explicitly mention ban" ) def test_chat_send_body_has_voice_profile_field(self): src = BFF_MAIN.read_text() assert "voice_profile" in src, ( "ChatSendBody must have voice_profile field in sofiia-console/app/main.py" ) def test_voice_guardrails_applied_per_turn(self): src = BFF_MAIN.read_text() # Check that _system_prompt is used (not hardcoded SOFIIA_SYSTEM_PROMPT) in ollama/glm/grok provider_uses_system_prompt = src.count('"content": _system_prompt') assert provider_uses_system_prompt >= 2, ( f"Expected ≥2 providers to use _system_prompt (voice-aware), found {provider_uses_system_prompt}" ) # ── 5. Router config structure ──────────────────────────────────────────────── class TestRouterConfigStructure: def test_voice_policies_section_exists(self, router_config): assert "selection_policies" in router_config, \ "router-config.yml must have selection_policies section" def test_agent_voice_profiles_sofiia(self, router_config): avp = router_config.get("agent_voice_profiles", {}) assert "sofiia" in avp, "agent_voice_profiles.sofiia must be defined" sofiia = avp["sofiia"] assert sofiia.get("voice_profile") == "voice_fast_uk" assert sofiia.get("quality_profile") == "voice_quality_uk" def test_policies_voice_section_exists(self, router_config): policies = router_config.get("policies", {}) assert "voice" in policies, "policies.voice section missing in router-config.yml" v = policies["voice"] assert v.get("deadline_ms_voice", 0) <= 9000 assert v.get("max_tokens_voice", 999) <= 256