groups: - name: voice_slo # Evaluation interval should match Prometheus global evaluation_interval (default 1m). # All thresholds align with config/slo_policy.yml voice_slo section. rules: # ── Alert 1: TTFA p95 breach ────────────────────────────────────────────── # Fires when Time-to-first-audio p95 exceeds SLO for 10 consecutive minutes. # Root causes: slow LLM, Ollama overload, model cold-start. - alert: VoiceTTFA_P95_Breach_Fast expr: | histogram_quantile(0.95, rate(voice_ttfa_ms_bucket{voice_profile="voice_fast_uk"}[10m]) ) > 5000 for: 10m labels: severity: warning team: platform profile: voice_fast_uk annotations: summary: "Voice TTFA p95 breach (fast profile)" description: > voice_fast_uk TTFA p95 = {{ $value | humanizeDuration }}ms > 5000ms SLO. Check: Ollama queue depth, gemma3 model availability, sofiia-console logs. runbook: "ops/runbook-alerts.md#voice-ttfa" dashboard: "grafana/d/voice-slo/voice-latency" - alert: VoiceTTFA_P95_Breach_Quality expr: | histogram_quantile(0.95, rate(voice_ttfa_ms_bucket{voice_profile="voice_quality_uk"}[10m]) ) > 7000 for: 10m labels: severity: warning team: platform profile: voice_quality_uk annotations: summary: "Voice TTFA p95 breach (quality profile)" description: > voice_quality_uk TTFA p95 = {{ $value }}ms > 7000ms SLO. Check: qwen3.5:35b-a3b availability, NODA2 GPU/CPU load. runbook: "ops/runbook-alerts.md#voice-ttfa" # ── Alert 2: Underflow spike ─────────────────────────────────────────────── # Fires when queue starvation rate exceeds 1 event/min for 5 minutes. # Root cause: TTS synthesis slower than playback — LLM too slow, long chunks, # or network latency to memory-service. - alert: VoiceQueueUnderflow_Spike expr: | rate(voice_queue_underflows_total[5m]) > 0.017 for: 5m labels: severity: warning team: platform annotations: summary: "Voice queue starvation detected" description: > Queue underflow rate = {{ $value | humanize }}/s (>1/min). Audio playback is outrunning TTS synthesis — users hear silence gaps. Check: TTS latency (voice_tts_first_ms), chunk size, LLM total time. runbook: "ops/runbook-alerts.md#voice-underflow" # ── Alert 3: TTS synthesis degradation ──────────────────────────────────── # Fires when first-sentence TTS p95 exceeds 2s — indicates edge-tts issues # (403 auth errors, Microsoft endpoint throttling, network degradation). - alert: VoiceTTS_P95_Degraded expr: | histogram_quantile(0.95, rate(voice_tts_first_ms_bucket[10m]) ) > 2000 for: 10m labels: severity: critical team: platform annotations: summary: "Voice TTS synthesis degraded (p95 > 2s)" description: > voice_tts_first_ms p95 = {{ $value }}ms > 2000ms. Likely edge-tts 403 or Microsoft endpoint issue. Check: memory-service /voice/health, voice_tts_errors_total{error_type="403"}. runbook: "ops/runbook-alerts.md#voice-tts-degraded" # ── Alert 4: TTS error rate spike ───────────────────────────────────────── # Fires on elevated edge-tts error rate (403, network, synthesis failure). - alert: VoiceTTS_ErrorRate_High expr: | rate(voice_tts_errors_total[5m]) > 0.05 for: 3m labels: severity: critical team: platform annotations: summary: "Voice TTS error rate elevated" description: > TTS errors = {{ $value | humanize }}/s. Engine: {{ $labels.engine }}, Error type: {{ $labels.error_type }}. Users may hear espeak fallback or silence. runbook: "ops/runbook-alerts.md#voice-tts-error" # ── Alert 5: E2E latency breach ─────────────────────────────────────────── # Full round-trip SLO guard — catches combined LLM+TTS degradation. - alert: VoiceE2E_P95_Breach expr: | histogram_quantile(0.95, rate(voice_e2e_ms_bucket{voice_profile="voice_fast_uk"}[15m]) ) > 9000 for: 15m labels: severity: warning team: platform profile: voice_fast_uk annotations: summary: "Voice E2E latency p95 breach" description: > voice_fast_uk E2E p95 = {{ $value }}ms > 9000ms SLO. Full pipeline (STT+LLM+TTS) is degraded. runbook: "ops/runbook-alerts.md#voice-e2e"