microdao-daarion/config/slo_policy.yml

# SLO Policy — DAARION.city
#
# Defines Service Level Objectives per service.
# Used by observability_tool.slo_snapshot and incident_triage_graph slo_context node.
#
# Fields:
#   error_rate_pct   — max allowed error rate (%)
#   latency_p95_ms   — max p95 latency (milliseconds)
#   window_minutes   — default observation window (default: 60)

defaults:
  window_minutes: 60
  error_rate_pct: 1.0
  latency_p95_ms: 300

services:
  gateway:
    error_rate_pct: 1.0
    latency_p95_ms: 300
  router:
    error_rate_pct: 0.5
    latency_p95_ms: 200
  memory-service:
    error_rate_pct: 1.0
    latency_p95_ms: 400
  sofiia-supervisor:
    error_rate_pct: 1.0
    latency_p95_ms: 500

# ─── Voice SLO profiles ───────────────────────────────────────────────────────
# Two profiles aligned with router-config.yml selection_policies.
# Measured via Prometheus metrics emitted by sofiia-console /api/telemetry/voice
# and memory-service voice_endpoints.py.
#
# Prometheus metrics:
#   voice_ttfa_ms{voice_profile}       — Time-to-first-audio (BFF → first playable)
#   voice_e2e_ms{voice_profile}        — User stops speaking → audio plays
#   voice_tts_first_ms{voice_profile}  — First-sentence TTS synthesis
#   voice_tts_compute_ms{engine,voice} — Memory-service internal TTS
#   voice_queue_underflows_total       — Playback starvation events
voice_slo:
  voice_fast_uk:
    description: "Fast profile: gemma3 → qwen3.5 fallback"
    ttfa_ms_p95: 5000          # TTFA p95 ≤ 5s
    e2e_ms_p95: 9000           # E2E p95 ≤ 9s
    tts_first_ms_p95: 2000     # TTS synthesis p95 ≤ 2s
    underflow_rate_pct: 1.0    # starvation events per 100 voice turns ≤ 1%
    tts_error_rate_pct: 0.5    # edge-tts failures ≤ 0.5%
    window_minutes: 10

  voice_quality_uk:
    description: "Quality profile: qwen3.5 → qwen3:14b fallback"
    ttfa_ms_p95: 7000
    e2e_ms_p95: 12000
    tts_first_ms_p95: 2000     # TTS itself is the same engine
    underflow_rate_pct: 2.0    # slightly relaxed (longer LLM → more gap risk)
    tts_error_rate_pct: 0.5
    window_minutes: 10

  # Canary thresholds (runtime health check, stricter)
  canary:
    tts_polina_max_ms: 3000    # live Polina synthesis ≤ 3s
    tts_ostap_max_ms: 3000     # live Ostap synthesis ≤ 3s
    min_audio_bytes: 1000      # valid audio is never empty/tiny