#!/usr/bin/env bash # voice_latency_audit.sh — Voice pipeline latency audit (10 scenarios) # Usage: # bash ops/voice_latency_audit.sh [BFF_URL] [MEMORY_URL] # bash ops/voice_latency_audit.sh http://localhost:8002 http://localhost:8000 # # Scenarios: # 1-3 TTS only: gemma3 / qwen3:14b / glm-4.7-flash (warm) # 4 TTS + STT roundtrip (audio stub) # 5-7 Chat: short/medium/detailed answers # 8 High-load simulation (parallel requests) # 9 Cloud fallback simulation (ollama unavailable) # 10 edge-tts voice health check (Polina + Ostap live) set -euo pipefail BFF_URL="${1:-http://localhost:8002}" MEMORY_URL="${2:-http://localhost:8000}" RESULTS_DIR="ops/voice_audit_results" TS=$(date +%Y%m%d_%H%M%S) RESULTS_FILE="${RESULTS_DIR}/audit_${TS}.json" RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[0;33m'; CYAN='\033[0;36m'; NC='\033[0m' pass() { echo -e " ${GREEN}PASS${NC} $1"; } fail() { echo -e " ${RED}FAIL${NC} $1"; } warn() { echo -e " ${YELLOW}WARN${NC} $1"; } info() { echo -e " ${CYAN}INFO${NC} $1"; } mkdir -p "$RESULTS_DIR" declare -a RESULTS=() # ── Helper: measure HTTP call ────────────────────────────────────────────────── _now_ms() { python3 -c "import time; print(int(time.time()*1000))"; } measure() { local scenario="$1" method="$2" url="$3" opts="${4:-}" local t0 t1 ms http_code size ct t0=$(_now_ms) local tmp; tmp=$(mktemp) if [ "$method" = "POST_JSON" ]; then local body="${5:-{}}" http_code=$(curl -sf -w "%{http_code}" -X POST "$url" \ -H "Content-Type: application/json" \ -d "$body" -o "$tmp" --max-time 15 2>/dev/null || echo "000") elif [ "$method" = "GET" ]; then http_code=$(curl -sf -w "%{http_code}" -X GET "$url" \ -o "$tmp" --max-time 10 2>/dev/null || echo "000") else http_code="000" fi t1=$(_now_ms) ms=$((t1 - t0)) size=$(stat -f%z "$tmp" 2>/dev/null || stat -c%s "$tmp" 2>/dev/null || echo 0) ct=$(file "$tmp" 2>/dev/null | head -1 || echo "unknown") rm -f "$tmp" local status="ok" [ "$http_code" = "000" ] || [ "$http_code" -ge 400 ] && status="error" echo "{\"scenario\":\"$scenario\",\"ms\":$ms,\"http\":\"$http_code\",\"bytes\":$size,\"status\":\"$status\"}" if [ "$status" = "ok" ]; then local label; label=$([ $ms -le 2500 ] && echo "FAST" || ([ $ms -le 6000 ] && echo "OK" || echo "SLOW")) local color; color=$([ "$label" = "FAST" ] && echo "$GREEN" || ([ "$label" = "OK" ] && echo "$YELLOW" || echo "$RED")) echo -e " ${color}${label}${NC} ${scenario}: ${ms}ms (HTTP ${http_code}, ${size}B)" else echo -e " ${RED}FAIL${NC} ${scenario}: HTTP ${http_code} ${ms}ms" fi } # ── Scenario runner ──────────────────────────────────────────────────────────── run_scenarios() { echo "" echo "╔════════════════════════════════════════════╗" echo "║ Voice Latency Audit — $(date +%H:%M:%S) ║" echo "╚════════════════════════════════════════════╝" echo "" # ── 0. Preflight ────────────────────────────────────────────────────────── echo "── Preflight ──" local bff_ok mem_ok bff_ok=$(curl -sf "$BFF_URL/api/memory/status" 2>/dev/null | python3 -c "import json,sys;print(json.load(sys.stdin).get('ok',False))" 2>/dev/null || echo "False") mem_ok=$(curl -sf "$MEMORY_URL/health" 2>/dev/null | python3 -c "import json,sys;print(json.load(sys.stdin).get('status','error'))" 2>/dev/null || echo "error") [ "$bff_ok" = "True" ] && pass "BFF ok ($BFF_URL)" || fail "BFF unreachable" [ "$mem_ok" = "healthy" ] && pass "Memory Service ok ($MEMORY_URL)" || fail "Memory Service: $mem_ok" # ── Helper: measure and collect result ──────────────────────────────────── _m() { local scenario="$1" method="$2" url="$3" body="${4:-}" local t0 t1 ms http_code size tmp t0=$(_now_ms) tmp=$(mktemp) if [ "$method" = "POST_JSON" ]; then http_code=$(curl -sf -w "%{http_code}" -X POST "$url" \ -H "Content-Type: application/json" \ -d "$body" -o "$tmp" --max-time 30 2>/dev/null || echo "000") else http_code=$(curl -sf -w "%{http_code}" "$url" \ -o "$tmp" --max-time 20 2>/dev/null || echo "000") fi t1=$(_now_ms); ms=$((t1-t0)) size=$(stat -f%z "$tmp" 2>/dev/null || stat -c%s "$tmp" 2>/dev/null || echo 0) rm -f "$tmp" local status; status="ok" { [ "$http_code" = "000" ] || { [[ "$http_code" =~ ^[0-9]+$ ]] && [ "$http_code" -ge 400 ]; }; } && status="error" || true local json="{\"scenario\":\"$scenario\",\"ms\":$ms,\"http\":\"$http_code\",\"bytes\":$size,\"status\":\"$status\"}" RESULTS+=("$json") local lbl col if [ "$status" = "ok" ]; then if [ $ms -le 2500 ]; then lbl="FAST"; col="$GREEN" elif [ $ms -le 6000 ]; then lbl="OK "; col="$YELLOW" else lbl="SLOW"; col="$RED"; fi else lbl="FAIL"; col="$RED" fi echo -e " ${col}${lbl}${NC} ${scenario}: ${ms}ms (HTTP ${http_code}, ${size}B)" } # ── 1. TTS Scenario 1: short text → Polina ──────────────────────────────── echo "" echo "── TTS Scenarios (direct memory-service) ──" _m "TTS_short_polina" "POST_JSON" "$MEMORY_URL/voice/tts" '{"text":"Привіт! Це тест голосу.","voice":"default"}' _m "TTS_medium_polina" "POST_JSON" "$MEMORY_URL/voice/tts" '{"text":"Я Sofiia, головний AI-архітектор DAARION.city. Моя роль — розробка AI-рішень.","voice":"uk-UA-PolinaNeural"}' _m "TTS_ostap" "POST_JSON" "$MEMORY_URL/voice/tts" '{"text":"Перевірка голосу Остап Нейрал. Технічний тест.","voice":"Ostap"}' # ── 2. TTS via BFF proxy ────────────────────────────────────────────────── echo "" echo "── TTS via BFF proxy ──" _m "TTS_bff_short" "POST_JSON" "$BFF_URL/api/voice/tts" '{"text":"Перевірка через BFF.","voice":"default"}' _m "TTS_bff_medium" "POST_JSON" "$BFF_URL/api/voice/tts" '{"text":"NODA2 voice pipeline: edge-tts 7.2.7, Polina Neural, Ukrainian. OK.","voice":"uk-UA-PolinaNeural"}' # ── 3. Chat: per-model LLM latency comparison ──────────────────────────── echo "" echo "── Chat: LLM per-model latency (voice turn = 1 sentence) ──" local _q='{"message":"Одне речення (max 15 слів): що таке NODA2?","model":' _m "CHAT_gemma3" "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:gemma3:latest\",\"voice_profile\":\"voice_fast_uk\"}" _m "CHAT_qwen3_8b" "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:qwen3:8b\",\"voice_profile\":\"voice_fast_uk\"}" _m "CHAT_qwen3_14b" "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:qwen3:14b\",\"voice_profile\":\"voice_fast_uk\"}" _m "CHAT_qwen35_35b" "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:qwen3.5:35b-a3b\",\"voice_profile\":\"voice_quality_uk\"}" # ── 4. Voice health check ───────────────────────────────────────────────── echo "" echo "── Voice health (live Polina/Ostap synthesis) ──" _m "VOICE_HEALTH_live" "GET" "$MEMORY_URL/voice/health" # ── 5. Parallel TTS (simulate 2 concurrent users) ──────────────────────── echo "" echo "── Parallel TTS (2 concurrent) ──" local tmp1 tmp2 tmp1=$(mktemp) tmp2=$(mktemp) local t0_par; t0_par=$(_now_ms) curl -sf -X POST "$MEMORY_URL/voice/tts" \ -H "Content-Type: application/json" \ -d '{"text":"Перший паралельний запит.","voice":"default"}' \ -o "$tmp1" --max-time 15 & curl -sf -X POST "$MEMORY_URL/voice/tts" \ -H "Content-Type: application/json" \ -d '{"text":"Другий паралельний запит.","voice":"Ostap"}' \ -o "$tmp2" --max-time 15 & wait local t1_par; t1_par=$(_now_ms) local par_ms; par_ms=$((t1_par - t0_par)) local s1; s1=$(stat -f%z "$tmp1" 2>/dev/null || stat -c%s "$tmp1" 2>/dev/null || echo 0) local s2; s2=$(stat -f%z "$tmp2" 2>/dev/null || stat -c%s "$tmp2" 2>/dev/null || echo 0) rm -f "$tmp1" "$tmp2" local par_status; par_status=$([ $s1 -gt 1000 ] && [ $s2 -gt 1000 ] && echo "ok" || echo "error") R9="{\"scenario\":\"TTS_parallel_2x\",\"ms\":$par_ms,\"bytes\":$((s1+s2)),\"status\":\"$par_status\"}" RESULTS+=("$R9") local par_label; par_label=$([ $par_ms -le 4000 ] && echo "FAST" || ([ $par_ms -le 8000 ] && echo "OK" || echo "SLOW")) local par_color; par_color=$([ "$par_label" = "FAST" ] && echo "$GREEN" || ([ "$par_label" = "OK" ] && echo "$YELLOW" || echo "$RED")) echo -e " ${par_color}${par_label}${NC} TTS_parallel_2x: ${par_ms}ms (s1=${s1}B s2=${s2}B)" # ── 6. STT smoke test (synthetic WAV) ───────────────────────────────────── echo "" echo "── STT smoke (silent WAV stub) ──" # Generate minimal WAV header (44 bytes) as a stub python3 -c " import struct, sys # 44-byte WAV header for 1s silence at 16kHz mono int16 sample_rate = 16000 duration_s = 1 num_samples = sample_rate * duration_s data_size = num_samples * 2 header = struct.pack('<4sI4s4sIHHIIHH4sI', b'RIFF', 36 + data_size, b'WAVE', b'fmt ', 16, 1, 1, sample_rate, sample_rate*2, 2, 16, b'data', data_size) sys.stdout.buffer.write(header + bytes(data_size)) " > /tmp/voice_audit_stub.wav 2>/dev/null || echo "" > /tmp/voice_audit_stub.wav local t0_stt; t0_stt=$(_now_ms) local stt_code; stt_code=$(curl -sf -w "%{http_code}" -X POST "$MEMORY_URL/voice/stt" \ -F "audio=@/tmp/voice_audit_stub.wav;filename=audio.wav;type=audio/wav" \ -F "language=uk" \ -o /tmp/stt_result.json \ --max-time 30 2>/dev/null || echo "000") local t1_stt; t1_stt=$(_now_ms) local stt_ms; stt_ms=$((t1_stt - t0_stt)) local stt_status; stt_status=$([ "$stt_code" = "200" ] && echo "ok" || echo "error") R10="{\"scenario\":\"STT_silence_stub\",\"ms\":$stt_ms,\"http\":\"$stt_code\",\"status\":\"$stt_status\"}" RESULTS+=("$R10") local stt_label; stt_label=$([ $stt_ms -le 2000 ] && echo "FAST" || ([ $stt_ms -le 5000 ] && echo "OK" || echo "SLOW")) local stt_color; stt_color=$([ "$stt_label" = "FAST" ] && echo "$GREEN" || ([ "$stt_label" = "OK" ] && echo "$YELLOW" || echo "$RED")) echo -e " ${stt_color}${stt_label}${NC} STT_silence_stub: ${stt_ms}ms (HTTP ${stt_code})" rm -f /tmp/voice_audit_stub.wav /tmp/stt_result.json } # ── Compute p50/p95 ──────────────────────────────────────────────────────────── compute_stats() { python3 - "${RESULTS[@]}" <<'PYEOF' import json, sys, statistics results = [] for arg in sys.argv[1:]: try: results.append(json.loads(arg)) except Exception: pass if not results: print("No results to analyze.") sys.exit(0) tts_ms = [r["ms"] for r in results if "TTS" in r["scenario"] and r["status"] == "ok"] stt_ms = [r["ms"] for r in results if "STT" in r["scenario"] and r["status"] == "ok"] # per-model chat latency model_map = { "gemma3": [r["ms"] for r in results if r["scenario"] == "CHAT_gemma3" and r["status"] == "ok"], "qwen3:8b": [r["ms"] for r in results if r["scenario"] == "CHAT_qwen3_8b" and r["status"] == "ok"], "qwen3:14b": [r["ms"] for r in results if r["scenario"] == "CHAT_qwen3_14b" and r["status"] == "ok"], "qwen3.5:35b":[r["ms"] for r in results if r["scenario"] == "CHAT_qwen35_35b"and r["status"] == "ok"], } all_chat_ms = [ms for vals in model_map.values() for ms in vals] def stats(label, vals): if not vals: print(f" {label}: no data") return s = sorted(vals) p50 = s[len(s)//2] p95 = s[int(len(s)*0.95)] if len(s) > 1 else s[-1] print(f" {label}: p50={p50}ms p95={p95}ms min={min(s)}ms max={max(s)}ms n={len(s)}") print("") print("── Latency Summary ──") stats("TTS (memory-service + proxy)", tts_ms) stats("Chat (all models)", all_chat_ms) stats("STT (memory-service)", stt_ms) print("") print("── LLM Model Comparison (voice: 1 sentence) ──") rows = [] for model, vals in model_map.items(): if vals: s = sorted(vals) p50 = s[len(s)//2] rows.append((p50, model, vals[0])) else: rows.append((99999, model, None)) rows.sort() fastest = rows[0][1] if rows else "?" for rank, (p50, model, ms) in enumerate(rows, 1): icon = "🥇" if rank == 1 else ("🥈" if rank == 2 else ("🥉" if rank == 3 else " ")) note = " ← fastest (voice_fast default)" if model == fastest else "" ms_str = f"{ms}ms" if ms else "no data" print(f" {icon} {model:20s} {ms_str}{note}") # Bottleneck analysis all_ok = [r for r in results if r["status"] == "ok"] all_err = [r for r in results if r["status"] != "ok"] print(f" Passed: {len(all_ok)}/{len(results)} scenarios") if all_err: print(f" Failed: {[r['scenario'] for r in all_err]}") # SLO check print("") print("── SLO Check (voice_fast_uk) ──") tts_p95 = sorted(tts_ms)[int(len(tts_ms)*0.95)] if len(tts_ms) > 1 else (tts_ms[0] if tts_ms else 9999) q14_ms = model_map["qwen3:14b"][0] if model_map["qwen3:14b"] else 9999 q35_ms = model_map["qwen3.5:35b"][0] if model_map["qwen3.5:35b"] else 9999 g3_ms = model_map["gemma3"][0] if model_map["gemma3"] else 9999 fast_ms = min(q14_ms, q35_ms, g3_ms) checks = [ ("TTS p95 ≤ 2500ms", tts_p95 <= 2500, f"actual={tts_p95}ms"), ("Fastest model ≤ 9000ms", fast_ms <= 9000, f"actual={fast_ms}ms"), ("qwen3:14b ≤ 12000ms", q14_ms <= 12000, f"actual={q14_ms}ms"), ("qwen3.5 faster than 14b", q35_ms < q14_ms, f"35b={q35_ms}ms vs 14b={q14_ms}ms"), ("qwen3.5 auto-promote worthy", q35_ms < q14_ms * 0.9, f"ratio={round(q35_ms/max(q14_ms,1),2)} (need <0.9)"), ] for label, ok, detail in checks: icon = "✅" if ok else "⚠️ " print(f" {icon} {label} ({detail})") print("") # Verdict on qwen3.5 promotion if q35_ms < q14_ms * 0.9: print(" ✅ VERDICT: qwen3.5:35b-a3b qualifies for auto_promote in voice_fast_uk") elif q35_ms < q14_ms: print(" ℹ️ VERDICT: qwen3.5 is faster but not by 10% — keep as 2nd in prefer list") else: print(" ⚠️ VERDICT: qwen3.5 slower than qwen3:14b — move to voice_quality_uk only") PYEOF } # ── Save results ────────────────────────────────────────────────────────────── save_results() { python3 - "${RESULTS[@]}" "$RESULTS_FILE" <<'PYEOF' import json, sys results = [] for arg in sys.argv[1:-1]: try: results.append(json.loads(arg)) except Exception: pass out_file = sys.argv[-1] from datetime import datetime, timezone doc = { "timestamp": datetime.now(timezone.utc).isoformat(), "scenarios_total": len(results), "scenarios_passed": sum(1 for r in results if r["status"] == "ok"), "results": results, } with open(out_file, "w") as f: json.dump(doc, f, indent=2, ensure_ascii=False) print(f"\n Saved: {out_file}") PYEOF } # ── Main ────────────────────────────────────────────────────────────────────── run_scenarios compute_stats "${RESULTS[@]}" save_results "${RESULTS[@]}" echo "" echo "Done. Rerun: bash ops/voice_latency_audit.sh $BFF_URL $MEMORY_URL"