microdao-daarion/ops/voice_latency_audit.sh

#!/usr/bin/env bash
# voice_latency_audit.sh — Voice pipeline latency audit (10 scenarios)
# Usage:
#   bash ops/voice_latency_audit.sh [BFF_URL] [MEMORY_URL]
#   bash ops/voice_latency_audit.sh http://localhost:8002 http://localhost:8000
#
# Scenarios:
#   1-3  TTS only: gemma3 / qwen3:14b / glm-4.7-flash (warm)
#   4    TTS + STT roundtrip (audio stub)
#   5-7  Chat: short/medium/detailed answers
#   8    High-load simulation (parallel requests)
#   9    Cloud fallback simulation (ollama unavailable)
#   10   edge-tts voice health check (Polina + Ostap live)
set -euo pipefail

BFF_URL="${1:-http://localhost:8002}"
MEMORY_URL="${2:-http://localhost:8000}"
RESULTS_DIR="ops/voice_audit_results"
TS=$(date +%Y%m%d_%H%M%S)
RESULTS_FILE="${RESULTS_DIR}/audit_${TS}.json"

RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[0;33m'; CYAN='\033[0;36m'; NC='\033[0m'
pass() { echo -e "  ${GREEN}PASS${NC} $1"; }
fail() { echo -e "  ${RED}FAIL${NC} $1"; }
warn() { echo -e "  ${YELLOW}WARN${NC} $1"; }
info() { echo -e "  ${CYAN}INFO${NC} $1"; }

mkdir -p "$RESULTS_DIR"

declare -a RESULTS=()

# ── Helper: measure HTTP call ──────────────────────────────────────────────────
_now_ms() { python3 -c "import time; print(int(time.time()*1000))"; }

measure() {
    local scenario="$1" method="$2" url="$3" opts="${4:-}"
    local t0 t1 ms http_code size ct
    t0=$(_now_ms)
    local tmp; tmp=$(mktemp)

    if [ "$method" = "POST_JSON" ]; then
        local body="${5:-{}}"
        http_code=$(curl -sf -w "%{http_code}" -X POST "$url" \
            -H "Content-Type: application/json" \
            -d "$body" -o "$tmp" --max-time 15 2>/dev/null || echo "000")
    elif [ "$method" = "GET" ]; then
        http_code=$(curl -sf -w "%{http_code}" -X GET "$url" \
            -o "$tmp" --max-time 10 2>/dev/null || echo "000")
    else
        http_code="000"
    fi

    t1=$(_now_ms)
    ms=$((t1 - t0))
    size=$(stat -f%z "$tmp" 2>/dev/null || stat -c%s "$tmp" 2>/dev/null || echo 0)
    ct=$(file "$tmp" 2>/dev/null | head -1 || echo "unknown")
    rm -f "$tmp"

    local status="ok"
    [ "$http_code" = "000" ] || [ "$http_code" -ge 400 ] && status="error"

    echo "{\"scenario\":\"$scenario\",\"ms\":$ms,\"http\":\"$http_code\",\"bytes\":$size,\"status\":\"$status\"}"

    if [ "$status" = "ok" ]; then
        local label; label=$([ $ms -le 2500 ] && echo "FAST" || ([ $ms -le 6000 ] && echo "OK" || echo "SLOW"))
        local color; color=$([ "$label" = "FAST" ] && echo "$GREEN" || ([ "$label" = "OK" ] && echo "$YELLOW" || echo "$RED"))
        echo -e "  ${color}${label}${NC} ${scenario}: ${ms}ms (HTTP ${http_code}, ${size}B)"
    else
        echo -e "  ${RED}FAIL${NC} ${scenario}: HTTP ${http_code} ${ms}ms"
    fi
}

# ── Scenario runner ────────────────────────────────────────────────────────────

run_scenarios() {
    echo ""
    echo "╔════════════════════════════════════════════╗"
    echo "║   Voice Latency Audit — $(date +%H:%M:%S)          ║"
    echo "╚════════════════════════════════════════════╝"
    echo ""

    # ── 0. Preflight ──────────────────────────────────────────────────────────
    echo "── Preflight ──"
    local bff_ok mem_ok
    bff_ok=$(curl -sf "$BFF_URL/api/memory/status" 2>/dev/null | python3 -c "import json,sys;print(json.load(sys.stdin).get('ok',False))" 2>/dev/null || echo "False")
    mem_ok=$(curl -sf "$MEMORY_URL/health" 2>/dev/null | python3 -c "import json,sys;print(json.load(sys.stdin).get('status','error'))" 2>/dev/null || echo "error")
    [ "$bff_ok" = "True" ] && pass "BFF ok ($BFF_URL)" || fail "BFF unreachable"
    [ "$mem_ok" = "healthy" ] && pass "Memory Service ok ($MEMORY_URL)" || fail "Memory Service: $mem_ok"

    # ── Helper: measure and collect result ────────────────────────────────────
    _m() {
        local scenario="$1" method="$2" url="$3" body="${4:-}"
        local t0 t1 ms http_code size tmp
        t0=$(_now_ms)
        tmp=$(mktemp)
        if [ "$method" = "POST_JSON" ]; then
            http_code=$(curl -sf -w "%{http_code}" -X POST "$url" \
                -H "Content-Type: application/json" \
                -d "$body" -o "$tmp" --max-time 30 2>/dev/null || echo "000")
        else
            http_code=$(curl -sf -w "%{http_code}" "$url" \
                -o "$tmp" --max-time 20 2>/dev/null || echo "000")
        fi
        t1=$(_now_ms); ms=$((t1-t0))
        size=$(stat -f%z "$tmp" 2>/dev/null || stat -c%s "$tmp" 2>/dev/null || echo 0)
        rm -f "$tmp"
        local status; status="ok"
        { [ "$http_code" = "000" ] || { [[ "$http_code" =~ ^[0-9]+$ ]] && [ "$http_code" -ge 400 ]; }; } && status="error" || true
        local json="{\"scenario\":\"$scenario\",\"ms\":$ms,\"http\":\"$http_code\",\"bytes\":$size,\"status\":\"$status\"}"
        RESULTS+=("$json")
        local lbl col
        if [ "$status" = "ok" ]; then
            if [ $ms -le 2500 ]; then lbl="FAST"; col="$GREEN"
            elif [ $ms -le 6000 ]; then lbl="OK  "; col="$YELLOW"
            else lbl="SLOW"; col="$RED"; fi
        else
            lbl="FAIL"; col="$RED"
        fi
        echo -e "  ${col}${lbl}${NC} ${scenario}: ${ms}ms (HTTP ${http_code}, ${size}B)"
    }

    # ── 1. TTS Scenario 1: short text → Polina ────────────────────────────────
    echo ""
    echo "── TTS Scenarios (direct memory-service) ──"
    _m "TTS_short_polina"  "POST_JSON" "$MEMORY_URL/voice/tts" '{"text":"Привіт! Це тест голосу.","voice":"default"}'
    _m "TTS_medium_polina" "POST_JSON" "$MEMORY_URL/voice/tts" '{"text":"Я Sofiia, головний AI-архітектор DAARION.city. Моя роль — розробка AI-рішень.","voice":"uk-UA-PolinaNeural"}'
    _m "TTS_ostap"         "POST_JSON" "$MEMORY_URL/voice/tts" '{"text":"Перевірка голосу Остап Нейрал. Технічний тест.","voice":"Ostap"}'

    # ── 2. TTS via BFF proxy ──────────────────────────────────────────────────
    echo ""
    echo "── TTS via BFF proxy ──"
    _m "TTS_bff_short"  "POST_JSON" "$BFF_URL/api/voice/tts" '{"text":"Перевірка через BFF.","voice":"default"}'
    _m "TTS_bff_medium" "POST_JSON" "$BFF_URL/api/voice/tts" '{"text":"NODA2 voice pipeline: edge-tts 7.2.7, Polina Neural, Ukrainian. OK.","voice":"uk-UA-PolinaNeural"}'

    # ── 3. Chat: per-model LLM latency comparison ────────────────────────────
    echo ""
    echo "── Chat: LLM per-model latency (voice turn = 1 sentence) ──"
    local _q='{"message":"Одне речення (max 15 слів): що таке NODA2?","model":'
    _m "CHAT_gemma3"        "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:gemma3:latest\",\"voice_profile\":\"voice_fast_uk\"}"
    _m "CHAT_qwen3_8b"      "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:qwen3:8b\",\"voice_profile\":\"voice_fast_uk\"}"
    _m "CHAT_qwen3_14b"     "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:qwen3:14b\",\"voice_profile\":\"voice_fast_uk\"}"
    _m "CHAT_qwen35_35b"    "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:qwen3.5:35b-a3b\",\"voice_profile\":\"voice_quality_uk\"}"

    # ── 4. Voice health check ─────────────────────────────────────────────────
    echo ""
    echo "── Voice health (live Polina/Ostap synthesis) ──"
    _m "VOICE_HEALTH_live" "GET" "$MEMORY_URL/voice/health"

    # ── 5. Parallel TTS (simulate 2 concurrent users) ────────────────────────
    echo ""
    echo "── Parallel TTS (2 concurrent) ──"
    local tmp1 tmp2
    tmp1=$(mktemp)
    tmp2=$(mktemp)
    local t0_par; t0_par=$(_now_ms)
    curl -sf -X POST "$MEMORY_URL/voice/tts" \
        -H "Content-Type: application/json" \
        -d '{"text":"Перший паралельний запит.","voice":"default"}' \
        -o "$tmp1" --max-time 15 &
    curl -sf -X POST "$MEMORY_URL/voice/tts" \
        -H "Content-Type: application/json" \
        -d '{"text":"Другий паралельний запит.","voice":"Ostap"}' \
        -o "$tmp2" --max-time 15 &
    wait
    local t1_par; t1_par=$(_now_ms)
    local par_ms; par_ms=$((t1_par - t0_par))
    local s1; s1=$(stat -f%z "$tmp1" 2>/dev/null || stat -c%s "$tmp1" 2>/dev/null || echo 0)
    local s2; s2=$(stat -f%z "$tmp2" 2>/dev/null || stat -c%s "$tmp2" 2>/dev/null || echo 0)
    rm -f "$tmp1" "$tmp2"
    local par_status; par_status=$([ $s1 -gt 1000 ] && [ $s2 -gt 1000 ] && echo "ok" || echo "error")
    R9="{\"scenario\":\"TTS_parallel_2x\",\"ms\":$par_ms,\"bytes\":$((s1+s2)),\"status\":\"$par_status\"}"
    RESULTS+=("$R9")
    local par_label; par_label=$([ $par_ms -le 4000 ] && echo "FAST" || ([ $par_ms -le 8000 ] && echo "OK" || echo "SLOW"))
    local par_color; par_color=$([ "$par_label" = "FAST" ] && echo "$GREEN" || ([ "$par_label" = "OK" ] && echo "$YELLOW" || echo "$RED"))
    echo -e "  ${par_color}${par_label}${NC} TTS_parallel_2x: ${par_ms}ms (s1=${s1}B s2=${s2}B)"

    # ── 6. STT smoke test (synthetic WAV) ─────────────────────────────────────
    echo ""
    echo "── STT smoke (silent WAV stub) ──"
    # Generate minimal WAV header (44 bytes) as a stub
    python3 -c "
import struct, sys
# 44-byte WAV header for 1s silence at 16kHz mono int16
sample_rate = 16000
duration_s = 1
num_samples = sample_rate * duration_s
data_size = num_samples * 2
header = struct.pack('<4sI4s4sIHHIIHH4sI',
    b'RIFF', 36 + data_size, b'WAVE',
    b'fmt ', 16, 1, 1, sample_rate, sample_rate*2, 2, 16,
    b'data', data_size)
sys.stdout.buffer.write(header + bytes(data_size))
" > /tmp/voice_audit_stub.wav 2>/dev/null || echo "" > /tmp/voice_audit_stub.wav

    local t0_stt; t0_stt=$(_now_ms)
    local stt_code; stt_code=$(curl -sf -w "%{http_code}" -X POST "$MEMORY_URL/voice/stt" \
        -F "audio=@/tmp/voice_audit_stub.wav;filename=audio.wav;type=audio/wav" \
        -F "language=uk" \
        -o /tmp/stt_result.json \
        --max-time 30 2>/dev/null || echo "000")
    local t1_stt; t1_stt=$(_now_ms)
    local stt_ms; stt_ms=$((t1_stt - t0_stt))
    local stt_status; stt_status=$([ "$stt_code" = "200" ] && echo "ok" || echo "error")
    R10="{\"scenario\":\"STT_silence_stub\",\"ms\":$stt_ms,\"http\":\"$stt_code\",\"status\":\"$stt_status\"}"
    RESULTS+=("$R10")
    local stt_label; stt_label=$([ $stt_ms -le 2000 ] && echo "FAST" || ([ $stt_ms -le 5000 ] && echo "OK" || echo "SLOW"))
    local stt_color; stt_color=$([ "$stt_label" = "FAST" ] && echo "$GREEN" || ([ "$stt_label" = "OK" ] && echo "$YELLOW" || echo "$RED"))
    echo -e "  ${stt_color}${stt_label}${NC} STT_silence_stub: ${stt_ms}ms (HTTP ${stt_code})"

    rm -f /tmp/voice_audit_stub.wav /tmp/stt_result.json
}

# ── Compute p50/p95 ────────────────────────────────────────────────────────────
compute_stats() {
    python3 - "${RESULTS[@]}" <<'PYEOF'
import json, sys, statistics

results = []
for arg in sys.argv[1:]:
    try:
        results.append(json.loads(arg))
    except Exception:
        pass

if not results:
    print("No results to analyze.")
    sys.exit(0)

tts_ms  = [r["ms"] for r in results if "TTS" in r["scenario"] and r["status"] == "ok"]
stt_ms  = [r["ms"] for r in results if "STT" in r["scenario"] and r["status"] == "ok"]
# per-model chat latency
model_map = {
    "gemma3":     [r["ms"] for r in results if r["scenario"] == "CHAT_gemma3"    and r["status"] == "ok"],
    "qwen3:8b":   [r["ms"] for r in results if r["scenario"] == "CHAT_qwen3_8b"  and r["status"] == "ok"],
    "qwen3:14b":  [r["ms"] for r in results if r["scenario"] == "CHAT_qwen3_14b" and r["status"] == "ok"],
    "qwen3.5:35b":[r["ms"] for r in results if r["scenario"] == "CHAT_qwen35_35b"and r["status"] == "ok"],
}
all_chat_ms = [ms for vals in model_map.values() for ms in vals]

def stats(label, vals):
    if not vals:
        print(f"  {label}: no data")
        return
    s = sorted(vals)
    p50 = s[len(s)//2]
    p95 = s[int(len(s)*0.95)] if len(s) > 1 else s[-1]
    print(f"  {label}: p50={p50}ms p95={p95}ms min={min(s)}ms max={max(s)}ms n={len(s)}")

print("")
print("── Latency Summary ──")
stats("TTS (memory-service + proxy)", tts_ms)
stats("Chat (all models)", all_chat_ms)
stats("STT (memory-service)", stt_ms)

print("")
print("── LLM Model Comparison (voice: 1 sentence) ──")
rows = []
for model, vals in model_map.items():
    if vals:
        s = sorted(vals)
        p50 = s[len(s)//2]
        rows.append((p50, model, vals[0]))
    else:
        rows.append((99999, model, None))
rows.sort()
fastest = rows[0][1] if rows else "?"
for rank, (p50, model, ms) in enumerate(rows, 1):
    icon = "🥇" if rank == 1 else ("🥈" if rank == 2 else ("🥉" if rank == 3 else "  "))
    note = " ← fastest (voice_fast default)" if model == fastest else ""
    ms_str = f"{ms}ms" if ms else "no data"
    print(f"  {icon} {model:20s} {ms_str}{note}")

# Bottleneck analysis
all_ok  = [r for r in results if r["status"] == "ok"]
all_err = [r for r in results if r["status"] != "ok"]
print(f"  Passed: {len(all_ok)}/{len(results)} scenarios")
if all_err:
    print(f"  Failed: {[r['scenario'] for r in all_err]}")

# SLO check
print("")
print("── SLO Check (voice_fast_uk) ──")
tts_p95 = sorted(tts_ms)[int(len(tts_ms)*0.95)] if len(tts_ms) > 1 else (tts_ms[0] if tts_ms else 9999)
q14_ms   = model_map["qwen3:14b"][0]  if model_map["qwen3:14b"]   else 9999
q35_ms   = model_map["qwen3.5:35b"][0] if model_map["qwen3.5:35b"] else 9999
g3_ms    = model_map["gemma3"][0]     if model_map["gemma3"]       else 9999
fast_ms  = min(q14_ms, q35_ms, g3_ms)

checks = [
    ("TTS p95 ≤ 2500ms",           tts_p95 <= 2500,          f"actual={tts_p95}ms"),
    ("Fastest model ≤ 9000ms",     fast_ms  <= 9000,          f"actual={fast_ms}ms"),
    ("qwen3:14b ≤ 12000ms",        q14_ms  <= 12000,          f"actual={q14_ms}ms"),
    ("qwen3.5 faster than 14b",    q35_ms  < q14_ms,          f"35b={q35_ms}ms vs 14b={q14_ms}ms"),
    ("qwen3.5 auto-promote worthy", q35_ms < q14_ms * 0.9,    f"ratio={round(q35_ms/max(q14_ms,1),2)} (need <0.9)"),
]
for label, ok, detail in checks:
    icon = "✅" if ok else "⚠️ "
    print(f"  {icon} {label} ({detail})")

print("")
# Verdict on qwen3.5 promotion
if q35_ms < q14_ms * 0.9:
    print("  ✅ VERDICT: qwen3.5:35b-a3b qualifies for auto_promote in voice_fast_uk")
elif q35_ms < q14_ms:
    print("  ℹ️  VERDICT: qwen3.5 is faster but not by 10% — keep as 2nd in prefer list")
else:
    print("  ⚠️  VERDICT: qwen3.5 slower than qwen3:14b — move to voice_quality_uk only")

PYEOF
}

# ── Save results ──────────────────────────────────────────────────────────────
save_results() {
    python3 - "${RESULTS[@]}" "$RESULTS_FILE" <<'PYEOF'
import json, sys
results = []
for arg in sys.argv[1:-1]:
    try:
        results.append(json.loads(arg))
    except Exception:
        pass
out_file = sys.argv[-1]
from datetime import datetime, timezone
doc = {
    "timestamp": datetime.now(timezone.utc).isoformat(),
    "scenarios_total": len(results),
    "scenarios_passed": sum(1 for r in results if r["status"] == "ok"),
    "results": results,
}
with open(out_file, "w") as f:
    json.dump(doc, f, indent=2, ensure_ascii=False)
print(f"\n  Saved: {out_file}")
PYEOF
}

# ── Main ──────────────────────────────────────────────────────────────────────
run_scenarios
compute_stats "${RESULTS[@]}"
save_results "${RESULTS[@]}"
echo ""
echo "Done. Rerun: bash ops/voice_latency_audit.sh $BFF_URL $MEMORY_URL"