Files
microdao-daarion/ops/voice_latency_audit.sh
Apple 67225a39fa docs(platform): add policy configs, runbooks, ops scripts and platform documentation
Config policies (16 files): alert_routing, architecture_pressure, backlog,
cost_weights, data_governance, incident_escalation, incident_intelligence,
network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix,
release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout

Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard,
deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice,
cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule),
task_registry, voice alerts/ha/latency/policy

Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks,
NODA1/NODA2 status and setup, audit index and traces, backlog, incident,
supervisor, tools, voice, opencode, release, risk, aistalk, spacebot

Made-with: Cursor
2026-03-03 07:14:53 -08:00

342 lines
16 KiB
Bash
Executable File
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# voice_latency_audit.sh — Voice pipeline latency audit (10 scenarios)
# Usage:
# bash ops/voice_latency_audit.sh [BFF_URL] [MEMORY_URL]
# bash ops/voice_latency_audit.sh http://localhost:8002 http://localhost:8000
#
# Scenarios:
# 1-3 TTS only: gemma3 / qwen3:14b / glm-4.7-flash (warm)
# 4 TTS + STT roundtrip (audio stub)
# 5-7 Chat: short/medium/detailed answers
# 8 High-load simulation (parallel requests)
# 9 Cloud fallback simulation (ollama unavailable)
# 10 edge-tts voice health check (Polina + Ostap live)
set -euo pipefail
BFF_URL="${1:-http://localhost:8002}"
MEMORY_URL="${2:-http://localhost:8000}"
RESULTS_DIR="ops/voice_audit_results"
TS=$(date +%Y%m%d_%H%M%S)
RESULTS_FILE="${RESULTS_DIR}/audit_${TS}.json"
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[0;33m'; CYAN='\033[0;36m'; NC='\033[0m'
pass() { echo -e " ${GREEN}PASS${NC} $1"; }
fail() { echo -e " ${RED}FAIL${NC} $1"; }
warn() { echo -e " ${YELLOW}WARN${NC} $1"; }
info() { echo -e " ${CYAN}INFO${NC} $1"; }
mkdir -p "$RESULTS_DIR"
declare -a RESULTS=()
# ── Helper: measure HTTP call ──────────────────────────────────────────────────
_now_ms() { python3 -c "import time; print(int(time.time()*1000))"; }
measure() {
local scenario="$1" method="$2" url="$3" opts="${4:-}"
local t0 t1 ms http_code size ct
t0=$(_now_ms)
local tmp; tmp=$(mktemp)
if [ "$method" = "POST_JSON" ]; then
local body="${5:-{}}"
http_code=$(curl -sf -w "%{http_code}" -X POST "$url" \
-H "Content-Type: application/json" \
-d "$body" -o "$tmp" --max-time 15 2>/dev/null || echo "000")
elif [ "$method" = "GET" ]; then
http_code=$(curl -sf -w "%{http_code}" -X GET "$url" \
-o "$tmp" --max-time 10 2>/dev/null || echo "000")
else
http_code="000"
fi
t1=$(_now_ms)
ms=$((t1 - t0))
size=$(stat -f%z "$tmp" 2>/dev/null || stat -c%s "$tmp" 2>/dev/null || echo 0)
ct=$(file "$tmp" 2>/dev/null | head -1 || echo "unknown")
rm -f "$tmp"
local status="ok"
[ "$http_code" = "000" ] || [ "$http_code" -ge 400 ] && status="error"
echo "{\"scenario\":\"$scenario\",\"ms\":$ms,\"http\":\"$http_code\",\"bytes\":$size,\"status\":\"$status\"}"
if [ "$status" = "ok" ]; then
local label; label=$([ $ms -le 2500 ] && echo "FAST" || ([ $ms -le 6000 ] && echo "OK" || echo "SLOW"))
local color; color=$([ "$label" = "FAST" ] && echo "$GREEN" || ([ "$label" = "OK" ] && echo "$YELLOW" || echo "$RED"))
echo -e " ${color}${label}${NC} ${scenario}: ${ms}ms (HTTP ${http_code}, ${size}B)"
else
echo -e " ${RED}FAIL${NC} ${scenario}: HTTP ${http_code} ${ms}ms"
fi
}
# ── Scenario runner ────────────────────────────────────────────────────────────
run_scenarios() {
echo ""
echo "╔════════════════════════════════════════════╗"
echo "║ Voice Latency Audit — $(date +%H:%M:%S)"
echo "╚════════════════════════════════════════════╝"
echo ""
# ── 0. Preflight ──────────────────────────────────────────────────────────
echo "── Preflight ──"
local bff_ok mem_ok
bff_ok=$(curl -sf "$BFF_URL/api/memory/status" 2>/dev/null | python3 -c "import json,sys;print(json.load(sys.stdin).get('ok',False))" 2>/dev/null || echo "False")
mem_ok=$(curl -sf "$MEMORY_URL/health" 2>/dev/null | python3 -c "import json,sys;print(json.load(sys.stdin).get('status','error'))" 2>/dev/null || echo "error")
[ "$bff_ok" = "True" ] && pass "BFF ok ($BFF_URL)" || fail "BFF unreachable"
[ "$mem_ok" = "healthy" ] && pass "Memory Service ok ($MEMORY_URL)" || fail "Memory Service: $mem_ok"
# ── Helper: measure and collect result ────────────────────────────────────
_m() {
local scenario="$1" method="$2" url="$3" body="${4:-}"
local t0 t1 ms http_code size tmp
t0=$(_now_ms)
tmp=$(mktemp)
if [ "$method" = "POST_JSON" ]; then
http_code=$(curl -sf -w "%{http_code}" -X POST "$url" \
-H "Content-Type: application/json" \
-d "$body" -o "$tmp" --max-time 30 2>/dev/null || echo "000")
else
http_code=$(curl -sf -w "%{http_code}" "$url" \
-o "$tmp" --max-time 20 2>/dev/null || echo "000")
fi
t1=$(_now_ms); ms=$((t1-t0))
size=$(stat -f%z "$tmp" 2>/dev/null || stat -c%s "$tmp" 2>/dev/null || echo 0)
rm -f "$tmp"
local status; status="ok"
{ [ "$http_code" = "000" ] || { [[ "$http_code" =~ ^[0-9]+$ ]] && [ "$http_code" -ge 400 ]; }; } && status="error" || true
local json="{\"scenario\":\"$scenario\",\"ms\":$ms,\"http\":\"$http_code\",\"bytes\":$size,\"status\":\"$status\"}"
RESULTS+=("$json")
local lbl col
if [ "$status" = "ok" ]; then
if [ $ms -le 2500 ]; then lbl="FAST"; col="$GREEN"
elif [ $ms -le 6000 ]; then lbl="OK "; col="$YELLOW"
else lbl="SLOW"; col="$RED"; fi
else
lbl="FAIL"; col="$RED"
fi
echo -e " ${col}${lbl}${NC} ${scenario}: ${ms}ms (HTTP ${http_code}, ${size}B)"
}
# ── 1. TTS Scenario 1: short text → Polina ────────────────────────────────
echo ""
echo "── TTS Scenarios (direct memory-service) ──"
_m "TTS_short_polina" "POST_JSON" "$MEMORY_URL/voice/tts" '{"text":"Привіт! Це тест голосу.","voice":"default"}'
_m "TTS_medium_polina" "POST_JSON" "$MEMORY_URL/voice/tts" '{"text":"Я Sofiia, головний AI-архітектор DAARION.city. Моя роль — розробка AI-рішень.","voice":"uk-UA-PolinaNeural"}'
_m "TTS_ostap" "POST_JSON" "$MEMORY_URL/voice/tts" '{"text":"Перевірка голосу Остап Нейрал. Технічний тест.","voice":"Ostap"}'
# ── 2. TTS via BFF proxy ──────────────────────────────────────────────────
echo ""
echo "── TTS via BFF proxy ──"
_m "TTS_bff_short" "POST_JSON" "$BFF_URL/api/voice/tts" '{"text":"Перевірка через BFF.","voice":"default"}'
_m "TTS_bff_medium" "POST_JSON" "$BFF_URL/api/voice/tts" '{"text":"NODA2 voice pipeline: edge-tts 7.2.7, Polina Neural, Ukrainian. OK.","voice":"uk-UA-PolinaNeural"}'
# ── 3. Chat: per-model LLM latency comparison ────────────────────────────
echo ""
echo "── Chat: LLM per-model latency (voice turn = 1 sentence) ──"
local _q='{"message":"Одне речення (max 15 слів): що таке NODA2?","model":'
_m "CHAT_gemma3" "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:gemma3:latest\",\"voice_profile\":\"voice_fast_uk\"}"
_m "CHAT_qwen3_8b" "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:qwen3:8b\",\"voice_profile\":\"voice_fast_uk\"}"
_m "CHAT_qwen3_14b" "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:qwen3:14b\",\"voice_profile\":\"voice_fast_uk\"}"
_m "CHAT_qwen35_35b" "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:qwen3.5:35b-a3b\",\"voice_profile\":\"voice_quality_uk\"}"
# ── 4. Voice health check ─────────────────────────────────────────────────
echo ""
echo "── Voice health (live Polina/Ostap synthesis) ──"
_m "VOICE_HEALTH_live" "GET" "$MEMORY_URL/voice/health"
# ── 5. Parallel TTS (simulate 2 concurrent users) ────────────────────────
echo ""
echo "── Parallel TTS (2 concurrent) ──"
local tmp1 tmp2
tmp1=$(mktemp)
tmp2=$(mktemp)
local t0_par; t0_par=$(_now_ms)
curl -sf -X POST "$MEMORY_URL/voice/tts" \
-H "Content-Type: application/json" \
-d '{"text":"Перший паралельний запит.","voice":"default"}' \
-o "$tmp1" --max-time 15 &
curl -sf -X POST "$MEMORY_URL/voice/tts" \
-H "Content-Type: application/json" \
-d '{"text":"Другий паралельний запит.","voice":"Ostap"}' \
-o "$tmp2" --max-time 15 &
wait
local t1_par; t1_par=$(_now_ms)
local par_ms; par_ms=$((t1_par - t0_par))
local s1; s1=$(stat -f%z "$tmp1" 2>/dev/null || stat -c%s "$tmp1" 2>/dev/null || echo 0)
local s2; s2=$(stat -f%z "$tmp2" 2>/dev/null || stat -c%s "$tmp2" 2>/dev/null || echo 0)
rm -f "$tmp1" "$tmp2"
local par_status; par_status=$([ $s1 -gt 1000 ] && [ $s2 -gt 1000 ] && echo "ok" || echo "error")
R9="{\"scenario\":\"TTS_parallel_2x\",\"ms\":$par_ms,\"bytes\":$((s1+s2)),\"status\":\"$par_status\"}"
RESULTS+=("$R9")
local par_label; par_label=$([ $par_ms -le 4000 ] && echo "FAST" || ([ $par_ms -le 8000 ] && echo "OK" || echo "SLOW"))
local par_color; par_color=$([ "$par_label" = "FAST" ] && echo "$GREEN" || ([ "$par_label" = "OK" ] && echo "$YELLOW" || echo "$RED"))
echo -e " ${par_color}${par_label}${NC} TTS_parallel_2x: ${par_ms}ms (s1=${s1}B s2=${s2}B)"
# ── 6. STT smoke test (synthetic WAV) ─────────────────────────────────────
echo ""
echo "── STT smoke (silent WAV stub) ──"
# Generate minimal WAV header (44 bytes) as a stub
python3 -c "
import struct, sys
# 44-byte WAV header for 1s silence at 16kHz mono int16
sample_rate = 16000
duration_s = 1
num_samples = sample_rate * duration_s
data_size = num_samples * 2
header = struct.pack('<4sI4s4sIHHIIHH4sI',
b'RIFF', 36 + data_size, b'WAVE',
b'fmt ', 16, 1, 1, sample_rate, sample_rate*2, 2, 16,
b'data', data_size)
sys.stdout.buffer.write(header + bytes(data_size))
" > /tmp/voice_audit_stub.wav 2>/dev/null || echo "" > /tmp/voice_audit_stub.wav
local t0_stt; t0_stt=$(_now_ms)
local stt_code; stt_code=$(curl -sf -w "%{http_code}" -X POST "$MEMORY_URL/voice/stt" \
-F "audio=@/tmp/voice_audit_stub.wav;filename=audio.wav;type=audio/wav" \
-F "language=uk" \
-o /tmp/stt_result.json \
--max-time 30 2>/dev/null || echo "000")
local t1_stt; t1_stt=$(_now_ms)
local stt_ms; stt_ms=$((t1_stt - t0_stt))
local stt_status; stt_status=$([ "$stt_code" = "200" ] && echo "ok" || echo "error")
R10="{\"scenario\":\"STT_silence_stub\",\"ms\":$stt_ms,\"http\":\"$stt_code\",\"status\":\"$stt_status\"}"
RESULTS+=("$R10")
local stt_label; stt_label=$([ $stt_ms -le 2000 ] && echo "FAST" || ([ $stt_ms -le 5000 ] && echo "OK" || echo "SLOW"))
local stt_color; stt_color=$([ "$stt_label" = "FAST" ] && echo "$GREEN" || ([ "$stt_label" = "OK" ] && echo "$YELLOW" || echo "$RED"))
echo -e " ${stt_color}${stt_label}${NC} STT_silence_stub: ${stt_ms}ms (HTTP ${stt_code})"
rm -f /tmp/voice_audit_stub.wav /tmp/stt_result.json
}
# ── Compute p50/p95 ────────────────────────────────────────────────────────────
compute_stats() {
python3 - "${RESULTS[@]}" <<'PYEOF'
import json, sys, statistics
results = []
for arg in sys.argv[1:]:
try:
results.append(json.loads(arg))
except Exception:
pass
if not results:
print("No results to analyze.")
sys.exit(0)
tts_ms = [r["ms"] for r in results if "TTS" in r["scenario"] and r["status"] == "ok"]
stt_ms = [r["ms"] for r in results if "STT" in r["scenario"] and r["status"] == "ok"]
# per-model chat latency
model_map = {
"gemma3": [r["ms"] for r in results if r["scenario"] == "CHAT_gemma3" and r["status"] == "ok"],
"qwen3:8b": [r["ms"] for r in results if r["scenario"] == "CHAT_qwen3_8b" and r["status"] == "ok"],
"qwen3:14b": [r["ms"] for r in results if r["scenario"] == "CHAT_qwen3_14b" and r["status"] == "ok"],
"qwen3.5:35b":[r["ms"] for r in results if r["scenario"] == "CHAT_qwen35_35b"and r["status"] == "ok"],
}
all_chat_ms = [ms for vals in model_map.values() for ms in vals]
def stats(label, vals):
if not vals:
print(f" {label}: no data")
return
s = sorted(vals)
p50 = s[len(s)//2]
p95 = s[int(len(s)*0.95)] if len(s) > 1 else s[-1]
print(f" {label}: p50={p50}ms p95={p95}ms min={min(s)}ms max={max(s)}ms n={len(s)}")
print("")
print("── Latency Summary ──")
stats("TTS (memory-service + proxy)", tts_ms)
stats("Chat (all models)", all_chat_ms)
stats("STT (memory-service)", stt_ms)
print("")
print("── LLM Model Comparison (voice: 1 sentence) ──")
rows = []
for model, vals in model_map.items():
if vals:
s = sorted(vals)
p50 = s[len(s)//2]
rows.append((p50, model, vals[0]))
else:
rows.append((99999, model, None))
rows.sort()
fastest = rows[0][1] if rows else "?"
for rank, (p50, model, ms) in enumerate(rows, 1):
icon = "🥇" if rank == 1 else ("🥈" if rank == 2 else ("🥉" if rank == 3 else " "))
note = " ← fastest (voice_fast default)" if model == fastest else ""
ms_str = f"{ms}ms" if ms else "no data"
print(f" {icon} {model:20s} {ms_str}{note}")
# Bottleneck analysis
all_ok = [r for r in results if r["status"] == "ok"]
all_err = [r for r in results if r["status"] != "ok"]
print(f" Passed: {len(all_ok)}/{len(results)} scenarios")
if all_err:
print(f" Failed: {[r['scenario'] for r in all_err]}")
# SLO check
print("")
print("── SLO Check (voice_fast_uk) ──")
tts_p95 = sorted(tts_ms)[int(len(tts_ms)*0.95)] if len(tts_ms) > 1 else (tts_ms[0] if tts_ms else 9999)
q14_ms = model_map["qwen3:14b"][0] if model_map["qwen3:14b"] else 9999
q35_ms = model_map["qwen3.5:35b"][0] if model_map["qwen3.5:35b"] else 9999
g3_ms = model_map["gemma3"][0] if model_map["gemma3"] else 9999
fast_ms = min(q14_ms, q35_ms, g3_ms)
checks = [
("TTS p95 ≤ 2500ms", tts_p95 <= 2500, f"actual={tts_p95}ms"),
("Fastest model ≤ 9000ms", fast_ms <= 9000, f"actual={fast_ms}ms"),
("qwen3:14b ≤ 12000ms", q14_ms <= 12000, f"actual={q14_ms}ms"),
("qwen3.5 faster than 14b", q35_ms < q14_ms, f"35b={q35_ms}ms vs 14b={q14_ms}ms"),
("qwen3.5 auto-promote worthy", q35_ms < q14_ms * 0.9, f"ratio={round(q35_ms/max(q14_ms,1),2)} (need <0.9)"),
]
for label, ok, detail in checks:
icon = "✅" if ok else "⚠️ "
print(f" {icon} {label} ({detail})")
print("")
# Verdict on qwen3.5 promotion
if q35_ms < q14_ms * 0.9:
print(" ✅ VERDICT: qwen3.5:35b-a3b qualifies for auto_promote in voice_fast_uk")
elif q35_ms < q14_ms:
print(" VERDICT: qwen3.5 is faster but not by 10% — keep as 2nd in prefer list")
else:
print(" ⚠️ VERDICT: qwen3.5 slower than qwen3:14b — move to voice_quality_uk only")
PYEOF
}
# ── Save results ──────────────────────────────────────────────────────────────
save_results() {
python3 - "${RESULTS[@]}" "$RESULTS_FILE" <<'PYEOF'
import json, sys
results = []
for arg in sys.argv[1:-1]:
try:
results.append(json.loads(arg))
except Exception:
pass
out_file = sys.argv[-1]
from datetime import datetime, timezone
doc = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"scenarios_total": len(results),
"scenarios_passed": sum(1 for r in results if r["status"] == "ok"),
"results": results,
}
with open(out_file, "w") as f:
json.dump(doc, f, indent=2, ensure_ascii=False)
print(f"\n Saved: {out_file}")
PYEOF
}
# ── Main ──────────────────────────────────────────────────────────────────────
run_scenarios
compute_stats "${RESULTS[@]}"
save_results "${RESULTS[@]}"
echo ""
echo "Done. Rerun: bash ops/voice_latency_audit.sh $BFF_URL $MEMORY_URL"