Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
342 lines
16 KiB
Bash
Executable File
342 lines
16 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# voice_latency_audit.sh — Voice pipeline latency audit (10 scenarios)
|
||
# Usage:
|
||
# bash ops/voice_latency_audit.sh [BFF_URL] [MEMORY_URL]
|
||
# bash ops/voice_latency_audit.sh http://localhost:8002 http://localhost:8000
|
||
#
|
||
# Scenarios:
|
||
# 1-3 TTS only: gemma3 / qwen3:14b / glm-4.7-flash (warm)
|
||
# 4 TTS + STT roundtrip (audio stub)
|
||
# 5-7 Chat: short/medium/detailed answers
|
||
# 8 High-load simulation (parallel requests)
|
||
# 9 Cloud fallback simulation (ollama unavailable)
|
||
# 10 edge-tts voice health check (Polina + Ostap live)
|
||
set -euo pipefail
|
||
|
||
BFF_URL="${1:-http://localhost:8002}"
|
||
MEMORY_URL="${2:-http://localhost:8000}"
|
||
RESULTS_DIR="ops/voice_audit_results"
|
||
TS=$(date +%Y%m%d_%H%M%S)
|
||
RESULTS_FILE="${RESULTS_DIR}/audit_${TS}.json"
|
||
|
||
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[0;33m'; CYAN='\033[0;36m'; NC='\033[0m'
|
||
pass() { echo -e " ${GREEN}PASS${NC} $1"; }
|
||
fail() { echo -e " ${RED}FAIL${NC} $1"; }
|
||
warn() { echo -e " ${YELLOW}WARN${NC} $1"; }
|
||
info() { echo -e " ${CYAN}INFO${NC} $1"; }
|
||
|
||
mkdir -p "$RESULTS_DIR"
|
||
|
||
declare -a RESULTS=()
|
||
|
||
# ── Helper: measure HTTP call ──────────────────────────────────────────────────
|
||
_now_ms() { python3 -c "import time; print(int(time.time()*1000))"; }
|
||
|
||
measure() {
|
||
local scenario="$1" method="$2" url="$3" opts="${4:-}"
|
||
local t0 t1 ms http_code size ct
|
||
t0=$(_now_ms)
|
||
local tmp; tmp=$(mktemp)
|
||
|
||
if [ "$method" = "POST_JSON" ]; then
|
||
local body="${5:-{}}"
|
||
http_code=$(curl -sf -w "%{http_code}" -X POST "$url" \
|
||
-H "Content-Type: application/json" \
|
||
-d "$body" -o "$tmp" --max-time 15 2>/dev/null || echo "000")
|
||
elif [ "$method" = "GET" ]; then
|
||
http_code=$(curl -sf -w "%{http_code}" -X GET "$url" \
|
||
-o "$tmp" --max-time 10 2>/dev/null || echo "000")
|
||
else
|
||
http_code="000"
|
||
fi
|
||
|
||
t1=$(_now_ms)
|
||
ms=$((t1 - t0))
|
||
size=$(stat -f%z "$tmp" 2>/dev/null || stat -c%s "$tmp" 2>/dev/null || echo 0)
|
||
ct=$(file "$tmp" 2>/dev/null | head -1 || echo "unknown")
|
||
rm -f "$tmp"
|
||
|
||
local status="ok"
|
||
[ "$http_code" = "000" ] || [ "$http_code" -ge 400 ] && status="error"
|
||
|
||
echo "{\"scenario\":\"$scenario\",\"ms\":$ms,\"http\":\"$http_code\",\"bytes\":$size,\"status\":\"$status\"}"
|
||
|
||
if [ "$status" = "ok" ]; then
|
||
local label; label=$([ $ms -le 2500 ] && echo "FAST" || ([ $ms -le 6000 ] && echo "OK" || echo "SLOW"))
|
||
local color; color=$([ "$label" = "FAST" ] && echo "$GREEN" || ([ "$label" = "OK" ] && echo "$YELLOW" || echo "$RED"))
|
||
echo -e " ${color}${label}${NC} ${scenario}: ${ms}ms (HTTP ${http_code}, ${size}B)"
|
||
else
|
||
echo -e " ${RED}FAIL${NC} ${scenario}: HTTP ${http_code} ${ms}ms"
|
||
fi
|
||
}
|
||
|
||
# ── Scenario runner ────────────────────────────────────────────────────────────
|
||
|
||
run_scenarios() {
|
||
echo ""
|
||
echo "╔════════════════════════════════════════════╗"
|
||
echo "║ Voice Latency Audit — $(date +%H:%M:%S) ║"
|
||
echo "╚════════════════════════════════════════════╝"
|
||
echo ""
|
||
|
||
# ── 0. Preflight ──────────────────────────────────────────────────────────
|
||
echo "── Preflight ──"
|
||
local bff_ok mem_ok
|
||
bff_ok=$(curl -sf "$BFF_URL/api/memory/status" 2>/dev/null | python3 -c "import json,sys;print(json.load(sys.stdin).get('ok',False))" 2>/dev/null || echo "False")
|
||
mem_ok=$(curl -sf "$MEMORY_URL/health" 2>/dev/null | python3 -c "import json,sys;print(json.load(sys.stdin).get('status','error'))" 2>/dev/null || echo "error")
|
||
[ "$bff_ok" = "True" ] && pass "BFF ok ($BFF_URL)" || fail "BFF unreachable"
|
||
[ "$mem_ok" = "healthy" ] && pass "Memory Service ok ($MEMORY_URL)" || fail "Memory Service: $mem_ok"
|
||
|
||
# ── Helper: measure and collect result ────────────────────────────────────
|
||
_m() {
|
||
local scenario="$1" method="$2" url="$3" body="${4:-}"
|
||
local t0 t1 ms http_code size tmp
|
||
t0=$(_now_ms)
|
||
tmp=$(mktemp)
|
||
if [ "$method" = "POST_JSON" ]; then
|
||
http_code=$(curl -sf -w "%{http_code}" -X POST "$url" \
|
||
-H "Content-Type: application/json" \
|
||
-d "$body" -o "$tmp" --max-time 30 2>/dev/null || echo "000")
|
||
else
|
||
http_code=$(curl -sf -w "%{http_code}" "$url" \
|
||
-o "$tmp" --max-time 20 2>/dev/null || echo "000")
|
||
fi
|
||
t1=$(_now_ms); ms=$((t1-t0))
|
||
size=$(stat -f%z "$tmp" 2>/dev/null || stat -c%s "$tmp" 2>/dev/null || echo 0)
|
||
rm -f "$tmp"
|
||
local status; status="ok"
|
||
{ [ "$http_code" = "000" ] || { [[ "$http_code" =~ ^[0-9]+$ ]] && [ "$http_code" -ge 400 ]; }; } && status="error" || true
|
||
local json="{\"scenario\":\"$scenario\",\"ms\":$ms,\"http\":\"$http_code\",\"bytes\":$size,\"status\":\"$status\"}"
|
||
RESULTS+=("$json")
|
||
local lbl col
|
||
if [ "$status" = "ok" ]; then
|
||
if [ $ms -le 2500 ]; then lbl="FAST"; col="$GREEN"
|
||
elif [ $ms -le 6000 ]; then lbl="OK "; col="$YELLOW"
|
||
else lbl="SLOW"; col="$RED"; fi
|
||
else
|
||
lbl="FAIL"; col="$RED"
|
||
fi
|
||
echo -e " ${col}${lbl}${NC} ${scenario}: ${ms}ms (HTTP ${http_code}, ${size}B)"
|
||
}
|
||
|
||
# ── 1. TTS Scenario 1: short text → Polina ────────────────────────────────
|
||
echo ""
|
||
echo "── TTS Scenarios (direct memory-service) ──"
|
||
_m "TTS_short_polina" "POST_JSON" "$MEMORY_URL/voice/tts" '{"text":"Привіт! Це тест голосу.","voice":"default"}'
|
||
_m "TTS_medium_polina" "POST_JSON" "$MEMORY_URL/voice/tts" '{"text":"Я Sofiia, головний AI-архітектор DAARION.city. Моя роль — розробка AI-рішень.","voice":"uk-UA-PolinaNeural"}'
|
||
_m "TTS_ostap" "POST_JSON" "$MEMORY_URL/voice/tts" '{"text":"Перевірка голосу Остап Нейрал. Технічний тест.","voice":"Ostap"}'
|
||
|
||
# ── 2. TTS via BFF proxy ──────────────────────────────────────────────────
|
||
echo ""
|
||
echo "── TTS via BFF proxy ──"
|
||
_m "TTS_bff_short" "POST_JSON" "$BFF_URL/api/voice/tts" '{"text":"Перевірка через BFF.","voice":"default"}'
|
||
_m "TTS_bff_medium" "POST_JSON" "$BFF_URL/api/voice/tts" '{"text":"NODA2 voice pipeline: edge-tts 7.2.7, Polina Neural, Ukrainian. OK.","voice":"uk-UA-PolinaNeural"}'
|
||
|
||
# ── 3. Chat: per-model LLM latency comparison ────────────────────────────
|
||
echo ""
|
||
echo "── Chat: LLM per-model latency (voice turn = 1 sentence) ──"
|
||
local _q='{"message":"Одне речення (max 15 слів): що таке NODA2?","model":'
|
||
_m "CHAT_gemma3" "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:gemma3:latest\",\"voice_profile\":\"voice_fast_uk\"}"
|
||
_m "CHAT_qwen3_8b" "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:qwen3:8b\",\"voice_profile\":\"voice_fast_uk\"}"
|
||
_m "CHAT_qwen3_14b" "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:qwen3:14b\",\"voice_profile\":\"voice_fast_uk\"}"
|
||
_m "CHAT_qwen35_35b" "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:qwen3.5:35b-a3b\",\"voice_profile\":\"voice_quality_uk\"}"
|
||
|
||
# ── 4. Voice health check ─────────────────────────────────────────────────
|
||
echo ""
|
||
echo "── Voice health (live Polina/Ostap synthesis) ──"
|
||
_m "VOICE_HEALTH_live" "GET" "$MEMORY_URL/voice/health"
|
||
|
||
# ── 5. Parallel TTS (simulate 2 concurrent users) ────────────────────────
|
||
echo ""
|
||
echo "── Parallel TTS (2 concurrent) ──"
|
||
local tmp1 tmp2
|
||
tmp1=$(mktemp)
|
||
tmp2=$(mktemp)
|
||
local t0_par; t0_par=$(_now_ms)
|
||
curl -sf -X POST "$MEMORY_URL/voice/tts" \
|
||
-H "Content-Type: application/json" \
|
||
-d '{"text":"Перший паралельний запит.","voice":"default"}' \
|
||
-o "$tmp1" --max-time 15 &
|
||
curl -sf -X POST "$MEMORY_URL/voice/tts" \
|
||
-H "Content-Type: application/json" \
|
||
-d '{"text":"Другий паралельний запит.","voice":"Ostap"}' \
|
||
-o "$tmp2" --max-time 15 &
|
||
wait
|
||
local t1_par; t1_par=$(_now_ms)
|
||
local par_ms; par_ms=$((t1_par - t0_par))
|
||
local s1; s1=$(stat -f%z "$tmp1" 2>/dev/null || stat -c%s "$tmp1" 2>/dev/null || echo 0)
|
||
local s2; s2=$(stat -f%z "$tmp2" 2>/dev/null || stat -c%s "$tmp2" 2>/dev/null || echo 0)
|
||
rm -f "$tmp1" "$tmp2"
|
||
local par_status; par_status=$([ $s1 -gt 1000 ] && [ $s2 -gt 1000 ] && echo "ok" || echo "error")
|
||
R9="{\"scenario\":\"TTS_parallel_2x\",\"ms\":$par_ms,\"bytes\":$((s1+s2)),\"status\":\"$par_status\"}"
|
||
RESULTS+=("$R9")
|
||
local par_label; par_label=$([ $par_ms -le 4000 ] && echo "FAST" || ([ $par_ms -le 8000 ] && echo "OK" || echo "SLOW"))
|
||
local par_color; par_color=$([ "$par_label" = "FAST" ] && echo "$GREEN" || ([ "$par_label" = "OK" ] && echo "$YELLOW" || echo "$RED"))
|
||
echo -e " ${par_color}${par_label}${NC} TTS_parallel_2x: ${par_ms}ms (s1=${s1}B s2=${s2}B)"
|
||
|
||
# ── 6. STT smoke test (synthetic WAV) ─────────────────────────────────────
|
||
echo ""
|
||
echo "── STT smoke (silent WAV stub) ──"
|
||
# Generate minimal WAV header (44 bytes) as a stub
|
||
python3 -c "
|
||
import struct, sys
|
||
# 44-byte WAV header for 1s silence at 16kHz mono int16
|
||
sample_rate = 16000
|
||
duration_s = 1
|
||
num_samples = sample_rate * duration_s
|
||
data_size = num_samples * 2
|
||
header = struct.pack('<4sI4s4sIHHIIHH4sI',
|
||
b'RIFF', 36 + data_size, b'WAVE',
|
||
b'fmt ', 16, 1, 1, sample_rate, sample_rate*2, 2, 16,
|
||
b'data', data_size)
|
||
sys.stdout.buffer.write(header + bytes(data_size))
|
||
" > /tmp/voice_audit_stub.wav 2>/dev/null || echo "" > /tmp/voice_audit_stub.wav
|
||
|
||
local t0_stt; t0_stt=$(_now_ms)
|
||
local stt_code; stt_code=$(curl -sf -w "%{http_code}" -X POST "$MEMORY_URL/voice/stt" \
|
||
-F "audio=@/tmp/voice_audit_stub.wav;filename=audio.wav;type=audio/wav" \
|
||
-F "language=uk" \
|
||
-o /tmp/stt_result.json \
|
||
--max-time 30 2>/dev/null || echo "000")
|
||
local t1_stt; t1_stt=$(_now_ms)
|
||
local stt_ms; stt_ms=$((t1_stt - t0_stt))
|
||
local stt_status; stt_status=$([ "$stt_code" = "200" ] && echo "ok" || echo "error")
|
||
R10="{\"scenario\":\"STT_silence_stub\",\"ms\":$stt_ms,\"http\":\"$stt_code\",\"status\":\"$stt_status\"}"
|
||
RESULTS+=("$R10")
|
||
local stt_label; stt_label=$([ $stt_ms -le 2000 ] && echo "FAST" || ([ $stt_ms -le 5000 ] && echo "OK" || echo "SLOW"))
|
||
local stt_color; stt_color=$([ "$stt_label" = "FAST" ] && echo "$GREEN" || ([ "$stt_label" = "OK" ] && echo "$YELLOW" || echo "$RED"))
|
||
echo -e " ${stt_color}${stt_label}${NC} STT_silence_stub: ${stt_ms}ms (HTTP ${stt_code})"
|
||
|
||
rm -f /tmp/voice_audit_stub.wav /tmp/stt_result.json
|
||
}
|
||
|
||
# ── Compute p50/p95 ────────────────────────────────────────────────────────────
|
||
compute_stats() {
|
||
python3 - "${RESULTS[@]}" <<'PYEOF'
|
||
import json, sys, statistics
|
||
|
||
results = []
|
||
for arg in sys.argv[1:]:
|
||
try:
|
||
results.append(json.loads(arg))
|
||
except Exception:
|
||
pass
|
||
|
||
if not results:
|
||
print("No results to analyze.")
|
||
sys.exit(0)
|
||
|
||
tts_ms = [r["ms"] for r in results if "TTS" in r["scenario"] and r["status"] == "ok"]
|
||
stt_ms = [r["ms"] for r in results if "STT" in r["scenario"] and r["status"] == "ok"]
|
||
# per-model chat latency
|
||
model_map = {
|
||
"gemma3": [r["ms"] for r in results if r["scenario"] == "CHAT_gemma3" and r["status"] == "ok"],
|
||
"qwen3:8b": [r["ms"] for r in results if r["scenario"] == "CHAT_qwen3_8b" and r["status"] == "ok"],
|
||
"qwen3:14b": [r["ms"] for r in results if r["scenario"] == "CHAT_qwen3_14b" and r["status"] == "ok"],
|
||
"qwen3.5:35b":[r["ms"] for r in results if r["scenario"] == "CHAT_qwen35_35b"and r["status"] == "ok"],
|
||
}
|
||
all_chat_ms = [ms for vals in model_map.values() for ms in vals]
|
||
|
||
def stats(label, vals):
|
||
if not vals:
|
||
print(f" {label}: no data")
|
||
return
|
||
s = sorted(vals)
|
||
p50 = s[len(s)//2]
|
||
p95 = s[int(len(s)*0.95)] if len(s) > 1 else s[-1]
|
||
print(f" {label}: p50={p50}ms p95={p95}ms min={min(s)}ms max={max(s)}ms n={len(s)}")
|
||
|
||
print("")
|
||
print("── Latency Summary ──")
|
||
stats("TTS (memory-service + proxy)", tts_ms)
|
||
stats("Chat (all models)", all_chat_ms)
|
||
stats("STT (memory-service)", stt_ms)
|
||
|
||
print("")
|
||
print("── LLM Model Comparison (voice: 1 sentence) ──")
|
||
rows = []
|
||
for model, vals in model_map.items():
|
||
if vals:
|
||
s = sorted(vals)
|
||
p50 = s[len(s)//2]
|
||
rows.append((p50, model, vals[0]))
|
||
else:
|
||
rows.append((99999, model, None))
|
||
rows.sort()
|
||
fastest = rows[0][1] if rows else "?"
|
||
for rank, (p50, model, ms) in enumerate(rows, 1):
|
||
icon = "🥇" if rank == 1 else ("🥈" if rank == 2 else ("🥉" if rank == 3 else " "))
|
||
note = " ← fastest (voice_fast default)" if model == fastest else ""
|
||
ms_str = f"{ms}ms" if ms else "no data"
|
||
print(f" {icon} {model:20s} {ms_str}{note}")
|
||
|
||
# Bottleneck analysis
|
||
all_ok = [r for r in results if r["status"] == "ok"]
|
||
all_err = [r for r in results if r["status"] != "ok"]
|
||
print(f" Passed: {len(all_ok)}/{len(results)} scenarios")
|
||
if all_err:
|
||
print(f" Failed: {[r['scenario'] for r in all_err]}")
|
||
|
||
# SLO check
|
||
print("")
|
||
print("── SLO Check (voice_fast_uk) ──")
|
||
tts_p95 = sorted(tts_ms)[int(len(tts_ms)*0.95)] if len(tts_ms) > 1 else (tts_ms[0] if tts_ms else 9999)
|
||
q14_ms = model_map["qwen3:14b"][0] if model_map["qwen3:14b"] else 9999
|
||
q35_ms = model_map["qwen3.5:35b"][0] if model_map["qwen3.5:35b"] else 9999
|
||
g3_ms = model_map["gemma3"][0] if model_map["gemma3"] else 9999
|
||
fast_ms = min(q14_ms, q35_ms, g3_ms)
|
||
|
||
checks = [
|
||
("TTS p95 ≤ 2500ms", tts_p95 <= 2500, f"actual={tts_p95}ms"),
|
||
("Fastest model ≤ 9000ms", fast_ms <= 9000, f"actual={fast_ms}ms"),
|
||
("qwen3:14b ≤ 12000ms", q14_ms <= 12000, f"actual={q14_ms}ms"),
|
||
("qwen3.5 faster than 14b", q35_ms < q14_ms, f"35b={q35_ms}ms vs 14b={q14_ms}ms"),
|
||
("qwen3.5 auto-promote worthy", q35_ms < q14_ms * 0.9, f"ratio={round(q35_ms/max(q14_ms,1),2)} (need <0.9)"),
|
||
]
|
||
for label, ok, detail in checks:
|
||
icon = "✅" if ok else "⚠️ "
|
||
print(f" {icon} {label} ({detail})")
|
||
|
||
print("")
|
||
# Verdict on qwen3.5 promotion
|
||
if q35_ms < q14_ms * 0.9:
|
||
print(" ✅ VERDICT: qwen3.5:35b-a3b qualifies for auto_promote in voice_fast_uk")
|
||
elif q35_ms < q14_ms:
|
||
print(" ℹ️ VERDICT: qwen3.5 is faster but not by 10% — keep as 2nd in prefer list")
|
||
else:
|
||
print(" ⚠️ VERDICT: qwen3.5 slower than qwen3:14b — move to voice_quality_uk only")
|
||
|
||
PYEOF
|
||
}
|
||
|
||
# ── Save results ──────────────────────────────────────────────────────────────
|
||
save_results() {
|
||
python3 - "${RESULTS[@]}" "$RESULTS_FILE" <<'PYEOF'
|
||
import json, sys
|
||
results = []
|
||
for arg in sys.argv[1:-1]:
|
||
try:
|
||
results.append(json.loads(arg))
|
||
except Exception:
|
||
pass
|
||
out_file = sys.argv[-1]
|
||
from datetime import datetime, timezone
|
||
doc = {
|
||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||
"scenarios_total": len(results),
|
||
"scenarios_passed": sum(1 for r in results if r["status"] == "ok"),
|
||
"results": results,
|
||
}
|
||
with open(out_file, "w") as f:
|
||
json.dump(doc, f, indent=2, ensure_ascii=False)
|
||
print(f"\n Saved: {out_file}")
|
||
PYEOF
|
||
}
|
||
|
||
# ── Main ──────────────────────────────────────────────────────────────────────
|
||
run_scenarios
|
||
compute_stats "${RESULTS[@]}"
|
||
save_results "${RESULTS[@]}"
|
||
echo ""
|
||
echo "Done. Rerun: bash ops/voice_latency_audit.sh $BFF_URL $MEMORY_URL"
|