docs(platform): add policy configs, runbooks, ops scripts and platform documentation

Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
2026-03-03 07:14:53 -08:00
parent 129e4ea1fc
commit 67225a39fa
102 changed files with 20060 additions and 0 deletions
--- a/ops/voice_latency_audit.sh
+++ b/ops/voice_latency_audit.sh
@@ -0,0 +1,341 @@
+#!/usr/bin/env bash
+# voice_latency_audit.sh — Voice pipeline latency audit (10 scenarios)
+# Usage:
+#   bash ops/voice_latency_audit.sh [BFF_URL] [MEMORY_URL]
+#   bash ops/voice_latency_audit.sh http://localhost:8002 http://localhost:8000
+#
+# Scenarios:
+#   1-3  TTS only: gemma3 / qwen3:14b / glm-4.7-flash (warm)
+#   4    TTS + STT roundtrip (audio stub)
+#   5-7  Chat: short/medium/detailed answers
+#   8    High-load simulation (parallel requests)
+#   9    Cloud fallback simulation (ollama unavailable)
+#   10   edge-tts voice health check (Polina + Ostap live)
+set -euo pipefail
+
+BFF_URL="${1:-http://localhost:8002}"
+MEMORY_URL="${2:-http://localhost:8000}"
+RESULTS_DIR="ops/voice_audit_results"
+TS=$(date +%Y%m%d_%H%M%S)
+RESULTS_FILE="${RESULTS_DIR}/audit_${TS}.json"
+
+RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[0;33m'; CYAN='\033[0;36m'; NC='\033[0m'
+pass() { echo -e "  ${GREEN}PASS${NC} $1"; }
+fail() { echo -e "  ${RED}FAIL${NC} $1"; }
+warn() { echo -e "  ${YELLOW}WARN${NC} $1"; }
+info() { echo -e "  ${CYAN}INFO${NC} $1"; }
+
+mkdir -p "$RESULTS_DIR"
+
+declare -a RESULTS=()
+
+# ── Helper: measure HTTP call ──────────────────────────────────────────────────
+_now_ms() { python3 -c "import time; print(int(time.time()*1000))"; }
+
+measure() {
+    local scenario="$1" method="$2" url="$3" opts="${4:-}"
+    local t0 t1 ms http_code size ct
+    t0=$(_now_ms)
+    local tmp; tmp=$(mktemp)
+
+    if [ "$method" = "POST_JSON" ]; then
+        local body="${5:-{}}"
+        http_code=$(curl -sf -w "%{http_code}" -X POST "$url" \
+            -H "Content-Type: application/json" \
+            -d "$body" -o "$tmp" --max-time 15 2>/dev/null || echo "000")
+    elif [ "$method" = "GET" ]; then
+        http_code=$(curl -sf -w "%{http_code}" -X GET "$url" \
+            -o "$tmp" --max-time 10 2>/dev/null || echo "000")
+    else
+        http_code="000"
+    fi
+
+    t1=$(_now_ms)
+    ms=$((t1 - t0))
+    size=$(stat -f%z "$tmp" 2>/dev/null || stat -c%s "$tmp" 2>/dev/null || echo 0)
+    ct=$(file "$tmp" 2>/dev/null | head -1 || echo "unknown")
+    rm -f "$tmp"
+
+    local status="ok"
+    [ "$http_code" = "000" ] || [ "$http_code" -ge 400 ] && status="error"
+
+    echo "{\"scenario\":\"$scenario\",\"ms\":$ms,\"http\":\"$http_code\",\"bytes\":$size,\"status\":\"$status\"}"
+
+    if [ "$status" = "ok" ]; then
+        local label; label=$([ $ms -le 2500 ] && echo "FAST" || ([ $ms -le 6000 ] && echo "OK" || echo "SLOW"))
+        local color; color=$([ "$label" = "FAST" ] && echo "$GREEN" || ([ "$label" = "OK" ] && echo "$YELLOW" || echo "$RED"))
+        echo -e "  ${color}${label}${NC} ${scenario}: ${ms}ms (HTTP ${http_code}, ${size}B)"
+    else
+        echo -e "  ${RED}FAIL${NC} ${scenario}: HTTP ${http_code} ${ms}ms"
+    fi
+}
+
+# ── Scenario runner ────────────────────────────────────────────────────────────
+
+run_scenarios() {
+    echo ""
+    echo "╔════════════════════════════════════════════╗"
+    echo "║   Voice Latency Audit — $(date +%H:%M:%S)          ║"
+    echo "╚════════════════════════════════════════════╝"
+    echo ""
+
+    # ── 0. Preflight ──────────────────────────────────────────────────────────
+    echo "── Preflight ──"
+    local bff_ok mem_ok
+    bff_ok=$(curl -sf "$BFF_URL/api/memory/status" 2>/dev/null | python3 -c "import json,sys;print(json.load(sys.stdin).get('ok',False))" 2>/dev/null || echo "False")
+    mem_ok=$(curl -sf "$MEMORY_URL/health" 2>/dev/null | python3 -c "import json,sys;print(json.load(sys.stdin).get('status','error'))" 2>/dev/null || echo "error")
+    [ "$bff_ok" = "True" ] && pass "BFF ok ($BFF_URL)" || fail "BFF unreachable"
+    [ "$mem_ok" = "healthy" ] && pass "Memory Service ok ($MEMORY_URL)" || fail "Memory Service: $mem_ok"
+
+    # ── Helper: measure and collect result ────────────────────────────────────
+    _m() {
+        local scenario="$1" method="$2" url="$3" body="${4:-}"
+        local t0 t1 ms http_code size tmp
+        t0=$(_now_ms)
+        tmp=$(mktemp)
+        if [ "$method" = "POST_JSON" ]; then
+            http_code=$(curl -sf -w "%{http_code}" -X POST "$url" \
+                -H "Content-Type: application/json" \
+                -d "$body" -o "$tmp" --max-time 30 2>/dev/null || echo "000")
+        else
+            http_code=$(curl -sf -w "%{http_code}" "$url" \
+                -o "$tmp" --max-time 20 2>/dev/null || echo "000")
+        fi
+        t1=$(_now_ms); ms=$((t1-t0))
+        size=$(stat -f%z "$tmp" 2>/dev/null || stat -c%s "$tmp" 2>/dev/null || echo 0)
+        rm -f "$tmp"
+        local status; status="ok"
+        { [ "$http_code" = "000" ] || { [[ "$http_code" =~ ^[0-9]+$ ]] && [ "$http_code" -ge 400 ]; }; } && status="error" || true
+        local json="{\"scenario\":\"$scenario\",\"ms\":$ms,\"http\":\"$http_code\",\"bytes\":$size,\"status\":\"$status\"}"
+        RESULTS+=("$json")
+        local lbl col
+        if [ "$status" = "ok" ]; then
+            if [ $ms -le 2500 ]; then lbl="FAST"; col="$GREEN"
+            elif [ $ms -le 6000 ]; then lbl="OK  "; col="$YELLOW"
+            else lbl="SLOW"; col="$RED"; fi
+        else
+            lbl="FAIL"; col="$RED"
+        fi
+        echo -e "  ${col}${lbl}${NC} ${scenario}: ${ms}ms (HTTP ${http_code}, ${size}B)"
+    }
+
+    # ── 1. TTS Scenario 1: short text → Polina ────────────────────────────────
+    echo ""
+    echo "── TTS Scenarios (direct memory-service) ──"
+    _m "TTS_short_polina"  "POST_JSON" "$MEMORY_URL/voice/tts" '{"text":"Привіт! Це тест голосу.","voice":"default"}'
+    _m "TTS_medium_polina" "POST_JSON" "$MEMORY_URL/voice/tts" '{"text":"Я Sofiia, головний AI-архітектор DAARION.city. Моя роль — розробка AI-рішень.","voice":"uk-UA-PolinaNeural"}'
+    _m "TTS_ostap"         "POST_JSON" "$MEMORY_URL/voice/tts" '{"text":"Перевірка голосу Остап Нейрал. Технічний тест.","voice":"Ostap"}'
+
+    # ── 2. TTS via BFF proxy ──────────────────────────────────────────────────
+    echo ""
+    echo "── TTS via BFF proxy ──"
+    _m "TTS_bff_short"  "POST_JSON" "$BFF_URL/api/voice/tts" '{"text":"Перевірка через BFF.","voice":"default"}'
+    _m "TTS_bff_medium" "POST_JSON" "$BFF_URL/api/voice/tts" '{"text":"NODA2 voice pipeline: edge-tts 7.2.7, Polina Neural, Ukrainian. OK.","voice":"uk-UA-PolinaNeural"}'
+
+    # ── 3. Chat: per-model LLM latency comparison ────────────────────────────
+    echo ""
+    echo "── Chat: LLM per-model latency (voice turn = 1 sentence) ──"
+    local _q='{"message":"Одне речення (max 15 слів): що таке NODA2?","model":'
+    _m "CHAT_gemma3"        "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:gemma3:latest\",\"voice_profile\":\"voice_fast_uk\"}"
+    _m "CHAT_qwen3_8b"      "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:qwen3:8b\",\"voice_profile\":\"voice_fast_uk\"}"
+    _m "CHAT_qwen3_14b"     "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:qwen3:14b\",\"voice_profile\":\"voice_fast_uk\"}"
+    _m "CHAT_qwen35_35b"    "POST_JSON" "$BFF_URL/api/chat/send" "${_q}\"ollama:qwen3.5:35b-a3b\",\"voice_profile\":\"voice_quality_uk\"}"
+
+    # ── 4. Voice health check ─────────────────────────────────────────────────
+    echo ""
+    echo "── Voice health (live Polina/Ostap synthesis) ──"
+    _m "VOICE_HEALTH_live" "GET" "$MEMORY_URL/voice/health"
+
+    # ── 5. Parallel TTS (simulate 2 concurrent users) ────────────────────────
+    echo ""
+    echo "── Parallel TTS (2 concurrent) ──"
+    local tmp1 tmp2
+    tmp1=$(mktemp)
+    tmp2=$(mktemp)
+    local t0_par; t0_par=$(_now_ms)
+    curl -sf -X POST "$MEMORY_URL/voice/tts" \
+        -H "Content-Type: application/json" \
+        -d '{"text":"Перший паралельний запит.","voice":"default"}' \
+        -o "$tmp1" --max-time 15 &
+    curl -sf -X POST "$MEMORY_URL/voice/tts" \
+        -H "Content-Type: application/json" \
+        -d '{"text":"Другий паралельний запит.","voice":"Ostap"}' \
+        -o "$tmp2" --max-time 15 &
+    wait
+    local t1_par; t1_par=$(_now_ms)
+    local par_ms; par_ms=$((t1_par - t0_par))
+    local s1; s1=$(stat -f%z "$tmp1" 2>/dev/null || stat -c%s "$tmp1" 2>/dev/null || echo 0)
+    local s2; s2=$(stat -f%z "$tmp2" 2>/dev/null || stat -c%s "$tmp2" 2>/dev/null || echo 0)
+    rm -f "$tmp1" "$tmp2"
+    local par_status; par_status=$([ $s1 -gt 1000 ] && [ $s2 -gt 1000 ] && echo "ok" || echo "error")
+    R9="{\"scenario\":\"TTS_parallel_2x\",\"ms\":$par_ms,\"bytes\":$((s1+s2)),\"status\":\"$par_status\"}"
+    RESULTS+=("$R9")
+    local par_label; par_label=$([ $par_ms -le 4000 ] && echo "FAST" || ([ $par_ms -le 8000 ] && echo "OK" || echo "SLOW"))
+    local par_color; par_color=$([ "$par_label" = "FAST" ] && echo "$GREEN" || ([ "$par_label" = "OK" ] && echo "$YELLOW" || echo "$RED"))
+    echo -e "  ${par_color}${par_label}${NC} TTS_parallel_2x: ${par_ms}ms (s1=${s1}B s2=${s2}B)"
+
+    # ── 6. STT smoke test (synthetic WAV) ─────────────────────────────────────
+    echo ""
+    echo "── STT smoke (silent WAV stub) ──"
+    # Generate minimal WAV header (44 bytes) as a stub
+    python3 -c "
+import struct, sys
+# 44-byte WAV header for 1s silence at 16kHz mono int16
+sample_rate = 16000
+duration_s = 1
+num_samples = sample_rate * duration_s
+data_size = num_samples * 2
+header = struct.pack('<4sI4s4sIHHIIHH4sI',
+    b'RIFF', 36 + data_size, b'WAVE',
+    b'fmt ', 16, 1, 1, sample_rate, sample_rate*2, 2, 16,
+    b'data', data_size)
+sys.stdout.buffer.write(header + bytes(data_size))
+" > /tmp/voice_audit_stub.wav 2>/dev/null || echo "" > /tmp/voice_audit_stub.wav
+
+    local t0_stt; t0_stt=$(_now_ms)
+    local stt_code; stt_code=$(curl -sf -w "%{http_code}" -X POST "$MEMORY_URL/voice/stt" \
+        -F "audio=@/tmp/voice_audit_stub.wav;filename=audio.wav;type=audio/wav" \
+        -F "language=uk" \
+        -o /tmp/stt_result.json \
+        --max-time 30 2>/dev/null || echo "000")
+    local t1_stt; t1_stt=$(_now_ms)
+    local stt_ms; stt_ms=$((t1_stt - t0_stt))
+    local stt_status; stt_status=$([ "$stt_code" = "200" ] && echo "ok" || echo "error")
+    R10="{\"scenario\":\"STT_silence_stub\",\"ms\":$stt_ms,\"http\":\"$stt_code\",\"status\":\"$stt_status\"}"
+    RESULTS+=("$R10")
+    local stt_label; stt_label=$([ $stt_ms -le 2000 ] && echo "FAST" || ([ $stt_ms -le 5000 ] && echo "OK" || echo "SLOW"))
+    local stt_color; stt_color=$([ "$stt_label" = "FAST" ] && echo "$GREEN" || ([ "$stt_label" = "OK" ] && echo "$YELLOW" || echo "$RED"))
+    echo -e "  ${stt_color}${stt_label}${NC} STT_silence_stub: ${stt_ms}ms (HTTP ${stt_code})"
+
+    rm -f /tmp/voice_audit_stub.wav /tmp/stt_result.json
+}
+
+# ── Compute p50/p95 ────────────────────────────────────────────────────────────
+compute_stats() {
+    python3 - "${RESULTS[@]}" <<'PYEOF'
+import json, sys, statistics
+
+results = []
+for arg in sys.argv[1:]:
+    try:
+        results.append(json.loads(arg))
+    except Exception:
+        pass
+
+if not results:
+    print("No results to analyze.")
+    sys.exit(0)
+
+tts_ms  = [r["ms"] for r in results if "TTS" in r["scenario"] and r["status"] == "ok"]
+stt_ms  = [r["ms"] for r in results if "STT" in r["scenario"] and r["status"] == "ok"]
+# per-model chat latency
+model_map = {
+    "gemma3":     [r["ms"] for r in results if r["scenario"] == "CHAT_gemma3"    and r["status"] == "ok"],
+    "qwen3:8b":   [r["ms"] for r in results if r["scenario"] == "CHAT_qwen3_8b"  and r["status"] == "ok"],
+    "qwen3:14b":  [r["ms"] for r in results if r["scenario"] == "CHAT_qwen3_14b" and r["status"] == "ok"],
+    "qwen3.5:35b":[r["ms"] for r in results if r["scenario"] == "CHAT_qwen35_35b"and r["status"] == "ok"],
+}
+all_chat_ms = [ms for vals in model_map.values() for ms in vals]
+
+def stats(label, vals):
+    if not vals:
+        print(f"  {label}: no data")
+        return
+    s = sorted(vals)
+    p50 = s[len(s)//2]
+    p95 = s[int(len(s)*0.95)] if len(s) > 1 else s[-1]
+    print(f"  {label}: p50={p50}ms p95={p95}ms min={min(s)}ms max={max(s)}ms n={len(s)}")
+
+print("")
+print("── Latency Summary ──")
+stats("TTS (memory-service + proxy)", tts_ms)
+stats("Chat (all models)", all_chat_ms)
+stats("STT (memory-service)", stt_ms)
+
+print("")
+print("── LLM Model Comparison (voice: 1 sentence) ──")
+rows = []
+for model, vals in model_map.items():
+    if vals:
+        s = sorted(vals)
+        p50 = s[len(s)//2]
+        rows.append((p50, model, vals[0]))
+    else:
+        rows.append((99999, model, None))
+rows.sort()
+fastest = rows[0][1] if rows else "?"
+for rank, (p50, model, ms) in enumerate(rows, 1):
+    icon = "🥇" if rank == 1 else ("🥈" if rank == 2 else ("🥉" if rank == 3 else "  "))
+    note = " ← fastest (voice_fast default)" if model == fastest else ""
+    ms_str = f"{ms}ms" if ms else "no data"
+    print(f"  {icon} {model:20s} {ms_str}{note}")
+
+# Bottleneck analysis
+all_ok  = [r for r in results if r["status"] == "ok"]
+all_err = [r for r in results if r["status"] != "ok"]
+print(f"  Passed: {len(all_ok)}/{len(results)} scenarios")
+if all_err:
+    print(f"  Failed: {[r['scenario'] for r in all_err]}")
+
+# SLO check
+print("")
+print("── SLO Check (voice_fast_uk) ──")
+tts_p95 = sorted(tts_ms)[int(len(tts_ms)*0.95)] if len(tts_ms) > 1 else (tts_ms[0] if tts_ms else 9999)
+q14_ms   = model_map["qwen3:14b"][0]  if model_map["qwen3:14b"]   else 9999
+q35_ms   = model_map["qwen3.5:35b"][0] if model_map["qwen3.5:35b"] else 9999
+g3_ms    = model_map["gemma3"][0]     if model_map["gemma3"]       else 9999
+fast_ms  = min(q14_ms, q35_ms, g3_ms)
+
+checks = [
+    ("TTS p95 ≤ 2500ms",           tts_p95 <= 2500,          f"actual={tts_p95}ms"),
+    ("Fastest model ≤ 9000ms",     fast_ms  <= 9000,          f"actual={fast_ms}ms"),
+    ("qwen3:14b ≤ 12000ms",        q14_ms  <= 12000,          f"actual={q14_ms}ms"),
+    ("qwen3.5 faster than 14b",    q35_ms  < q14_ms,          f"35b={q35_ms}ms vs 14b={q14_ms}ms"),
+    ("qwen3.5 auto-promote worthy", q35_ms < q14_ms * 0.9,    f"ratio={round(q35_ms/max(q14_ms,1),2)} (need <0.9)"),
+]
+for label, ok, detail in checks:
+    icon = "✅" if ok else "⚠️ "
+    print(f"  {icon} {label} ({detail})")
+
+print("")
+# Verdict on qwen3.5 promotion
+if q35_ms < q14_ms * 0.9:
+    print("  ✅ VERDICT: qwen3.5:35b-a3b qualifies for auto_promote in voice_fast_uk")
+elif q35_ms < q14_ms:
+    print("  ℹ️  VERDICT: qwen3.5 is faster but not by 10% — keep as 2nd in prefer list")
+else:
+    print("  ⚠️  VERDICT: qwen3.5 slower than qwen3:14b — move to voice_quality_uk only")
+
+PYEOF
+}
+
+# ── Save results ──────────────────────────────────────────────────────────────
+save_results() {
+    python3 - "${RESULTS[@]}" "$RESULTS_FILE" <<'PYEOF'
+import json, sys
+results = []
+for arg in sys.argv[1:-1]:
+    try:
+        results.append(json.loads(arg))
+    except Exception:
+        pass
+out_file = sys.argv[-1]
+from datetime import datetime, timezone
+doc = {
+    "timestamp": datetime.now(timezone.utc).isoformat(),
+    "scenarios_total": len(results),
+    "scenarios_passed": sum(1 for r in results if r["status"] == "ok"),
+    "results": results,
+}
+with open(out_file, "w") as f:
+    json.dump(doc, f, indent=2, ensure_ascii=False)
+print(f"\n  Saved: {out_file}")
+PYEOF
+}
+
+# ── Main ──────────────────────────────────────────────────────────────────────
+run_scenarios
+compute_stats "${RESULTS[@]}"
+save_results "${RESULTS[@]}"
+echo ""
+echo "Done. Rerun: bash ops/voice_latency_audit.sh $BFF_URL $MEMORY_URL"