#!/usr/bin/env bash # Fabric Preflight — verify all nodes before changes/deploys. # Saves snapshot, compares with previous, fails hard on critical issues. # # Usage: # bash ops/fabric_preflight.sh [NCS_URL] [NCS_URL2] [ROUTER_URL] # bash ops/fabric_preflight.sh # defaults: 127.0.0.1:8099, same, 127.0.0.1:9102 set -euo pipefail NODA_NCS="${1:-http://127.0.0.1:8099}" ROUTER_URL="${2:-http://127.0.0.1:9102}" MEMORY_URL="${3:-http://127.0.0.1:8000}" RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' CYAN='\033[0;36m' NC='\033[0m' pass() { echo -e " ${GREEN}PASS${NC} $1"; } warn() { echo -e " ${YELLOW}WARN${NC} $1"; } fail() { echo -e " ${RED}FAIL${NC} $1"; ERRORS=$((ERRORS+1)); } info() { echo -e " ${CYAN}INFO${NC} $1"; } ERRORS=0 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SNAPSHOT_DIR="${SCRIPT_DIR}/preflight_snapshots" mkdir -p "$SNAPSHOT_DIR" # ── NCS check ───────────────────────────────────────────────────────────────── check_ncs() { local label="$1" url="$2" echo "── $label ($url) ──" local raw raw=$(curl -sf "$url/capabilities" 2>/dev/null) || { fail "NCS unreachable at $url"; return; } local node_id served installed swapper_status node_id=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('node_id','?'))" 2>/dev/null) served=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('served_count',0))" 2>/dev/null) installed=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('installed_count',0))" 2>/dev/null) swapper_status=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('runtimes',{}).get('swapper',{}).get('status','?'))" 2>/dev/null) [ "$served" -gt 0 ] 2>/dev/null && pass "node=$node_id served=$served installed=$installed" \ || fail "node=$node_id served=$served (empty pool!)" [ "$swapper_status" = "disabled" ] && pass "swapper=disabled" || warn "swapper=$swapper_status" local caps caps=$(echo "$raw" | python3 -c " import json,sys c=json.load(sys.stdin).get('capabilities',{}) parts=[f'{k}={v}' for k,v in c.items() if k!='providers'] print(' '.join(parts) if parts else '(none — P3.5 not deployed?)') " 2>/dev/null) [ "$caps" = "(none — P3.5 not deployed?)" ] && warn "capabilities: $caps" || pass "capabilities: $caps" local mem_p inflight mem_p=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('node_load',{}).get('mem_pressure','?'))" 2>/dev/null) inflight=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('node_load',{}).get('inflight',json.load(open('/dev/stdin')).get('node_load',{}).get('inflight_jobs',0)) if False else json.load(sys.stdin).get('node_load',{}).get('inflight_jobs',0))" 2>/dev/null || echo "?") [ "$mem_p" = "high" ] && warn "mem_pressure=$mem_p inflight=$inflight" \ || pass "mem_pressure=$mem_p inflight=$inflight" local vision_count vision_count=$(echo "$raw" | python3 -c "import json,sys;print(sum(1 for m in json.load(sys.stdin).get('served_models',[]) if m.get('type')=='vision'))" 2>/dev/null) [ "$vision_count" -gt 0 ] && pass "vision models: $vision_count" || warn "no vision models served" # Phase 1: explicit STT/TTS capability check local stt_cap tts_cap stt_provider tts_provider stt_cap=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('capabilities',{}).get('stt','?'))" 2>/dev/null) tts_cap=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('capabilities',{}).get('tts','?'))" 2>/dev/null) stt_provider=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('capabilities',{}).get('providers',{}).get('stt','?'))" 2>/dev/null) tts_provider=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('capabilities',{}).get('providers',{}).get('tts','?'))" 2>/dev/null) [ "$stt_cap" = "True" ] || [ "$stt_cap" = "true" ] \ && pass "stt=true provider=$stt_provider" \ || warn "stt=false (provider=$stt_provider) — STT not available on this node" [ "$tts_cap" = "True" ] || [ "$tts_cap" = "true" ] \ && pass "tts=true provider=$tts_provider" \ || warn "tts=false (provider=$tts_provider) — TTS not available on this node" NCS_RAW="$raw" NCS_NODE_ID="$node_id" } # ── Memory Service health check ──────────────────────────────────────────────── check_memory_service() { local label="$1" url="$2" echo "── $label ($url/health) ──" local health health=$(curl -sf "$url/health" 2>/dev/null) || { warn "Memory Service unreachable at $url (STT/TTS may fail)"; return; } local status status=$(echo "$health" | python3 -c "import json,sys;print(json.load(sys.stdin).get('status','?'))" 2>/dev/null || echo "ok") pass "memory-service health=$status" local voice_status voice_status=$(curl -sf "$url/voice/status" 2>/dev/null) || { warn "voice/status unreachable"; return; } local tts_engine stt_engine tts_engine=$(echo "$voice_status" | python3 -c "import json,sys;print(json.load(sys.stdin).get('tts_engine','?'))" 2>/dev/null) stt_engine=$(echo "$voice_status" | python3 -c "import json,sys;print(json.load(sys.stdin).get('stt_engine','?'))" 2>/dev/null) pass "voice: tts=$tts_engine stt=$stt_engine" } # ── Router check ────────────────────────────────────────────────────────────── check_router() { local label="$1" url="$2" echo "── $label ($url) ──" local health health=$(curl -sf "$url/health" 2>/dev/null) || { fail "Router unreachable at $url"; return; } local status status=$(echo "$health" | python3 -c "import json,sys;print(json.load(sys.stdin).get('status','?'))" 2>/dev/null) [ "$status" = "ok" ] && pass "health=$status" || fail "health=$status" local models_total models_total=$(curl -sf "$url/v1/models" 2>/dev/null | python3 -c "import json,sys;print(json.load(sys.stdin).get('total',0))" 2>/dev/null) || models_total=0 [ "$models_total" -gt 0 ] && pass "global pool: $models_total models" || fail "global pool empty" local caps_nodes caps_nodes=$(curl -sf "$url/v1/capabilities" 2>/dev/null | python3 -c " import json,sys d=json.load(sys.stdin) nodes=list(d.get('capabilities_by_node',{}).keys()) print(f'{len(nodes)} node(s): {\" \".join(nodes)}' if nodes else '(none)') " 2>/dev/null) [ "$caps_nodes" = "(none)" ] && warn "capabilities_by_node: $caps_nodes" || pass "capabilities_by_node: $caps_nodes" ROUTER_MODELS=$(curl -sf "$url/v1/models" 2>/dev/null || echo '{}') } # ── Snapshot + diff ─────────────────────────────────────────────────────────── NCS_RAW="{}" NCS_NODE_ID="unknown" ROUTER_MODELS="{}" save_and_diff() { local ts ts=$(date +%Y-%m-%d_%H%M%S) local snap_file="${SNAPSHOT_DIR}/${NCS_NODE_ID}_${ts}.json" python3 -c " import json, glob, os, sys from datetime import datetime ncs = json.loads('''$(echo "$NCS_RAW" | python3 -c "import sys;print(sys.stdin.read().replace(\"'\",\"\"))")''') if '''$NCS_RAW''' != '{}' else {} router = json.loads('''$(echo "$ROUTER_MODELS" | python3 -c "import sys;print(sys.stdin.read().replace(\"'\",\"\"))")''') if '''$ROUTER_MODELS''' != '{}' else {} snapshot = { 'timestamp': datetime.utcnow().isoformat() + 'Z', 'node_id': ncs.get('node_id', '$NCS_NODE_ID'), 'errors': $ERRORS, 'passed': $ERRORS == 0, 'served_count': ncs.get('served_count', 0), 'installed_count': ncs.get('installed_count', 0), 'capabilities': {k:v for k,v in ncs.get('capabilities', {}).items() if k != 'providers'}, 'providers': ncs.get('capabilities', {}).get('providers', {}), 'node_load': ncs.get('node_load', {}), 'router_models_total': router.get('total', 0), 'capabilities_by_node': router.get('capabilities_by_node', {}), } with open('$snap_file', 'w') as f: json.dump(snapshot, f, indent=2, ensure_ascii=False) # Find previous snapshot for diff prefix = '${NCS_NODE_ID}'.lower() + '_' snaps = sorted(glob.glob(os.path.join('$SNAPSHOT_DIR', prefix + '*.json')), reverse=True) prev = None if len(snaps) >= 2: with open(snaps[1]) as f: prev = json.load(f) if prev: diffs = [] for key in ('served_count', 'installed_count', 'router_models_total'): ov, nv = prev.get(key, '?'), snapshot.get(key, '?') if ov != nv: diffs.append(f' {key}: {ov} → {nv}') old_caps = prev.get('capabilities', {}) new_caps = snapshot.get('capabilities', {}) for k in sorted(set(list(old_caps.keys()) + list(new_caps.keys()))): ov, nv = old_caps.get(k, '?'), new_caps.get(k, '?') if ov != nv: diffs.append(f' caps.{k}: {ov} → {nv}') if diffs: print('Changes vs previous:') for d in diffs: print(d) else: print('(no changes vs previous snapshot)') else: print('(first snapshot for this node)') " 2>/dev/null || echo "(snapshot diff failed)" info "Snapshot: $snap_file" } # ── Ollama model availability check ────────────────────────────────────────── # Voice routing policy depends on specific models; 502 from BFF = model absent. # This check probes /api/tags (Ollama REST) to list installed models and # emits NCS-compatible "installed=false" warnings so Router can exclude them. OLLAMA_URL="${4:-http://127.0.0.1:11434}" # Voice policy: models required/preferred for voice_fast_uk / voice_quality_uk VOICE_REQUIRED_MODELS="gemma3:latest" VOICE_PREFERRED_MODELS="qwen3.5:35b-a3b qwen3:14b" VOICE_EXCLUDED_MODELS="glm-4.7-flash:32k glm-4.7-flash" check_ollama_voice_models() { local ollama_url="${1:-$OLLAMA_URL}" echo "── Ollama voice model availability ($ollama_url) ──" local tags_raw tags_raw=$(curl -sf "${ollama_url}/api/tags" 2>/dev/null) \ || { warn "Ollama unreachable at ${ollama_url} — model check skipped"; return; } local installed_names installed_names=$(echo "$tags_raw" | python3 -c " import json, sys data = json.load(sys.stdin) models = data.get('models', []) names = [m.get('name','') for m in models] print(' '.join(names)) " 2>/dev/null || echo "") info "Ollama installed: $(echo "$installed_names" | tr ' ' '\n' | grep -c . || echo 0) model(s)" # Check required voice models for model in $VOICE_REQUIRED_MODELS; do local short; short="${model%%:*}" if echo "$installed_names" | tr ' ' '\n' | grep -qi "^${model}$\|^${short}:"; then pass "voice_required: ${model} = installed" else fail "voice_required: ${model} = MISSING — voice_fast_uk will degrade to fallback" fi done # Check preferred voice models (warn not fail) local prefer_available=0 for model in $VOICE_PREFERRED_MODELS; do local short; short="${model%%:*}" if echo "$installed_names" | tr ' ' '\n' | grep -qi "^${model}$\|^${short}:"; then pass "voice_preferred: ${model} = installed" prefer_available=$((prefer_available + 1)) else warn "voice_preferred: ${model} = not installed — will be skipped by router" fi done # Check that excluded models are NOT serving voice for model in $VOICE_EXCLUDED_MODELS; do local short; short="${model%%:*}" if echo "$installed_names" | tr ' ' '\n' | grep -qi "^${model}$\|^${short}:"; then warn "voice_excluded: ${model} is installed — ensure router excludes from voice profiles" else pass "voice_excluded: ${model} = absent (correct)" fi done # qwen3:8b specific check — known 502 source local qwen3_8b_ok=0 if echo "$installed_names" | tr ' ' '\n' | grep -qi "^qwen3:8b$"; then # Extra: try a minimal generation to detect "loaded but broken" local gen_code gen_code=$(curl -sf -w "%{http_code}" -X POST "${ollama_url}/api/generate" \ -H "Content-Type: application/json" \ -d '{"model":"qwen3:8b","prompt":"ping","stream":false,"options":{"num_predict":1}}' \ -o /dev/null --max-time 15 2>/dev/null || echo "000") if [ "$gen_code" = "200" ]; then pass "qwen3:8b = installed and serves (HTTP 200)" qwen3_8b_ok=1 else warn "qwen3:8b = installed but generate returned HTTP ${gen_code} — exclude from voice_fast_uk prefer list" fi else warn "qwen3:8b = not installed — mark as unavailable in NCS" fi [ $qwen3_8b_ok -eq 0 ] && info "ACTION: remove qwen3:8b from voice_fast_uk.prefer_models until 502 resolved" } # ── Main ────────────────────────────────────────────────────────────────────── echo "╔══════════════════════════════════════╗" echo "║ Fabric Preflight Check ║" echo "╚══════════════════════════════════════╝" echo "" check_ncs "NCS" "$NODA_NCS" echo "" check_router "Router" "$ROUTER_URL" echo "" check_memory_service "Memory Service" "$MEMORY_URL" echo "" check_ollama_voice_models "$OLLAMA_URL" echo "" # ── Voice Canary: live synthesis test (hard-fail on voice failure) ──────────── echo "── Voice Canary (live synthesis) ──────────────────────────────────────" CANARY_SCRIPT="$(dirname "$0")/scripts/voice_canary.py" if [ -f "$CANARY_SCRIPT" ] && command -v python3 >/dev/null 2>&1; then MEMORY_SERVICE_URL="$MEMORY_URL" python3 "$CANARY_SCRIPT" --mode preflight CANARY_EXIT=$? if [ $CANARY_EXIT -ne 0 ]; then ERRORS=$((ERRORS+1)) echo -e " ${RED}FAIL${NC} Voice canary: synthesis test failed (Polina/Ostap not working)" fi else echo " [SKIP] voice_canary.py not found or python3 unavailable" fi echo "" save_and_diff echo "" if [ $ERRORS -gt 0 ]; then echo -e "${RED}Preflight FAILED: $ERRORS error(s)${NC}" echo -e "${RED}BLOCKED: no changes allowed until all errors resolved${NC}" exit 1 else echo -e "${GREEN}Preflight PASSED — all voice canaries green — changes allowed${NC}" fi