Includes updates across gateway, router, node-worker, memory-service, aurora-service, swapper, sofiia-console UI and node2 infrastructure: - gateway-bot: Dockerfile, http_api.py, druid/aistalk prompts, doc_service - services/router: main.py, router-config.yml, fabric_metrics, memory_retrieval, offload_client, prompt_builder - services/node-worker: worker.py, main.py, config.py, fabric_metrics - services/memory-service: Dockerfile, database.py, main.py, requirements - services/aurora-service: main.py (+399), kling.py, quality_report.py - services/swapper-service: main.py, swapper_config_node2.yaml - services/sofiia-console: static/index.html (console UI update) - config: agent_registry, crewai_agents/teams, router_agents - ops/fabric_preflight.sh: updated preflight checks - router-config.yml, docker-compose.node2.yml: infra updates - docs: NODA1-AGENT-ARCHITECTURE, fabric_contract updated Made-with: Cursor
325 lines
15 KiB
Bash
Executable File
325 lines
15 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Fabric Preflight — verify all nodes before changes/deploys.
|
|
# Saves snapshot, compares with previous, fails hard on critical issues.
|
|
#
|
|
# Usage:
|
|
# bash ops/fabric_preflight.sh [NCS_URL] [NCS_URL2] [ROUTER_URL]
|
|
# bash ops/fabric_preflight.sh # defaults: 127.0.0.1:8099, same, 127.0.0.1:9102
|
|
set -euo pipefail
|
|
|
|
NODA_NCS="${1:-http://127.0.0.1:8099}"
|
|
ROUTER_URL="${2:-http://127.0.0.1:9102}"
|
|
MEMORY_URL="${3:-http://127.0.0.1:8000}"
|
|
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[0;33m'
|
|
CYAN='\033[0;36m'
|
|
NC='\033[0m'
|
|
|
|
pass() { echo -e " ${GREEN}PASS${NC} $1"; }
|
|
warn() { echo -e " ${YELLOW}WARN${NC} $1"; }
|
|
fail() { echo -e " ${RED}FAIL${NC} $1"; ERRORS=$((ERRORS+1)); }
|
|
info() { echo -e " ${CYAN}INFO${NC} $1"; }
|
|
|
|
ERRORS=0
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
SNAPSHOT_DIR="${SCRIPT_DIR}/preflight_snapshots"
|
|
mkdir -p "$SNAPSHOT_DIR"
|
|
|
|
# ── NCS check ─────────────────────────────────────────────────────────────────
|
|
|
|
check_ncs() {
|
|
local label="$1" url="$2"
|
|
echo "── $label ($url) ──"
|
|
local raw
|
|
raw=$(curl -sf "$url/capabilities" 2>/dev/null) || { fail "NCS unreachable at $url"; return; }
|
|
|
|
local node_id served installed swapper_status
|
|
node_id=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('node_id','?'))" 2>/dev/null)
|
|
served=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('served_count',0))" 2>/dev/null)
|
|
installed=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('installed_count',0))" 2>/dev/null)
|
|
swapper_status=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('runtimes',{}).get('swapper',{}).get('status','?'))" 2>/dev/null)
|
|
|
|
[ "$served" -gt 0 ] 2>/dev/null && pass "node=$node_id served=$served installed=$installed" \
|
|
|| fail "node=$node_id served=$served (empty pool!)"
|
|
|
|
[ "$swapper_status" = "disabled" ] && pass "swapper=disabled" || warn "swapper=$swapper_status"
|
|
|
|
local caps
|
|
caps=$(echo "$raw" | python3 -c "
|
|
import json,sys
|
|
c=json.load(sys.stdin).get('capabilities',{})
|
|
parts=[f'{k}={v}' for k,v in c.items() if k!='providers']
|
|
print(' '.join(parts) if parts else '(none — P3.5 not deployed?)')
|
|
" 2>/dev/null)
|
|
[ "$caps" = "(none — P3.5 not deployed?)" ] && warn "capabilities: $caps" || pass "capabilities: $caps"
|
|
|
|
local mem_p inflight
|
|
mem_p=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('node_load',{}).get('mem_pressure','?'))" 2>/dev/null)
|
|
inflight=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('node_load',{}).get('inflight',json.load(open('/dev/stdin')).get('node_load',{}).get('inflight_jobs',0)) if False else json.load(sys.stdin).get('node_load',{}).get('inflight_jobs',0))" 2>/dev/null || echo "?")
|
|
[ "$mem_p" = "high" ] && warn "mem_pressure=$mem_p inflight=$inflight" \
|
|
|| pass "mem_pressure=$mem_p inflight=$inflight"
|
|
|
|
local vision_count
|
|
vision_count=$(echo "$raw" | python3 -c "import json,sys;print(sum(1 for m in json.load(sys.stdin).get('served_models',[]) if m.get('type')=='vision'))" 2>/dev/null)
|
|
[ "$vision_count" -gt 0 ] && pass "vision models: $vision_count" || warn "no vision models served"
|
|
|
|
# Phase 1: explicit STT/TTS capability check
|
|
local stt_cap tts_cap stt_provider tts_provider
|
|
stt_cap=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('capabilities',{}).get('stt','?'))" 2>/dev/null)
|
|
tts_cap=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('capabilities',{}).get('tts','?'))" 2>/dev/null)
|
|
stt_provider=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('capabilities',{}).get('providers',{}).get('stt','?'))" 2>/dev/null)
|
|
tts_provider=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('capabilities',{}).get('providers',{}).get('tts','?'))" 2>/dev/null)
|
|
[ "$stt_cap" = "True" ] || [ "$stt_cap" = "true" ] \
|
|
&& pass "stt=true provider=$stt_provider" \
|
|
|| warn "stt=false (provider=$stt_provider) — STT not available on this node"
|
|
[ "$tts_cap" = "True" ] || [ "$tts_cap" = "true" ] \
|
|
&& pass "tts=true provider=$tts_provider" \
|
|
|| warn "tts=false (provider=$tts_provider) — TTS not available on this node"
|
|
|
|
NCS_RAW="$raw"
|
|
NCS_NODE_ID="$node_id"
|
|
}
|
|
|
|
# ── Memory Service health check ────────────────────────────────────────────────
|
|
|
|
check_memory_service() {
|
|
local label="$1" url="$2"
|
|
echo "── $label ($url/health) ──"
|
|
local health
|
|
health=$(curl -sf "$url/health" 2>/dev/null) || { warn "Memory Service unreachable at $url (STT/TTS may fail)"; return; }
|
|
local status
|
|
status=$(echo "$health" | python3 -c "import json,sys;print(json.load(sys.stdin).get('status','?'))" 2>/dev/null || echo "ok")
|
|
pass "memory-service health=$status"
|
|
|
|
local voice_status
|
|
voice_status=$(curl -sf "$url/voice/status" 2>/dev/null) || { warn "voice/status unreachable"; return; }
|
|
local tts_engine stt_engine
|
|
tts_engine=$(echo "$voice_status" | python3 -c "import json,sys;print(json.load(sys.stdin).get('tts_engine','?'))" 2>/dev/null)
|
|
stt_engine=$(echo "$voice_status" | python3 -c "import json,sys;print(json.load(sys.stdin).get('stt_engine','?'))" 2>/dev/null)
|
|
pass "voice: tts=$tts_engine stt=$stt_engine"
|
|
}
|
|
|
|
# ── Router check ──────────────────────────────────────────────────────────────
|
|
|
|
check_router() {
|
|
local label="$1" url="$2"
|
|
echo "── $label ($url) ──"
|
|
local health
|
|
health=$(curl -sf "$url/health" 2>/dev/null) || { fail "Router unreachable at $url"; return; }
|
|
local status
|
|
status=$(echo "$health" | python3 -c "import json,sys;print(json.load(sys.stdin).get('status','?'))" 2>/dev/null)
|
|
[ "$status" = "ok" ] && pass "health=$status" || fail "health=$status"
|
|
|
|
local models_total
|
|
models_total=$(curl -sf "$url/v1/models" 2>/dev/null | python3 -c "import json,sys;print(json.load(sys.stdin).get('total',0))" 2>/dev/null) || models_total=0
|
|
[ "$models_total" -gt 0 ] && pass "global pool: $models_total models" || fail "global pool empty"
|
|
|
|
local caps_nodes
|
|
caps_nodes=$(curl -sf "$url/v1/capabilities" 2>/dev/null | python3 -c "
|
|
import json,sys
|
|
d=json.load(sys.stdin)
|
|
nodes=list(d.get('capabilities_by_node',{}).keys())
|
|
print(f'{len(nodes)} node(s): {\" \".join(nodes)}' if nodes else '(none)')
|
|
" 2>/dev/null)
|
|
[ "$caps_nodes" = "(none)" ] && warn "capabilities_by_node: $caps_nodes" || pass "capabilities_by_node: $caps_nodes"
|
|
|
|
ROUTER_MODELS=$(curl -sf "$url/v1/models" 2>/dev/null || echo '{}')
|
|
}
|
|
|
|
# ── Snapshot + diff ───────────────────────────────────────────────────────────
|
|
|
|
NCS_RAW="{}"
|
|
NCS_NODE_ID="unknown"
|
|
ROUTER_MODELS="{}"
|
|
|
|
save_and_diff() {
|
|
local ts
|
|
ts=$(date +%Y-%m-%d_%H%M%S)
|
|
local snap_file="${SNAPSHOT_DIR}/${NCS_NODE_ID}_${ts}.json"
|
|
|
|
python3 -c "
|
|
import json, glob, os, sys
|
|
from datetime import datetime
|
|
|
|
ncs = json.loads('''$(echo "$NCS_RAW" | python3 -c "import sys;print(sys.stdin.read().replace(\"'\",\"\"))")''') if '''$NCS_RAW''' != '{}' else {}
|
|
router = json.loads('''$(echo "$ROUTER_MODELS" | python3 -c "import sys;print(sys.stdin.read().replace(\"'\",\"\"))")''') if '''$ROUTER_MODELS''' != '{}' else {}
|
|
|
|
snapshot = {
|
|
'timestamp': datetime.utcnow().isoformat() + 'Z',
|
|
'node_id': ncs.get('node_id', '$NCS_NODE_ID'),
|
|
'errors': $ERRORS,
|
|
'passed': $ERRORS == 0,
|
|
'served_count': ncs.get('served_count', 0),
|
|
'installed_count': ncs.get('installed_count', 0),
|
|
'capabilities': {k:v for k,v in ncs.get('capabilities', {}).items() if k != 'providers'},
|
|
'providers': ncs.get('capabilities', {}).get('providers', {}),
|
|
'node_load': ncs.get('node_load', {}),
|
|
'router_models_total': router.get('total', 0),
|
|
'capabilities_by_node': router.get('capabilities_by_node', {}),
|
|
}
|
|
|
|
with open('$snap_file', 'w') as f:
|
|
json.dump(snapshot, f, indent=2, ensure_ascii=False)
|
|
|
|
# Find previous snapshot for diff
|
|
prefix = '${NCS_NODE_ID}'.lower() + '_'
|
|
snaps = sorted(glob.glob(os.path.join('$SNAPSHOT_DIR', prefix + '*.json')), reverse=True)
|
|
prev = None
|
|
if len(snaps) >= 2:
|
|
with open(snaps[1]) as f:
|
|
prev = json.load(f)
|
|
|
|
if prev:
|
|
diffs = []
|
|
for key in ('served_count', 'installed_count', 'router_models_total'):
|
|
ov, nv = prev.get(key, '?'), snapshot.get(key, '?')
|
|
if ov != nv:
|
|
diffs.append(f' {key}: {ov} → {nv}')
|
|
old_caps = prev.get('capabilities', {})
|
|
new_caps = snapshot.get('capabilities', {})
|
|
for k in sorted(set(list(old_caps.keys()) + list(new_caps.keys()))):
|
|
ov, nv = old_caps.get(k, '?'), new_caps.get(k, '?')
|
|
if ov != nv:
|
|
diffs.append(f' caps.{k}: {ov} → {nv}')
|
|
if diffs:
|
|
print('Changes vs previous:')
|
|
for d in diffs:
|
|
print(d)
|
|
else:
|
|
print('(no changes vs previous snapshot)')
|
|
else:
|
|
print('(first snapshot for this node)')
|
|
" 2>/dev/null || echo "(snapshot diff failed)"
|
|
|
|
info "Snapshot: $snap_file"
|
|
}
|
|
|
|
# ── Ollama model availability check ──────────────────────────────────────────
|
|
# Voice routing policy depends on specific models; 502 from BFF = model absent.
|
|
# This check probes /api/tags (Ollama REST) to list installed models and
|
|
# emits NCS-compatible "installed=false" warnings so Router can exclude them.
|
|
|
|
OLLAMA_URL="${4:-http://127.0.0.1:11434}"
|
|
|
|
# Voice policy: models required/preferred for voice_fast_uk / voice_quality_uk
|
|
VOICE_REQUIRED_MODELS="gemma3:latest"
|
|
VOICE_PREFERRED_MODELS="qwen3.5:35b-a3b qwen3:14b"
|
|
VOICE_EXCLUDED_MODELS="glm-4.7-flash:32k glm-4.7-flash"
|
|
|
|
check_ollama_voice_models() {
|
|
local ollama_url="${1:-$OLLAMA_URL}"
|
|
echo "── Ollama voice model availability ($ollama_url) ──"
|
|
|
|
local tags_raw
|
|
tags_raw=$(curl -sf "${ollama_url}/api/tags" 2>/dev/null) \
|
|
|| { warn "Ollama unreachable at ${ollama_url} — model check skipped"; return; }
|
|
|
|
local installed_names
|
|
installed_names=$(echo "$tags_raw" | python3 -c "
|
|
import json, sys
|
|
data = json.load(sys.stdin)
|
|
models = data.get('models', [])
|
|
names = [m.get('name','') for m in models]
|
|
print(' '.join(names))
|
|
" 2>/dev/null || echo "")
|
|
|
|
info "Ollama installed: $(echo "$installed_names" | tr ' ' '\n' | grep -c . || echo 0) model(s)"
|
|
|
|
# Check required voice models
|
|
for model in $VOICE_REQUIRED_MODELS; do
|
|
local short; short="${model%%:*}"
|
|
if echo "$installed_names" | tr ' ' '\n' | grep -qi "^${model}$\|^${short}:"; then
|
|
pass "voice_required: ${model} = installed"
|
|
else
|
|
fail "voice_required: ${model} = MISSING — voice_fast_uk will degrade to fallback"
|
|
fi
|
|
done
|
|
|
|
# Check preferred voice models (warn not fail)
|
|
local prefer_available=0
|
|
for model in $VOICE_PREFERRED_MODELS; do
|
|
local short; short="${model%%:*}"
|
|
if echo "$installed_names" | tr ' ' '\n' | grep -qi "^${model}$\|^${short}:"; then
|
|
pass "voice_preferred: ${model} = installed"
|
|
prefer_available=$((prefer_available + 1))
|
|
else
|
|
warn "voice_preferred: ${model} = not installed — will be skipped by router"
|
|
fi
|
|
done
|
|
|
|
# Check that excluded models are NOT serving voice
|
|
for model in $VOICE_EXCLUDED_MODELS; do
|
|
local short; short="${model%%:*}"
|
|
if echo "$installed_names" | tr ' ' '\n' | grep -qi "^${model}$\|^${short}:"; then
|
|
warn "voice_excluded: ${model} is installed — ensure router excludes from voice profiles"
|
|
else
|
|
pass "voice_excluded: ${model} = absent (correct)"
|
|
fi
|
|
done
|
|
|
|
# qwen3:8b specific check — known 502 source
|
|
local qwen3_8b_ok=0
|
|
if echo "$installed_names" | tr ' ' '\n' | grep -qi "^qwen3:8b$"; then
|
|
# Extra: try a minimal generation to detect "loaded but broken"
|
|
local gen_code
|
|
gen_code=$(curl -sf -w "%{http_code}" -X POST "${ollama_url}/api/generate" \
|
|
-H "Content-Type: application/json" \
|
|
-d '{"model":"qwen3:8b","prompt":"ping","stream":false,"options":{"num_predict":1}}' \
|
|
-o /dev/null --max-time 15 2>/dev/null || echo "000")
|
|
if [ "$gen_code" = "200" ]; then
|
|
pass "qwen3:8b = installed and serves (HTTP 200)"
|
|
qwen3_8b_ok=1
|
|
else
|
|
warn "qwen3:8b = installed but generate returned HTTP ${gen_code} — exclude from voice_fast_uk prefer list"
|
|
fi
|
|
else
|
|
warn "qwen3:8b = not installed — mark as unavailable in NCS"
|
|
fi
|
|
|
|
[ $qwen3_8b_ok -eq 0 ] && info "ACTION: remove qwen3:8b from voice_fast_uk.prefer_models until 502 resolved"
|
|
}
|
|
|
|
# ── Main ──────────────────────────────────────────────────────────────────────
|
|
|
|
echo "╔══════════════════════════════════════╗"
|
|
echo "║ Fabric Preflight Check ║"
|
|
echo "╚══════════════════════════════════════╝"
|
|
echo ""
|
|
|
|
check_ncs "NCS" "$NODA_NCS"
|
|
echo ""
|
|
check_router "Router" "$ROUTER_URL"
|
|
echo ""
|
|
check_memory_service "Memory Service" "$MEMORY_URL"
|
|
echo ""
|
|
check_ollama_voice_models "$OLLAMA_URL"
|
|
echo ""
|
|
|
|
# ── Voice Canary: live synthesis test (hard-fail on voice failure) ────────────
|
|
echo "── Voice Canary (live synthesis) ──────────────────────────────────────"
|
|
CANARY_SCRIPT="$(dirname "$0")/scripts/voice_canary.py"
|
|
if [ -f "$CANARY_SCRIPT" ] && command -v python3 >/dev/null 2>&1; then
|
|
MEMORY_SERVICE_URL="$MEMORY_URL" python3 "$CANARY_SCRIPT" --mode preflight
|
|
CANARY_EXIT=$?
|
|
if [ $CANARY_EXIT -ne 0 ]; then
|
|
ERRORS=$((ERRORS+1))
|
|
echo -e " ${RED}FAIL${NC} Voice canary: synthesis test failed (Polina/Ostap not working)"
|
|
fi
|
|
else
|
|
echo " [SKIP] voice_canary.py not found or python3 unavailable"
|
|
fi
|
|
echo ""
|
|
|
|
save_and_diff
|
|
echo ""
|
|
|
|
if [ $ERRORS -gt 0 ]; then
|
|
echo -e "${RED}Preflight FAILED: $ERRORS error(s)${NC}"
|
|
echo -e "${RED}BLOCKED: no changes allowed until all errors resolved${NC}"
|
|
exit 1
|
|
else
|
|
echo -e "${GREEN}Preflight PASSED — all voice canaries green — changes allowed${NC}"
|
|
fi
|