Files
microdao-daarion/ops/fabric_preflight.sh
Apple 9a36020316 P3.5-P3.7: 2-layer inventory, capability routing, STT/TTS adapters, Dev Contract
NCS:
- _collect_worker_caps() fetches capability flags from node-worker /caps
- _derive_capabilities() merges served model types + worker provider flags
- installed_artifacts replaces inventory_only (disk scan with DISK_SCAN_PATHS env)
- New endpoints: /capabilities/caps, /capabilities/installed

Node Worker:
- STT_PROVIDER, TTS_PROVIDER, OCR_PROVIDER, IMAGE_PROVIDER env flags
- /caps endpoint returns capabilities + providers for NCS aggregation
- STT adapter (providers/stt_mlx_whisper.py) — remote + local mode
- TTS adapter (providers/tts_mlx_kokoro.py) — remote + local mode
- OCR handler via vision_prompted (ollama_vision with OCR prompt)
- NATS subjects: node.{id}.stt/tts/ocr/image.request

Router:
- POST /v1/capability/{stt,tts,ocr,image} — capability-based offload routing
- GET /v1/capabilities — global view with capabilities_by_node
- require_fresh_caps(ttl) preflight guard
- find_nodes_with_capability(cap) + load-based node selection

Ops:
- ops/fabric_snapshot.py — full runtime snapshot collector
- ops/fabric_preflight.sh — quick check + snapshot save + diff
- docs/fabric_contract.md — Dev Contract v0.1 (preflight-first)
- tests/test_fabric_contract.py — CI enforcement (6 tests)

Made-with: Cursor
2026-02-27 05:24:09 -08:00

187 lines
8.0 KiB
Bash
Executable File

#!/usr/bin/env bash
# Fabric Preflight — verify all nodes before changes/deploys.
# Saves snapshot, compares with previous, fails hard on critical issues.
#
# Usage:
# bash ops/fabric_preflight.sh [NCS_URL] [NCS_URL2] [ROUTER_URL]
# bash ops/fabric_preflight.sh # defaults: 127.0.0.1:8099, same, 127.0.0.1:9102
set -euo pipefail
NODA_NCS="${1:-http://127.0.0.1:8099}"
ROUTER_URL="${2:-http://127.0.0.1:9102}"
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
CYAN='\033[0;36m'
NC='\033[0m'
pass() { echo -e " ${GREEN}PASS${NC} $1"; }
warn() { echo -e " ${YELLOW}WARN${NC} $1"; }
fail() { echo -e " ${RED}FAIL${NC} $1"; ERRORS=$((ERRORS+1)); }
info() { echo -e " ${CYAN}INFO${NC} $1"; }
ERRORS=0
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SNAPSHOT_DIR="${SCRIPT_DIR}/preflight_snapshots"
mkdir -p "$SNAPSHOT_DIR"
# ── NCS check ─────────────────────────────────────────────────────────────────
check_ncs() {
local label="$1" url="$2"
echo "── $label ($url) ──"
local raw
raw=$(curl -sf "$url/capabilities" 2>/dev/null) || { fail "NCS unreachable at $url"; return; }
local node_id served installed swapper_status
node_id=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('node_id','?'))" 2>/dev/null)
served=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('served_count',0))" 2>/dev/null)
installed=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('installed_count',0))" 2>/dev/null)
swapper_status=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('runtimes',{}).get('swapper',{}).get('status','?'))" 2>/dev/null)
[ "$served" -gt 0 ] 2>/dev/null && pass "node=$node_id served=$served installed=$installed" \
|| fail "node=$node_id served=$served (empty pool!)"
[ "$swapper_status" = "disabled" ] && pass "swapper=disabled" || warn "swapper=$swapper_status"
local caps
caps=$(echo "$raw" | python3 -c "
import json,sys
c=json.load(sys.stdin).get('capabilities',{})
parts=[f'{k}={v}' for k,v in c.items() if k!='providers']
print(' '.join(parts) if parts else '(none — P3.5 not deployed?)')
" 2>/dev/null)
[ "$caps" = "(none — P3.5 not deployed?)" ] && warn "capabilities: $caps" || pass "capabilities: $caps"
local mem_p inflight
mem_p=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('node_load',{}).get('mem_pressure','?'))" 2>/dev/null)
inflight=$(echo "$raw" | python3 -c "import json,sys;print(json.load(sys.stdin).get('node_load',{}).get('inflight',json.load(open('/dev/stdin')).get('node_load',{}).get('inflight_jobs',0)) if False else json.load(sys.stdin).get('node_load',{}).get('inflight_jobs',0))" 2>/dev/null || echo "?")
[ "$mem_p" = "high" ] && warn "mem_pressure=$mem_p inflight=$inflight" \
|| pass "mem_pressure=$mem_p inflight=$inflight"
local vision_count
vision_count=$(echo "$raw" | python3 -c "import json,sys;print(sum(1 for m in json.load(sys.stdin).get('served_models',[]) if m.get('type')=='vision'))" 2>/dev/null)
[ "$vision_count" -gt 0 ] && pass "vision models: $vision_count" || warn "no vision models served"
NCS_RAW="$raw"
NCS_NODE_ID="$node_id"
}
# ── Router check ──────────────────────────────────────────────────────────────
check_router() {
local label="$1" url="$2"
echo "── $label ($url) ──"
local health
health=$(curl -sf "$url/health" 2>/dev/null) || { fail "Router unreachable at $url"; return; }
local status
status=$(echo "$health" | python3 -c "import json,sys;print(json.load(sys.stdin).get('status','?'))" 2>/dev/null)
[ "$status" = "ok" ] && pass "health=$status" || fail "health=$status"
local models_total
models_total=$(curl -sf "$url/v1/models" 2>/dev/null | python3 -c "import json,sys;print(json.load(sys.stdin).get('total',0))" 2>/dev/null) || models_total=0
[ "$models_total" -gt 0 ] && pass "global pool: $models_total models" || fail "global pool empty"
local caps_nodes
caps_nodes=$(curl -sf "$url/v1/capabilities" 2>/dev/null | python3 -c "
import json,sys
d=json.load(sys.stdin)
nodes=list(d.get('capabilities_by_node',{}).keys())
print(f'{len(nodes)} node(s): {\" \".join(nodes)}' if nodes else '(none)')
" 2>/dev/null)
[ "$caps_nodes" = "(none)" ] && warn "capabilities_by_node: $caps_nodes" || pass "capabilities_by_node: $caps_nodes"
ROUTER_MODELS=$(curl -sf "$url/v1/models" 2>/dev/null || echo '{}')
}
# ── Snapshot + diff ───────────────────────────────────────────────────────────
NCS_RAW="{}"
NCS_NODE_ID="unknown"
ROUTER_MODELS="{}"
save_and_diff() {
local ts
ts=$(date +%Y-%m-%d_%H%M%S)
local snap_file="${SNAPSHOT_DIR}/${NCS_NODE_ID}_${ts}.json"
python3 -c "
import json, glob, os, sys
from datetime import datetime
ncs = json.loads('''$(echo "$NCS_RAW" | python3 -c "import sys;print(sys.stdin.read().replace(\"'\",\"\"))")''') if '''$NCS_RAW''' != '{}' else {}
router = json.loads('''$(echo "$ROUTER_MODELS" | python3 -c "import sys;print(sys.stdin.read().replace(\"'\",\"\"))")''') if '''$ROUTER_MODELS''' != '{}' else {}
snapshot = {
'timestamp': datetime.utcnow().isoformat() + 'Z',
'node_id': ncs.get('node_id', '$NCS_NODE_ID'),
'errors': $ERRORS,
'passed': $ERRORS == 0,
'served_count': ncs.get('served_count', 0),
'installed_count': ncs.get('installed_count', 0),
'capabilities': {k:v for k,v in ncs.get('capabilities', {}).items() if k != 'providers'},
'providers': ncs.get('capabilities', {}).get('providers', {}),
'node_load': ncs.get('node_load', {}),
'router_models_total': router.get('total', 0),
'capabilities_by_node': router.get('capabilities_by_node', {}),
}
with open('$snap_file', 'w') as f:
json.dump(snapshot, f, indent=2, ensure_ascii=False)
# Find previous snapshot for diff
prefix = '${NCS_NODE_ID}'.lower() + '_'
snaps = sorted(glob.glob(os.path.join('$SNAPSHOT_DIR', prefix + '*.json')), reverse=True)
prev = None
if len(snaps) >= 2:
with open(snaps[1]) as f:
prev = json.load(f)
if prev:
diffs = []
for key in ('served_count', 'installed_count', 'router_models_total'):
ov, nv = prev.get(key, '?'), snapshot.get(key, '?')
if ov != nv:
diffs.append(f' {key}: {ov} → {nv}')
old_caps = prev.get('capabilities', {})
new_caps = snapshot.get('capabilities', {})
for k in sorted(set(list(old_caps.keys()) + list(new_caps.keys()))):
ov, nv = old_caps.get(k, '?'), new_caps.get(k, '?')
if ov != nv:
diffs.append(f' caps.{k}: {ov} → {nv}')
if diffs:
print('Changes vs previous:')
for d in diffs:
print(d)
else:
print('(no changes vs previous snapshot)')
else:
print('(first snapshot for this node)')
" 2>/dev/null || echo "(snapshot diff failed)"
info "Snapshot: $snap_file"
}
# ── Main ──────────────────────────────────────────────────────────────────────
echo "╔══════════════════════════════════════╗"
echo "║ Fabric Preflight Check ║"
echo "╚══════════════════════════════════════╝"
echo ""
check_ncs "NCS" "$NODA_NCS"
echo ""
check_router "Router" "$ROUTER_URL"
echo ""
save_and_diff
echo ""
if [ $ERRORS -gt 0 ]; then
echo -e "${RED}Preflight FAILED: $ERRORS error(s)${NC}"
echo -e "${RED}BLOCKED: no changes allowed until all errors resolved${NC}"
exit 1
else
echo -e "${GREEN}Preflight PASSED — changes allowed${NC}"
fi