### Backend (city-service) - Node Registry + Self-Healing API (migration 039) - Improved get_all_nodes() with robust fallback for node_registry/node_cache - Agent Prompts Runtime API for DAGI Router integration - DAGI Router Audit endpoints (phantom/stale detection) - Node Agents API (Guardian/Steward) - Node metrics extended (CPU/GPU/RAM/Disk) ### Frontend (apps/web) - Node Directory with improved error handling - Node Cabinet with metrics cards - DAGI Router Card component - Node Metrics Card component - useDAGIAudit hook ### Scripts - check-invariants.py - deploy verification - node-bootstrap.sh - node self-registration - node-guardian-loop.py - continuous self-healing - dagi_agent_audit.py - DAGI audit utility ### Migrations - 034: Agent prompts seed - 035: Agent DAGI audit - 036: Node metrics extended - 037: Node agents complete - 038: Agent prompts full coverage - 039: Node registry self-healing ### Tests - test_infra_smoke.py - test_agent_prompts_runtime.py - test_dagi_router_api.py ### Documentation - DEPLOY_CHECKLIST_2024_11_30.md - Multiple TASK_PHASE docs
156 lines
4.2 KiB
Bash
Executable File
156 lines
4.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
#
|
|
# DAARION Node Bootstrap Script
|
|
# Виконує самореєстрацію ноди при старті.
|
|
#
|
|
# Використання:
|
|
# ./scripts/node-bootstrap.sh
|
|
#
|
|
# Environment variables:
|
|
# CITY_SERVICE_URL - URL city-service (default: http://localhost:7001)
|
|
# NODE_ID - Ідентифікатор ноди (required)
|
|
# NODE_NAME - Назва ноди (required)
|
|
# NODE_ENVIRONMENT - production|development|staging (default: development)
|
|
# NODE_HOSTNAME - Hostname (optional)
|
|
# NODE_ROLES - Ролі через кому: gpu,ai_runtime,storage (default: gpu,ai_runtime)
|
|
# NODE_DESCRIPTION - Опис ноди (optional)
|
|
|
|
set -euo pipefail
|
|
|
|
# Colors
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m'
|
|
|
|
log_info() {
|
|
echo -e "${BLUE}[NODE-BOOTSTRAP]${NC} $1"
|
|
}
|
|
|
|
log_success() {
|
|
echo -e "${GREEN}[NODE-BOOTSTRAP]${NC} ✅ $1"
|
|
}
|
|
|
|
log_error() {
|
|
echo -e "${RED}[NODE-BOOTSTRAP]${NC} ❌ $1"
|
|
}
|
|
|
|
log_warning() {
|
|
echo -e "${YELLOW}[NODE-BOOTSTRAP]${NC} ⚠️ $1"
|
|
}
|
|
|
|
# Configuration
|
|
CITY_SERVICE_URL="${CITY_SERVICE_URL:-http://localhost:7001}"
|
|
NODE_ID="${NODE_ID:-}"
|
|
NODE_NAME="${NODE_NAME:-}"
|
|
NODE_ENVIRONMENT="${NODE_ENVIRONMENT:-development}"
|
|
NODE_HOSTNAME="${NODE_HOSTNAME:-$(hostname 2>/dev/null || echo '')}"
|
|
NODE_ROLES="${NODE_ROLES:-gpu,ai_runtime}"
|
|
NODE_DESCRIPTION="${NODE_DESCRIPTION:-}"
|
|
|
|
# Retry settings
|
|
MAX_RETRIES=5
|
|
RETRY_DELAY=5
|
|
|
|
# Validate required params
|
|
if [ -z "$NODE_ID" ]; then
|
|
log_error "NODE_ID is required"
|
|
exit 1
|
|
fi
|
|
|
|
if [ -z "$NODE_NAME" ]; then
|
|
log_error "NODE_NAME is required"
|
|
exit 1
|
|
fi
|
|
|
|
# Convert roles to JSON array
|
|
roles_json=""
|
|
IFS=',' read -ra ROLE_ARRAY <<< "$NODE_ROLES"
|
|
for i in "${!ROLE_ARRAY[@]}"; do
|
|
if [ $i -eq 0 ]; then
|
|
roles_json="\"${ROLE_ARRAY[$i]}\""
|
|
else
|
|
roles_json="$roles_json, \"${ROLE_ARRAY[$i]}\""
|
|
fi
|
|
done
|
|
roles_json="[$roles_json]"
|
|
|
|
# Build payload
|
|
payload=$(cat <<EOF
|
|
{
|
|
"id": "$NODE_ID",
|
|
"name": "$NODE_NAME",
|
|
"hostname": "$NODE_HOSTNAME",
|
|
"environment": "$NODE_ENVIRONMENT",
|
|
"roles": $roles_json,
|
|
"description": "$NODE_DESCRIPTION"
|
|
}
|
|
EOF
|
|
)
|
|
|
|
log_info "Starting node self-registration..."
|
|
log_info " Node ID: $NODE_ID"
|
|
log_info " Node Name: $NODE_NAME"
|
|
log_info " Environment: $NODE_ENVIRONMENT"
|
|
log_info " Hostname: $NODE_HOSTNAME"
|
|
log_info " Roles: $NODE_ROLES"
|
|
log_info " City Service: $CITY_SERVICE_URL"
|
|
|
|
# Self-registration with retries
|
|
attempt=1
|
|
while [ $attempt -le $MAX_RETRIES ]; do
|
|
log_info "Registration attempt $attempt/$MAX_RETRIES..."
|
|
|
|
response=$(curl -s -w "\n%{http_code}" \
|
|
-X POST "${CITY_SERVICE_URL}/internal/nodes/register-or-update" \
|
|
-H "Content-Type: application/json" \
|
|
-d "$payload" \
|
|
--max-time 10 \
|
|
2>/dev/null || echo -e "\n000")
|
|
|
|
http_code=$(echo "$response" | tail -n1)
|
|
body=$(echo "$response" | sed '$d')
|
|
|
|
if [ "$http_code" = "200" ]; then
|
|
success=$(echo "$body" | grep -o '"success":\s*true' || true)
|
|
|
|
if [ -n "$success" ]; then
|
|
is_new=$(echo "$body" | grep -o '"is_new":\s*true' || true)
|
|
|
|
if [ -n "$is_new" ]; then
|
|
log_success "Node registered successfully (new registration)"
|
|
else
|
|
log_success "Node updated successfully"
|
|
fi
|
|
|
|
# Optional: run initial heartbeat
|
|
log_info "Sending initial heartbeat..."
|
|
curl -s -X POST "${CITY_SERVICE_URL}/internal/node/${NODE_ID}/heartbeat" \
|
|
-H "Content-Type: application/json" \
|
|
-d '{"metrics": {}}' \
|
|
--max-time 5 > /dev/null 2>&1 || true
|
|
|
|
log_success "Node bootstrap completed"
|
|
exit 0
|
|
fi
|
|
fi
|
|
|
|
log_warning "Registration failed (HTTP $http_code)"
|
|
|
|
if [ $attempt -lt $MAX_RETRIES ]; then
|
|
log_info "Retrying in ${RETRY_DELAY}s..."
|
|
sleep $RETRY_DELAY
|
|
fi
|
|
|
|
attempt=$((attempt + 1))
|
|
done
|
|
|
|
log_error "Node registration failed after $MAX_RETRIES attempts"
|
|
log_error "Please check:"
|
|
log_error " 1. City service is running at $CITY_SERVICE_URL"
|
|
log_error " 2. Migration 039_node_registry_self_healing.sql is applied"
|
|
log_error " 3. Network connectivity to city service"
|
|
exit 1
|
|
|