#!/bin/bash # Monitor database stability and auto-recover if needed # Run this periodically (e.g., every 5 minutes via cron) # # IMPORTANT: This script has two recovery modes: # 1. FULL RECOVERY (DROP DATABASE) - only when MicroDAOs < 5 (critical data loss) # 2. SOFT RECOVERY (add NODE2 agents) - when MicroDAOs exist but agents missing set -e LOG_FILE="/var/log/db-stability-monitor.log" TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S') log() { echo "[$TIMESTAMP] $1" | tee -a "$LOG_FILE" } log "🔍 Starting database stability check..." # Check if PostgreSQL container is running if ! docker ps | grep -q daarion-postgres; then log "❌ PostgreSQL container is not running! Starting..." cd /opt/microdao-daarion docker compose -f docker-compose.db.yml up -d db sleep 10 fi # Check if database exists if ! docker exec daarion-postgres psql -U postgres -lqt | cut -d \| -f 1 | grep -qw daarion; then log "⚠️ Database 'daarion' does not exist, creating..." docker exec daarion-postgres psql -U postgres -c "CREATE DATABASE daarion;" || { log "❌ Failed to create database" exit 1 } fi # Check data integrity MICRODAO_COUNT=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM microdaos;" 2>/dev/null | tr -d ' \n' || echo "0") AGENT_COUNT=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents;" 2>/dev/null | tr -d ' \n' || echo "0") NODE2_AGENT_COUNT=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents WHERE node_id = 'node-2-macbook-m4max';" 2>/dev/null | tr -d ' \n' || echo "0") # Ensure we have valid integers MICRODAO_COUNT=${MICRODAO_COUNT:-0} AGENT_COUNT=${AGENT_COUNT:-0} NODE2_AGENT_COUNT=${NODE2_AGENT_COUNT:-0} log "📊 Data check: MicroDAOs=$MICRODAO_COUNT, Agents=$AGENT_COUNT, NODE2=$NODE2_AGENT_COUNT" # ============================================================================ # CASE 1: CRITICAL DATA LOSS - MicroDAOs missing (< 5) # Only in this case we do FULL RECOVERY with DROP DATABASE # ============================================================================ if [ "$MICRODAO_COUNT" -lt 5 ]; then log "🔴 CRITICAL: MicroDAOs missing ($MICRODAO_COUNT < 5). Starting FULL RECOVERY..." # Check for recent backup (prefer backup with NODE2 agents) LATEST_BACKUP=$(ls -t /opt/microdao-daarion/db_backups/full_backup_with_node2*.sql 2>/dev/null | head -1) if [ -z "$LATEST_BACKUP" ]; then LATEST_BACKUP=$(ls -t /opt/microdao-daarion/db_backups/*.sql 2>/dev/null | head -1) fi if [ -n "$LATEST_BACKUP" ]; then log "📦 Found backup: $LATEST_BACKUP" log "🔄 Starting full restoration..." # Terminate all connections to the database first log "🔒 Terminating active connections..." docker exec daarion-postgres psql -U postgres -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = 'daarion' AND pid <> pg_backend_pid();" 2>&1 | grep -v "terminate_backend\|^$" || true sleep 2 # Drop and recreate database docker exec daarion-postgres psql -U postgres -c "DROP DATABASE IF EXISTS daarion;" 2>&1 | grep -v "DROP DATABASE" || true docker exec daarion-postgres psql -U postgres -c "CREATE DATABASE daarion;" 2>&1 | grep -v "CREATE DATABASE" || true # Restore from backup docker exec -i daarion-postgres psql -U postgres -d daarion < "$LATEST_BACKUP" 2>&1 | grep -v "already exists\|does not exist" || true # Apply migrations cd /opt/microdao-daarion for f in migrations/*.sql; do docker exec -i daarion-postgres psql -U postgres -d daarion < "$f" 2>&1 | grep -v "already exists\|does not exist" || true done # Sync NODE2 agents log "🤖 Syncing NODE2 agents..." python3 scripts/sync-node2-dagi-agents.py 2>&1 | tail -10 || true # Remove test agents bash scripts/remove-test-agents.sh 2>&1 | tail -3 || true # Fix asset URLs log "🖼️ Fixing asset URLs..." bash scripts/fix-asset-urls.sh 2>&1 | tail -5 || true # Verify AGENT_COUNT_AFTER=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents;" 2>/dev/null | tr -d ' \n' || echo "0") log "📊 Agents after full recovery: $AGENT_COUNT_AFTER" log "✅ Full recovery complete" else log "❌ No backup found for full recovery" fi # ============================================================================ # CASE 2: SOFT RECOVERY - MicroDAOs exist but NODE2 agents missing # DO NOT DROP DATABASE - just sync NODE2 agents # ============================================================================ elif [ "$NODE2_AGENT_COUNT" -lt 45 ]; then log "🟡 WARNING: NODE2 agents missing ($NODE2_AGENT_COUNT < 45). Starting SOFT RECOVERY (no DROP)..." cd /opt/microdao-daarion # Just sync NODE2 agents - NO DROP DATABASE! log "🤖 Syncing NODE2 agents..." python3 scripts/sync-node2-dagi-agents.py 2>&1 | tail -10 || true # Remove test agents if any bash scripts/remove-test-agents.sh 2>&1 | tail -3 || true # Fix asset URLs log "🖼️ Fixing asset URLs..." bash scripts/fix-asset-urls.sh 2>&1 | tail -5 || true # Verify NODE2_AFTER=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents WHERE node_id = 'node-2-macbook-m4max';" 2>/dev/null | tr -d ' \n' || echo "0") AGENT_COUNT_AFTER=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents;" 2>/dev/null | tr -d ' \n' || echo "0") log "📊 After soft recovery: Agents=$AGENT_COUNT_AFTER, NODE2=$NODE2_AFTER" log "✅ Soft recovery complete" else log "✅ Data integrity OK" fi # Check PostgreSQL logs for errors (informational only) ERROR_COUNT=$(docker logs daarion-postgres --since 5m 2>&1 | grep -i "fatal\|error\|panic" | grep -v "already exists\|does not exist\|column.*does not exist" | wc -l) if [ "$ERROR_COUNT" -gt 5 ]; then log "⚠️ Found $ERROR_COUNT significant errors in PostgreSQL logs in last 5 minutes" fi # Check container restart count RESTART_COUNT=$(docker inspect daarion-postgres --format='{{.RestartCount}}' 2>/dev/null || echo "0") if [ "$RESTART_COUNT" -gt 10 ]; then log "⚠️ PostgreSQL has restarted $RESTART_COUNT times - possible stability issue" fi log "✅ Stability check complete"