From b2caee4e0e22042f3585c41d3a34904a3cafd909 Mon Sep 17 00:00:00 2001 From: Apple Date: Fri, 5 Dec 2025 02:41:43 -0800 Subject: [PATCH] fix: CRITICAL - Prevent infinite DROP DATABASE loop ROOT CAUSE: Monitor was doing DROP DATABASE when NODE2 agents were missing, but the backup didn't have NODE2 agents, causing an infinite loop. FIX: - FULL RECOVERY (DROP DATABASE) only when MicroDAOs < 5 (critical data loss) - SOFT RECOVERY (just sync agents) when MicroDAOs exist but agents missing - Prefer backup with NODE2 agents (full_backup_with_node2*.sql) - Never DROP DATABASE if MicroDAOs exist This prevents the daily data loss issue. --- scripts/monitor-db-stability.sh | 77 ++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 20 deletions(-) diff --git a/scripts/monitor-db-stability.sh b/scripts/monitor-db-stability.sh index 8df354eb..b40ad02b 100755 --- a/scripts/monitor-db-stability.sh +++ b/scripts/monitor-db-stability.sh @@ -1,6 +1,10 @@ #!/bin/bash # Monitor database stability and auto-recover if needed # Run this periodically (e.g., every 5 minutes via cron) +# +# IMPORTANT: This script has two recovery modes: +# 1. FULL RECOVERY (DROP DATABASE) - only when MicroDAOs < 5 (critical data loss) +# 2. SOFT RECOVERY (add NODE2 agents) - when MicroDAOs exist but agents missing set -e @@ -42,17 +46,22 @@ NODE2_AGENT_COUNT=${NODE2_AGENT_COUNT:-0} log "📊 Data check: MicroDAOs=$MICRODAO_COUNT, Agents=$AGENT_COUNT, NODE2=$NODE2_AGENT_COUNT" -# If data is missing, try to restore -# Need at least 5 MicroDAOs, 50 total agents, and 45+ NODE2 agents -if [ "$MICRODAO_COUNT" -lt 5 ] || [ "$AGENT_COUNT" -lt 50 ] || [ "$NODE2_AGENT_COUNT" -lt 45 ]; then - log "⚠️ Data loss detected! Attempting recovery..." +# ============================================================================ +# CASE 1: CRITICAL DATA LOSS - MicroDAOs missing (< 5) +# Only in this case we do FULL RECOVERY with DROP DATABASE +# ============================================================================ +if [ "$MICRODAO_COUNT" -lt 5 ]; then + log "🔴 CRITICAL: MicroDAOs missing ($MICRODAO_COUNT < 5). Starting FULL RECOVERY..." - # Check for recent backup - LATEST_BACKUP=$(ls -t /opt/microdao-daarion/db_backups/*.sql 2>/dev/null | head -1) + # Check for recent backup (prefer backup with NODE2 agents) + LATEST_BACKUP=$(ls -t /opt/microdao-daarion/db_backups/full_backup_with_node2*.sql 2>/dev/null | head -1) + if [ -z "$LATEST_BACKUP" ]; then + LATEST_BACKUP=$(ls -t /opt/microdao-daarion/db_backups/*.sql 2>/dev/null | head -1) + fi if [ -n "$LATEST_BACKUP" ]; then log "📦 Found backup: $LATEST_BACKUP" - log "🔄 Restoring from backup..." + log "🔄 Starting full restoration..." # Terminate all connections to the database first log "🔒 Terminating active connections..." @@ -72,31 +81,60 @@ if [ "$MICRODAO_COUNT" -lt 5 ] || [ "$AGENT_COUNT" -lt 50 ] || [ "$NODE2_AGENT_C docker exec -i daarion-postgres psql -U postgres -d daarion < "$f" 2>&1 | grep -v "already exists\|does not exist" || true done - # Sync NODE2 agents (force sync even if they exist) + # Sync NODE2 agents log "🤖 Syncing NODE2 agents..." python3 scripts/sync-node2-dagi-agents.py 2>&1 | tail -10 || true - # Verify agent count after sync - AGENT_COUNT_AFTER=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents;" 2>/dev/null | tr -d ' \n' || echo "0") - log "📊 Agents after sync: $AGENT_COUNT_AFTER" - # Remove test agents bash scripts/remove-test-agents.sh 2>&1 | tail -3 || true - # Fix asset URLs (logos and banners) + # Fix asset URLs log "🖼️ Fixing asset URLs..." bash scripts/fix-asset-urls.sh 2>&1 | tail -5 || true - log "✅ Recovery complete" + # Verify + AGENT_COUNT_AFTER=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents;" 2>/dev/null | tr -d ' \n' || echo "0") + log "📊 Agents after full recovery: $AGENT_COUNT_AFTER" + + log "✅ Full recovery complete" else - log "❌ No backup found for recovery" + log "❌ No backup found for full recovery" fi + +# ============================================================================ +# CASE 2: SOFT RECOVERY - MicroDAOs exist but NODE2 agents missing +# DO NOT DROP DATABASE - just sync NODE2 agents +# ============================================================================ +elif [ "$NODE2_AGENT_COUNT" -lt 45 ]; then + log "🟡 WARNING: NODE2 agents missing ($NODE2_AGENT_COUNT < 45). Starting SOFT RECOVERY (no DROP)..." + + cd /opt/microdao-daarion + + # Just sync NODE2 agents - NO DROP DATABASE! + log "🤖 Syncing NODE2 agents..." + python3 scripts/sync-node2-dagi-agents.py 2>&1 | tail -10 || true + + # Remove test agents if any + bash scripts/remove-test-agents.sh 2>&1 | tail -3 || true + + # Fix asset URLs + log "🖼️ Fixing asset URLs..." + bash scripts/fix-asset-urls.sh 2>&1 | tail -5 || true + + # Verify + NODE2_AFTER=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents WHERE node_id = 'node-2-macbook-m4max';" 2>/dev/null | tr -d ' \n' || echo "0") + AGENT_COUNT_AFTER=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents;" 2>/dev/null | tr -d ' \n' || echo "0") + log "📊 After soft recovery: Agents=$AGENT_COUNT_AFTER, NODE2=$NODE2_AFTER" + + log "✅ Soft recovery complete" +else + log "✅ Data integrity OK" fi -# Check PostgreSQL logs for errors -ERROR_COUNT=$(docker logs daarion-postgres --since 5m 2>&1 | grep -i "fatal\|error\|panic" | wc -l) -if [ "$ERROR_COUNT" -gt 0 ]; then - log "⚠️ Found $ERROR_COUNT errors in PostgreSQL logs in last 5 minutes" +# Check PostgreSQL logs for errors (informational only) +ERROR_COUNT=$(docker logs daarion-postgres --since 5m 2>&1 | grep -i "fatal\|error\|panic" | grep -v "already exists\|does not exist\|column.*does not exist" | wc -l) +if [ "$ERROR_COUNT" -gt 5 ]; then + log "⚠️ Found $ERROR_COUNT significant errors in PostgreSQL logs in last 5 minutes" fi # Check container restart count @@ -106,4 +144,3 @@ if [ "$RESTART_COUNT" -gt 10 ]; then fi log "✅ Stability check complete" -