fix: CRITICAL - Prevent infinite DROP DATABASE loop
ROOT CAUSE: Monitor was doing DROP DATABASE when NODE2 agents were missing, but the backup didn't have NODE2 agents, causing an infinite loop. FIX: - FULL RECOVERY (DROP DATABASE) only when MicroDAOs < 5 (critical data loss) - SOFT RECOVERY (just sync agents) when MicroDAOs exist but agents missing - Prefer backup with NODE2 agents (full_backup_with_node2*.sql) - Never DROP DATABASE if MicroDAOs exist This prevents the daily data loss issue.
This commit is contained in:
@@ -1,6 +1,10 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# Monitor database stability and auto-recover if needed
|
# Monitor database stability and auto-recover if needed
|
||||||
# Run this periodically (e.g., every 5 minutes via cron)
|
# Run this periodically (e.g., every 5 minutes via cron)
|
||||||
|
#
|
||||||
|
# IMPORTANT: This script has two recovery modes:
|
||||||
|
# 1. FULL RECOVERY (DROP DATABASE) - only when MicroDAOs < 5 (critical data loss)
|
||||||
|
# 2. SOFT RECOVERY (add NODE2 agents) - when MicroDAOs exist but agents missing
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
@@ -42,17 +46,22 @@ NODE2_AGENT_COUNT=${NODE2_AGENT_COUNT:-0}
|
|||||||
|
|
||||||
log "📊 Data check: MicroDAOs=$MICRODAO_COUNT, Agents=$AGENT_COUNT, NODE2=$NODE2_AGENT_COUNT"
|
log "📊 Data check: MicroDAOs=$MICRODAO_COUNT, Agents=$AGENT_COUNT, NODE2=$NODE2_AGENT_COUNT"
|
||||||
|
|
||||||
# If data is missing, try to restore
|
# ============================================================================
|
||||||
# Need at least 5 MicroDAOs, 50 total agents, and 45+ NODE2 agents
|
# CASE 1: CRITICAL DATA LOSS - MicroDAOs missing (< 5)
|
||||||
if [ "$MICRODAO_COUNT" -lt 5 ] || [ "$AGENT_COUNT" -lt 50 ] || [ "$NODE2_AGENT_COUNT" -lt 45 ]; then
|
# Only in this case we do FULL RECOVERY with DROP DATABASE
|
||||||
log "⚠️ Data loss detected! Attempting recovery..."
|
# ============================================================================
|
||||||
|
if [ "$MICRODAO_COUNT" -lt 5 ]; then
|
||||||
|
log "🔴 CRITICAL: MicroDAOs missing ($MICRODAO_COUNT < 5). Starting FULL RECOVERY..."
|
||||||
|
|
||||||
# Check for recent backup
|
# Check for recent backup (prefer backup with NODE2 agents)
|
||||||
LATEST_BACKUP=$(ls -t /opt/microdao-daarion/db_backups/*.sql 2>/dev/null | head -1)
|
LATEST_BACKUP=$(ls -t /opt/microdao-daarion/db_backups/full_backup_with_node2*.sql 2>/dev/null | head -1)
|
||||||
|
if [ -z "$LATEST_BACKUP" ]; then
|
||||||
|
LATEST_BACKUP=$(ls -t /opt/microdao-daarion/db_backups/*.sql 2>/dev/null | head -1)
|
||||||
|
fi
|
||||||
|
|
||||||
if [ -n "$LATEST_BACKUP" ]; then
|
if [ -n "$LATEST_BACKUP" ]; then
|
||||||
log "📦 Found backup: $LATEST_BACKUP"
|
log "📦 Found backup: $LATEST_BACKUP"
|
||||||
log "🔄 Restoring from backup..."
|
log "🔄 Starting full restoration..."
|
||||||
|
|
||||||
# Terminate all connections to the database first
|
# Terminate all connections to the database first
|
||||||
log "🔒 Terminating active connections..."
|
log "🔒 Terminating active connections..."
|
||||||
@@ -72,31 +81,60 @@ if [ "$MICRODAO_COUNT" -lt 5 ] || [ "$AGENT_COUNT" -lt 50 ] || [ "$NODE2_AGENT_C
|
|||||||
docker exec -i daarion-postgres psql -U postgres -d daarion < "$f" 2>&1 | grep -v "already exists\|does not exist" || true
|
docker exec -i daarion-postgres psql -U postgres -d daarion < "$f" 2>&1 | grep -v "already exists\|does not exist" || true
|
||||||
done
|
done
|
||||||
|
|
||||||
# Sync NODE2 agents (force sync even if they exist)
|
# Sync NODE2 agents
|
||||||
log "🤖 Syncing NODE2 agents..."
|
log "🤖 Syncing NODE2 agents..."
|
||||||
python3 scripts/sync-node2-dagi-agents.py 2>&1 | tail -10 || true
|
python3 scripts/sync-node2-dagi-agents.py 2>&1 | tail -10 || true
|
||||||
|
|
||||||
# Verify agent count after sync
|
|
||||||
AGENT_COUNT_AFTER=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents;" 2>/dev/null | tr -d ' \n' || echo "0")
|
|
||||||
log "📊 Agents after sync: $AGENT_COUNT_AFTER"
|
|
||||||
|
|
||||||
# Remove test agents
|
# Remove test agents
|
||||||
bash scripts/remove-test-agents.sh 2>&1 | tail -3 || true
|
bash scripts/remove-test-agents.sh 2>&1 | tail -3 || true
|
||||||
|
|
||||||
# Fix asset URLs (logos and banners)
|
# Fix asset URLs
|
||||||
log "🖼️ Fixing asset URLs..."
|
log "🖼️ Fixing asset URLs..."
|
||||||
bash scripts/fix-asset-urls.sh 2>&1 | tail -5 || true
|
bash scripts/fix-asset-urls.sh 2>&1 | tail -5 || true
|
||||||
|
|
||||||
log "✅ Recovery complete"
|
# Verify
|
||||||
|
AGENT_COUNT_AFTER=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents;" 2>/dev/null | tr -d ' \n' || echo "0")
|
||||||
|
log "📊 Agents after full recovery: $AGENT_COUNT_AFTER"
|
||||||
|
|
||||||
|
log "✅ Full recovery complete"
|
||||||
else
|
else
|
||||||
log "❌ No backup found for recovery"
|
log "❌ No backup found for full recovery"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# CASE 2: SOFT RECOVERY - MicroDAOs exist but NODE2 agents missing
|
||||||
|
# DO NOT DROP DATABASE - just sync NODE2 agents
|
||||||
|
# ============================================================================
|
||||||
|
elif [ "$NODE2_AGENT_COUNT" -lt 45 ]; then
|
||||||
|
log "🟡 WARNING: NODE2 agents missing ($NODE2_AGENT_COUNT < 45). Starting SOFT RECOVERY (no DROP)..."
|
||||||
|
|
||||||
|
cd /opt/microdao-daarion
|
||||||
|
|
||||||
|
# Just sync NODE2 agents - NO DROP DATABASE!
|
||||||
|
log "🤖 Syncing NODE2 agents..."
|
||||||
|
python3 scripts/sync-node2-dagi-agents.py 2>&1 | tail -10 || true
|
||||||
|
|
||||||
|
# Remove test agents if any
|
||||||
|
bash scripts/remove-test-agents.sh 2>&1 | tail -3 || true
|
||||||
|
|
||||||
|
# Fix asset URLs
|
||||||
|
log "🖼️ Fixing asset URLs..."
|
||||||
|
bash scripts/fix-asset-urls.sh 2>&1 | tail -5 || true
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
NODE2_AFTER=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents WHERE node_id = 'node-2-macbook-m4max';" 2>/dev/null | tr -d ' \n' || echo "0")
|
||||||
|
AGENT_COUNT_AFTER=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents;" 2>/dev/null | tr -d ' \n' || echo "0")
|
||||||
|
log "📊 After soft recovery: Agents=$AGENT_COUNT_AFTER, NODE2=$NODE2_AFTER"
|
||||||
|
|
||||||
|
log "✅ Soft recovery complete"
|
||||||
|
else
|
||||||
|
log "✅ Data integrity OK"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Check PostgreSQL logs for errors
|
# Check PostgreSQL logs for errors (informational only)
|
||||||
ERROR_COUNT=$(docker logs daarion-postgres --since 5m 2>&1 | grep -i "fatal\|error\|panic" | wc -l)
|
ERROR_COUNT=$(docker logs daarion-postgres --since 5m 2>&1 | grep -i "fatal\|error\|panic" | grep -v "already exists\|does not exist\|column.*does not exist" | wc -l)
|
||||||
if [ "$ERROR_COUNT" -gt 0 ]; then
|
if [ "$ERROR_COUNT" -gt 5 ]; then
|
||||||
log "⚠️ Found $ERROR_COUNT errors in PostgreSQL logs in last 5 minutes"
|
log "⚠️ Found $ERROR_COUNT significant errors in PostgreSQL logs in last 5 minutes"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Check container restart count
|
# Check container restart count
|
||||||
@@ -106,4 +144,3 @@ if [ "$RESTART_COUNT" -gt 10 ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
log "✅ Stability check complete"
|
log "✅ Stability check complete"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user