Files
microdao-daarion/scripts/monitor-db-stability.sh
Apple b2caee4e0e fix: CRITICAL - Prevent infinite DROP DATABASE loop
ROOT CAUSE: Monitor was doing DROP DATABASE when NODE2 agents were missing,
but the backup didn't have NODE2 agents, causing an infinite loop.

FIX:
- FULL RECOVERY (DROP DATABASE) only when MicroDAOs < 5 (critical data loss)
- SOFT RECOVERY (just sync agents) when MicroDAOs exist but agents missing
- Prefer backup with NODE2 agents (full_backup_with_node2*.sql)
- Never DROP DATABASE if MicroDAOs exist

This prevents the daily data loss issue.
2025-12-05 02:41:43 -08:00

147 lines
6.2 KiB
Bash
Executable File

#!/bin/bash
# Monitor database stability and auto-recover if needed
# Run this periodically (e.g., every 5 minutes via cron)
#
# IMPORTANT: This script has two recovery modes:
# 1. FULL RECOVERY (DROP DATABASE) - only when MicroDAOs < 5 (critical data loss)
# 2. SOFT RECOVERY (add NODE2 agents) - when MicroDAOs exist but agents missing
set -e
LOG_FILE="/var/log/db-stability-monitor.log"
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
log() {
echo "[$TIMESTAMP] $1" | tee -a "$LOG_FILE"
}
log "🔍 Starting database stability check..."
# Check if PostgreSQL container is running
if ! docker ps | grep -q daarion-postgres; then
log "❌ PostgreSQL container is not running! Starting..."
cd /opt/microdao-daarion
docker compose -f docker-compose.db.yml up -d db
sleep 10
fi
# Check if database exists
if ! docker exec daarion-postgres psql -U postgres -lqt | cut -d \| -f 1 | grep -qw daarion; then
log "⚠️ Database 'daarion' does not exist, creating..."
docker exec daarion-postgres psql -U postgres -c "CREATE DATABASE daarion;" || {
log "❌ Failed to create database"
exit 1
}
fi
# Check data integrity
MICRODAO_COUNT=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM microdaos;" 2>/dev/null | tr -d ' \n' || echo "0")
AGENT_COUNT=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents;" 2>/dev/null | tr -d ' \n' || echo "0")
NODE2_AGENT_COUNT=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents WHERE node_id = 'node-2-macbook-m4max';" 2>/dev/null | tr -d ' \n' || echo "0")
# Ensure we have valid integers
MICRODAO_COUNT=${MICRODAO_COUNT:-0}
AGENT_COUNT=${AGENT_COUNT:-0}
NODE2_AGENT_COUNT=${NODE2_AGENT_COUNT:-0}
log "📊 Data check: MicroDAOs=$MICRODAO_COUNT, Agents=$AGENT_COUNT, NODE2=$NODE2_AGENT_COUNT"
# ============================================================================
# CASE 1: CRITICAL DATA LOSS - MicroDAOs missing (< 5)
# Only in this case we do FULL RECOVERY with DROP DATABASE
# ============================================================================
if [ "$MICRODAO_COUNT" -lt 5 ]; then
log "🔴 CRITICAL: MicroDAOs missing ($MICRODAO_COUNT < 5). Starting FULL RECOVERY..."
# Check for recent backup (prefer backup with NODE2 agents)
LATEST_BACKUP=$(ls -t /opt/microdao-daarion/db_backups/full_backup_with_node2*.sql 2>/dev/null | head -1)
if [ -z "$LATEST_BACKUP" ]; then
LATEST_BACKUP=$(ls -t /opt/microdao-daarion/db_backups/*.sql 2>/dev/null | head -1)
fi
if [ -n "$LATEST_BACKUP" ]; then
log "📦 Found backup: $LATEST_BACKUP"
log "🔄 Starting full restoration..."
# Terminate all connections to the database first
log "🔒 Terminating active connections..."
docker exec daarion-postgres psql -U postgres -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = 'daarion' AND pid <> pg_backend_pid();" 2>&1 | grep -v "terminate_backend\|^$" || true
sleep 2
# Drop and recreate database
docker exec daarion-postgres psql -U postgres -c "DROP DATABASE IF EXISTS daarion;" 2>&1 | grep -v "DROP DATABASE" || true
docker exec daarion-postgres psql -U postgres -c "CREATE DATABASE daarion;" 2>&1 | grep -v "CREATE DATABASE" || true
# Restore from backup
docker exec -i daarion-postgres psql -U postgres -d daarion < "$LATEST_BACKUP" 2>&1 | grep -v "already exists\|does not exist" || true
# Apply migrations
cd /opt/microdao-daarion
for f in migrations/*.sql; do
docker exec -i daarion-postgres psql -U postgres -d daarion < "$f" 2>&1 | grep -v "already exists\|does not exist" || true
done
# Sync NODE2 agents
log "🤖 Syncing NODE2 agents..."
python3 scripts/sync-node2-dagi-agents.py 2>&1 | tail -10 || true
# Remove test agents
bash scripts/remove-test-agents.sh 2>&1 | tail -3 || true
# Fix asset URLs
log "🖼️ Fixing asset URLs..."
bash scripts/fix-asset-urls.sh 2>&1 | tail -5 || true
# Verify
AGENT_COUNT_AFTER=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents;" 2>/dev/null | tr -d ' \n' || echo "0")
log "📊 Agents after full recovery: $AGENT_COUNT_AFTER"
log "✅ Full recovery complete"
else
log "❌ No backup found for full recovery"
fi
# ============================================================================
# CASE 2: SOFT RECOVERY - MicroDAOs exist but NODE2 agents missing
# DO NOT DROP DATABASE - just sync NODE2 agents
# ============================================================================
elif [ "$NODE2_AGENT_COUNT" -lt 45 ]; then
log "🟡 WARNING: NODE2 agents missing ($NODE2_AGENT_COUNT < 45). Starting SOFT RECOVERY (no DROP)..."
cd /opt/microdao-daarion
# Just sync NODE2 agents - NO DROP DATABASE!
log "🤖 Syncing NODE2 agents..."
python3 scripts/sync-node2-dagi-agents.py 2>&1 | tail -10 || true
# Remove test agents if any
bash scripts/remove-test-agents.sh 2>&1 | tail -3 || true
# Fix asset URLs
log "🖼️ Fixing asset URLs..."
bash scripts/fix-asset-urls.sh 2>&1 | tail -5 || true
# Verify
NODE2_AFTER=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents WHERE node_id = 'node-2-macbook-m4max';" 2>/dev/null | tr -d ' \n' || echo "0")
AGENT_COUNT_AFTER=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents;" 2>/dev/null | tr -d ' \n' || echo "0")
log "📊 After soft recovery: Agents=$AGENT_COUNT_AFTER, NODE2=$NODE2_AFTER"
log "✅ Soft recovery complete"
else
log "✅ Data integrity OK"
fi
# Check PostgreSQL logs for errors (informational only)
ERROR_COUNT=$(docker logs daarion-postgres --since 5m 2>&1 | grep -i "fatal\|error\|panic" | grep -v "already exists\|does not exist\|column.*does not exist" | wc -l)
if [ "$ERROR_COUNT" -gt 5 ]; then
log "⚠️ Found $ERROR_COUNT significant errors in PostgreSQL logs in last 5 minutes"
fi
# Check container restart count
RESTART_COUNT=$(docker inspect daarion-postgres --format='{{.RestartCount}}' 2>/dev/null || echo "0")
if [ "$RESTART_COUNT" -gt 10 ]; then
log "⚠️ PostgreSQL has restarted $RESTART_COUNT times - possible stability issue"
fi
log "✅ Stability check complete"