- Check for at least 45 NODE2 agents (out of 50 expected) - This prevents false positives when only core agents exist - Better detection of actual data loss
110 lines
4.4 KiB
Bash
Executable File
110 lines
4.4 KiB
Bash
Executable File
#!/bin/bash
|
|
# Monitor database stability and auto-recover if needed
|
|
# Run this periodically (e.g., every 5 minutes via cron)
|
|
|
|
set -e
|
|
|
|
LOG_FILE="/var/log/db-stability-monitor.log"
|
|
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
|
|
|
|
log() {
|
|
echo "[$TIMESTAMP] $1" | tee -a "$LOG_FILE"
|
|
}
|
|
|
|
log "🔍 Starting database stability check..."
|
|
|
|
# Check if PostgreSQL container is running
|
|
if ! docker ps | grep -q daarion-postgres; then
|
|
log "❌ PostgreSQL container is not running! Starting..."
|
|
cd /opt/microdao-daarion
|
|
docker compose -f docker-compose.db.yml up -d db
|
|
sleep 10
|
|
fi
|
|
|
|
# Check if database exists
|
|
if ! docker exec daarion-postgres psql -U postgres -lqt | cut -d \| -f 1 | grep -qw daarion; then
|
|
log "⚠️ Database 'daarion' does not exist, creating..."
|
|
docker exec daarion-postgres psql -U postgres -c "CREATE DATABASE daarion;" || {
|
|
log "❌ Failed to create database"
|
|
exit 1
|
|
}
|
|
fi
|
|
|
|
# Check data integrity
|
|
MICRODAO_COUNT=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM microdaos;" 2>/dev/null | tr -d ' \n' || echo "0")
|
|
AGENT_COUNT=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents;" 2>/dev/null | tr -d ' \n' || echo "0")
|
|
NODE2_AGENT_COUNT=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents WHERE node_id = 'node-2-macbook-m4max';" 2>/dev/null | tr -d ' \n' || echo "0")
|
|
|
|
# Ensure we have valid integers
|
|
MICRODAO_COUNT=${MICRODAO_COUNT:-0}
|
|
AGENT_COUNT=${AGENT_COUNT:-0}
|
|
NODE2_AGENT_COUNT=${NODE2_AGENT_COUNT:-0}
|
|
|
|
log "📊 Data check: MicroDAOs=$MICRODAO_COUNT, Agents=$AGENT_COUNT, NODE2=$NODE2_AGENT_COUNT"
|
|
|
|
# If data is missing, try to restore
|
|
# Need at least 5 MicroDAOs, 50 total agents, and 45+ NODE2 agents
|
|
if [ "$MICRODAO_COUNT" -lt 5 ] || [ "$AGENT_COUNT" -lt 50 ] || [ "$NODE2_AGENT_COUNT" -lt 45 ]; then
|
|
log "⚠️ Data loss detected! Attempting recovery..."
|
|
|
|
# Check for recent backup
|
|
LATEST_BACKUP=$(ls -t /opt/microdao-daarion/db_backups/*.sql 2>/dev/null | head -1)
|
|
|
|
if [ -n "$LATEST_BACKUP" ]; then
|
|
log "📦 Found backup: $LATEST_BACKUP"
|
|
log "🔄 Restoring from backup..."
|
|
|
|
# Terminate all connections to the database first
|
|
log "🔒 Terminating active connections..."
|
|
docker exec daarion-postgres psql -U postgres -c "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = 'daarion' AND pid <> pg_backend_pid();" 2>&1 | grep -v "terminate_backend\|^$" || true
|
|
sleep 2
|
|
|
|
# Drop and recreate database
|
|
docker exec daarion-postgres psql -U postgres -c "DROP DATABASE IF EXISTS daarion;" 2>&1 | grep -v "DROP DATABASE" || true
|
|
docker exec daarion-postgres psql -U postgres -c "CREATE DATABASE daarion;" 2>&1 | grep -v "CREATE DATABASE" || true
|
|
|
|
# Restore from backup
|
|
docker exec -i daarion-postgres psql -U postgres -d daarion < "$LATEST_BACKUP" 2>&1 | grep -v "already exists\|does not exist" || true
|
|
|
|
# Apply migrations
|
|
cd /opt/microdao-daarion
|
|
for f in migrations/*.sql; do
|
|
docker exec -i daarion-postgres psql -U postgres -d daarion < "$f" 2>&1 | grep -v "already exists\|does not exist" || true
|
|
done
|
|
|
|
# Sync NODE2 agents (force sync even if they exist)
|
|
log "🤖 Syncing NODE2 agents..."
|
|
python3 scripts/sync-node2-dagi-agents.py 2>&1 | tail -10 || true
|
|
|
|
# Verify agent count after sync
|
|
AGENT_COUNT_AFTER=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents;" 2>/dev/null | tr -d ' \n' || echo "0")
|
|
log "📊 Agents after sync: $AGENT_COUNT_AFTER"
|
|
|
|
# Remove test agents
|
|
bash scripts/remove-test-agents.sh 2>&1 | tail -3 || true
|
|
|
|
# Fix asset URLs (logos and banners)
|
|
log "🖼️ Fixing asset URLs..."
|
|
bash scripts/fix-asset-urls.sh 2>&1 | tail -5 || true
|
|
|
|
log "✅ Recovery complete"
|
|
else
|
|
log "❌ No backup found for recovery"
|
|
fi
|
|
fi
|
|
|
|
# Check PostgreSQL logs for errors
|
|
ERROR_COUNT=$(docker logs daarion-postgres --since 5m 2>&1 | grep -i "fatal\|error\|panic" | wc -l)
|
|
if [ "$ERROR_COUNT" -gt 0 ]; then
|
|
log "⚠️ Found $ERROR_COUNT errors in PostgreSQL logs in last 5 minutes"
|
|
fi
|
|
|
|
# Check container restart count
|
|
RESTART_COUNT=$(docker inspect daarion-postgres --format='{{.RestartCount}}' 2>/dev/null || echo "0")
|
|
if [ "$RESTART_COUNT" -gt 10 ]; then
|
|
log "⚠️ PostgreSQL has restarted $RESTART_COUNT times - possible stability issue"
|
|
fi
|
|
|
|
log "✅ Stability check complete"
|
|
|