fix: Add database stability monitoring and improve PostgreSQL config

- Add monitor-db-stability.sh for automatic recovery
- Improve PostgreSQL shutdown settings to prevent data loss
- Add checkpoint and WAL settings for better persistence
This commit is contained in:
Apple
2025-12-03 09:59:41 -08:00
parent 0c75ded63a
commit 19e8436a02
2 changed files with 92 additions and 0 deletions

View File

@@ -10,6 +10,8 @@ services:
POSTGRES_DB: daarion
POSTGRES_USER: postgres
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-postgres}
# Prevent data loss: ensure proper shutdown
POSTGRES_INITDB_ARGS: "-E UTF8 --locale=C"
ports:
- "5432:5432"
volumes:
@@ -24,6 +26,8 @@ services:
timeout: 5s
retries: 5
start_period: 30s
# Add command to ensure proper shutdown and prevent data loss
command: postgres -c shared_buffers=256MB -c max_connections=200 -c checkpoint_timeout=15min -c wal_level=replica -c max_wal_size=1GB
# Automatic database backups
db-backup:

88
scripts/monitor-db-stability.sh Executable file
View File

@@ -0,0 +1,88 @@
#!/bin/bash
# Monitor database stability and auto-recover if needed
# Run this periodically (e.g., every 5 minutes via cron)
set -e
LOG_FILE="/var/log/db-stability-monitor.log"
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
log() {
echo "[$TIMESTAMP] $1" | tee -a "$LOG_FILE"
}
log "🔍 Starting database stability check..."
# Check if PostgreSQL container is running
if ! docker ps | grep -q daarion-postgres; then
log "❌ PostgreSQL container is not running! Starting..."
cd /opt/microdao-daarion
docker compose -f docker-compose.db.yml up -d db
sleep 10
fi
# Check if database exists
if ! docker exec daarion-postgres psql -U postgres -lqt | cut -d \| -f 1 | grep -qw daarion; then
log "⚠️ Database 'daarion' does not exist, creating..."
docker exec daarion-postgres psql -U postgres -c "CREATE DATABASE daarion;" || {
log "❌ Failed to create database"
exit 1
}
fi
# Check data integrity
MICRODAO_COUNT=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM microdaos;" 2>/dev/null | tr -d ' ' || echo "0")
AGENT_COUNT=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents;" 2>/dev/null | tr -d ' ' || echo "0")
log "📊 Data check: MicroDAOs=$MICRODAO_COUNT, Agents=$AGENT_COUNT"
# If data is missing, try to restore
if [ "$MICRODAO_COUNT" -lt 5 ] || [ "$AGENT_COUNT" -lt 10 ]; then
log "⚠️ Data loss detected! Attempting recovery..."
# Check for recent backup
LATEST_BACKUP=$(ls -t /opt/microdao-daarion/db_backups/*.sql 2>/dev/null | head -1)
if [ -n "$LATEST_BACKUP" ]; then
log "📦 Found backup: $LATEST_BACKUP"
log "🔄 Restoring from backup..."
# Drop and recreate database
docker exec daarion-postgres psql -U postgres -c "DROP DATABASE IF EXISTS daarion;"
docker exec daarion-postgres psql -U postgres -c "CREATE DATABASE daarion;"
# Restore from backup
docker exec -i daarion-postgres psql -U postgres -d daarion < "$LATEST_BACKUP" 2>&1 | grep -v "already exists\|does not exist" || true
# Apply migrations
cd /opt/microdao-daarion
for f in migrations/*.sql; do
docker exec -i daarion-postgres psql -U postgres -d daarion < "$f" 2>&1 | grep -v "already exists\|does not exist" || true
done
# Sync NODE2 agents
python3 scripts/sync-node2-dagi-agents.py 2>&1 | tail -5 || true
# Remove test agents
bash scripts/remove-test-agents.sh 2>&1 | tail -3 || true
log "✅ Recovery complete"
else
log "❌ No backup found for recovery"
fi
fi
# Check PostgreSQL logs for errors
ERROR_COUNT=$(docker logs daarion-postgres --since 5m 2>&1 | grep -i "fatal\|error\|panic" | wc -l)
if [ "$ERROR_COUNT" -gt 0 ]; then
log "⚠️ Found $ERROR_COUNT errors in PostgreSQL logs in last 5 minutes"
fi
# Check container restart count
RESTART_COUNT=$(docker inspect daarion-postgres --format='{{.RestartCount}}' 2>/dev/null || echo "0")
if [ "$RESTART_COUNT" -gt 10 ]; then
log "⚠️ PostgreSQL has restarted $RESTART_COUNT times - possible stability issue"
fi
log "✅ Stability check complete"