From 19e8436a029ea3421af285ae16f16c64b680562b Mon Sep 17 00:00:00 2001 From: Apple Date: Wed, 3 Dec 2025 09:59:41 -0800 Subject: [PATCH] fix: Add database stability monitoring and improve PostgreSQL config - Add monitor-db-stability.sh for automatic recovery - Improve PostgreSQL shutdown settings to prevent data loss - Add checkpoint and WAL settings for better persistence --- docker-compose.db.yml | 4 ++ scripts/monitor-db-stability.sh | 88 +++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100755 scripts/monitor-db-stability.sh diff --git a/docker-compose.db.yml b/docker-compose.db.yml index a1bb409b..a5fb36ca 100644 --- a/docker-compose.db.yml +++ b/docker-compose.db.yml @@ -10,6 +10,8 @@ services: POSTGRES_DB: daarion POSTGRES_USER: postgres POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-postgres} + # Prevent data loss: ensure proper shutdown + POSTGRES_INITDB_ARGS: "-E UTF8 --locale=C" ports: - "5432:5432" volumes: @@ -24,6 +26,8 @@ services: timeout: 5s retries: 5 start_period: 30s + # Add command to ensure proper shutdown and prevent data loss + command: postgres -c shared_buffers=256MB -c max_connections=200 -c checkpoint_timeout=15min -c wal_level=replica -c max_wal_size=1GB # Automatic database backups db-backup: diff --git a/scripts/monitor-db-stability.sh b/scripts/monitor-db-stability.sh new file mode 100755 index 00000000..e1cdc240 --- /dev/null +++ b/scripts/monitor-db-stability.sh @@ -0,0 +1,88 @@ +#!/bin/bash +# Monitor database stability and auto-recover if needed +# Run this periodically (e.g., every 5 minutes via cron) + +set -e + +LOG_FILE="/var/log/db-stability-monitor.log" +TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S') + +log() { + echo "[$TIMESTAMP] $1" | tee -a "$LOG_FILE" +} + +log "🔍 Starting database stability check..." + +# Check if PostgreSQL container is running +if ! docker ps | grep -q daarion-postgres; then + log "❌ PostgreSQL container is not running! Starting..." + cd /opt/microdao-daarion + docker compose -f docker-compose.db.yml up -d db + sleep 10 +fi + +# Check if database exists +if ! docker exec daarion-postgres psql -U postgres -lqt | cut -d \| -f 1 | grep -qw daarion; then + log "⚠️ Database 'daarion' does not exist, creating..." + docker exec daarion-postgres psql -U postgres -c "CREATE DATABASE daarion;" || { + log "❌ Failed to create database" + exit 1 + } +fi + +# Check data integrity +MICRODAO_COUNT=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM microdaos;" 2>/dev/null | tr -d ' ' || echo "0") +AGENT_COUNT=$(docker exec daarion-postgres psql -U postgres -d daarion -t -c "SELECT COUNT(*) FROM agents;" 2>/dev/null | tr -d ' ' || echo "0") + +log "📊 Data check: MicroDAOs=$MICRODAO_COUNT, Agents=$AGENT_COUNT" + +# If data is missing, try to restore +if [ "$MICRODAO_COUNT" -lt 5 ] || [ "$AGENT_COUNT" -lt 10 ]; then + log "⚠️ Data loss detected! Attempting recovery..." + + # Check for recent backup + LATEST_BACKUP=$(ls -t /opt/microdao-daarion/db_backups/*.sql 2>/dev/null | head -1) + + if [ -n "$LATEST_BACKUP" ]; then + log "📦 Found backup: $LATEST_BACKUP" + log "🔄 Restoring from backup..." + + # Drop and recreate database + docker exec daarion-postgres psql -U postgres -c "DROP DATABASE IF EXISTS daarion;" + docker exec daarion-postgres psql -U postgres -c "CREATE DATABASE daarion;" + + # Restore from backup + docker exec -i daarion-postgres psql -U postgres -d daarion < "$LATEST_BACKUP" 2>&1 | grep -v "already exists\|does not exist" || true + + # Apply migrations + cd /opt/microdao-daarion + for f in migrations/*.sql; do + docker exec -i daarion-postgres psql -U postgres -d daarion < "$f" 2>&1 | grep -v "already exists\|does not exist" || true + done + + # Sync NODE2 agents + python3 scripts/sync-node2-dagi-agents.py 2>&1 | tail -5 || true + + # Remove test agents + bash scripts/remove-test-agents.sh 2>&1 | tail -3 || true + + log "✅ Recovery complete" + else + log "❌ No backup found for recovery" + fi +fi + +# Check PostgreSQL logs for errors +ERROR_COUNT=$(docker logs daarion-postgres --since 5m 2>&1 | grep -i "fatal\|error\|panic" | wc -l) +if [ "$ERROR_COUNT" -gt 0 ]; then + log "⚠️ Found $ERROR_COUNT errors in PostgreSQL logs in last 5 minutes" +fi + +# Check container restart count +RESTART_COUNT=$(docker inspect daarion-postgres --format='{{.RestartCount}}' 2>/dev/null || echo "0") +if [ "$RESTART_COUNT" -gt 10 ]; then + log "⚠️ PostgreSQL has restarted $RESTART_COUNT times - possible stability issue" +fi + +log "✅ Stability check complete" +