#!/bin/bash set -euo pipefail # Safety guard: refuse to run on prod by default : "${ENV:=}" : "${ALLOW_DESTRUCTIVE_TESTS:=false}" : "${CONFIRM_CHAOS:=}" : "${CHAOS_FORCE_DLQ:=false}" if [ "$ENV" != "staging" ] && [ "$ENV" != "dev" ]; then echo "ERROR: ENV must be staging/dev. Current ENV='$ENV'. Refusing to run." exit 1 fi if [ "$ALLOW_DESTRUCTIVE_TESTS" != "true" ] || [ "$CONFIRM_CHAOS" != "YES" ]; then echo "ERROR: Set ALLOW_DESTRUCTIVE_TESTS=true and CONFIRM_CHAOS=YES to proceed." exit 1 fi ROOT="/opt/microdao-daarion" COMPOSE_PROJECT_NAME=${COMPOSE_PROJECT_NAME:-dagi-staging} LOG_DIR="$ROOT/logs/chaos" COMPOSE_FILES="-f $ROOT/docker-compose.staging.yml" STAGING_NETWORK="${COMPOSE_PROJECT_NAME}_dagi-staging-network" REPORT="$ROOT/docs/CHAOS_TEST_REPORT.md" TS=$(date -u +"%Y%m%dT%H%M%SZ") # Chaos parameters MESSAGES=${MESSAGES:-50} DUPLICATES=${DUPLICATES:-10} BURST_TIME=${BURST_TIME:-5} AGENT_ID=${AGENT_ID:-helion} DLQ_SUBJECT=${DLQ_SUBJECT:-agent.run.failed.dlq} DLQ_STREAM=${DLQ_STREAM:-AUDIT} AGENT_RUNS_STREAM=${AGENT_RUNS_STREAM:-AGENT_RUNS} COMPLETED_SUBJECT=${COMPLETED_SUBJECT:-agent.run.completed.${AGENT_ID}} # NATS/Redis endpoints for runner container NATS_HOST="dagi-staging-nats" RUNNER_IMAGE="python:3.11-slim" RUNNER_CMD_PREFIX="pip -q install nats-py httpx redis" # Resolve container IPs (DNS inside runner can be flaky) NATS_IP=$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' dagi-staging-nats 2>/dev/null || true) ROUTER_IP=$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' dagi-staging-router 2>/dev/null || true) REDIS_IP=$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' dagi-staging-redis 2>/dev/null || true) : "${NATS_IP:=dagi-staging-nats}" : "${ROUTER_IP:=dagi-staging-router}" : "${REDIS_IP:=dagi-staging-redis}" export NATS_URL="nats://router:router-secret-change-me@${NATS_IP}:4222" export NATS_MONITOR_URL="http://${NATS_IP}:8222" export ROUTER_URL="http://${ROUTER_IP}:9102" export REDIS_URL="redis://${REDIS_IP}:6379" run_burst() { local label="$1" local logfile="$LOG_DIR/${TS}_${label}.log" echo "[${label}] Starting burst: messages=${MESSAGES}, duplicates=${DUPLICATES}, burst_time=${BURST_TIME}s" docker run --rm --network $STAGING_NETWORK \ -e NATS_URL="$NATS_URL" \ -e NATS_MONITOR_URL="$NATS_MONITOR_URL" \ -e ROUTER_URL="$ROUTER_URL" \ -e REDIS_URL="$REDIS_URL" \ -v "$ROOT/scripts:/scripts" \ $RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && python /scripts/load/burst_100.py --messages ${MESSAGES} --duplicates ${DUPLICATES} --burst-time ${BURST_TIME}" | tee "$logfile" } dlq_pending() { local subject="$1" local stream="$2" local durable="dlq_check_$(date +%s%N)" docker run --rm -i --network $STAGING_NETWORK \ -e NATS_URL="$NATS_URL" \ -e DLQ_SUBJECT="$subject" \ -e DLQ_STREAM="$stream" \ -e DLQ_DURABLE="$durable" \ $RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && cat > /tmp/dlq_check.py && python /tmp/dlq_check.py" << 'PY_DLQ' import asyncio, os from nats.aio.client import Client as NATS from nats.js.api import ConsumerConfig, AckPolicy, DeliverPolicy async def main(): nc = NATS() await nc.connect(servers=[os.environ["NATS_URL"]]) js = nc.jetstream() stream = os.environ["DLQ_STREAM"] subject = os.environ["DLQ_SUBJECT"] durable = os.environ["DLQ_DURABLE"] cfg = ConsumerConfig( durable_name=durable, filter_subject=subject, ack_policy=AckPolicy.EXPLICIT, deliver_policy=DeliverPolicy.ALL, ) try: await js.add_consumer(stream, cfg) info = await js.consumer_info(stream, durable) print(info.num_pending) finally: try: await js.delete_consumer(stream, durable) except Exception: pass await nc.close() asyncio.run(main()) PY_DLQ } publish_forced_fail() { local job_id="$1" docker run --rm -i --network $STAGING_NETWORK \ -e NATS_URL="$NATS_URL" \ -e JOB_ID="$job_id" \ -e AGENT_ID="$AGENT_ID" \ $RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && cat > /tmp/force_fail.py && python /tmp/force_fail.py" << 'PY_FAIL' import asyncio, os, json from nats.aio.client import Client as NATS async def main(): nc = NATS() await nc.connect(servers=[os.environ["NATS_URL"]]) js = nc.jetstream() job_id = os.environ["JOB_ID"] payload = { "task_id": job_id, "job_id": job_id, "workflow_type": "test", "agent_id": os.environ.get("AGENT_ID", "helion"), "trace_id": f"trace-{job_id}", "user_id": "tg:test_user", "test_mode": True, "force_fail": True, "payload": {"prompt": "DLQ forced fail", "test": True}, } await js.publish("agent.run.requested", json.dumps(payload).encode()) await nc.close() asyncio.run(main()) print("published", os.environ["JOB_ID"]) PY_FAIL } wait_for_completion() { local job_id="$1" local subject="$2" local stream="$3" local timeout="${4:-60}" docker run --rm -i --network $STAGING_NETWORK \ -e NATS_URL="$NATS_URL" \ -e JOB_ID="$job_id" \ -e COMPLETED_SUBJECT="$subject" \ -e COMPLETED_STREAM="$stream" \ -e COMPLETION_TIMEOUT="$timeout" \ $RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && cat > /tmp/wait_completion.py && python /tmp/wait_completion.py" << 'PY_DONE' import asyncio, os, json, time from nats.aio.client import Client as NATS async def main(): nc = NATS() await nc.connect(servers=[os.environ["NATS_URL"]]) js = nc.jetstream() job_id = os.environ["JOB_ID"] subject = os.environ["COMPLETED_SUBJECT"] stream = os.environ["COMPLETED_STREAM"] timeout = int(os.environ.get("COMPLETION_TIMEOUT", "60")) deadline = time.time() + timeout sub = await js.pull_subscribe(subject, durable=None, stream=stream) found = False replayed = None replay_count = None completed_subject = subject while time.time() < deadline: try: batch = await sub.fetch(1, timeout=1) except Exception: continue if batch: msg = batch[0] data = json.loads(msg.data.decode()) if data.get("job_id") == job_id: found = True completed_subject = msg.subject if msg.headers: replayed = msg.headers.get("replayed") replay_count = msg.headers.get("replay_count") await msg.ack() if found: break await nc.close() if not found: raise SystemExit(1) print(f"completed_job_id={job_id}") print(f"completed_subject={completed_subject}") print(f"replayed={replayed if replayed is not None else 'n/a'}") print(f"replay_count={replay_count if replay_count is not None else 'n/a'}") asyncio.run(main()) PY_DONE } append_report_header() { if [ ! -f "$REPORT" ]; then cat > "$REPORT" << "MD" # Chaos Test Report | Test | Start/End (UTC) | Max Lag | DLQ Peak | p95 Latency | Unique Success | Notes | |---|---|---|---|---|---|---| MD fi } append_report_row() { local test="$1" local start="$2" local end="$3" local max_lag="$4" local dlq="$5" local p95="$6" local success="$7" local notes="$8" echo "| ${test} | ${start} – ${end} | ${max_lag} | ${dlq} | ${p95} | ${success} | ${notes} |" >> "$REPORT" } extract_summary() { local logfile="$1" local max_lag=$(grep -m1 "Max during test" "$logfile" | awk '{print $NF}') local dlq=$(grep -m1 "DLQ depth" "$logfile" | awk '{print $NF}') local p95=$(grep -m1 "p95:" "$logfile" | awk '{print $2}') local success=$(grep -m1 "Success rate" "$logfile" | awk '{print $NF}' | tr -d ')%') echo "$max_lag|$dlq|$p95|$success" } append_report_header # Test A — Kill Worker START_A=$(date -u +"%Y-%m-%d %H:%M:%S") (run_burst "A_kill_worker") & PID=$! sleep 5 if docker ps --format "{{.Names}}" | grep -q "dagi-staging-crewai-worker"; then COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES restart crewai-worker fi wait $PID || true END_A=$(date -u +"%Y-%m-%d %H:%M:%S") SUM_A=$(extract_summary "$LOG_DIR/${TS}_A_kill_worker.log") IFS='|' read -r MAXLAG_A DLQ_A P95_A SUCCESS_A <<< "$SUM_A" append_report_row "A – Kill Worker" "$START_A" "$END_A" "${MAXLAG_A:-n/a}" "${DLQ_A:-n/a}" "${P95_A:-n/a}" "${SUCCESS_A:-n/a}%" "restart crewai-worker" # Test B — Kill Router START_B=$(date -u +"%Y-%m-%d %H:%M:%S") (run_burst "B_kill_router") & PID=$! sleep 5 if docker ps --format "{{.Names}}" | grep -q "dagi-staging-router"; then COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES restart router fi wait $PID || true END_B=$(date -u +"%Y-%m-%d %H:%M:%S") SUM_B=$(extract_summary "$LOG_DIR/${TS}_B_kill_router.log") IFS='|' read -r MAXLAG_B DLQ_B P95_B SUCCESS_B <<< "$SUM_B" append_report_row "B – Kill Router" "$START_B" "$END_B" "${MAXLAG_B:-n/a}" "${DLQ_B:-n/a}" "${P95_B:-n/a}" "${SUCCESS_B:-n/a}%" "restart router" # Test C — Block Postgres (stop 60s) START_C=$(date -u +"%Y-%m-%d %H:%M:%S") (run_burst "C_block_postgres") & PID=$! sleep 5 if docker ps --format "{{.Names}}" | grep -q "dagi-staging-postgres"; then COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES stop dagi-postgres sleep 60 COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES start dagi-postgres fi wait $PID || true END_C=$(date -u +"%Y-%m-%d %H:%M:%S") SUM_C=$(extract_summary "$LOG_DIR/${TS}_C_block_postgres.log") IFS='|' read -r MAXLAG_C DLQ_C P95_C SUCCESS_C <<< "$SUM_C" append_report_row "C – Block Postgres" "$START_C" "$END_C" "${MAXLAG_C:-n/a}" "${DLQ_C:-n/a}" "${P95_C:-n/a}" "${SUCCESS_C:-n/a}%" "stop/start postgres 60s" # Test D — DLQ Replay START_D=$(date -u +"%Y-%m-%d %H:%M:%S") DLQ_LOG="$LOG_DIR/${TS}_D_dlq_replay.log" DLQ_NOTES="dlq_replay.py" DLQ_STATUS="n/a" DLQ_JOB_ID="" DLQ_WAIT_S="0" REPLAY_WAIT_S="0" COMPLETION_WAIT_S="0" COMPLETED_SUBJECT_ACTUAL="$COMPLETED_SUBJECT" REPLAY_COUNT_VALUE="n/a" REPLAYED_VALUE="n/a" if [ "$CHAOS_FORCE_DLQ" = "true" ]; then echo "D[DLQ]: forcing job_id= subject=${DLQ_SUBJECT} stream=${DLQ_STREAM}" DLQ_COUNT=$(dlq_pending "$DLQ_SUBJECT" "$DLQ_STREAM" 2>/dev/null || echo "0") if [ "$DLQ_COUNT" = "0" ]; then DLQ_JOB_ID="dlq-test-$(date +%s)" echo "D[DLQ]: forcing job_id=${DLQ_JOB_ID} subject=${DLQ_SUBJECT} stream=${DLQ_STREAM}" publish_forced_fail "$DLQ_JOB_ID" DLQ_WAIT_START=$(date +%s) READY=false for _ in $(seq 1 30); do DLQ_COUNT=$(dlq_pending "$DLQ_SUBJECT" "$DLQ_STREAM" 2>/dev/null || echo "0") if [ "$DLQ_COUNT" != "0" ]; then READY=true break fi sleep 2 done DLQ_WAIT_END=$(date +%s) DLQ_WAIT_S=$((DLQ_WAIT_END-DLQ_WAIT_START)) if [ "$READY" != "true" ]; then echo "D[DLQ]: FAIL job_id=${DLQ_JOB_ID} reason=timeout waiting dlq" DLQ_NOTES="forced-fail timeout" DLQ_STATUS="FAIL" append_report_row "D – DLQ Replay" "$START_D" "$(date -u +"%Y-%m-%d %H:%M:%S")" "n/a" "0" "n/a" "$DLQ_STATUS" "$DLQ_NOTES" exit 1 fi else echo "D[DLQ]: pending=${DLQ_COUNT}, running replay (dry-run->real)" fi fi REPLAY_START=$(date +%s) docker run --rm --network $STAGING_NETWORK \ -e NATS_URL="$NATS_URL" \ -v "$ROOT/scripts:/scripts" \ $RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && python /scripts/dlq_replay.py --subject ${DLQ_SUBJECT} --stream ${DLQ_STREAM} --dry-run" | tee "$DLQ_LOG" docker run --rm --network $STAGING_NETWORK \ -e NATS_URL="$NATS_URL" \ -v "$ROOT/scripts:/scripts" \ $RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && python /scripts/dlq_replay.py --subject ${DLQ_SUBJECT} --stream ${DLQ_STREAM}" | tee -a "$DLQ_LOG" REPLAY_END=$(date +%s) REPLAY_WAIT_S=$((REPLAY_END-REPLAY_START)) if [ "$CHAOS_FORCE_DLQ" = "true" ] && [ -z "$DLQ_JOB_ID" ]; then DLQ_JOB_ID=$(grep -m1 "replayed_job_id=" "$DLQ_LOG" | awk -F= '{print $2}' | tr -d '\r') fi if [ "$CHAOS_FORCE_DLQ" = "true" ] && [ -n "$DLQ_JOB_ID" ]; then COMPLETION_START=$(date +%s) COMPLETION_OUTPUT=$(wait_for_completion "$DLQ_JOB_ID" "$COMPLETED_SUBJECT_ACTUAL" "$AGENT_RUNS_STREAM" 90) COMPLETION_RC=$? COMPLETION_END=$(date +%s) COMPLETION_WAIT_S=$((COMPLETION_END-COMPLETION_START)) if [ "$COMPLETION_RC" -eq 0 ]; then DLQ_STATUS="completed" COMPLETED_SUBJECT_ACTUAL=$(echo "$COMPLETION_OUTPUT" | awk -F= '/completed_subject=/{print $2}' | tail -1) REPLAYED_VALUE=$(echo "$COMPLETION_OUTPUT" | awk -F= '/replayed=/{print $2}' | tail -1) REPLAY_COUNT_VALUE=$(echo "$COMPLETION_OUTPUT" | awk -F= '/replay_count=/{print $2}' | tail -1) DLQ_NOTES="forced fail + replay, job_id=${DLQ_JOB_ID}, subject=${COMPLETED_SUBJECT_ACTUAL}, replay_count=${REPLAY_COUNT_VALUE}" echo "D[DLQ]: completed job_id=${DLQ_JOB_ID} completed_subject=${COMPLETED_SUBJECT_ACTUAL} replayed=${REPLAYED_VALUE} replay_count=${REPLAY_COUNT_VALUE}" echo "D[DLQ]: timings dlq_wait_s=${DLQ_WAIT_S} replay_wait_s=${REPLAY_WAIT_S} completion_wait_s=${COMPLETION_WAIT_S}" else DLQ_STATUS="FAIL" DLQ_NOTES="replay done, completion timeout, job_id=${DLQ_JOB_ID}" echo "D[DLQ]: FAIL job_id=${DLQ_JOB_ID} reason=timeout waiting completion" echo "D[DLQ]: timings dlq_wait_s=${DLQ_WAIT_S} replay_wait_s=${REPLAY_WAIT_S} completion_wait_s=${COMPLETION_WAIT_S}" append_report_row "D – DLQ Replay" "$START_D" "$(date -u +"%Y-%m-%d %H:%M:%S")" "n/a" "1→0" "n/a" "$DLQ_STATUS" "$DLQ_NOTES" exit 1 fi else if [ "$CHAOS_FORCE_DLQ" = "true" ]; then echo "D[DLQ]: FAIL job_id= reason=no replayed_job_id in log" DLQ_STATUS="FAIL" DLQ_NOTES="no replayed_job_id found" append_report_row "D – DLQ Replay" "$START_D" "$(date -u +"%Y-%m-%d %H:%M:%S")" "n/a" "see log" "n/a" "$DLQ_STATUS" "$DLQ_NOTES" exit 1 fi fi END_D=$(date -u +"%Y-%m-%d %H:%M:%S") append_report_row "D – DLQ Replay" "$START_D" "$END_D" "n/a" "see log" "n/a" "${DLQ_STATUS}" "${DLQ_NOTES}" echo "✅ Chaos suite complete. Logs: $LOG_DIR" echo "✅ Report updated: $REPORT"