Files
microdao-daarion/scripts/chaos/run_chaos_suite.sh
Apple ef3473db21 snapshot: NODE1 production state 2026-02-09
Complete snapshot of /opt/microdao-daarion/ from NODE1 (144.76.224.179).
This represents the actual running production code that has diverged
significantly from the previous main branch.

Key changes from old main:
- Gateway (http_api.py): expanded from ~40KB to 164KB with full agent support
- Router: new /v1/agents/{id}/infer endpoint with vision + DeepSeek routing
- Behavior Policy: SOWA v2.2 (3-level: FULL/ACK/SILENT)
- Agent Registry: config/agent_registry.yml as single source of truth
- 13 agents configured (was 3)
- Memory service integration
- CrewAI teams and roles

Excluded from snapshot: venv/, .env, data/, backups, .tgz archives

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-09 08:46:46 -08:00

391 lines
14 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
set -euo pipefail
# Safety guard: refuse to run on prod by default
: "${ENV:=}"
: "${ALLOW_DESTRUCTIVE_TESTS:=false}"
: "${CONFIRM_CHAOS:=}"
: "${CHAOS_FORCE_DLQ:=false}"
if [ "$ENV" != "staging" ] && [ "$ENV" != "dev" ]; then
echo "ERROR: ENV must be staging/dev. Current ENV='$ENV'. Refusing to run."
exit 1
fi
if [ "$ALLOW_DESTRUCTIVE_TESTS" != "true" ] || [ "$CONFIRM_CHAOS" != "YES" ]; then
echo "ERROR: Set ALLOW_DESTRUCTIVE_TESTS=true and CONFIRM_CHAOS=YES to proceed."
exit 1
fi
ROOT="/opt/microdao-daarion"
COMPOSE_PROJECT_NAME=${COMPOSE_PROJECT_NAME:-dagi-staging}
LOG_DIR="$ROOT/logs/chaos"
COMPOSE_FILES="-f $ROOT/docker-compose.staging.yml"
STAGING_NETWORK="${COMPOSE_PROJECT_NAME}_dagi-staging-network"
REPORT="$ROOT/docs/CHAOS_TEST_REPORT.md"
TS=$(date -u +"%Y%m%dT%H%M%SZ")
# Chaos parameters
MESSAGES=${MESSAGES:-50}
DUPLICATES=${DUPLICATES:-10}
BURST_TIME=${BURST_TIME:-5}
AGENT_ID=${AGENT_ID:-helion}
DLQ_SUBJECT=${DLQ_SUBJECT:-agent.run.failed.dlq}
DLQ_STREAM=${DLQ_STREAM:-AUDIT}
AGENT_RUNS_STREAM=${AGENT_RUNS_STREAM:-AGENT_RUNS}
COMPLETED_SUBJECT=${COMPLETED_SUBJECT:-agent.run.completed.${AGENT_ID}}
# NATS/Redis endpoints for runner container
NATS_HOST="dagi-staging-nats"
RUNNER_IMAGE="python:3.11-slim"
RUNNER_CMD_PREFIX="pip -q install nats-py httpx redis"
# Resolve container IPs (DNS inside runner can be flaky)
NATS_IP=$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' dagi-staging-nats 2>/dev/null || true)
ROUTER_IP=$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' dagi-staging-router 2>/dev/null || true)
REDIS_IP=$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' dagi-staging-redis 2>/dev/null || true)
: "${NATS_IP:=dagi-staging-nats}"
: "${ROUTER_IP:=dagi-staging-router}"
: "${REDIS_IP:=dagi-staging-redis}"
export NATS_URL="nats://router:router-secret-change-me@${NATS_IP}:4222"
export NATS_MONITOR_URL="http://${NATS_IP}:8222"
export ROUTER_URL="http://${ROUTER_IP}:9102"
export REDIS_URL="redis://${REDIS_IP}:6379"
run_burst() {
local label="$1"
local logfile="$LOG_DIR/${TS}_${label}.log"
echo "[${label}] Starting burst: messages=${MESSAGES}, duplicates=${DUPLICATES}, burst_time=${BURST_TIME}s"
docker run --rm --network $STAGING_NETWORK \
-e NATS_URL="$NATS_URL" \
-e NATS_MONITOR_URL="$NATS_MONITOR_URL" \
-e ROUTER_URL="$ROUTER_URL" \
-e REDIS_URL="$REDIS_URL" \
-v "$ROOT/scripts:/scripts" \
$RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && python /scripts/load/burst_100.py --messages ${MESSAGES} --duplicates ${DUPLICATES} --burst-time ${BURST_TIME}" | tee "$logfile"
}
dlq_pending() {
local subject="$1"
local stream="$2"
local durable="dlq_check_$(date +%s%N)"
docker run --rm -i --network $STAGING_NETWORK \
-e NATS_URL="$NATS_URL" \
-e DLQ_SUBJECT="$subject" \
-e DLQ_STREAM="$stream" \
-e DLQ_DURABLE="$durable" \
$RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && cat > /tmp/dlq_check.py && python /tmp/dlq_check.py" << 'PY_DLQ'
import asyncio, os
from nats.aio.client import Client as NATS
from nats.js.api import ConsumerConfig, AckPolicy, DeliverPolicy
async def main():
nc = NATS()
await nc.connect(servers=[os.environ["NATS_URL"]])
js = nc.jetstream()
stream = os.environ["DLQ_STREAM"]
subject = os.environ["DLQ_SUBJECT"]
durable = os.environ["DLQ_DURABLE"]
cfg = ConsumerConfig(
durable_name=durable,
filter_subject=subject,
ack_policy=AckPolicy.EXPLICIT,
deliver_policy=DeliverPolicy.ALL,
)
try:
await js.add_consumer(stream, cfg)
info = await js.consumer_info(stream, durable)
print(info.num_pending)
finally:
try:
await js.delete_consumer(stream, durable)
except Exception:
pass
await nc.close()
asyncio.run(main())
PY_DLQ
}
publish_forced_fail() {
local job_id="$1"
docker run --rm -i --network $STAGING_NETWORK \
-e NATS_URL="$NATS_URL" \
-e JOB_ID="$job_id" \
-e AGENT_ID="$AGENT_ID" \
$RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && cat > /tmp/force_fail.py && python /tmp/force_fail.py" << 'PY_FAIL'
import asyncio, os, json
from nats.aio.client import Client as NATS
async def main():
nc = NATS()
await nc.connect(servers=[os.environ["NATS_URL"]])
js = nc.jetstream()
job_id = os.environ["JOB_ID"]
payload = {
"task_id": job_id,
"job_id": job_id,
"workflow_type": "test",
"agent_id": os.environ.get("AGENT_ID", "helion"),
"trace_id": f"trace-{job_id}",
"user_id": "tg:test_user",
"test_mode": True,
"force_fail": True,
"payload": {"prompt": "DLQ forced fail", "test": True},
}
await js.publish("agent.run.requested", json.dumps(payload).encode())
await nc.close()
asyncio.run(main())
print("published", os.environ["JOB_ID"])
PY_FAIL
}
wait_for_completion() {
local job_id="$1"
local subject="$2"
local stream="$3"
local timeout="${4:-60}"
docker run --rm -i --network $STAGING_NETWORK \
-e NATS_URL="$NATS_URL" \
-e JOB_ID="$job_id" \
-e COMPLETED_SUBJECT="$subject" \
-e COMPLETED_STREAM="$stream" \
-e COMPLETION_TIMEOUT="$timeout" \
$RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && cat > /tmp/wait_completion.py && python /tmp/wait_completion.py" << 'PY_DONE'
import asyncio, os, json, time
from nats.aio.client import Client as NATS
async def main():
nc = NATS()
await nc.connect(servers=[os.environ["NATS_URL"]])
js = nc.jetstream()
job_id = os.environ["JOB_ID"]
subject = os.environ["COMPLETED_SUBJECT"]
stream = os.environ["COMPLETED_STREAM"]
timeout = int(os.environ.get("COMPLETION_TIMEOUT", "60"))
deadline = time.time() + timeout
sub = await js.pull_subscribe(subject, durable=None, stream=stream)
found = False
replayed = None
replay_count = None
completed_subject = subject
while time.time() < deadline:
try:
batch = await sub.fetch(1, timeout=1)
except Exception:
continue
if batch:
msg = batch[0]
data = json.loads(msg.data.decode())
if data.get("job_id") == job_id:
found = True
completed_subject = msg.subject
if msg.headers:
replayed = msg.headers.get("replayed")
replay_count = msg.headers.get("replay_count")
await msg.ack()
if found:
break
await nc.close()
if not found:
raise SystemExit(1)
print(f"completed_job_id={job_id}")
print(f"completed_subject={completed_subject}")
print(f"replayed={replayed if replayed is not None else 'n/a'}")
print(f"replay_count={replay_count if replay_count is not None else 'n/a'}")
asyncio.run(main())
PY_DONE
}
append_report_header() {
if [ ! -f "$REPORT" ]; then
cat > "$REPORT" << "MD"
# Chaos Test Report
| Test | Start/End (UTC) | Max Lag | DLQ Peak | p95 Latency | Unique Success | Notes |
|---|---|---|---|---|---|---|
MD
fi
}
append_report_row() {
local test="$1"
local start="$2"
local end="$3"
local max_lag="$4"
local dlq="$5"
local p95="$6"
local success="$7"
local notes="$8"
echo "| ${test} | ${start} ${end} | ${max_lag} | ${dlq} | ${p95} | ${success} | ${notes} |" >> "$REPORT"
}
extract_summary() {
local logfile="$1"
local max_lag=$(grep -m1 "Max during test" "$logfile" | awk '{print $NF}')
local dlq=$(grep -m1 "DLQ depth" "$logfile" | awk '{print $NF}')
local p95=$(grep -m1 "p95:" "$logfile" | awk '{print $2}')
local success=$(grep -m1 "Success rate" "$logfile" | awk '{print $NF}' | tr -d ')%')
echo "$max_lag|$dlq|$p95|$success"
}
append_report_header
# Test A — Kill Worker
START_A=$(date -u +"%Y-%m-%d %H:%M:%S")
(run_burst "A_kill_worker") &
PID=$!
sleep 5
if docker ps --format "{{.Names}}" | grep -q "dagi-staging-crewai-worker"; then
COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES restart crewai-worker
fi
wait $PID || true
END_A=$(date -u +"%Y-%m-%d %H:%M:%S")
SUM_A=$(extract_summary "$LOG_DIR/${TS}_A_kill_worker.log")
IFS='|' read -r MAXLAG_A DLQ_A P95_A SUCCESS_A <<< "$SUM_A"
append_report_row "A Kill Worker" "$START_A" "$END_A" "${MAXLAG_A:-n/a}" "${DLQ_A:-n/a}" "${P95_A:-n/a}" "${SUCCESS_A:-n/a}%" "restart crewai-worker"
# Test B — Kill Router
START_B=$(date -u +"%Y-%m-%d %H:%M:%S")
(run_burst "B_kill_router") &
PID=$!
sleep 5
if docker ps --format "{{.Names}}" | grep -q "dagi-staging-router"; then
COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES restart router
fi
wait $PID || true
END_B=$(date -u +"%Y-%m-%d %H:%M:%S")
SUM_B=$(extract_summary "$LOG_DIR/${TS}_B_kill_router.log")
IFS='|' read -r MAXLAG_B DLQ_B P95_B SUCCESS_B <<< "$SUM_B"
append_report_row "B Kill Router" "$START_B" "$END_B" "${MAXLAG_B:-n/a}" "${DLQ_B:-n/a}" "${P95_B:-n/a}" "${SUCCESS_B:-n/a}%" "restart router"
# Test C — Block Postgres (stop 60s)
START_C=$(date -u +"%Y-%m-%d %H:%M:%S")
(run_burst "C_block_postgres") &
PID=$!
sleep 5
if docker ps --format "{{.Names}}" | grep -q "dagi-staging-postgres"; then
COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES stop dagi-postgres
sleep 60
COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES start dagi-postgres
fi
wait $PID || true
END_C=$(date -u +"%Y-%m-%d %H:%M:%S")
SUM_C=$(extract_summary "$LOG_DIR/${TS}_C_block_postgres.log")
IFS='|' read -r MAXLAG_C DLQ_C P95_C SUCCESS_C <<< "$SUM_C"
append_report_row "C Block Postgres" "$START_C" "$END_C" "${MAXLAG_C:-n/a}" "${DLQ_C:-n/a}" "${P95_C:-n/a}" "${SUCCESS_C:-n/a}%" "stop/start postgres 60s"
# Test D — DLQ Replay
START_D=$(date -u +"%Y-%m-%d %H:%M:%S")
DLQ_LOG="$LOG_DIR/${TS}_D_dlq_replay.log"
DLQ_NOTES="dlq_replay.py"
DLQ_STATUS="n/a"
DLQ_JOB_ID=""
DLQ_WAIT_S="0"
REPLAY_WAIT_S="0"
COMPLETION_WAIT_S="0"
COMPLETED_SUBJECT_ACTUAL="$COMPLETED_SUBJECT"
REPLAY_COUNT_VALUE="n/a"
REPLAYED_VALUE="n/a"
if [ "$CHAOS_FORCE_DLQ" = "true" ]; then
echo "D[DLQ]: forcing job_id=<pending> subject=${DLQ_SUBJECT} stream=${DLQ_STREAM}"
DLQ_COUNT=$(dlq_pending "$DLQ_SUBJECT" "$DLQ_STREAM" 2>/dev/null || echo "0")
if [ "$DLQ_COUNT" = "0" ]; then
DLQ_JOB_ID="dlq-test-$(date +%s)"
echo "D[DLQ]: forcing job_id=${DLQ_JOB_ID} subject=${DLQ_SUBJECT} stream=${DLQ_STREAM}"
publish_forced_fail "$DLQ_JOB_ID"
DLQ_WAIT_START=$(date +%s)
READY=false
for _ in $(seq 1 30); do
DLQ_COUNT=$(dlq_pending "$DLQ_SUBJECT" "$DLQ_STREAM" 2>/dev/null || echo "0")
if [ "$DLQ_COUNT" != "0" ]; then
READY=true
break
fi
sleep 2
done
DLQ_WAIT_END=$(date +%s)
DLQ_WAIT_S=$((DLQ_WAIT_END-DLQ_WAIT_START))
if [ "$READY" != "true" ]; then
echo "D[DLQ]: FAIL job_id=${DLQ_JOB_ID} reason=timeout waiting dlq"
DLQ_NOTES="forced-fail timeout"
DLQ_STATUS="FAIL"
append_report_row "D DLQ Replay" "$START_D" "$(date -u +"%Y-%m-%d %H:%M:%S")" "n/a" "0" "n/a" "$DLQ_STATUS" "$DLQ_NOTES"
exit 1
fi
else
echo "D[DLQ]: pending=${DLQ_COUNT}, running replay (dry-run->real)"
fi
fi
REPLAY_START=$(date +%s)
docker run --rm --network $STAGING_NETWORK \
-e NATS_URL="$NATS_URL" \
-v "$ROOT/scripts:/scripts" \
$RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && python /scripts/dlq_replay.py --subject ${DLQ_SUBJECT} --stream ${DLQ_STREAM} --dry-run" | tee "$DLQ_LOG"
docker run --rm --network $STAGING_NETWORK \
-e NATS_URL="$NATS_URL" \
-v "$ROOT/scripts:/scripts" \
$RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && python /scripts/dlq_replay.py --subject ${DLQ_SUBJECT} --stream ${DLQ_STREAM}" | tee -a "$DLQ_LOG"
REPLAY_END=$(date +%s)
REPLAY_WAIT_S=$((REPLAY_END-REPLAY_START))
if [ "$CHAOS_FORCE_DLQ" = "true" ] && [ -z "$DLQ_JOB_ID" ]; then
DLQ_JOB_ID=$(grep -m1 "replayed_job_id=" "$DLQ_LOG" | awk -F= '{print $2}' | tr -d '\r')
fi
if [ "$CHAOS_FORCE_DLQ" = "true" ] && [ -n "$DLQ_JOB_ID" ]; then
COMPLETION_START=$(date +%s)
COMPLETION_OUTPUT=$(wait_for_completion "$DLQ_JOB_ID" "$COMPLETED_SUBJECT_ACTUAL" "$AGENT_RUNS_STREAM" 90)
COMPLETION_RC=$?
COMPLETION_END=$(date +%s)
COMPLETION_WAIT_S=$((COMPLETION_END-COMPLETION_START))
if [ "$COMPLETION_RC" -eq 0 ]; then
DLQ_STATUS="completed"
COMPLETED_SUBJECT_ACTUAL=$(echo "$COMPLETION_OUTPUT" | awk -F= '/completed_subject=/{print $2}' | tail -1)
REPLAYED_VALUE=$(echo "$COMPLETION_OUTPUT" | awk -F= '/replayed=/{print $2}' | tail -1)
REPLAY_COUNT_VALUE=$(echo "$COMPLETION_OUTPUT" | awk -F= '/replay_count=/{print $2}' | tail -1)
DLQ_NOTES="forced fail + replay, job_id=${DLQ_JOB_ID}, subject=${COMPLETED_SUBJECT_ACTUAL}, replay_count=${REPLAY_COUNT_VALUE}"
echo "D[DLQ]: completed job_id=${DLQ_JOB_ID} completed_subject=${COMPLETED_SUBJECT_ACTUAL} replayed=${REPLAYED_VALUE} replay_count=${REPLAY_COUNT_VALUE}"
echo "D[DLQ]: timings dlq_wait_s=${DLQ_WAIT_S} replay_wait_s=${REPLAY_WAIT_S} completion_wait_s=${COMPLETION_WAIT_S}"
else
DLQ_STATUS="FAIL"
DLQ_NOTES="replay done, completion timeout, job_id=${DLQ_JOB_ID}"
echo "D[DLQ]: FAIL job_id=${DLQ_JOB_ID} reason=timeout waiting completion"
echo "D[DLQ]: timings dlq_wait_s=${DLQ_WAIT_S} replay_wait_s=${REPLAY_WAIT_S} completion_wait_s=${COMPLETION_WAIT_S}"
append_report_row "D DLQ Replay" "$START_D" "$(date -u +"%Y-%m-%d %H:%M:%S")" "n/a" "1→0" "n/a" "$DLQ_STATUS" "$DLQ_NOTES"
exit 1
fi
else
if [ "$CHAOS_FORCE_DLQ" = "true" ]; then
echo "D[DLQ]: FAIL job_id=<none> reason=no replayed_job_id in log"
DLQ_STATUS="FAIL"
DLQ_NOTES="no replayed_job_id found"
append_report_row "D DLQ Replay" "$START_D" "$(date -u +"%Y-%m-%d %H:%M:%S")" "n/a" "see log" "n/a" "$DLQ_STATUS" "$DLQ_NOTES"
exit 1
fi
fi
END_D=$(date -u +"%Y-%m-%d %H:%M:%S")
append_report_row "D DLQ Replay" "$START_D" "$END_D" "n/a" "see log" "n/a" "${DLQ_STATUS}" "${DLQ_NOTES}"
echo "✅ Chaos suite complete. Logs: $LOG_DIR"
echo "✅ Report updated: $REPORT"