Complete snapshot of /opt/microdao-daarion/ from NODE1 (144.76.224.179).
This represents the actual running production code that has diverged
significantly from the previous main branch.
Key changes from old main:
- Gateway (http_api.py): expanded from ~40KB to 164KB with full agent support
- Router: new /v1/agents/{id}/infer endpoint with vision + DeepSeek routing
- Behavior Policy: SOWA v2.2 (3-level: FULL/ACK/SILENT)
- Agent Registry: config/agent_registry.yml as single source of truth
- 13 agents configured (was 3)
- Memory service integration
- CrewAI teams and roles
Excluded from snapshot: venv/, .env, data/, backups, .tgz archives
Co-authored-by: Cursor <cursoragent@cursor.com>
391 lines
14 KiB
Bash
Executable File
391 lines
14 KiB
Bash
Executable File
#!/bin/bash
|
||
set -euo pipefail
|
||
|
||
# Safety guard: refuse to run on prod by default
|
||
: "${ENV:=}"
|
||
: "${ALLOW_DESTRUCTIVE_TESTS:=false}"
|
||
: "${CONFIRM_CHAOS:=}"
|
||
: "${CHAOS_FORCE_DLQ:=false}"
|
||
|
||
if [ "$ENV" != "staging" ] && [ "$ENV" != "dev" ]; then
|
||
echo "ERROR: ENV must be staging/dev. Current ENV='$ENV'. Refusing to run."
|
||
exit 1
|
||
fi
|
||
|
||
if [ "$ALLOW_DESTRUCTIVE_TESTS" != "true" ] || [ "$CONFIRM_CHAOS" != "YES" ]; then
|
||
echo "ERROR: Set ALLOW_DESTRUCTIVE_TESTS=true and CONFIRM_CHAOS=YES to proceed."
|
||
exit 1
|
||
fi
|
||
|
||
ROOT="/opt/microdao-daarion"
|
||
COMPOSE_PROJECT_NAME=${COMPOSE_PROJECT_NAME:-dagi-staging}
|
||
LOG_DIR="$ROOT/logs/chaos"
|
||
COMPOSE_FILES="-f $ROOT/docker-compose.staging.yml"
|
||
STAGING_NETWORK="${COMPOSE_PROJECT_NAME}_dagi-staging-network"
|
||
REPORT="$ROOT/docs/CHAOS_TEST_REPORT.md"
|
||
TS=$(date -u +"%Y%m%dT%H%M%SZ")
|
||
|
||
# Chaos parameters
|
||
MESSAGES=${MESSAGES:-50}
|
||
DUPLICATES=${DUPLICATES:-10}
|
||
BURST_TIME=${BURST_TIME:-5}
|
||
AGENT_ID=${AGENT_ID:-helion}
|
||
DLQ_SUBJECT=${DLQ_SUBJECT:-agent.run.failed.dlq}
|
||
DLQ_STREAM=${DLQ_STREAM:-AUDIT}
|
||
AGENT_RUNS_STREAM=${AGENT_RUNS_STREAM:-AGENT_RUNS}
|
||
COMPLETED_SUBJECT=${COMPLETED_SUBJECT:-agent.run.completed.${AGENT_ID}}
|
||
|
||
# NATS/Redis endpoints for runner container
|
||
NATS_HOST="dagi-staging-nats"
|
||
|
||
RUNNER_IMAGE="python:3.11-slim"
|
||
RUNNER_CMD_PREFIX="pip -q install nats-py httpx redis"
|
||
|
||
# Resolve container IPs (DNS inside runner can be flaky)
|
||
NATS_IP=$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' dagi-staging-nats 2>/dev/null || true)
|
||
ROUTER_IP=$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' dagi-staging-router 2>/dev/null || true)
|
||
REDIS_IP=$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' dagi-staging-redis 2>/dev/null || true)
|
||
|
||
: "${NATS_IP:=dagi-staging-nats}"
|
||
: "${ROUTER_IP:=dagi-staging-router}"
|
||
: "${REDIS_IP:=dagi-staging-redis}"
|
||
|
||
export NATS_URL="nats://router:router-secret-change-me@${NATS_IP}:4222"
|
||
export NATS_MONITOR_URL="http://${NATS_IP}:8222"
|
||
export ROUTER_URL="http://${ROUTER_IP}:9102"
|
||
export REDIS_URL="redis://${REDIS_IP}:6379"
|
||
|
||
run_burst() {
|
||
local label="$1"
|
||
local logfile="$LOG_DIR/${TS}_${label}.log"
|
||
echo "[${label}] Starting burst: messages=${MESSAGES}, duplicates=${DUPLICATES}, burst_time=${BURST_TIME}s"
|
||
|
||
docker run --rm --network $STAGING_NETWORK \
|
||
-e NATS_URL="$NATS_URL" \
|
||
-e NATS_MONITOR_URL="$NATS_MONITOR_URL" \
|
||
-e ROUTER_URL="$ROUTER_URL" \
|
||
-e REDIS_URL="$REDIS_URL" \
|
||
-v "$ROOT/scripts:/scripts" \
|
||
$RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && python /scripts/load/burst_100.py --messages ${MESSAGES} --duplicates ${DUPLICATES} --burst-time ${BURST_TIME}" | tee "$logfile"
|
||
}
|
||
|
||
dlq_pending() {
|
||
local subject="$1"
|
||
local stream="$2"
|
||
local durable="dlq_check_$(date +%s%N)"
|
||
|
||
docker run --rm -i --network $STAGING_NETWORK \
|
||
-e NATS_URL="$NATS_URL" \
|
||
-e DLQ_SUBJECT="$subject" \
|
||
-e DLQ_STREAM="$stream" \
|
||
-e DLQ_DURABLE="$durable" \
|
||
$RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && cat > /tmp/dlq_check.py && python /tmp/dlq_check.py" << 'PY_DLQ'
|
||
import asyncio, os
|
||
from nats.aio.client import Client as NATS
|
||
from nats.js.api import ConsumerConfig, AckPolicy, DeliverPolicy
|
||
|
||
async def main():
|
||
nc = NATS()
|
||
await nc.connect(servers=[os.environ["NATS_URL"]])
|
||
js = nc.jetstream()
|
||
stream = os.environ["DLQ_STREAM"]
|
||
subject = os.environ["DLQ_SUBJECT"]
|
||
durable = os.environ["DLQ_DURABLE"]
|
||
|
||
cfg = ConsumerConfig(
|
||
durable_name=durable,
|
||
filter_subject=subject,
|
||
ack_policy=AckPolicy.EXPLICIT,
|
||
deliver_policy=DeliverPolicy.ALL,
|
||
)
|
||
try:
|
||
await js.add_consumer(stream, cfg)
|
||
info = await js.consumer_info(stream, durable)
|
||
print(info.num_pending)
|
||
finally:
|
||
try:
|
||
await js.delete_consumer(stream, durable)
|
||
except Exception:
|
||
pass
|
||
await nc.close()
|
||
|
||
asyncio.run(main())
|
||
PY_DLQ
|
||
}
|
||
|
||
publish_forced_fail() {
|
||
local job_id="$1"
|
||
|
||
docker run --rm -i --network $STAGING_NETWORK \
|
||
-e NATS_URL="$NATS_URL" \
|
||
-e JOB_ID="$job_id" \
|
||
-e AGENT_ID="$AGENT_ID" \
|
||
$RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && cat > /tmp/force_fail.py && python /tmp/force_fail.py" << 'PY_FAIL'
|
||
import asyncio, os, json
|
||
from nats.aio.client import Client as NATS
|
||
|
||
async def main():
|
||
nc = NATS()
|
||
await nc.connect(servers=[os.environ["NATS_URL"]])
|
||
js = nc.jetstream()
|
||
job_id = os.environ["JOB_ID"]
|
||
payload = {
|
||
"task_id": job_id,
|
||
"job_id": job_id,
|
||
"workflow_type": "test",
|
||
"agent_id": os.environ.get("AGENT_ID", "helion"),
|
||
"trace_id": f"trace-{job_id}",
|
||
"user_id": "tg:test_user",
|
||
"test_mode": True,
|
||
"force_fail": True,
|
||
"payload": {"prompt": "DLQ forced fail", "test": True},
|
||
}
|
||
await js.publish("agent.run.requested", json.dumps(payload).encode())
|
||
await nc.close()
|
||
|
||
asyncio.run(main())
|
||
print("published", os.environ["JOB_ID"])
|
||
PY_FAIL
|
||
}
|
||
|
||
wait_for_completion() {
|
||
local job_id="$1"
|
||
local subject="$2"
|
||
local stream="$3"
|
||
local timeout="${4:-60}"
|
||
|
||
docker run --rm -i --network $STAGING_NETWORK \
|
||
-e NATS_URL="$NATS_URL" \
|
||
-e JOB_ID="$job_id" \
|
||
-e COMPLETED_SUBJECT="$subject" \
|
||
-e COMPLETED_STREAM="$stream" \
|
||
-e COMPLETION_TIMEOUT="$timeout" \
|
||
$RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && cat > /tmp/wait_completion.py && python /tmp/wait_completion.py" << 'PY_DONE'
|
||
import asyncio, os, json, time
|
||
from nats.aio.client import Client as NATS
|
||
|
||
async def main():
|
||
nc = NATS()
|
||
await nc.connect(servers=[os.environ["NATS_URL"]])
|
||
js = nc.jetstream()
|
||
job_id = os.environ["JOB_ID"]
|
||
subject = os.environ["COMPLETED_SUBJECT"]
|
||
stream = os.environ["COMPLETED_STREAM"]
|
||
timeout = int(os.environ.get("COMPLETION_TIMEOUT", "60"))
|
||
deadline = time.time() + timeout
|
||
|
||
sub = await js.pull_subscribe(subject, durable=None, stream=stream)
|
||
found = False
|
||
replayed = None
|
||
replay_count = None
|
||
completed_subject = subject
|
||
while time.time() < deadline:
|
||
try:
|
||
batch = await sub.fetch(1, timeout=1)
|
||
except Exception:
|
||
continue
|
||
if batch:
|
||
msg = batch[0]
|
||
data = json.loads(msg.data.decode())
|
||
if data.get("job_id") == job_id:
|
||
found = True
|
||
completed_subject = msg.subject
|
||
if msg.headers:
|
||
replayed = msg.headers.get("replayed")
|
||
replay_count = msg.headers.get("replay_count")
|
||
await msg.ack()
|
||
if found:
|
||
break
|
||
await nc.close()
|
||
if not found:
|
||
raise SystemExit(1)
|
||
print(f"completed_job_id={job_id}")
|
||
print(f"completed_subject={completed_subject}")
|
||
print(f"replayed={replayed if replayed is not None else 'n/a'}")
|
||
print(f"replay_count={replay_count if replay_count is not None else 'n/a'}")
|
||
|
||
asyncio.run(main())
|
||
PY_DONE
|
||
}
|
||
|
||
append_report_header() {
|
||
if [ ! -f "$REPORT" ]; then
|
||
cat > "$REPORT" << "MD"
|
||
# Chaos Test Report
|
||
|
||
| Test | Start/End (UTC) | Max Lag | DLQ Peak | p95 Latency | Unique Success | Notes |
|
||
|---|---|---|---|---|---|---|
|
||
MD
|
||
fi
|
||
}
|
||
|
||
append_report_row() {
|
||
local test="$1"
|
||
local start="$2"
|
||
local end="$3"
|
||
local max_lag="$4"
|
||
local dlq="$5"
|
||
local p95="$6"
|
||
local success="$7"
|
||
local notes="$8"
|
||
echo "| ${test} | ${start} – ${end} | ${max_lag} | ${dlq} | ${p95} | ${success} | ${notes} |" >> "$REPORT"
|
||
}
|
||
|
||
extract_summary() {
|
||
local logfile="$1"
|
||
local max_lag=$(grep -m1 "Max during test" "$logfile" | awk '{print $NF}')
|
||
local dlq=$(grep -m1 "DLQ depth" "$logfile" | awk '{print $NF}')
|
||
local p95=$(grep -m1 "p95:" "$logfile" | awk '{print $2}')
|
||
local success=$(grep -m1 "Success rate" "$logfile" | awk '{print $NF}' | tr -d ')%')
|
||
echo "$max_lag|$dlq|$p95|$success"
|
||
}
|
||
|
||
append_report_header
|
||
|
||
# Test A — Kill Worker
|
||
START_A=$(date -u +"%Y-%m-%d %H:%M:%S")
|
||
(run_burst "A_kill_worker") &
|
||
PID=$!
|
||
sleep 5
|
||
if docker ps --format "{{.Names}}" | grep -q "dagi-staging-crewai-worker"; then
|
||
COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES restart crewai-worker
|
||
fi
|
||
wait $PID || true
|
||
END_A=$(date -u +"%Y-%m-%d %H:%M:%S")
|
||
SUM_A=$(extract_summary "$LOG_DIR/${TS}_A_kill_worker.log")
|
||
IFS='|' read -r MAXLAG_A DLQ_A P95_A SUCCESS_A <<< "$SUM_A"
|
||
append_report_row "A – Kill Worker" "$START_A" "$END_A" "${MAXLAG_A:-n/a}" "${DLQ_A:-n/a}" "${P95_A:-n/a}" "${SUCCESS_A:-n/a}%" "restart crewai-worker"
|
||
|
||
# Test B — Kill Router
|
||
START_B=$(date -u +"%Y-%m-%d %H:%M:%S")
|
||
(run_burst "B_kill_router") &
|
||
PID=$!
|
||
sleep 5
|
||
if docker ps --format "{{.Names}}" | grep -q "dagi-staging-router"; then
|
||
COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES restart router
|
||
fi
|
||
wait $PID || true
|
||
END_B=$(date -u +"%Y-%m-%d %H:%M:%S")
|
||
SUM_B=$(extract_summary "$LOG_DIR/${TS}_B_kill_router.log")
|
||
IFS='|' read -r MAXLAG_B DLQ_B P95_B SUCCESS_B <<< "$SUM_B"
|
||
append_report_row "B – Kill Router" "$START_B" "$END_B" "${MAXLAG_B:-n/a}" "${DLQ_B:-n/a}" "${P95_B:-n/a}" "${SUCCESS_B:-n/a}%" "restart router"
|
||
|
||
# Test C — Block Postgres (stop 60s)
|
||
START_C=$(date -u +"%Y-%m-%d %H:%M:%S")
|
||
(run_burst "C_block_postgres") &
|
||
PID=$!
|
||
sleep 5
|
||
if docker ps --format "{{.Names}}" | grep -q "dagi-staging-postgres"; then
|
||
COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES stop dagi-postgres
|
||
sleep 60
|
||
COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES start dagi-postgres
|
||
fi
|
||
wait $PID || true
|
||
END_C=$(date -u +"%Y-%m-%d %H:%M:%S")
|
||
SUM_C=$(extract_summary "$LOG_DIR/${TS}_C_block_postgres.log")
|
||
IFS='|' read -r MAXLAG_C DLQ_C P95_C SUCCESS_C <<< "$SUM_C"
|
||
append_report_row "C – Block Postgres" "$START_C" "$END_C" "${MAXLAG_C:-n/a}" "${DLQ_C:-n/a}" "${P95_C:-n/a}" "${SUCCESS_C:-n/a}%" "stop/start postgres 60s"
|
||
|
||
# Test D — DLQ Replay
|
||
START_D=$(date -u +"%Y-%m-%d %H:%M:%S")
|
||
DLQ_LOG="$LOG_DIR/${TS}_D_dlq_replay.log"
|
||
|
||
DLQ_NOTES="dlq_replay.py"
|
||
DLQ_STATUS="n/a"
|
||
DLQ_JOB_ID=""
|
||
DLQ_WAIT_S="0"
|
||
REPLAY_WAIT_S="0"
|
||
COMPLETION_WAIT_S="0"
|
||
COMPLETED_SUBJECT_ACTUAL="$COMPLETED_SUBJECT"
|
||
REPLAY_COUNT_VALUE="n/a"
|
||
REPLAYED_VALUE="n/a"
|
||
|
||
if [ "$CHAOS_FORCE_DLQ" = "true" ]; then
|
||
echo "D[DLQ]: forcing job_id=<pending> subject=${DLQ_SUBJECT} stream=${DLQ_STREAM}"
|
||
DLQ_COUNT=$(dlq_pending "$DLQ_SUBJECT" "$DLQ_STREAM" 2>/dev/null || echo "0")
|
||
if [ "$DLQ_COUNT" = "0" ]; then
|
||
DLQ_JOB_ID="dlq-test-$(date +%s)"
|
||
echo "D[DLQ]: forcing job_id=${DLQ_JOB_ID} subject=${DLQ_SUBJECT} stream=${DLQ_STREAM}"
|
||
publish_forced_fail "$DLQ_JOB_ID"
|
||
|
||
DLQ_WAIT_START=$(date +%s)
|
||
READY=false
|
||
for _ in $(seq 1 30); do
|
||
DLQ_COUNT=$(dlq_pending "$DLQ_SUBJECT" "$DLQ_STREAM" 2>/dev/null || echo "0")
|
||
if [ "$DLQ_COUNT" != "0" ]; then
|
||
READY=true
|
||
break
|
||
fi
|
||
sleep 2
|
||
done
|
||
DLQ_WAIT_END=$(date +%s)
|
||
DLQ_WAIT_S=$((DLQ_WAIT_END-DLQ_WAIT_START))
|
||
|
||
if [ "$READY" != "true" ]; then
|
||
echo "D[DLQ]: FAIL job_id=${DLQ_JOB_ID} reason=timeout waiting dlq"
|
||
DLQ_NOTES="forced-fail timeout"
|
||
DLQ_STATUS="FAIL"
|
||
append_report_row "D – DLQ Replay" "$START_D" "$(date -u +"%Y-%m-%d %H:%M:%S")" "n/a" "0" "n/a" "$DLQ_STATUS" "$DLQ_NOTES"
|
||
exit 1
|
||
fi
|
||
else
|
||
echo "D[DLQ]: pending=${DLQ_COUNT}, running replay (dry-run->real)"
|
||
fi
|
||
fi
|
||
|
||
REPLAY_START=$(date +%s)
|
||
docker run --rm --network $STAGING_NETWORK \
|
||
-e NATS_URL="$NATS_URL" \
|
||
-v "$ROOT/scripts:/scripts" \
|
||
$RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && python /scripts/dlq_replay.py --subject ${DLQ_SUBJECT} --stream ${DLQ_STREAM} --dry-run" | tee "$DLQ_LOG"
|
||
|
||
docker run --rm --network $STAGING_NETWORK \
|
||
-e NATS_URL="$NATS_URL" \
|
||
-v "$ROOT/scripts:/scripts" \
|
||
$RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && python /scripts/dlq_replay.py --subject ${DLQ_SUBJECT} --stream ${DLQ_STREAM}" | tee -a "$DLQ_LOG"
|
||
REPLAY_END=$(date +%s)
|
||
REPLAY_WAIT_S=$((REPLAY_END-REPLAY_START))
|
||
|
||
if [ "$CHAOS_FORCE_DLQ" = "true" ] && [ -z "$DLQ_JOB_ID" ]; then
|
||
DLQ_JOB_ID=$(grep -m1 "replayed_job_id=" "$DLQ_LOG" | awk -F= '{print $2}' | tr -d '\r')
|
||
fi
|
||
|
||
if [ "$CHAOS_FORCE_DLQ" = "true" ] && [ -n "$DLQ_JOB_ID" ]; then
|
||
COMPLETION_START=$(date +%s)
|
||
COMPLETION_OUTPUT=$(wait_for_completion "$DLQ_JOB_ID" "$COMPLETED_SUBJECT_ACTUAL" "$AGENT_RUNS_STREAM" 90)
|
||
COMPLETION_RC=$?
|
||
COMPLETION_END=$(date +%s)
|
||
COMPLETION_WAIT_S=$((COMPLETION_END-COMPLETION_START))
|
||
|
||
if [ "$COMPLETION_RC" -eq 0 ]; then
|
||
DLQ_STATUS="completed"
|
||
COMPLETED_SUBJECT_ACTUAL=$(echo "$COMPLETION_OUTPUT" | awk -F= '/completed_subject=/{print $2}' | tail -1)
|
||
REPLAYED_VALUE=$(echo "$COMPLETION_OUTPUT" | awk -F= '/replayed=/{print $2}' | tail -1)
|
||
REPLAY_COUNT_VALUE=$(echo "$COMPLETION_OUTPUT" | awk -F= '/replay_count=/{print $2}' | tail -1)
|
||
DLQ_NOTES="forced fail + replay, job_id=${DLQ_JOB_ID}, subject=${COMPLETED_SUBJECT_ACTUAL}, replay_count=${REPLAY_COUNT_VALUE}"
|
||
echo "D[DLQ]: completed job_id=${DLQ_JOB_ID} completed_subject=${COMPLETED_SUBJECT_ACTUAL} replayed=${REPLAYED_VALUE} replay_count=${REPLAY_COUNT_VALUE}"
|
||
echo "D[DLQ]: timings dlq_wait_s=${DLQ_WAIT_S} replay_wait_s=${REPLAY_WAIT_S} completion_wait_s=${COMPLETION_WAIT_S}"
|
||
else
|
||
DLQ_STATUS="FAIL"
|
||
DLQ_NOTES="replay done, completion timeout, job_id=${DLQ_JOB_ID}"
|
||
echo "D[DLQ]: FAIL job_id=${DLQ_JOB_ID} reason=timeout waiting completion"
|
||
echo "D[DLQ]: timings dlq_wait_s=${DLQ_WAIT_S} replay_wait_s=${REPLAY_WAIT_S} completion_wait_s=${COMPLETION_WAIT_S}"
|
||
append_report_row "D – DLQ Replay" "$START_D" "$(date -u +"%Y-%m-%d %H:%M:%S")" "n/a" "1→0" "n/a" "$DLQ_STATUS" "$DLQ_NOTES"
|
||
exit 1
|
||
fi
|
||
else
|
||
if [ "$CHAOS_FORCE_DLQ" = "true" ]; then
|
||
echo "D[DLQ]: FAIL job_id=<none> reason=no replayed_job_id in log"
|
||
DLQ_STATUS="FAIL"
|
||
DLQ_NOTES="no replayed_job_id found"
|
||
append_report_row "D – DLQ Replay" "$START_D" "$(date -u +"%Y-%m-%d %H:%M:%S")" "n/a" "see log" "n/a" "$DLQ_STATUS" "$DLQ_NOTES"
|
||
exit 1
|
||
fi
|
||
fi
|
||
|
||
END_D=$(date -u +"%Y-%m-%d %H:%M:%S")
|
||
append_report_row "D – DLQ Replay" "$START_D" "$END_D" "n/a" "see log" "n/a" "${DLQ_STATUS}" "${DLQ_NOTES}"
|
||
|
||
echo "✅ Chaos suite complete. Logs: $LOG_DIR"
|
||
echo "✅ Report updated: $REPORT"
|