snapshot: NODE1 production state 2026-02-09

Complete snapshot of /opt/microdao-daarion/ from NODE1 (144.76.224.179).
This represents the actual running production code that has diverged
significantly from the previous main branch.

Key changes from old main:
- Gateway (http_api.py): expanded from ~40KB to 164KB with full agent support
- Router: new /v1/agents/{id}/infer endpoint with vision + DeepSeek routing
- Behavior Policy: SOWA v2.2 (3-level: FULL/ACK/SILENT)
- Agent Registry: config/agent_registry.yml as single source of truth
- 13 agents configured (was 3)
- Memory service integration
- CrewAI teams and roles

Excluded from snapshot: venv/, .env, data/, backups, .tgz archives

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Apple
2026-02-09 08:46:46 -08:00
parent 134c044c21
commit ef3473db21
9473 changed files with 408933 additions and 2769877 deletions

390
scripts/chaos/run_chaos_suite.sh Executable file
View File

@@ -0,0 +1,390 @@
#!/bin/bash
set -euo pipefail
# Safety guard: refuse to run on prod by default
: "${ENV:=}"
: "${ALLOW_DESTRUCTIVE_TESTS:=false}"
: "${CONFIRM_CHAOS:=}"
: "${CHAOS_FORCE_DLQ:=false}"
if [ "$ENV" != "staging" ] && [ "$ENV" != "dev" ]; then
echo "ERROR: ENV must be staging/dev. Current ENV='$ENV'. Refusing to run."
exit 1
fi
if [ "$ALLOW_DESTRUCTIVE_TESTS" != "true" ] || [ "$CONFIRM_CHAOS" != "YES" ]; then
echo "ERROR: Set ALLOW_DESTRUCTIVE_TESTS=true and CONFIRM_CHAOS=YES to proceed."
exit 1
fi
ROOT="/opt/microdao-daarion"
COMPOSE_PROJECT_NAME=${COMPOSE_PROJECT_NAME:-dagi-staging}
LOG_DIR="$ROOT/logs/chaos"
COMPOSE_FILES="-f $ROOT/docker-compose.staging.yml"
STAGING_NETWORK="${COMPOSE_PROJECT_NAME}_dagi-staging-network"
REPORT="$ROOT/docs/CHAOS_TEST_REPORT.md"
TS=$(date -u +"%Y%m%dT%H%M%SZ")
# Chaos parameters
MESSAGES=${MESSAGES:-50}
DUPLICATES=${DUPLICATES:-10}
BURST_TIME=${BURST_TIME:-5}
AGENT_ID=${AGENT_ID:-helion}
DLQ_SUBJECT=${DLQ_SUBJECT:-agent.run.failed.dlq}
DLQ_STREAM=${DLQ_STREAM:-AUDIT}
AGENT_RUNS_STREAM=${AGENT_RUNS_STREAM:-AGENT_RUNS}
COMPLETED_SUBJECT=${COMPLETED_SUBJECT:-agent.run.completed.${AGENT_ID}}
# NATS/Redis endpoints for runner container
NATS_HOST="dagi-staging-nats"
RUNNER_IMAGE="python:3.11-slim"
RUNNER_CMD_PREFIX="pip -q install nats-py httpx redis"
# Resolve container IPs (DNS inside runner can be flaky)
NATS_IP=$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' dagi-staging-nats 2>/dev/null || true)
ROUTER_IP=$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' dagi-staging-router 2>/dev/null || true)
REDIS_IP=$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' dagi-staging-redis 2>/dev/null || true)
: "${NATS_IP:=dagi-staging-nats}"
: "${ROUTER_IP:=dagi-staging-router}"
: "${REDIS_IP:=dagi-staging-redis}"
export NATS_URL="nats://router:router-secret-change-me@${NATS_IP}:4222"
export NATS_MONITOR_URL="http://${NATS_IP}:8222"
export ROUTER_URL="http://${ROUTER_IP}:9102"
export REDIS_URL="redis://${REDIS_IP}:6379"
run_burst() {
local label="$1"
local logfile="$LOG_DIR/${TS}_${label}.log"
echo "[${label}] Starting burst: messages=${MESSAGES}, duplicates=${DUPLICATES}, burst_time=${BURST_TIME}s"
docker run --rm --network $STAGING_NETWORK \
-e NATS_URL="$NATS_URL" \
-e NATS_MONITOR_URL="$NATS_MONITOR_URL" \
-e ROUTER_URL="$ROUTER_URL" \
-e REDIS_URL="$REDIS_URL" \
-v "$ROOT/scripts:/scripts" \
$RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && python /scripts/load/burst_100.py --messages ${MESSAGES} --duplicates ${DUPLICATES} --burst-time ${BURST_TIME}" | tee "$logfile"
}
dlq_pending() {
local subject="$1"
local stream="$2"
local durable="dlq_check_$(date +%s%N)"
docker run --rm -i --network $STAGING_NETWORK \
-e NATS_URL="$NATS_URL" \
-e DLQ_SUBJECT="$subject" \
-e DLQ_STREAM="$stream" \
-e DLQ_DURABLE="$durable" \
$RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && cat > /tmp/dlq_check.py && python /tmp/dlq_check.py" << 'PY_DLQ'
import asyncio, os
from nats.aio.client import Client as NATS
from nats.js.api import ConsumerConfig, AckPolicy, DeliverPolicy
async def main():
nc = NATS()
await nc.connect(servers=[os.environ["NATS_URL"]])
js = nc.jetstream()
stream = os.environ["DLQ_STREAM"]
subject = os.environ["DLQ_SUBJECT"]
durable = os.environ["DLQ_DURABLE"]
cfg = ConsumerConfig(
durable_name=durable,
filter_subject=subject,
ack_policy=AckPolicy.EXPLICIT,
deliver_policy=DeliverPolicy.ALL,
)
try:
await js.add_consumer(stream, cfg)
info = await js.consumer_info(stream, durable)
print(info.num_pending)
finally:
try:
await js.delete_consumer(stream, durable)
except Exception:
pass
await nc.close()
asyncio.run(main())
PY_DLQ
}
publish_forced_fail() {
local job_id="$1"
docker run --rm -i --network $STAGING_NETWORK \
-e NATS_URL="$NATS_URL" \
-e JOB_ID="$job_id" \
-e AGENT_ID="$AGENT_ID" \
$RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && cat > /tmp/force_fail.py && python /tmp/force_fail.py" << 'PY_FAIL'
import asyncio, os, json
from nats.aio.client import Client as NATS
async def main():
nc = NATS()
await nc.connect(servers=[os.environ["NATS_URL"]])
js = nc.jetstream()
job_id = os.environ["JOB_ID"]
payload = {
"task_id": job_id,
"job_id": job_id,
"workflow_type": "test",
"agent_id": os.environ.get("AGENT_ID", "helion"),
"trace_id": f"trace-{job_id}",
"user_id": "tg:test_user",
"test_mode": True,
"force_fail": True,
"payload": {"prompt": "DLQ forced fail", "test": True},
}
await js.publish("agent.run.requested", json.dumps(payload).encode())
await nc.close()
asyncio.run(main())
print("published", os.environ["JOB_ID"])
PY_FAIL
}
wait_for_completion() {
local job_id="$1"
local subject="$2"
local stream="$3"
local timeout="${4:-60}"
docker run --rm -i --network $STAGING_NETWORK \
-e NATS_URL="$NATS_URL" \
-e JOB_ID="$job_id" \
-e COMPLETED_SUBJECT="$subject" \
-e COMPLETED_STREAM="$stream" \
-e COMPLETION_TIMEOUT="$timeout" \
$RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && cat > /tmp/wait_completion.py && python /tmp/wait_completion.py" << 'PY_DONE'
import asyncio, os, json, time
from nats.aio.client import Client as NATS
async def main():
nc = NATS()
await nc.connect(servers=[os.environ["NATS_URL"]])
js = nc.jetstream()
job_id = os.environ["JOB_ID"]
subject = os.environ["COMPLETED_SUBJECT"]
stream = os.environ["COMPLETED_STREAM"]
timeout = int(os.environ.get("COMPLETION_TIMEOUT", "60"))
deadline = time.time() + timeout
sub = await js.pull_subscribe(subject, durable=None, stream=stream)
found = False
replayed = None
replay_count = None
completed_subject = subject
while time.time() < deadline:
try:
batch = await sub.fetch(1, timeout=1)
except Exception:
continue
if batch:
msg = batch[0]
data = json.loads(msg.data.decode())
if data.get("job_id") == job_id:
found = True
completed_subject = msg.subject
if msg.headers:
replayed = msg.headers.get("replayed")
replay_count = msg.headers.get("replay_count")
await msg.ack()
if found:
break
await nc.close()
if not found:
raise SystemExit(1)
print(f"completed_job_id={job_id}")
print(f"completed_subject={completed_subject}")
print(f"replayed={replayed if replayed is not None else 'n/a'}")
print(f"replay_count={replay_count if replay_count is not None else 'n/a'}")
asyncio.run(main())
PY_DONE
}
append_report_header() {
if [ ! -f "$REPORT" ]; then
cat > "$REPORT" << "MD"
# Chaos Test Report
| Test | Start/End (UTC) | Max Lag | DLQ Peak | p95 Latency | Unique Success | Notes |
|---|---|---|---|---|---|---|
MD
fi
}
append_report_row() {
local test="$1"
local start="$2"
local end="$3"
local max_lag="$4"
local dlq="$5"
local p95="$6"
local success="$7"
local notes="$8"
echo "| ${test} | ${start} ${end} | ${max_lag} | ${dlq} | ${p95} | ${success} | ${notes} |" >> "$REPORT"
}
extract_summary() {
local logfile="$1"
local max_lag=$(grep -m1 "Max during test" "$logfile" | awk '{print $NF}')
local dlq=$(grep -m1 "DLQ depth" "$logfile" | awk '{print $NF}')
local p95=$(grep -m1 "p95:" "$logfile" | awk '{print $2}')
local success=$(grep -m1 "Success rate" "$logfile" | awk '{print $NF}' | tr -d ')%')
echo "$max_lag|$dlq|$p95|$success"
}
append_report_header
# Test A — Kill Worker
START_A=$(date -u +"%Y-%m-%d %H:%M:%S")
(run_burst "A_kill_worker") &
PID=$!
sleep 5
if docker ps --format "{{.Names}}" | grep -q "dagi-staging-crewai-worker"; then
COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES restart crewai-worker
fi
wait $PID || true
END_A=$(date -u +"%Y-%m-%d %H:%M:%S")
SUM_A=$(extract_summary "$LOG_DIR/${TS}_A_kill_worker.log")
IFS='|' read -r MAXLAG_A DLQ_A P95_A SUCCESS_A <<< "$SUM_A"
append_report_row "A Kill Worker" "$START_A" "$END_A" "${MAXLAG_A:-n/a}" "${DLQ_A:-n/a}" "${P95_A:-n/a}" "${SUCCESS_A:-n/a}%" "restart crewai-worker"
# Test B — Kill Router
START_B=$(date -u +"%Y-%m-%d %H:%M:%S")
(run_burst "B_kill_router") &
PID=$!
sleep 5
if docker ps --format "{{.Names}}" | grep -q "dagi-staging-router"; then
COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES restart router
fi
wait $PID || true
END_B=$(date -u +"%Y-%m-%d %H:%M:%S")
SUM_B=$(extract_summary "$LOG_DIR/${TS}_B_kill_router.log")
IFS='|' read -r MAXLAG_B DLQ_B P95_B SUCCESS_B <<< "$SUM_B"
append_report_row "B Kill Router" "$START_B" "$END_B" "${MAXLAG_B:-n/a}" "${DLQ_B:-n/a}" "${P95_B:-n/a}" "${SUCCESS_B:-n/a}%" "restart router"
# Test C — Block Postgres (stop 60s)
START_C=$(date -u +"%Y-%m-%d %H:%M:%S")
(run_burst "C_block_postgres") &
PID=$!
sleep 5
if docker ps --format "{{.Names}}" | grep -q "dagi-staging-postgres"; then
COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES stop dagi-postgres
sleep 60
COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES start dagi-postgres
fi
wait $PID || true
END_C=$(date -u +"%Y-%m-%d %H:%M:%S")
SUM_C=$(extract_summary "$LOG_DIR/${TS}_C_block_postgres.log")
IFS='|' read -r MAXLAG_C DLQ_C P95_C SUCCESS_C <<< "$SUM_C"
append_report_row "C Block Postgres" "$START_C" "$END_C" "${MAXLAG_C:-n/a}" "${DLQ_C:-n/a}" "${P95_C:-n/a}" "${SUCCESS_C:-n/a}%" "stop/start postgres 60s"
# Test D — DLQ Replay
START_D=$(date -u +"%Y-%m-%d %H:%M:%S")
DLQ_LOG="$LOG_DIR/${TS}_D_dlq_replay.log"
DLQ_NOTES="dlq_replay.py"
DLQ_STATUS="n/a"
DLQ_JOB_ID=""
DLQ_WAIT_S="0"
REPLAY_WAIT_S="0"
COMPLETION_WAIT_S="0"
COMPLETED_SUBJECT_ACTUAL="$COMPLETED_SUBJECT"
REPLAY_COUNT_VALUE="n/a"
REPLAYED_VALUE="n/a"
if [ "$CHAOS_FORCE_DLQ" = "true" ]; then
echo "D[DLQ]: forcing job_id=<pending> subject=${DLQ_SUBJECT} stream=${DLQ_STREAM}"
DLQ_COUNT=$(dlq_pending "$DLQ_SUBJECT" "$DLQ_STREAM" 2>/dev/null || echo "0")
if [ "$DLQ_COUNT" = "0" ]; then
DLQ_JOB_ID="dlq-test-$(date +%s)"
echo "D[DLQ]: forcing job_id=${DLQ_JOB_ID} subject=${DLQ_SUBJECT} stream=${DLQ_STREAM}"
publish_forced_fail "$DLQ_JOB_ID"
DLQ_WAIT_START=$(date +%s)
READY=false
for _ in $(seq 1 30); do
DLQ_COUNT=$(dlq_pending "$DLQ_SUBJECT" "$DLQ_STREAM" 2>/dev/null || echo "0")
if [ "$DLQ_COUNT" != "0" ]; then
READY=true
break
fi
sleep 2
done
DLQ_WAIT_END=$(date +%s)
DLQ_WAIT_S=$((DLQ_WAIT_END-DLQ_WAIT_START))
if [ "$READY" != "true" ]; then
echo "D[DLQ]: FAIL job_id=${DLQ_JOB_ID} reason=timeout waiting dlq"
DLQ_NOTES="forced-fail timeout"
DLQ_STATUS="FAIL"
append_report_row "D DLQ Replay" "$START_D" "$(date -u +"%Y-%m-%d %H:%M:%S")" "n/a" "0" "n/a" "$DLQ_STATUS" "$DLQ_NOTES"
exit 1
fi
else
echo "D[DLQ]: pending=${DLQ_COUNT}, running replay (dry-run->real)"
fi
fi
REPLAY_START=$(date +%s)
docker run --rm --network $STAGING_NETWORK \
-e NATS_URL="$NATS_URL" \
-v "$ROOT/scripts:/scripts" \
$RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && python /scripts/dlq_replay.py --subject ${DLQ_SUBJECT} --stream ${DLQ_STREAM} --dry-run" | tee "$DLQ_LOG"
docker run --rm --network $STAGING_NETWORK \
-e NATS_URL="$NATS_URL" \
-v "$ROOT/scripts:/scripts" \
$RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && python /scripts/dlq_replay.py --subject ${DLQ_SUBJECT} --stream ${DLQ_STREAM}" | tee -a "$DLQ_LOG"
REPLAY_END=$(date +%s)
REPLAY_WAIT_S=$((REPLAY_END-REPLAY_START))
if [ "$CHAOS_FORCE_DLQ" = "true" ] && [ -z "$DLQ_JOB_ID" ]; then
DLQ_JOB_ID=$(grep -m1 "replayed_job_id=" "$DLQ_LOG" | awk -F= '{print $2}' | tr -d '\r')
fi
if [ "$CHAOS_FORCE_DLQ" = "true" ] && [ -n "$DLQ_JOB_ID" ]; then
COMPLETION_START=$(date +%s)
COMPLETION_OUTPUT=$(wait_for_completion "$DLQ_JOB_ID" "$COMPLETED_SUBJECT_ACTUAL" "$AGENT_RUNS_STREAM" 90)
COMPLETION_RC=$?
COMPLETION_END=$(date +%s)
COMPLETION_WAIT_S=$((COMPLETION_END-COMPLETION_START))
if [ "$COMPLETION_RC" -eq 0 ]; then
DLQ_STATUS="completed"
COMPLETED_SUBJECT_ACTUAL=$(echo "$COMPLETION_OUTPUT" | awk -F= '/completed_subject=/{print $2}' | tail -1)
REPLAYED_VALUE=$(echo "$COMPLETION_OUTPUT" | awk -F= '/replayed=/{print $2}' | tail -1)
REPLAY_COUNT_VALUE=$(echo "$COMPLETION_OUTPUT" | awk -F= '/replay_count=/{print $2}' | tail -1)
DLQ_NOTES="forced fail + replay, job_id=${DLQ_JOB_ID}, subject=${COMPLETED_SUBJECT_ACTUAL}, replay_count=${REPLAY_COUNT_VALUE}"
echo "D[DLQ]: completed job_id=${DLQ_JOB_ID} completed_subject=${COMPLETED_SUBJECT_ACTUAL} replayed=${REPLAYED_VALUE} replay_count=${REPLAY_COUNT_VALUE}"
echo "D[DLQ]: timings dlq_wait_s=${DLQ_WAIT_S} replay_wait_s=${REPLAY_WAIT_S} completion_wait_s=${COMPLETION_WAIT_S}"
else
DLQ_STATUS="FAIL"
DLQ_NOTES="replay done, completion timeout, job_id=${DLQ_JOB_ID}"
echo "D[DLQ]: FAIL job_id=${DLQ_JOB_ID} reason=timeout waiting completion"
echo "D[DLQ]: timings dlq_wait_s=${DLQ_WAIT_S} replay_wait_s=${REPLAY_WAIT_S} completion_wait_s=${COMPLETION_WAIT_S}"
append_report_row "D DLQ Replay" "$START_D" "$(date -u +"%Y-%m-%d %H:%M:%S")" "n/a" "1→0" "n/a" "$DLQ_STATUS" "$DLQ_NOTES"
exit 1
fi
else
if [ "$CHAOS_FORCE_DLQ" = "true" ]; then
echo "D[DLQ]: FAIL job_id=<none> reason=no replayed_job_id in log"
DLQ_STATUS="FAIL"
DLQ_NOTES="no replayed_job_id found"
append_report_row "D DLQ Replay" "$START_D" "$(date -u +"%Y-%m-%d %H:%M:%S")" "n/a" "see log" "n/a" "$DLQ_STATUS" "$DLQ_NOTES"
exit 1
fi
fi
END_D=$(date -u +"%Y-%m-%d %H:%M:%S")
append_report_row "D DLQ Replay" "$START_D" "$END_D" "n/a" "see log" "n/a" "${DLQ_STATUS}" "${DLQ_NOTES}"
echo "✅ Chaos suite complete. Logs: $LOG_DIR"
echo "✅ Report updated: $REPORT"