snapshot: NODE1 production state 2026-02-09

Complete snapshot of /opt/microdao-daarion/ from NODE1 (144.76.224.179). This represents the actual running production code that has diverged significantly from the previous main branch. Key changes from old main: - Gateway (http_api.py): expanded from ~40KB to 164KB with full agent support - Router: new /v1/agents/{id}/infer endpoint with vision + DeepSeek routing - Behavior Policy: SOWA v2.2 (3-level: FULL/ACK/SILENT) - Agent Registry: config/agent_registry.yml as single source of truth - 13 agents configured (was 3) - Memory service integration - CrewAI teams and roles Excluded from snapshot: venv/, .env, data/, backups, .tgz archives Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-09 08:46:46 -08:00
parent 134c044c21
commit ef3473db21
9473 changed files with 408933 additions and 2769877 deletions
--- a/scripts/chaos/run_chaos_suite.sh
+++ b/scripts/chaos/run_chaos_suite.sh
@@ -0,0 +1,390 @@
+#!/bin/bash
+set -euo pipefail
+
+# Safety guard: refuse to run on prod by default
+: "${ENV:=}"
+: "${ALLOW_DESTRUCTIVE_TESTS:=false}"
+: "${CONFIRM_CHAOS:=}"
+: "${CHAOS_FORCE_DLQ:=false}"
+
+if [ "$ENV" != "staging" ] && [ "$ENV" != "dev" ]; then
+  echo "ERROR: ENV must be staging/dev. Current ENV='$ENV'. Refusing to run."
+  exit 1
+fi
+
+if [ "$ALLOW_DESTRUCTIVE_TESTS" != "true" ] || [ "$CONFIRM_CHAOS" != "YES" ]; then
+  echo "ERROR: Set ALLOW_DESTRUCTIVE_TESTS=true and CONFIRM_CHAOS=YES to proceed."
+  exit 1
+fi
+
+ROOT="/opt/microdao-daarion"
+COMPOSE_PROJECT_NAME=${COMPOSE_PROJECT_NAME:-dagi-staging}
+LOG_DIR="$ROOT/logs/chaos"
+COMPOSE_FILES="-f $ROOT/docker-compose.staging.yml"
+STAGING_NETWORK="${COMPOSE_PROJECT_NAME}_dagi-staging-network"
+REPORT="$ROOT/docs/CHAOS_TEST_REPORT.md"
+TS=$(date -u +"%Y%m%dT%H%M%SZ")
+
+# Chaos parameters
+MESSAGES=${MESSAGES:-50}
+DUPLICATES=${DUPLICATES:-10}
+BURST_TIME=${BURST_TIME:-5}
+AGENT_ID=${AGENT_ID:-helion}
+DLQ_SUBJECT=${DLQ_SUBJECT:-agent.run.failed.dlq}
+DLQ_STREAM=${DLQ_STREAM:-AUDIT}
+AGENT_RUNS_STREAM=${AGENT_RUNS_STREAM:-AGENT_RUNS}
+COMPLETED_SUBJECT=${COMPLETED_SUBJECT:-agent.run.completed.${AGENT_ID}}
+
+# NATS/Redis endpoints for runner container
+NATS_HOST="dagi-staging-nats"
+
+RUNNER_IMAGE="python:3.11-slim"
+RUNNER_CMD_PREFIX="pip -q install nats-py httpx redis"
+
+# Resolve container IPs (DNS inside runner can be flaky)
+NATS_IP=$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' dagi-staging-nats 2>/dev/null || true)
+ROUTER_IP=$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' dagi-staging-router 2>/dev/null || true)
+REDIS_IP=$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' dagi-staging-redis 2>/dev/null || true)
+
+: "${NATS_IP:=dagi-staging-nats}"
+: "${ROUTER_IP:=dagi-staging-router}"
+: "${REDIS_IP:=dagi-staging-redis}"
+
+export NATS_URL="nats://router:router-secret-change-me@${NATS_IP}:4222"
+export NATS_MONITOR_URL="http://${NATS_IP}:8222"
+export ROUTER_URL="http://${ROUTER_IP}:9102"
+export REDIS_URL="redis://${REDIS_IP}:6379"
+
+run_burst() {
+  local label="$1"
+  local logfile="$LOG_DIR/${TS}_${label}.log"
+  echo "[${label}] Starting burst: messages=${MESSAGES}, duplicates=${DUPLICATES}, burst_time=${BURST_TIME}s"
+
+  docker run --rm --network $STAGING_NETWORK \
+    -e NATS_URL="$NATS_URL" \
+    -e NATS_MONITOR_URL="$NATS_MONITOR_URL" \
+    -e ROUTER_URL="$ROUTER_URL" \
+    -e REDIS_URL="$REDIS_URL" \
+    -v "$ROOT/scripts:/scripts" \
+    $RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && python /scripts/load/burst_100.py --messages ${MESSAGES} --duplicates ${DUPLICATES} --burst-time ${BURST_TIME}" | tee "$logfile"
+}
+
+dlq_pending() {
+  local subject="$1"
+  local stream="$2"
+  local durable="dlq_check_$(date +%s%N)"
+
+  docker run --rm -i --network $STAGING_NETWORK \
+    -e NATS_URL="$NATS_URL" \
+    -e DLQ_SUBJECT="$subject" \
+    -e DLQ_STREAM="$stream" \
+    -e DLQ_DURABLE="$durable" \
+    $RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && cat > /tmp/dlq_check.py && python /tmp/dlq_check.py" << 'PY_DLQ'
+import asyncio, os
+from nats.aio.client import Client as NATS
+from nats.js.api import ConsumerConfig, AckPolicy, DeliverPolicy
+
+async def main():
+    nc = NATS()
+    await nc.connect(servers=[os.environ["NATS_URL"]])
+    js = nc.jetstream()
+    stream = os.environ["DLQ_STREAM"]
+    subject = os.environ["DLQ_SUBJECT"]
+    durable = os.environ["DLQ_DURABLE"]
+
+    cfg = ConsumerConfig(
+        durable_name=durable,
+        filter_subject=subject,
+        ack_policy=AckPolicy.EXPLICIT,
+        deliver_policy=DeliverPolicy.ALL,
+    )
+    try:
+        await js.add_consumer(stream, cfg)
+        info = await js.consumer_info(stream, durable)
+        print(info.num_pending)
+    finally:
+        try:
+            await js.delete_consumer(stream, durable)
+        except Exception:
+            pass
+        await nc.close()
+
+asyncio.run(main())
+PY_DLQ
+}
+
+publish_forced_fail() {
+  local job_id="$1"
+
+  docker run --rm -i --network $STAGING_NETWORK \
+    -e NATS_URL="$NATS_URL" \
+    -e JOB_ID="$job_id" \
+    -e AGENT_ID="$AGENT_ID" \
+    $RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && cat > /tmp/force_fail.py && python /tmp/force_fail.py" << 'PY_FAIL'
+import asyncio, os, json
+from nats.aio.client import Client as NATS
+
+async def main():
+    nc = NATS()
+    await nc.connect(servers=[os.environ["NATS_URL"]])
+    js = nc.jetstream()
+    job_id = os.environ["JOB_ID"]
+    payload = {
+        "task_id": job_id,
+        "job_id": job_id,
+        "workflow_type": "test",
+        "agent_id": os.environ.get("AGENT_ID", "helion"),
+        "trace_id": f"trace-{job_id}",
+        "user_id": "tg:test_user",
+        "test_mode": True,
+        "force_fail": True,
+        "payload": {"prompt": "DLQ forced fail", "test": True},
+    }
+    await js.publish("agent.run.requested", json.dumps(payload).encode())
+    await nc.close()
+
+asyncio.run(main())
+print("published", os.environ["JOB_ID"])
+PY_FAIL
+}
+
+wait_for_completion() {
+  local job_id="$1"
+  local subject="$2"
+  local stream="$3"
+  local timeout="${4:-60}"
+
+  docker run --rm -i --network $STAGING_NETWORK \
+    -e NATS_URL="$NATS_URL" \
+    -e JOB_ID="$job_id" \
+    -e COMPLETED_SUBJECT="$subject" \
+    -e COMPLETED_STREAM="$stream" \
+    -e COMPLETION_TIMEOUT="$timeout" \
+    $RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && cat > /tmp/wait_completion.py && python /tmp/wait_completion.py" << 'PY_DONE'
+import asyncio, os, json, time
+from nats.aio.client import Client as NATS
+
+async def main():
+    nc = NATS()
+    await nc.connect(servers=[os.environ["NATS_URL"]])
+    js = nc.jetstream()
+    job_id = os.environ["JOB_ID"]
+    subject = os.environ["COMPLETED_SUBJECT"]
+    stream = os.environ["COMPLETED_STREAM"]
+    timeout = int(os.environ.get("COMPLETION_TIMEOUT", "60"))
+    deadline = time.time() + timeout
+
+    sub = await js.pull_subscribe(subject, durable=None, stream=stream)
+    found = False
+    replayed = None
+    replay_count = None
+    completed_subject = subject
+    while time.time() < deadline:
+        try:
+            batch = await sub.fetch(1, timeout=1)
+        except Exception:
+            continue
+        if batch:
+            msg = batch[0]
+            data = json.loads(msg.data.decode())
+            if data.get("job_id") == job_id:
+                found = True
+                completed_subject = msg.subject
+                if msg.headers:
+                    replayed = msg.headers.get("replayed")
+                    replay_count = msg.headers.get("replay_count")
+            await msg.ack()
+        if found:
+            break
+    await nc.close()
+    if not found:
+        raise SystemExit(1)
+    print(f"completed_job_id={job_id}")
+    print(f"completed_subject={completed_subject}")
+    print(f"replayed={replayed if replayed is not None else 'n/a'}")
+    print(f"replay_count={replay_count if replay_count is not None else 'n/a'}")
+
+asyncio.run(main())
+PY_DONE
+}
+
+append_report_header() {
+  if [ ! -f "$REPORT" ]; then
+    cat > "$REPORT" << "MD"
+# Chaos Test Report
+
+| Test | Start/End (UTC) | Max Lag | DLQ Peak | p95 Latency | Unique Success | Notes |
+|---|---|---|---|---|---|---|
+MD
+  fi
+}
+
+append_report_row() {
+  local test="$1"
+  local start="$2"
+  local end="$3"
+  local max_lag="$4"
+  local dlq="$5"
+  local p95="$6"
+  local success="$7"
+  local notes="$8"
+  echo "| ${test} | ${start} – ${end} | ${max_lag} | ${dlq} | ${p95} | ${success} | ${notes} |" >> "$REPORT"
+}
+
+extract_summary() {
+  local logfile="$1"
+  local max_lag=$(grep -m1 "Max during test" "$logfile" | awk '{print $NF}')
+  local dlq=$(grep -m1 "DLQ depth" "$logfile" | awk '{print $NF}')
+  local p95=$(grep -m1 "p95:" "$logfile" | awk '{print $2}')
+  local success=$(grep -m1 "Success rate" "$logfile" | awk '{print $NF}' | tr -d ')%')
+  echo "$max_lag|$dlq|$p95|$success"
+}
+
+append_report_header
+
+# Test A — Kill Worker
+START_A=$(date -u +"%Y-%m-%d %H:%M:%S")
+(run_burst "A_kill_worker") &
+PID=$!
+sleep 5
+if docker ps --format "{{.Names}}" | grep -q "dagi-staging-crewai-worker"; then
+  COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES restart crewai-worker
+fi
+wait $PID || true
+END_A=$(date -u +"%Y-%m-%d %H:%M:%S")
+SUM_A=$(extract_summary "$LOG_DIR/${TS}_A_kill_worker.log")
+IFS='|' read -r MAXLAG_A DLQ_A P95_A SUCCESS_A <<< "$SUM_A"
+append_report_row "A – Kill Worker" "$START_A" "$END_A" "${MAXLAG_A:-n/a}" "${DLQ_A:-n/a}" "${P95_A:-n/a}" "${SUCCESS_A:-n/a}%" "restart crewai-worker"
+
+# Test B — Kill Router
+START_B=$(date -u +"%Y-%m-%d %H:%M:%S")
+(run_burst "B_kill_router") &
+PID=$!
+sleep 5
+if docker ps --format "{{.Names}}" | grep -q "dagi-staging-router"; then
+  COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES restart router
+fi
+wait $PID || true
+END_B=$(date -u +"%Y-%m-%d %H:%M:%S")
+SUM_B=$(extract_summary "$LOG_DIR/${TS}_B_kill_router.log")
+IFS='|' read -r MAXLAG_B DLQ_B P95_B SUCCESS_B <<< "$SUM_B"
+append_report_row "B – Kill Router" "$START_B" "$END_B" "${MAXLAG_B:-n/a}" "${DLQ_B:-n/a}" "${P95_B:-n/a}" "${SUCCESS_B:-n/a}%" "restart router"
+
+# Test C — Block Postgres (stop 60s)
+START_C=$(date -u +"%Y-%m-%d %H:%M:%S")
+(run_burst "C_block_postgres") &
+PID=$!
+sleep 5
+if docker ps --format "{{.Names}}" | grep -q "dagi-staging-postgres"; then
+  COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES stop dagi-postgres
+  sleep 60
+  COMPOSE_PROJECT_NAME=$COMPOSE_PROJECT_NAME docker compose -p $COMPOSE_PROJECT_NAME $COMPOSE_FILES start dagi-postgres
+fi
+wait $PID || true
+END_C=$(date -u +"%Y-%m-%d %H:%M:%S")
+SUM_C=$(extract_summary "$LOG_DIR/${TS}_C_block_postgres.log")
+IFS='|' read -r MAXLAG_C DLQ_C P95_C SUCCESS_C <<< "$SUM_C"
+append_report_row "C – Block Postgres" "$START_C" "$END_C" "${MAXLAG_C:-n/a}" "${DLQ_C:-n/a}" "${P95_C:-n/a}" "${SUCCESS_C:-n/a}%" "stop/start postgres 60s"
+
+# Test D — DLQ Replay
+START_D=$(date -u +"%Y-%m-%d %H:%M:%S")
+DLQ_LOG="$LOG_DIR/${TS}_D_dlq_replay.log"
+
+DLQ_NOTES="dlq_replay.py"
+DLQ_STATUS="n/a"
+DLQ_JOB_ID=""
+DLQ_WAIT_S="0"
+REPLAY_WAIT_S="0"
+COMPLETION_WAIT_S="0"
+COMPLETED_SUBJECT_ACTUAL="$COMPLETED_SUBJECT"
+REPLAY_COUNT_VALUE="n/a"
+REPLAYED_VALUE="n/a"
+
+if [ "$CHAOS_FORCE_DLQ" = "true" ]; then
+  echo "D[DLQ]: forcing job_id=<pending> subject=${DLQ_SUBJECT} stream=${DLQ_STREAM}"
+  DLQ_COUNT=$(dlq_pending "$DLQ_SUBJECT" "$DLQ_STREAM" 2>/dev/null || echo "0")
+  if [ "$DLQ_COUNT" = "0" ]; then
+    DLQ_JOB_ID="dlq-test-$(date +%s)"
+    echo "D[DLQ]: forcing job_id=${DLQ_JOB_ID} subject=${DLQ_SUBJECT} stream=${DLQ_STREAM}"
+    publish_forced_fail "$DLQ_JOB_ID"
+
+    DLQ_WAIT_START=$(date +%s)
+    READY=false
+    for _ in $(seq 1 30); do
+      DLQ_COUNT=$(dlq_pending "$DLQ_SUBJECT" "$DLQ_STREAM" 2>/dev/null || echo "0")
+      if [ "$DLQ_COUNT" != "0" ]; then
+        READY=true
+        break
+      fi
+      sleep 2
+    done
+    DLQ_WAIT_END=$(date +%s)
+    DLQ_WAIT_S=$((DLQ_WAIT_END-DLQ_WAIT_START))
+
+    if [ "$READY" != "true" ]; then
+      echo "D[DLQ]: FAIL job_id=${DLQ_JOB_ID} reason=timeout waiting dlq"
+      DLQ_NOTES="forced-fail timeout"
+      DLQ_STATUS="FAIL"
+      append_report_row "D – DLQ Replay" "$START_D" "$(date -u +"%Y-%m-%d %H:%M:%S")" "n/a" "0" "n/a" "$DLQ_STATUS" "$DLQ_NOTES"
+      exit 1
+    fi
+  else
+    echo "D[DLQ]: pending=${DLQ_COUNT}, running replay (dry-run->real)"
+  fi
+fi
+
+REPLAY_START=$(date +%s)
+docker run --rm --network $STAGING_NETWORK \
+  -e NATS_URL="$NATS_URL" \
+  -v "$ROOT/scripts:/scripts" \
+  $RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && python /scripts/dlq_replay.py --subject ${DLQ_SUBJECT} --stream ${DLQ_STREAM} --dry-run" | tee "$DLQ_LOG"
+
+docker run --rm --network $STAGING_NETWORK \
+  -e NATS_URL="$NATS_URL" \
+  -v "$ROOT/scripts:/scripts" \
+  $RUNNER_IMAGE bash -lc "$RUNNER_CMD_PREFIX && python /scripts/dlq_replay.py --subject ${DLQ_SUBJECT} --stream ${DLQ_STREAM}" | tee -a "$DLQ_LOG"
+REPLAY_END=$(date +%s)
+REPLAY_WAIT_S=$((REPLAY_END-REPLAY_START))
+
+if [ "$CHAOS_FORCE_DLQ" = "true" ] && [ -z "$DLQ_JOB_ID" ]; then
+  DLQ_JOB_ID=$(grep -m1 "replayed_job_id=" "$DLQ_LOG" | awk -F= '{print $2}' | tr -d '\r')
+fi
+
+if [ "$CHAOS_FORCE_DLQ" = "true" ] && [ -n "$DLQ_JOB_ID" ]; then
+  COMPLETION_START=$(date +%s)
+  COMPLETION_OUTPUT=$(wait_for_completion "$DLQ_JOB_ID" "$COMPLETED_SUBJECT_ACTUAL" "$AGENT_RUNS_STREAM" 90)
+  COMPLETION_RC=$?
+  COMPLETION_END=$(date +%s)
+  COMPLETION_WAIT_S=$((COMPLETION_END-COMPLETION_START))
+
+  if [ "$COMPLETION_RC" -eq 0 ]; then
+    DLQ_STATUS="completed"
+    COMPLETED_SUBJECT_ACTUAL=$(echo "$COMPLETION_OUTPUT" | awk -F= '/completed_subject=/{print $2}' | tail -1)
+    REPLAYED_VALUE=$(echo "$COMPLETION_OUTPUT" | awk -F= '/replayed=/{print $2}' | tail -1)
+    REPLAY_COUNT_VALUE=$(echo "$COMPLETION_OUTPUT" | awk -F= '/replay_count=/{print $2}' | tail -1)
+    DLQ_NOTES="forced fail + replay, job_id=${DLQ_JOB_ID}, subject=${COMPLETED_SUBJECT_ACTUAL}, replay_count=${REPLAY_COUNT_VALUE}"
+    echo "D[DLQ]: completed job_id=${DLQ_JOB_ID} completed_subject=${COMPLETED_SUBJECT_ACTUAL} replayed=${REPLAYED_VALUE} replay_count=${REPLAY_COUNT_VALUE}"
+    echo "D[DLQ]: timings dlq_wait_s=${DLQ_WAIT_S} replay_wait_s=${REPLAY_WAIT_S} completion_wait_s=${COMPLETION_WAIT_S}"
+  else
+    DLQ_STATUS="FAIL"
+    DLQ_NOTES="replay done, completion timeout, job_id=${DLQ_JOB_ID}"
+    echo "D[DLQ]: FAIL job_id=${DLQ_JOB_ID} reason=timeout waiting completion"
+    echo "D[DLQ]: timings dlq_wait_s=${DLQ_WAIT_S} replay_wait_s=${REPLAY_WAIT_S} completion_wait_s=${COMPLETION_WAIT_S}"
+    append_report_row "D – DLQ Replay" "$START_D" "$(date -u +"%Y-%m-%d %H:%M:%S")" "n/a" "1→0" "n/a" "$DLQ_STATUS" "$DLQ_NOTES"
+    exit 1
+  fi
+else
+  if [ "$CHAOS_FORCE_DLQ" = "true" ]; then
+    echo "D[DLQ]: FAIL job_id=<none> reason=no replayed_job_id in log"
+    DLQ_STATUS="FAIL"
+    DLQ_NOTES="no replayed_job_id found"
+    append_report_row "D – DLQ Replay" "$START_D" "$(date -u +"%Y-%m-%d %H:%M:%S")" "n/a" "see log" "n/a" "$DLQ_STATUS" "$DLQ_NOTES"
+    exit 1
+  fi
+fi
+
+END_D=$(date -u +"%Y-%m-%d %H:%M:%S")
+append_report_row "D – DLQ Replay" "$START_D" "$END_D" "n/a" "see log" "n/a" "${DLQ_STATUS}" "${DLQ_NOTES}"
+
+echo "✅ Chaos suite complete. Logs: $LOG_DIR"
+echo "✅ Report updated: $REPORT"