Files
microdao-daarion/docker-compose.matrix-bridge-node1.yml
Apple 82d5ff2a4f feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)
Includes all milestones M4 through M11:
- M4: agent discovery (!agents / !status)
- M5: node-aware routing + per-node observability
- M6: dynamic policy store (node/agent overrides, import/export)
- M7: Prometheus alerts + Grafana dashboard + metrics contract
- M8: node health tracker + soft failover + sticky cache + HA persistence
- M9: two-step confirm + diff preview for dangerous commands
- M10: auto-backup, restore, retention, policy history + change detail
- M11: soak scenarios (CI tests) + live soak script

Soak infrastructure (this commit):
- POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false)
- _preflight_inject() and _check_wal() in soak script
- --db-path arg for WAL delta reporting
- Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands

Made-with: Cursor
2026-03-05 07:51:37 -08:00

135 lines
6.6 KiB
YAML

# Matrix Bridge DAGI — Phase M2.1 (multi-room + mixed routing)
# Include into the main NODA1 stack or run standalone:
# docker compose -f docker-compose.node1.yml -f docker-compose.matrix-bridge-node1.yml up -d matrix-bridge-dagi
version: "3.9"
services:
matrix-bridge-dagi:
build:
context: ./services/matrix-bridge-dagi
args:
BUILD_SHA: "${BUILD_SHA:-dev}"
BUILD_TIME: "${BUILD_TIME:-local}"
container_name: matrix-bridge-dagi-node1
ports:
- "127.0.0.1:7030:7030" # internal only — not exposed publicly
environment:
- PORT=7030
- ENV=prod
- NODE_ID=NODA1
- BUILD_SHA=${BUILD_SHA:-dev}
- BUILD_TIME=${BUILD_TIME:-local}
# ── Matrix homeserver ────────────────────────────────────────────────
# Required: set in .env on NODA1 before first launch
- MATRIX_HOMESERVER_URL=${MATRIX_HOMESERVER_URL:-}
- MATRIX_ACCESS_TOKEN=${MATRIX_ACCESS_TOKEN:-}
- MATRIX_USER_ID=${MATRIX_USER_ID:-}
# ── Room → Agent mapping (M1: single room for Sofiia) ────────────────
# Create the room manually, then paste the room_id here
- SOFIIA_ROOM_ID=${SOFIIA_ROOM_ID:-}
# ── DAGI backend — Router for /v1/agents/{id}/infer ─────────────────
# Router internal port 8000 on dagi-network (ext port 9102 on host)
- DAGI_GATEWAY_URL=http://dagi-router-node1:8000
- DEFAULT_NODE_ID=NODA1
# ── Sofiia Console (audit write) ─────────────────────────────────────
- SOFIIA_CONSOLE_URL=http://dagi-sofiia-console-node1:8002
- SOFIIA_INTERNAL_TOKEN=${SOFIIA_INTERNAL_TOKEN:-}
# ── H2: Backpressure queue ───────────────────────────────────────────
- QUEUE_MAX_EVENTS=100
- WORKER_CONCURRENCY=2
- QUEUE_DRAIN_TIMEOUT_S=5
# ── Policy ───────────────────────────────────────────────────────────
# M2.0+: multiple agents separated by comma
- BRIDGE_ALLOWED_AGENTS=${BRIDGE_ALLOWED_AGENTS:-sofiia}
# M2.0: "sofiia:!room1:server,helion:!room2:server" (1 room → 1 agent)
- BRIDGE_ROOM_MAP=${BRIDGE_ROOM_MAP:-}
- RATE_LIMIT_ROOM_RPM=20
- RATE_LIMIT_SENDER_RPM=10
# ── M2.1: Mixed rooms (1 room → N agents) ───────────────────────────
# Format: "!roomX:server=sofiia,helion;!roomY:server=druid"
- BRIDGE_MIXED_ROOM_MAP=${BRIDGE_MIXED_ROOM_MAP:-}
# Override default agent per mixed room (optional):
# "!roomX:server=helion;!roomY:server=druid"
- BRIDGE_MIXED_DEFAULTS=${BRIDGE_MIXED_DEFAULTS:-}
# ── M3.0: Operator control channel ──────────────────────────────────
# Comma-separated Matrix user IDs allowed to issue !commands
- BRIDGE_OPERATOR_ALLOWLIST=${BRIDGE_OPERATOR_ALLOWLIST:-}
# Comma-separated room IDs designated as ops control channels
- BRIDGE_CONTROL_ROOMS=${BRIDGE_CONTROL_ROOMS:-}
# "ignore" (silent) | "reply_error" (⛔ reply to unauthorised attempts)
- CONTROL_UNAUTHORIZED_BEHAVIOR=${CONTROL_UNAUTHORIZED_BEHAVIOR:-ignore}
# ── M3.1: Runbook runner token ───────────────────────────────────────
# X-Control-Token for POST /api/runbooks/internal/runs (sofiia-console)
- SOFIIA_CONTROL_TOKEN=${SOFIIA_CONTROL_TOKEN:-}
# M3.4: Control channel safety — rate limiting + cooldown
- CONTROL_ROOM_RPM=${CONTROL_ROOM_RPM:-60}
- CONTROL_OPERATOR_RPM=${CONTROL_OPERATOR_RPM:-30}
- CONTROL_RUN_NEXT_RPM=${CONTROL_RUN_NEXT_RPM:-20}
- CONTROL_COOLDOWN_S=${CONTROL_COOLDOWN_S:-2.0}
# M2.3: Persistent event deduplication
- PERSISTENT_DEDUPE=${PERSISTENT_DEDUPE:-1}
- BRIDGE_DATA_DIR=${BRIDGE_DATA_DIR:-/app/data}
- PROCESSED_EVENTS_TTL_H=${PROCESSED_EVENTS_TTL_H:-48}
- PROCESSED_EVENTS_PRUNE_BATCH=${PROCESSED_EVENTS_PRUNE_BATCH:-5000}
- PROCESSED_EVENTS_PRUNE_INTERVAL_S=${PROCESSED_EVENTS_PRUNE_INTERVAL_S:-3600}
# M4.0: agent discovery
- DISCOVERY_RPM=${DISCOVERY_RPM:-20}
# M5.0: node-aware routing
- BRIDGE_ALLOWED_NODES=${BRIDGE_ALLOWED_NODES:-NODA1}
- BRIDGE_DEFAULT_NODE=${BRIDGE_DEFAULT_NODE:-NODA1}
- BRIDGE_ROOM_NODE_MAP=${BRIDGE_ROOM_NODE_MAP:-}
# M8.0: Node health + soft-failover thresholds
- NODE_FAIL_CONSEC=${NODE_FAIL_CONSEC:-3}
- NODE_LAT_EWMA_S=${NODE_LAT_EWMA_S:-12.0}
- NODE_EWMA_ALPHA=${NODE_EWMA_ALPHA:-0.3}
# M8.1: Sticky failover TTL (0 = disabled)
- FAILOVER_STICKY_TTL_S=${FAILOVER_STICKY_TTL_S:-300}
# M8.2: HA state persistence
- HA_HEALTH_SNAPSHOT_INTERVAL_S=${HA_HEALTH_SNAPSHOT_INTERVAL_S:-60}
- HA_HEALTH_MAX_AGE_S=${HA_HEALTH_MAX_AGE_S:-600}
# M9.0: Two-step confirmation TTL for dangerous commands (0 = disabled)
- CONFIRM_TTL_S=${CONFIRM_TTL_S:-120}
- POLICY_EXPORT_RETENTION_DAYS=${POLICY_EXPORT_RETENTION_DAYS:-30}
- POLICY_HISTORY_LIMIT=${POLICY_HISTORY_LIMIT:-100}
# M11 soak: NEVER set to true in production
- DEBUG_INJECT_ENABLED=${DEBUG_INJECT_ENABLED:-false}
# ── M2.2: Mixed room guard rails ────────────────────────────────────
# Fail-fast if any room defines more agents than this
- MAX_AGENTS_PER_MIXED_ROOM=${MAX_AGENTS_PER_MIXED_ROOM:-5}
# Reject slash commands longer than this (anti-garbage / injection guard)
- MAX_SLASH_LEN=${MAX_SLASH_LEN:-32}
# What to do when unknown /slash is used: "ignore" (silent) | "reply_error" (inform user)
- UNKNOWN_AGENT_BEHAVIOR=${UNKNOWN_AGENT_BEHAVIOR:-ignore}
# Max concurrent Router invocations per (room, agent) pair; 0 = unlimited
- MIXED_CONCURRENCY_CAP=${MIXED_CONCURRENCY_CAP:-1}
healthcheck:
test:
- "CMD"
- "python3"
- "-c"
- "import urllib.request; urllib.request.urlopen('http://localhost:7030/health', timeout=5)"
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
networks:
- dagi-network
restart: unless-stopped
networks:
dagi-network:
external: true