Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor
135 lines
6.6 KiB
YAML
135 lines
6.6 KiB
YAML
# Matrix Bridge DAGI — Phase M2.1 (multi-room + mixed routing)
|
|
# Include into the main NODA1 stack or run standalone:
|
|
# docker compose -f docker-compose.node1.yml -f docker-compose.matrix-bridge-node1.yml up -d matrix-bridge-dagi
|
|
|
|
version: "3.9"
|
|
|
|
services:
|
|
matrix-bridge-dagi:
|
|
build:
|
|
context: ./services/matrix-bridge-dagi
|
|
args:
|
|
BUILD_SHA: "${BUILD_SHA:-dev}"
|
|
BUILD_TIME: "${BUILD_TIME:-local}"
|
|
container_name: matrix-bridge-dagi-node1
|
|
ports:
|
|
- "127.0.0.1:7030:7030" # internal only — not exposed publicly
|
|
environment:
|
|
- PORT=7030
|
|
- ENV=prod
|
|
- NODE_ID=NODA1
|
|
- BUILD_SHA=${BUILD_SHA:-dev}
|
|
- BUILD_TIME=${BUILD_TIME:-local}
|
|
|
|
# ── Matrix homeserver ────────────────────────────────────────────────
|
|
# Required: set in .env on NODA1 before first launch
|
|
- MATRIX_HOMESERVER_URL=${MATRIX_HOMESERVER_URL:-}
|
|
- MATRIX_ACCESS_TOKEN=${MATRIX_ACCESS_TOKEN:-}
|
|
- MATRIX_USER_ID=${MATRIX_USER_ID:-}
|
|
|
|
# ── Room → Agent mapping (M1: single room for Sofiia) ────────────────
|
|
# Create the room manually, then paste the room_id here
|
|
- SOFIIA_ROOM_ID=${SOFIIA_ROOM_ID:-}
|
|
|
|
# ── DAGI backend — Router for /v1/agents/{id}/infer ─────────────────
|
|
# Router internal port 8000 on dagi-network (ext port 9102 on host)
|
|
- DAGI_GATEWAY_URL=http://dagi-router-node1:8000
|
|
- DEFAULT_NODE_ID=NODA1
|
|
|
|
# ── Sofiia Console (audit write) ─────────────────────────────────────
|
|
- SOFIIA_CONSOLE_URL=http://dagi-sofiia-console-node1:8002
|
|
- SOFIIA_INTERNAL_TOKEN=${SOFIIA_INTERNAL_TOKEN:-}
|
|
|
|
# ── H2: Backpressure queue ───────────────────────────────────────────
|
|
- QUEUE_MAX_EVENTS=100
|
|
- WORKER_CONCURRENCY=2
|
|
- QUEUE_DRAIN_TIMEOUT_S=5
|
|
|
|
# ── Policy ───────────────────────────────────────────────────────────
|
|
# M2.0+: multiple agents separated by comma
|
|
- BRIDGE_ALLOWED_AGENTS=${BRIDGE_ALLOWED_AGENTS:-sofiia}
|
|
# M2.0: "sofiia:!room1:server,helion:!room2:server" (1 room → 1 agent)
|
|
- BRIDGE_ROOM_MAP=${BRIDGE_ROOM_MAP:-}
|
|
- RATE_LIMIT_ROOM_RPM=20
|
|
- RATE_LIMIT_SENDER_RPM=10
|
|
|
|
# ── M2.1: Mixed rooms (1 room → N agents) ───────────────────────────
|
|
# Format: "!roomX:server=sofiia,helion;!roomY:server=druid"
|
|
- BRIDGE_MIXED_ROOM_MAP=${BRIDGE_MIXED_ROOM_MAP:-}
|
|
# Override default agent per mixed room (optional):
|
|
# "!roomX:server=helion;!roomY:server=druid"
|
|
- BRIDGE_MIXED_DEFAULTS=${BRIDGE_MIXED_DEFAULTS:-}
|
|
|
|
# ── M3.0: Operator control channel ──────────────────────────────────
|
|
# Comma-separated Matrix user IDs allowed to issue !commands
|
|
- BRIDGE_OPERATOR_ALLOWLIST=${BRIDGE_OPERATOR_ALLOWLIST:-}
|
|
# Comma-separated room IDs designated as ops control channels
|
|
- BRIDGE_CONTROL_ROOMS=${BRIDGE_CONTROL_ROOMS:-}
|
|
# "ignore" (silent) | "reply_error" (⛔ reply to unauthorised attempts)
|
|
- CONTROL_UNAUTHORIZED_BEHAVIOR=${CONTROL_UNAUTHORIZED_BEHAVIOR:-ignore}
|
|
# ── M3.1: Runbook runner token ───────────────────────────────────────
|
|
# X-Control-Token for POST /api/runbooks/internal/runs (sofiia-console)
|
|
- SOFIIA_CONTROL_TOKEN=${SOFIIA_CONTROL_TOKEN:-}
|
|
# M3.4: Control channel safety — rate limiting + cooldown
|
|
- CONTROL_ROOM_RPM=${CONTROL_ROOM_RPM:-60}
|
|
- CONTROL_OPERATOR_RPM=${CONTROL_OPERATOR_RPM:-30}
|
|
- CONTROL_RUN_NEXT_RPM=${CONTROL_RUN_NEXT_RPM:-20}
|
|
- CONTROL_COOLDOWN_S=${CONTROL_COOLDOWN_S:-2.0}
|
|
# M2.3: Persistent event deduplication
|
|
- PERSISTENT_DEDUPE=${PERSISTENT_DEDUPE:-1}
|
|
- BRIDGE_DATA_DIR=${BRIDGE_DATA_DIR:-/app/data}
|
|
- PROCESSED_EVENTS_TTL_H=${PROCESSED_EVENTS_TTL_H:-48}
|
|
- PROCESSED_EVENTS_PRUNE_BATCH=${PROCESSED_EVENTS_PRUNE_BATCH:-5000}
|
|
- PROCESSED_EVENTS_PRUNE_INTERVAL_S=${PROCESSED_EVENTS_PRUNE_INTERVAL_S:-3600}
|
|
# M4.0: agent discovery
|
|
- DISCOVERY_RPM=${DISCOVERY_RPM:-20}
|
|
# M5.0: node-aware routing
|
|
- BRIDGE_ALLOWED_NODES=${BRIDGE_ALLOWED_NODES:-NODA1}
|
|
- BRIDGE_DEFAULT_NODE=${BRIDGE_DEFAULT_NODE:-NODA1}
|
|
- BRIDGE_ROOM_NODE_MAP=${BRIDGE_ROOM_NODE_MAP:-}
|
|
# M8.0: Node health + soft-failover thresholds
|
|
- NODE_FAIL_CONSEC=${NODE_FAIL_CONSEC:-3}
|
|
- NODE_LAT_EWMA_S=${NODE_LAT_EWMA_S:-12.0}
|
|
- NODE_EWMA_ALPHA=${NODE_EWMA_ALPHA:-0.3}
|
|
# M8.1: Sticky failover TTL (0 = disabled)
|
|
- FAILOVER_STICKY_TTL_S=${FAILOVER_STICKY_TTL_S:-300}
|
|
# M8.2: HA state persistence
|
|
- HA_HEALTH_SNAPSHOT_INTERVAL_S=${HA_HEALTH_SNAPSHOT_INTERVAL_S:-60}
|
|
- HA_HEALTH_MAX_AGE_S=${HA_HEALTH_MAX_AGE_S:-600}
|
|
# M9.0: Two-step confirmation TTL for dangerous commands (0 = disabled)
|
|
- CONFIRM_TTL_S=${CONFIRM_TTL_S:-120}
|
|
- POLICY_EXPORT_RETENTION_DAYS=${POLICY_EXPORT_RETENTION_DAYS:-30}
|
|
- POLICY_HISTORY_LIMIT=${POLICY_HISTORY_LIMIT:-100}
|
|
# M11 soak: NEVER set to true in production
|
|
- DEBUG_INJECT_ENABLED=${DEBUG_INJECT_ENABLED:-false}
|
|
|
|
# ── M2.2: Mixed room guard rails ────────────────────────────────────
|
|
# Fail-fast if any room defines more agents than this
|
|
- MAX_AGENTS_PER_MIXED_ROOM=${MAX_AGENTS_PER_MIXED_ROOM:-5}
|
|
# Reject slash commands longer than this (anti-garbage / injection guard)
|
|
- MAX_SLASH_LEN=${MAX_SLASH_LEN:-32}
|
|
# What to do when unknown /slash is used: "ignore" (silent) | "reply_error" (inform user)
|
|
- UNKNOWN_AGENT_BEHAVIOR=${UNKNOWN_AGENT_BEHAVIOR:-ignore}
|
|
# Max concurrent Router invocations per (room, agent) pair; 0 = unlimited
|
|
- MIXED_CONCURRENCY_CAP=${MIXED_CONCURRENCY_CAP:-1}
|
|
|
|
healthcheck:
|
|
test:
|
|
- "CMD"
|
|
- "python3"
|
|
- "-c"
|
|
- "import urllib.request; urllib.request.urlopen('http://localhost:7030/health', timeout=5)"
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
start_period: 15s
|
|
|
|
networks:
|
|
- dagi-network
|
|
|
|
restart: unless-stopped
|
|
|
|
networks:
|
|
dagi-network:
|
|
external: true
|