feat(matrix-bridge-dagi): M4–M11 + soak infrastructure (debug inject endpoint)

Includes all milestones M4 through M11:
- M4: agent discovery (!agents / !status)
- M5: node-aware routing + per-node observability
- M6: dynamic policy store (node/agent overrides, import/export)
- M7: Prometheus alerts + Grafana dashboard + metrics contract
- M8: node health tracker + soft failover + sticky cache + HA persistence
- M9: two-step confirm + diff preview for dangerous commands
- M10: auto-backup, restore, retention, policy history + change detail
- M11: soak scenarios (CI tests) + live soak script

Soak infrastructure (this commit):
- POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false)
- _preflight_inject() and _check_wal() in soak script
- --db-path arg for WAL delta reporting
- Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands

Made-with: Cursor
This commit is contained in:
Apple
2026-03-05 07:51:37 -08:00
parent fe6e3d30ae
commit 82d5ff2a4f
21 changed files with 9123 additions and 93 deletions

View File

@@ -1,5 +1,5 @@
"""
matrix-bridge-dagi — configuration and validation (M2.1 + M2.2 + M3.0)
matrix-bridge-dagi — configuration and validation (M2.1 + M2.2 + M3.0 + M3.1)
"""
import os
from dataclasses import dataclass, field
@@ -54,6 +54,54 @@ class BridgeConfig:
# "ignore" | "reply_error" (send ⛔ to room on unauthorized attempt)
control_unauthorized_behavior: str
# M3.1: Runbook runner — sofiia-console control token
sofiia_control_token: str # X-Control-Token for /api/runbooks/internal/runs
# M3.4: Control channel safety — rate limiting + cooldown
control_room_rpm: int # Max commands per room per minute (0 = unlimited)
control_operator_rpm: int # Max commands per operator per minute
control_run_next_rpm: int # Max !runbook next calls per run_id per minute
control_cooldown_s: float # Anti-double-click debounce per (operator, verb, subcmd)
# M2.3: Persistent event deduplication
persistent_dedupe: bool # Enable SQLite-backed dedupe across restarts
bridge_data_dir: str # Directory for SQLite DB and other bridge data
processed_events_ttl_h: int # TTL for processed events (hours)
processed_events_prune_batch: int # Max rows to prune per prune run
processed_events_prune_interval_s: int # Prune interval in seconds (0 = disable periodic)
# M4.0: agent discovery
discovery_rpm: int # Max !agents replies per room per minute (0 = unlimited)
# M5.0: node-aware routing
bridge_allowed_nodes: str # Comma-separated: "NODA1,NODA2"
bridge_default_node: str # Default node when none specified
bridge_room_node_map: str # Optional: "!roomA:server=NODA2;!roomB:server=NODA1"
# M8.0: node health + soft-failover thresholds
node_fail_consecutive: int # Consecutive failures before node marked "down"
node_lat_ewma_s: float # EWMA latency threshold (seconds) → "degraded"
node_ewma_alpha: float # EWMA smoothing factor (0..1)
# M8.1: sticky failover cache
failover_sticky_ttl_s: float # Seconds to hold sticky node preference after failover (0 = disabled)
# M8.2: HA state persistence
ha_health_snapshot_interval_s: int # Seconds between node health writes to DB (0 = disabled)
ha_health_max_age_s: int # Max age of health snapshot to load on startup (seconds)
# M9.0: Two-step confirmation TTL
confirm_ttl_s: float # Seconds a pending !confirm nonce is valid (0 = disabled)
# M10.0: Policy export retention
policy_export_retention_days: int # Days to keep policy exports (0 = keep forever)
# M10.2: Policy change history
policy_history_limit: int # Max rows in policy_changes table (0 = unlimited)
# M11 soak: synthetic event injection (NEVER enable in production)
debug_inject_enabled: bool # POST /v1/debug/inject_event (default: False)
# Service identity
node_id: str
build_sha: str
@@ -99,6 +147,35 @@ def load_config() -> BridgeConfig:
bridge_operator_allowlist=_optional("BRIDGE_OPERATOR_ALLOWLIST", ""),
bridge_control_rooms=_optional("BRIDGE_CONTROL_ROOMS", ""),
control_unauthorized_behavior=_optional("CONTROL_UNAUTHORIZED_BEHAVIOR", "ignore"),
sofiia_control_token=_optional("SOFIIA_CONTROL_TOKEN", ""),
control_room_rpm=max(0, int(_optional("CONTROL_ROOM_RPM", "60"))),
control_operator_rpm=max(0, int(_optional("CONTROL_OPERATOR_RPM", "30"))),
control_run_next_rpm=max(0, int(_optional("CONTROL_RUN_NEXT_RPM", "20"))),
control_cooldown_s=max(0.0, float(_optional("CONTROL_COOLDOWN_S", "2.0"))),
persistent_dedupe=_optional("PERSISTENT_DEDUPE", "1").strip() not in ("0", "false", ""),
bridge_data_dir=_optional("BRIDGE_DATA_DIR", "/app/data"),
processed_events_ttl_h=max(1, int(_optional("PROCESSED_EVENTS_TTL_H", "48"))),
processed_events_prune_batch=max(1, int(_optional("PROCESSED_EVENTS_PRUNE_BATCH", "5000"))),
processed_events_prune_interval_s=max(0, int(_optional("PROCESSED_EVENTS_PRUNE_INTERVAL_S", "3600"))),
discovery_rpm=max(0, int(_optional("DISCOVERY_RPM", "20"))),
bridge_allowed_nodes=_optional("BRIDGE_ALLOWED_NODES", "NODA1"),
bridge_default_node=_optional("BRIDGE_DEFAULT_NODE", "NODA1"),
bridge_room_node_map=_optional("BRIDGE_ROOM_NODE_MAP", ""),
# M8.0: node health thresholds
node_fail_consecutive=max(1, int(_optional("NODE_FAIL_CONSEC", "3"))),
node_lat_ewma_s=max(0.5, float(_optional("NODE_LAT_EWMA_S", "12.0"))),
node_ewma_alpha=min(1.0, max(0.01, float(_optional("NODE_EWMA_ALPHA", "0.3")))),
# M8.1: sticky failover TTL (0 = disabled)
failover_sticky_ttl_s=max(0.0, float(_optional("FAILOVER_STICKY_TTL_S", "300.0"))),
# M8.2: HA state persistence
ha_health_snapshot_interval_s=max(0, int(_optional("HA_HEALTH_SNAPSHOT_INTERVAL_S", "60"))),
ha_health_max_age_s=max(0, int(_optional("HA_HEALTH_MAX_AGE_S", "600"))),
# M9.0: Two-step confirmation TTL (0 = disabled)
confirm_ttl_s=max(0.0, float(_optional("CONFIRM_TTL_S", "120.0"))),
policy_export_retention_days=max(0, int(_optional("POLICY_EXPORT_RETENTION_DAYS", "30"))),
policy_history_limit=max(0, int(_optional("POLICY_HISTORY_LIMIT", "100"))),
debug_inject_enabled=_optional("DEBUG_INJECT_ENABLED", "false").lower()
in ("1", "true", "yes"),
node_id=_optional("NODE_ID", "NODA1"),
build_sha=_optional("BUILD_SHA", "dev"),
build_time=_optional("BUILD_TIME", "local"),