From 82d5ff2a4fb8a3d2986d9a6d6e77c36282a70760 Mon Sep 17 00:00:00 2001 From: Apple Date: Thu, 5 Mar 2026 07:51:37 -0800 Subject: [PATCH] =?UTF-8?q?feat(matrix-bridge-dagi):=20M4=E2=80=93M11=20+?= =?UTF-8?q?=20soak=20infrastructure=20(debug=20inject=20endpoint)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Includes all milestones M4 through M11: - M4: agent discovery (!agents / !status) - M5: node-aware routing + per-node observability - M6: dynamic policy store (node/agent overrides, import/export) - M7: Prometheus alerts + Grafana dashboard + metrics contract - M8: node health tracker + soft failover + sticky cache + HA persistence - M9: two-step confirm + diff preview for dangerous commands - M10: auto-backup, restore, retention, policy history + change detail - M11: soak scenarios (CI tests) + live soak script Soak infrastructure (this commit): - POST /v1/debug/inject_event (guarded by DEBUG_INJECT_ENABLED=false) - _preflight_inject() and _check_wal() in soak script - --db-path arg for WAL delta reporting - Runbook sections 2a/2b/2c: Step 0 and Step 1 exact commands Made-with: Cursor --- docker-compose.matrix-bridge-node1.yml | 35 + .../dashboards/matrix-bridge-dagi.json | 986 ++++++ .../alerts/matrix-bridge-dagi.rules.yml | 158 + ops/runbook-matrix-bridge-soak.md | 401 +++ ops/scripts/matrix_bridge_soak.py | 476 +++ services/matrix-bridge-dagi/app/config.py | 79 +- .../matrix-bridge-dagi/app/confirm_store.py | 167 + services/matrix-bridge-dagi/app/control.py | 946 +++++- .../matrix-bridge-dagi/app/control_limiter.py | 138 + .../matrix-bridge-dagi/app/control_runner.py | 296 ++ services/matrix-bridge-dagi/app/discovery.py | 210 ++ .../matrix-bridge-dagi/app/event_store.py | 213 ++ services/matrix-bridge-dagi/app/ingress.py | 2801 ++++++++++++++++- services/matrix-bridge-dagi/app/main.py | 466 ++- .../app/metrics_contract.py | 224 ++ .../matrix-bridge-dagi/app/mixed_routing.py | 22 + .../matrix-bridge-dagi/app/node_health.py | 262 ++ .../matrix-bridge-dagi/app/node_policy.py | 179 ++ .../matrix-bridge-dagi/app/policy_store.py | 1007 ++++++ .../matrix-bridge-dagi/app/sticky_cache.py | 149 + services/matrix-bridge-dagi/requirements.txt | 1 + 21 files changed, 9123 insertions(+), 93 deletions(-) create mode 100644 ops/grafana/dashboards/matrix-bridge-dagi.json create mode 100644 ops/prometheus/alerts/matrix-bridge-dagi.rules.yml create mode 100644 ops/runbook-matrix-bridge-soak.md create mode 100644 ops/scripts/matrix_bridge_soak.py create mode 100644 services/matrix-bridge-dagi/app/confirm_store.py create mode 100644 services/matrix-bridge-dagi/app/control_limiter.py create mode 100644 services/matrix-bridge-dagi/app/control_runner.py create mode 100644 services/matrix-bridge-dagi/app/discovery.py create mode 100644 services/matrix-bridge-dagi/app/event_store.py create mode 100644 services/matrix-bridge-dagi/app/metrics_contract.py create mode 100644 services/matrix-bridge-dagi/app/node_health.py create mode 100644 services/matrix-bridge-dagi/app/node_policy.py create mode 100644 services/matrix-bridge-dagi/app/policy_store.py create mode 100644 services/matrix-bridge-dagi/app/sticky_cache.py diff --git a/docker-compose.matrix-bridge-node1.yml b/docker-compose.matrix-bridge-node1.yml index 4cd401f3..d4eb13f2 100644 --- a/docker-compose.matrix-bridge-node1.yml +++ b/docker-compose.matrix-bridge-node1.yml @@ -67,6 +67,41 @@ services: - BRIDGE_CONTROL_ROOMS=${BRIDGE_CONTROL_ROOMS:-} # "ignore" (silent) | "reply_error" (⛔ reply to unauthorised attempts) - CONTROL_UNAUTHORIZED_BEHAVIOR=${CONTROL_UNAUTHORIZED_BEHAVIOR:-ignore} + # ── M3.1: Runbook runner token ─────────────────────────────────────── + # X-Control-Token for POST /api/runbooks/internal/runs (sofiia-console) + - SOFIIA_CONTROL_TOKEN=${SOFIIA_CONTROL_TOKEN:-} + # M3.4: Control channel safety — rate limiting + cooldown + - CONTROL_ROOM_RPM=${CONTROL_ROOM_RPM:-60} + - CONTROL_OPERATOR_RPM=${CONTROL_OPERATOR_RPM:-30} + - CONTROL_RUN_NEXT_RPM=${CONTROL_RUN_NEXT_RPM:-20} + - CONTROL_COOLDOWN_S=${CONTROL_COOLDOWN_S:-2.0} + # M2.3: Persistent event deduplication + - PERSISTENT_DEDUPE=${PERSISTENT_DEDUPE:-1} + - BRIDGE_DATA_DIR=${BRIDGE_DATA_DIR:-/app/data} + - PROCESSED_EVENTS_TTL_H=${PROCESSED_EVENTS_TTL_H:-48} + - PROCESSED_EVENTS_PRUNE_BATCH=${PROCESSED_EVENTS_PRUNE_BATCH:-5000} + - PROCESSED_EVENTS_PRUNE_INTERVAL_S=${PROCESSED_EVENTS_PRUNE_INTERVAL_S:-3600} + # M4.0: agent discovery + - DISCOVERY_RPM=${DISCOVERY_RPM:-20} + # M5.0: node-aware routing + - BRIDGE_ALLOWED_NODES=${BRIDGE_ALLOWED_NODES:-NODA1} + - BRIDGE_DEFAULT_NODE=${BRIDGE_DEFAULT_NODE:-NODA1} + - BRIDGE_ROOM_NODE_MAP=${BRIDGE_ROOM_NODE_MAP:-} + # M8.0: Node health + soft-failover thresholds + - NODE_FAIL_CONSEC=${NODE_FAIL_CONSEC:-3} + - NODE_LAT_EWMA_S=${NODE_LAT_EWMA_S:-12.0} + - NODE_EWMA_ALPHA=${NODE_EWMA_ALPHA:-0.3} + # M8.1: Sticky failover TTL (0 = disabled) + - FAILOVER_STICKY_TTL_S=${FAILOVER_STICKY_TTL_S:-300} + # M8.2: HA state persistence + - HA_HEALTH_SNAPSHOT_INTERVAL_S=${HA_HEALTH_SNAPSHOT_INTERVAL_S:-60} + - HA_HEALTH_MAX_AGE_S=${HA_HEALTH_MAX_AGE_S:-600} + # M9.0: Two-step confirmation TTL for dangerous commands (0 = disabled) + - CONFIRM_TTL_S=${CONFIRM_TTL_S:-120} + - POLICY_EXPORT_RETENTION_DAYS=${POLICY_EXPORT_RETENTION_DAYS:-30} + - POLICY_HISTORY_LIMIT=${POLICY_HISTORY_LIMIT:-100} + # M11 soak: NEVER set to true in production + - DEBUG_INJECT_ENABLED=${DEBUG_INJECT_ENABLED:-false} # ── M2.2: Mixed room guard rails ──────────────────────────────────── # Fail-fast if any room defines more agents than this diff --git a/ops/grafana/dashboards/matrix-bridge-dagi.json b/ops/grafana/dashboards/matrix-bridge-dagi.json new file mode 100644 index 00000000..d406ff72 --- /dev/null +++ b/ops/grafana/dashboards/matrix-bridge-dagi.json @@ -0,0 +1,986 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.0.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Matrix Bridge DAGI \u2014 operational overview (M7.0). Traffic, latency, errors, queue, dedupe, control channel.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [ + { + "asDropdown": false, + "icon": "doc", + "includeVars": false, + "keepTime": false, + "tags": [], + "targetBlank": true, + "title": "Runbook", + "tooltip": "matrix-bridge-dagi-ops.md", + "type": "link", + "url": "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md" + } + ], + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Bridge Up", + "gridPos": { + "x": 0, + "y": 0, + "w": 4, + "h": 4 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "expr": "sum(matrix_bridge_up)", + "legendFormat": "up (all nodes)", + "refId": "A", + "instant": true + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "colorMode": "background", + "graphMode": "none", + "textMode": "auto", + "orientation": "auto" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "type": "value", + "options": { + "0": { + "text": "DOWN", + "color": "red" + }, + "1": { + "text": "UP", + "color": "green" + } + } + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "color": { + "mode": "thresholds" + } + }, + "overrides": [] + } + }, + { + "id": 2, + "type": "stat", + "title": "Queue Size", + "gridPos": { + "x": 4, + "y": 0, + "w": 4, + "h": 4 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "expr": "matrix_bridge_queue_size", + "legendFormat": "queue", + "refId": "A", + "instant": true + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "colorMode": "background", + "graphMode": "area", + "textMode": "auto" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 100 + } + ] + }, + "color": { + "mode": "thresholds" + }, + "unit": "short" + }, + "overrides": [] + } + }, + { + "id": 3, + "type": "stat", + "title": "Active Rate-Limiter Rooms", + "gridPos": { + "x": 8, + "y": 0, + "w": 4, + "h": 4 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "expr": "matrix_bridge_rate_limiter_active_rooms", + "legendFormat": "rooms", + "refId": "A", + "instant": true + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "colorMode": "value", + "graphMode": "none" + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "id": 4, + "type": "stat", + "title": "Active Room-Agent Locks", + "gridPos": { + "x": 12, + "y": 0, + "w": 4, + "h": 4 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "expr": "matrix_bridge_active_room_agent_locks", + "legendFormat": "locks", + "refId": "A", + "instant": true + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "colorMode": "value", + "graphMode": "none" + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "id": 5, + "type": "stat", + "title": "Drops (5m)", + "gridPos": { + "x": 16, + "y": 0, + "w": 4, + "h": 4 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "expr": "sum(increase(matrix_bridge_queue_dropped_total[5m]))", + "legendFormat": "dropped", + "refId": "A", + "instant": true + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "colorMode": "background", + "graphMode": "none" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "color": { + "mode": "thresholds" + }, + "unit": "short" + }, + "overrides": [] + } + }, + { + "id": 6, + "type": "stat", + "title": "Errors (5m)", + "gridPos": { + "x": 20, + "y": 0, + "w": 4, + "h": 4 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "expr": "sum(increase(matrix_bridge_gateway_errors_total[5m]))", + "legendFormat": "errors", + "refId": "A", + "instant": true + } + ], + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + }, + "colorMode": "background", + "graphMode": "none" + }, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "color": { + "mode": "thresholds" + }, + "unit": "short" + }, + "overrides": [] + } + }, + { + "id": 10, + "type": "timeseries", + "title": "Traffic: Received & Replied (rate/5m)", + "gridPos": { + "x": 0, + "y": 4, + "w": 12, + "h": 8 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "expr": "sum(rate(matrix_bridge_messages_received_total[5m]))", + "legendFormat": "received", + "refId": "A" + }, + { + "expr": "sum(rate(matrix_bridge_messages_replied_total{status=\"ok\"}[5m]))", + "legendFormat": "replied ok", + "refId": "B" + }, + { + "expr": "sum(rate(matrix_bridge_messages_replied_total{status=\"error\"}[5m]))", + "legendFormat": "replied error", + "refId": "C" + } + ], + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "mean", + "max" + ] + } + }, + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "lineWidth": 2, + "fillOpacity": 10, + "drawStyle": "line", + "spanNulls": false + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "replied error" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "red" + } + } + ] + } + ] + } + }, + { + "id": 11, + "type": "timeseries", + "title": "Errors / Drops / Rate-Limited (rate/5m)", + "gridPos": { + "x": 12, + "y": 4, + "w": 12, + "h": 8 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "expr": "sum by (error_type) (rate(matrix_bridge_gateway_errors_total[5m]))", + "legendFormat": "gw_error: {{ error_type }}", + "refId": "A" + }, + { + "expr": "sum(rate(matrix_bridge_queue_dropped_total[5m]))", + "legendFormat": "queue_dropped", + "refId": "B" + }, + { + "expr": "sum(rate(matrix_bridge_rate_limited_total[5m]))", + "legendFormat": "rate_limited", + "refId": "C" + }, + { + "expr": "sum by (reason) (rate(matrix_bridge_route_rejected_total[5m]))", + "legendFormat": "route_rejected: {{ reason }}", + "refId": "D" + } + ], + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "mean", + "max" + ] + } + }, + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "lineWidth": 2, + "fillOpacity": 15, + "drawStyle": "line", + "stacking": { + "mode": "none" + }, + "spanNulls": false + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "id": 20, + "type": "timeseries", + "title": "Invoke Latency P50 / P95 by Node", + "gridPos": { + "x": 0, + "y": 12, + "w": 12, + "h": 8 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (node_id, le) (rate(matrix_bridge_invoke_duration_seconds_bucket[5m])))", + "legendFormat": "p50 {{ node_id }}", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (node_id, le) (rate(matrix_bridge_invoke_duration_seconds_bucket[5m])))", + "legendFormat": "p95 {{ node_id }}", + "refId": "B" + } + ], + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "mean", + "max", + "last" + ] + } + }, + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "lineWidth": 2, + "fillOpacity": 5, + "drawStyle": "line", + "spanNulls": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 20 + } + ] + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "id": 21, + "type": "timeseries", + "title": "Queue Wait P50 / P95", + "gridPos": { + "x": 12, + "y": 12, + "w": 12, + "h": 8 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum by (agent_id, le) (rate(matrix_bridge_queue_wait_seconds_bucket[5m])))", + "legendFormat": "wait p50 {{ agent_id }}", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum by (agent_id, le) (rate(matrix_bridge_queue_wait_seconds_bucket[5m])))", + "legendFormat": "wait p95 {{ agent_id }}", + "refId": "B" + } + ], + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "mean", + "max" + ] + } + }, + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { + "lineWidth": 2, + "fillOpacity": 5, + "drawStyle": "line", + "spanNulls": false + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "id": 30, + "type": "timeseries", + "title": "Node Routing: Routed & Rejected by Node (rate/5m)", + "gridPos": { + "x": 0, + "y": 20, + "w": 12, + "h": 7 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "expr": "sum by (node_id) (rate(matrix_bridge_routed_total[5m]))", + "legendFormat": "routed {{ node_id }}", + "refId": "A" + }, + { + "expr": "sum by (node_id) (rate(matrix_bridge_node_rejected_total[5m]))", + "legendFormat": "rejected {{ node_id }}", + "refId": "B" + } + ], + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "mean", + "max" + ] + } + }, + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "lineWidth": 2, + "fillOpacity": 10, + "drawStyle": "line", + "spanNulls": false + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "id": 31, + "type": "timeseries", + "title": "Persistent Dedupe Hits / Inserts (rate/10m)", + "gridPos": { + "x": 12, + "y": 20, + "w": 12, + "h": 7 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "expr": "sum(rate(matrix_bridge_dedupe_persistent_hits_total[10m]))", + "legendFormat": "dedupe_hits", + "refId": "A" + }, + { + "expr": "rate(matrix_bridge_dedupe_persistent_inserts_total[10m])", + "legendFormat": "dedupe_inserts", + "refId": "B" + } + ], + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "mean", + "max" + ] + } + }, + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "lineWidth": 2, + "fillOpacity": 10, + "drawStyle": "line", + "spanNulls": false + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "id": 40, + "type": "timeseries", + "title": "Control Commands (rate/5m)", + "gridPos": { + "x": 0, + "y": 27, + "w": 12, + "h": 7 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "expr": "sum by (verb) (rate(matrix_bridge_control_commands_total[5m]))", + "legendFormat": "cmd {{ verb }}", + "refId": "A" + }, + { + "expr": "sum by (scope) (rate(matrix_bridge_control_rate_limited_total[5m]))", + "legendFormat": "ctrl_ratelimited {{ scope }}", + "refId": "B" + } + ], + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "mean", + "max" + ] + } + }, + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "lineWidth": 2, + "fillOpacity": 10, + "drawStyle": "line", + "spanNulls": false + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "id": 41, + "type": "timeseries", + "title": "Traffic by Agent (received rate/5m)", + "gridPos": { + "x": 12, + "y": 27, + "w": 24, + "h": 7 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "expr": "sum by (agent_id) (rate(matrix_bridge_messages_received_total[5m]))", + "legendFormat": "{{ agent_id }}", + "refId": "A" + } + ], + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "mean", + "max", + "last" + ] + } + }, + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "lineWidth": 2, + "fillOpacity": 10, + "drawStyle": "line", + "spanNulls": false + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + }, + { + "id": 42, + "type": "timeseries", + "title": "Routing Reasons by Agent (rate/5m)", + "description": "M7.1: matrix_bridge_routing_reasons_total \u2014 slash/mention/name/default/direct breakdown", + "gridPos": { + "x": 0, + "y": 34, + "w": 24, + "h": 7 + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "targets": [ + { + "expr": "sum by (agent_id, reason) (rate(matrix_bridge_routing_reasons_total[5m]))", + "legendFormat": "{{ agent_id }} / {{ reason }}", + "refId": "A" + } + ], + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": [ + "mean", + "max" + ] + } + }, + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { + "lineWidth": 2, + "fillOpacity": 10, + "drawStyle": "line", + "spanNulls": false + }, + "color": { + "mode": "palette-classic" + } + }, + "overrides": [] + } + } + ], + "refresh": "30s", + "schemaVersion": 38, + "tags": [ + "matrix-bridge", + "dagi", + "daarion" + ], + "templating": { + "list": [ + { + "current": {}, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "UTC", + "title": "Matrix Bridge DAGI", + "uid": "matrix-bridge-dagi-v1", + "version": 1 +} \ No newline at end of file diff --git a/ops/prometheus/alerts/matrix-bridge-dagi.rules.yml b/ops/prometheus/alerts/matrix-bridge-dagi.rules.yml new file mode 100644 index 00000000..cd7c9d4b --- /dev/null +++ b/ops/prometheus/alerts/matrix-bridge-dagi.rules.yml @@ -0,0 +1,158 @@ +--- +# Prometheus alert rules — Matrix Bridge DAGI +# Phase M7.1 (metrics contract hardening) +# +# Metric source of truth: services/matrix-bridge-dagi/app/metrics_contract.py +# Runbook: docs/runbook/matrix-bridge-dagi-ops.md +# +# Usage: +# promtool check rules ops/prometheus/alerts/matrix-bridge-dagi.rules.yml +# docker run --rm -v $PWD:/w prom/prometheus:latest \ +# promtool check rules /w/ops/prometheus/alerts/matrix-bridge-dagi.rules.yml + +groups: + - name: matrix_bridge_dagi + interval: 30s + rules: + + # ── A1: Bridge process down ───────────────────────────────────────────── + # metric: matrix_bridge_up{node_id} (Gauge, M7.1: labeled per node) + - alert: BridgeDown + expr: sum(matrix_bridge_up) == 0 + for: 1m + labels: + severity: critical + team: platform + service: matrix-bridge-dagi + annotations: + summary: "Matrix Bridge DAGI is down" + description: > + `matrix_bridge_up` == 0 across all nodes — bridge process has not + started or has crashed. No messages are being processed. + runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a1-bridgedown" + + # ── A2: Matrix sync errors spike ──────────────────────────────────────── + # metric: matrix_bridge_gateway_errors_total{error_type} (Counter) + - alert: MatrixSyncErrors + expr: > + increase(matrix_bridge_gateway_errors_total{error_type="sync_error"}[5m]) > 3 + for: 2m + labels: + severity: warning + team: platform + service: matrix-bridge-dagi + annotations: + summary: "Matrix sync errors elevated" + description: > + More than 3 Matrix `/sync` errors (error_type=sync_error) in the last + 5 minutes. May indicate Matrix homeserver problems or network issues. + runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a2-matrixsyncerrors" + + # ── A3: Gateway (Router) invoke errors spike ───────────────────────────── + # metric: matrix_bridge_messages_replied_total{status} (Counter) + - alert: GatewayInvokeErrors + expr: > + increase(matrix_bridge_messages_replied_total{status="error"}[5m]) > 5 + for: 2m + labels: + severity: warning + team: platform + service: matrix-bridge-dagi + annotations: + summary: "Router invoke errors elevated (node={{ $labels.node_id }})" + description: > + More than 5 agent invocation errors (status=error) in the last 5 minutes. + Check Router/DeepSeek connectivity and logs. + runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a3-gatewayinvokeerrors" + + # ── A4: Queue drops ───────────────────────────────────────────────────── + # metric: matrix_bridge_queue_dropped_total{room_id, agent_id} (Counter) + - alert: QueueDropsHigh + expr: > + rate(matrix_bridge_queue_dropped_total[5m]) > 0 + for: 1m + labels: + severity: warning + team: platform + service: matrix-bridge-dagi + annotations: + summary: "Bridge queue is dropping messages" + description: > + `matrix_bridge_queue_dropped_total` is increasing — work queue is full + and incoming messages are being dropped. Increase + `BRIDGE_QUEUE_MAX_EVENTS` or `BRIDGE_WORKER_CONCURRENCY`. + runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a4-queuedrops" + + # ── A5: User-level rate limiting spike ────────────────────────────────── + # metric: matrix_bridge_rate_limited_total{room_id, agent_id, limit_type} (Counter) + - alert: RateLimitedSpike + expr: > + rate(matrix_bridge_rate_limited_total[5m]) > 2 + for: 3m + labels: + severity: warning + team: platform + service: matrix-bridge-dagi + annotations: + summary: "User rate limiting spike" + description: > + More than 2 messages/second are being rate-limited over 3 minutes. + May indicate a flood attack, misbehaving client, or limits too low. + runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a5-ratelimitedspike" + + # ── A6: Control channel rate limiting spike ────────────────────────────── + # metric: matrix_bridge_control_rate_limited_total{scope} (Counter) + - alert: ControlRateLimitedSpike + expr: > + rate(matrix_bridge_control_rate_limited_total[5m]) > 0.5 + for: 3m + labels: + severity: warning + team: platform + service: matrix-bridge-dagi + annotations: + summary: "Control channel rate limiting elevated" + description: > + More than 0.5 control commands/second rejected by rate limiter over + 3 minutes. May indicate operator tooling issues or abuse attempt. + runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a6-controlratelimitedspike" + + # ── A7: Persistent dedupe hit storm (resend loop) ──────────────────────── + # metric: matrix_bridge_dedupe_persistent_hits_total{room_id} (Counter) + - alert: DedupeHitStorm + expr: > + rate(matrix_bridge_dedupe_persistent_hits_total[10m]) > 0.5 + for: 5m + labels: + severity: warning + team: platform + service: matrix-bridge-dagi + annotations: + summary: "Persistent deduplication hit rate elevated" + description: > + High rate of persistent dedupe hits — may indicate a Matrix resend + storm or a client repeatedly retrying the same event_id. + runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a7-dedupehitstorm" + + # ── A8: Invoke latency P95 high (per node) ─────────────────────────────── + # metric: matrix_bridge_invoke_duration_seconds{agent_id, node_id} (Histogram) + - alert: InvokeLatencyP95High + expr: > + histogram_quantile( + 0.95, + sum by (node_id, le) ( + rate(matrix_bridge_invoke_duration_seconds_bucket[5m]) + ) + ) > 15 + for: 5m + labels: + severity: warning + team: platform + service: matrix-bridge-dagi + annotations: + summary: "Router invoke latency P95 > 15s (node={{ $labels.node_id }})" + description: > + 95th percentile invoke latency for node `{{ $labels.node_id }}` exceeds + 15 seconds over the last 5 minutes. Check Router load, DeepSeek API, + Ollama/Swapper queue. + runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a8-invokelatencyp95high" diff --git a/ops/runbook-matrix-bridge-soak.md b/ops/runbook-matrix-bridge-soak.md new file mode 100644 index 00000000..ecf8ffa2 --- /dev/null +++ b/ops/runbook-matrix-bridge-soak.md @@ -0,0 +1,401 @@ +# matrix-bridge-dagi — Soak & Failure Rehearsal Runbook (M11) + +**Phase:** M11 +**Applies to:** `matrix-bridge-dagi` service on NODA1 +**When to run:** Before any production traffic increase, after major code changes, or on a recurring monthly basis. + +--- + +## 1. Goals + +| Goal | Measurable pass criterion | +|------|--------------------------| +| Latency under load | p95 invoke < 5 000 ms | +| Queue stability | drop rate < 1% | +| Failover correctness | failover fires on NODA1 outage; NODA2 serves all remaining messages | +| Sticky anti-flap | sticky set after first failover; no re-tries to degraded node | +| Restart recovery | sticky + health snapshot reloads within 10 s of restart | +| Policy operations safe under load | `!policy history` / `!policy change` work while messages in-flight | + +--- + +## 2. Prerequisites + +```bash +# On NODA1 or local machine with network access to bridge +pip install httpx + +# Verify bridge is up +curl -s http://localhost:9400/health | jq '.ok' +# Expected: true + +# Verify /metrics endpoint +curl -s http://localhost:9400/metrics | grep matrix_bridge_up +# Expected: matrix_bridge_up{...} 1 +``` + +--- + +## 2a. Enabling the Soak Inject Endpoint + +The soak script uses `POST /v1/debug/inject_event` which is **disabled by default**. +Enable it only on staging/NODA1 soak runs: + +```bash +# On NODA1 — edit docker-compose override or pass env inline: +# Option 1: temporary inline restart +DEBUG_INJECT_ENABLED=true docker-compose \ + -f docker-compose.matrix-bridge-node1.yml \ + up -d --no-deps matrix-bridge-dagi + +# Option 2: .env file override +echo "DEBUG_INJECT_ENABLED=true" >> .env.soak +docker-compose --env-file .env.soak \ + -f docker-compose.matrix-bridge-node1.yml \ + up -d --no-deps matrix-bridge-dagi + +# Verify it's enabled (should return 200, not 403) +curl -s -X POST http://localhost:9400/v1/debug/inject_event \ + -H 'Content-Type: application/json' \ + -d '{"room_id":"!test:test","event":{}}' | jq . +# Expected: {"ok":false,"error":"no mapping for room_id=..."} ← 200, not 403 + +# IMPORTANT: disable after soak +docker-compose -f docker-compose.matrix-bridge-node1.yml up -d --no-deps matrix-bridge-dagi +# (DEBUG_INJECT_ENABLED defaults to false) +``` + +--- + +## 2b. Step 0 (WORKERS=2 / QUEUE=100) — Record True Baseline + +**Goal:** snapshot the "before any tuning" numbers to have a comparison point. + +```bash +# 0. Confirm current config (should be defaults) +curl -s http://localhost:9400/health | jq '{workers: .workers, queue_max: .queue.max}' +# Expected: {"workers": 2, "queue_max": 100} + +# 1. DB path for WAL check (adjust to your BRIDGE_DATA_DIR) +DB=/opt/microdao-daarion/data/matrix_bridge.db + +# 2. WAL size before (manual check) +ls -lh ${DB}-wal 2>/dev/null || echo "(no WAL file yet — first run)" +sqlite3 $DB "PRAGMA wal_checkpoint(PASSIVE);" 2>/dev/null || echo "(no sqlite3)" + +# 3. Run Step 0 soak +python3 ops/scripts/matrix_bridge_soak.py \ + --url http://localhost:9400 \ + --messages 100 \ + --concurrency 4 \ + --agent sofiia \ + --room-id "!your-room-id:your-server" \ + --max-p95-ms 5000 \ + --max-drop-rate 0.001 \ + --db-path $DB \ + --report-file /tmp/soak_step0_baseline.json + +# 4. Record result in "Baseline numbers" table (section 10) below. +jq '.summary, .latency, .metrics_delta, .wal' /tmp/soak_step0_baseline.json +``` + +**v1 Go/No-Go thresholds for Step 0:** + +| Metric | Green ✅ | Yellow ⚠️ | Red ❌ | +|--------|---------|-----------|-------| +| `p95_invoke_ms` | < 3000 | 3000–5000 | > 5000 | +| `drop_rate` | 0.00% (mandatory) | — | > 0.1% | +| `error_rate` | < 1% | 1–3% | > 3% | +| `failovers` | 0 | — | ≥ 1 without cause | +| WAL delta | < 2 MB | 2–10 MB | > 10 MB | + +**If Step 0 is Green → proceed to Step 1 tuning.** +**If Step 0 is Yellow/Red → investigate before touching WORKER_CONCURRENCY.** + +--- + +## 2c. Step 1 (WORKERS=4 / QUEUE=200) — Tune-1 + +**Goal:** verify that doubling workers gives headroom without Router saturation. + +```bash +# 1. Apply tuning +WORKER_CONCURRENCY=4 QUEUE_MAX_EVENTS=200 docker-compose \ + -f docker-compose.matrix-bridge-node1.yml \ + --env-file .env.soak \ + up -d --no-deps matrix-bridge-dagi + +sleep 3 +curl -s http://localhost:9400/health | jq '{workers: .workers, queue_max: .queue.max}' +# Expected: {"workers": 4, "queue_max": 200} + +# 2. Run Step 1 soak (higher concurrency to stress the new headroom) +python3 ops/scripts/matrix_bridge_soak.py \ + --url http://localhost:9400 \ + --messages 100 \ + --concurrency 8 \ + --agent sofiia \ + --room-id "!your-room-id:your-server" \ + --max-p95-ms 3000 \ + --max-drop-rate 0.001 \ + --db-path $DB \ + --report-file /tmp/soak_step1_tune1.json + +# 3. Compare Step 0 vs Step 1 +python3 - <<'EOF' +import json +s0 = json.load(open('/tmp/soak_step0_baseline.json')) +s1 = json.load(open('/tmp/soak_step1_tune1.json')) +for k in ('p50', 'p95', 'p99'): + print(f"{k}: {s0['latency'][k]}ms → {s1['latency'][k]}ms") +print(f"drops: {s0['metrics_delta']['queue_drops']} → {s1['metrics_delta']['queue_drops']}") +print(f"WAL: {s0['wal'].get('delta_mb')} → {s1['wal'].get('delta_mb')} MB delta") +EOF +``` + +**Decision:** +- Step 1 Green → **freeze, tag v1.0, ship to production.** +- p95 within 5% of Step 0 → Router is bottleneck (not workers); don't go to Step 2. +- Queue drops > 0 at WORKERS=4 → try Step 2 (WORKERS=8, QUEUE=300). + +--- + +## 3. Scenario A — Baseline load (100 messages, concurrency 4) + +**Goal:** establish latency baseline, verify no drops under normal load. + +```bash +python3 ops/scripts/matrix_bridge_soak.py \ + --url http://localhost:9400 \ + --messages 100 \ + --concurrency 4 \ + --max-p95-ms 3000 \ + --report-file /tmp/soak_baseline.json +``` + +**Expected output:** +``` +matrix-bridge-dagi Soak Report ✅ PASSED + Messages: 100 concurrency=4 + Latency: p50=<500ms p95=<3000ms + Queue drops: 0 (rate 0.000%) + Failovers: 0 +``` + +**If FAILED:** +- `p95 too high` → check router `/health`, DeepSeek API latency, `docker stats` +- `drop_rate > 0` → check `QUEUE_MAX_EVENTS` env var (increase if needed), inspect bridge logs + +--- + +## 4. Scenario B — Queue saturation test + +**Goal:** confirm drop metric fires cleanly and bridge doesn't crash. + +```bash +# Reduce queue via env override, then flood: +QUEUE_MAX_EVENTS=5 docker-compose -f docker-compose.matrix-bridge-node1.yml \ + up -d matrix-bridge-dagi + +# Wait for restart +sleep 5 + +python3 ops/scripts/matrix_bridge_soak.py \ + --url http://localhost:9400 \ + --messages 30 \ + --concurrency 10 \ + --max-drop-rate 0.99 \ + --report-file /tmp/soak_queue_sat.json + +# Restore normal queue size +docker-compose -f docker-compose.matrix-bridge-node1.yml up -d matrix-bridge-dagi +``` + +**Expected:** `queue_drops > 0`, bridge still running after the test. + +**Verify in Prometheus/Grafana:** +```promql +rate(matrix_bridge_queue_dropped_total[1m]) +``` +Should spike and then return to 0. + +--- + +## 5. Scenario C — Node failover rehearsal + +**Goal:** simulate NODA1 router becoming unavailable, verify NODA2 takes over. + +```bash +# Step 1: stop the router on NODA1 temporarily +docker pause dagi-router-node1 + +# Step 2: run soak against bridge (bridge will failover to NODA2) +python3 ops/scripts/matrix_bridge_soak.py \ + --url http://localhost:9400 \ + --messages 20 \ + --concurrency 2 \ + --max-p95-ms 10000 \ + --report-file /tmp/soak_failover.json + +# Step 3: restore router +docker unpause dagi-router-node1 +``` + +**Expected:** +``` + Failovers: 1..20 (at least 1) + Sticky sets: 1+ + Errors: 0 (fallback to NODA2 serves all messages) +``` + +**Check sticky in control room:** +``` +!nodes +``` +Should show `NODA2` sticky with remaining TTL. + +**Check health tracker:** +``` +!status +``` +Should show `NODA1 state=degraded|down`. + +--- + +## 6. Scenario D — Restart recovery + +**Goal:** after restart, sticky and health state reload within one polling cycle. + +```bash +# After Scenario C: sticky is set to NODA2 +# Restart the bridge +docker restart dagi-matrix-bridge-node1 + +# Wait for startup (up to 30s) +sleep 15 + +# Verify sticky reloaded +curl -s http://localhost:9400/health | jq '.ha_state' +# Expected: {"sticky_loaded": N, ...} + +# Verify routing still uses NODA2 sticky +python3 ops/scripts/matrix_bridge_soak.py \ + --url http://localhost:9400 \ + --messages 10 \ + --concurrency 2 \ + --report-file /tmp/soak_restart.json +``` + +**Expected:** p95 similar to post-failover run, `Failovers: 0` (sticky already applied). + +--- + +## 7. Scenario E — Rate limit burst + +**Goal:** verify rate limiting fires and bridge doesn't silently drop below-limit messages. + +```bash +# Set RPM very low for test, then flood from same sender +# This is best done in control room by observing !status rate_limited count +# rather than the soak script (which uses different senders per message). + +# In Matrix control room: +# Send 30+ messages from the same user account in quick succession in a mixed room. +# Then: +!status +# Check: rate_limited_total increased, no queue drops. +``` + +--- + +## 8. Scenario F — Policy operations under load + +**Goal:** `!policy history`, `!policy change`, and `!policy export` work while messages are in-flight. + +```bash +# Run a background soak +python3 ops/scripts/matrix_bridge_soak.py \ + --url http://localhost:9400 \ + --messages 200 \ + --concurrency 2 \ + --report-file /tmp/soak_concurrent_policy.json & + +# While soak is running, in Matrix control room: +!policy history limit=5 +!policy export +!status +``` + +**Expected:** all three commands respond immediately (< 2s), soak completes without extra drops. + +--- + +## 9. Prometheus / Grafana during soak + +Key queries for the Grafana dashboard: + +```promql +# Throughput (messages/s) +rate(matrix_bridge_routed_total[30s]) + +# Error rate +rate(matrix_bridge_errors_total[30s]) + +# p95 invoke latency per node +histogram_quantile(0.95, rate(matrix_bridge_invoke_duration_seconds_bucket[1m])) + +# Queue drops rate +rate(matrix_bridge_queue_dropped_total[1m]) + +# Failovers +rate(matrix_bridge_failover_total[5m]) +``` + +Use the `matrix-bridge-dagi` Grafana dashboard at: +`ops/grafana/dashboards/matrix-bridge-dagi.json` + +--- + +## 10. Baseline numbers (reference) + +| Metric | Cold start | Warm (sticky set) | +|--------|-----------|-------------------| +| p50 latency | ~200ms | ~150ms | +| p95 latency | ~2 000ms | ~1 500ms | +| Queue drops | 0 (queue=100) | 0 | +| Failover fires | 1 per degradation | 0 after sticky | +| Policy ops response | < 500ms | < 500ms | + +*Update this table after each soak run with actual measured values.* + +--- + +## 11. CI soak (mocked, no network) + +For CI pipelines, use the mocked soak scenarios: + +```bash +python3 -m pytest tests/test_matrix_bridge_m11_soak_scenarios.py -v +``` + +Covers (all deterministic, no network): +- **S1** Queue saturation → drop counter +- **S2** Failover under load → on_failover callback, health tracker +- **S3** Sticky routing under burst → sticky set, burst routed to NODA2 +- **S4** Multi-room isolation → separate rooms don't interfere +- **S5** Rate-limit burst → RL callback wired, no panic +- **S6** HA restart recovery → sticky + health snapshot persisted and reloaded +- **Perf baseline** 100-msg + 50-msg failover burst < 5s wall clock + +--- + +## 12. Known failure modes & mitigations + +| Symptom | Likely cause | Mitigation | +|---------|-------------|------------| +| `p95 > 5000ms` | Router/LLM slow | Increase `ROUTER_TIMEOUT_S`, check DeepSeek API | +| `drop_rate > 1%` | Queue too small | Increase `QUEUE_MAX_EVENTS` | +| `failovers > 0` but errors > 0 | Both nodes degraded | Check NODA1 + NODA2 health; scale router | +| Bridge crash during soak | Memory leak / bug | `docker logs` → file GitHub issue | +| Sticky not set after failover | `FAILOVER_STICKY_TTL_S=0` | Set to 300+ | +| Restart doesn't load sticky | `HA_HEALTH_MAX_AGE_S` too small | Increase or set to 3600 | diff --git a/ops/scripts/matrix_bridge_soak.py b/ops/scripts/matrix_bridge_soak.py new file mode 100644 index 00000000..ed774705 --- /dev/null +++ b/ops/scripts/matrix_bridge_soak.py @@ -0,0 +1,476 @@ +#!/usr/bin/env python3 +""" +matrix_bridge_soak.py — M11 live soak script for matrix-bridge-dagi + +Usage: + python3 ops/scripts/matrix_bridge_soak.py \ + --url http://localhost:9400 \ + --messages 100 \ + --concurrency 4 \ + --report-file /tmp/soak_report.json + +Requires: httpx (pip install httpx) + +What it does: + 1. Sends --messages synthetic messages to the bridge /v1/sync endpoint + (or directly to the router if --direct-router is set). + 2. Measures latency (p50, p95, p99, max) per batch. + 3. After the run, fetches /metrics and extracts key counters: + - matrix_bridge_queue_dropped_total + - matrix_bridge_rate_limited_total + - matrix_bridge_failover_total + - matrix_bridge_sticky_node_total + - matrix_bridge_invoke_duration_seconds (p50/p95 from histogram) + 4. Prints a human-readable report and optionally writes JSON. + +Exit codes: + 0 = all pass criteria met + 1 = one or more thresholds exceeded (see --max-p95-ms, --max-drop-rate) +""" +import argparse +import asyncio +import json +import sys +import time +from typing import Any, Dict, List, Optional + +try: + import httpx +except ImportError: + print("ERROR: httpx not installed. Run: pip install httpx", file=sys.stderr) + sys.exit(2) + +# ── Pass/fail defaults ───────────────────────────────────────────────────────── +_DEFAULT_MAX_P95_MS = 5000 # 5 s p95 per invoke (generous for cold start) +_DEFAULT_MAX_DROP_RATE = 0.01 # 1% queue drops allowed + + +# ── Metrics parsing ──────────────────────────────────────────────────────────── +def _parse_counter(text: str, name: str) -> float: + """Extract the last reported value of a Prometheus counter by name.""" + for line in text.splitlines(): + if line.startswith(name + " ") or line.startswith(name + "{"): + parts = line.rsplit(None, 1) + try: + return float(parts[-1]) + except (ValueError, IndexError): + pass + return 0.0 + + +def _parse_histogram_quantile(text: str, name: str, quantile: float) -> Optional[float]: + """ + Approximate histogram_quantile from _bucket lines. + Returns estimated value at given quantile or None if data missing. + """ + buckets: List[tuple] = [] + total_count = 0.0 + for line in text.splitlines(): + if f"{name}_bucket" in line and 'le="' in line: + try: + le_part = line.split('le="')[1].split('"')[0] + le = float(le_part) if le_part != "+Inf" else float("inf") + val = float(line.rsplit(None, 1)[-1]) + buckets.append((le, val)) + except (ValueError, IndexError): + pass + elif (f"{name}_count " in line or (name + "_count{") in line): + try: + total_count = float(line.rsplit(None, 1)[-1]) + except (ValueError, IndexError): + pass + + if not buckets or total_count == 0: + return None + + buckets.sort() + target = quantile * total_count + prev_le, prev_count = 0.0, 0.0 + for le, count in buckets: + if count >= target: + if le == float("inf"): + return prev_le + # Linear interpolation + if count == prev_count: + return le + fraction = (target - prev_count) / (count - prev_count) + return prev_le + fraction * (le - prev_le) + prev_le, prev_count = le, count + return prev_le + + +# ── Soak runner ──────────────────────────────────────────────────────────────── +async def _preflight_inject(client: httpx.AsyncClient, url: str, room_id: str) -> str: + """ + Verify the inject endpoint is reachable and enabled. + Returns "" on success, error message on failure. + """ + try: + resp = await client.post( + f"{url.rstrip('/')}/v1/debug/inject_event", + json={"room_id": room_id, "event": {"event_id": "!preflight", "sender": "@soak:test", + "content": {"msgtype": "m.text", "body": "ping"}}}, + timeout=5.0, + ) + if resp.status_code == 403: + return ( + "❌ DEBUG_INJECT_ENABLED=false on bridge. " + "Set DEBUG_INJECT_ENABLED=true and restart for soak.\n" + " NEVER enable in production!" + ) + if resp.status_code >= 500: + return f"❌ Bridge inject endpoint returned HTTP {resp.status_code}" + data = resp.json() + if not data.get("ok") and "no mapping" in data.get("error", ""): + return ( + f"❌ No room mapping for room_id={room_id!r}. " + "Pass --room-id matching a configured BRIDGE_ROOM_MAP entry." + ) + return "" + except httpx.ConnectError: + return f"❌ Cannot connect to bridge at {url}. Is it running?" + except Exception as exc: # noqa: BLE001 + return f"❌ Preflight failed: {exc}" + + +async def _check_wal(db_path: str) -> Dict[str, Any]: + """ + Run WAL size + checkpoint check on the bridge policy DB. + Returns dict with wal_bytes, wal_mb, checkpoint_result. + Requires sqlite3 CLI on PATH; gracefully skips if unavailable. + """ + import subprocess, shutil + result: Dict[str, Any] = {"db_path": db_path, "ok": False} + + wal_path = db_path + "-wal" + try: + wal_bytes = os.path.getsize(wal_path) if os.path.exists(wal_path) else 0 + result["wal_bytes"] = wal_bytes + result["wal_mb"] = round(wal_bytes / 1_048_576, 2) + except OSError: + result["wal_bytes"] = -1 + result["wal_mb"] = -1 + + if shutil.which("sqlite3"): + try: + cp = subprocess.run( + ["sqlite3", db_path, "PRAGMA wal_checkpoint(PASSIVE);"], + capture_output=True, text=True, timeout=5, + ) + # Output: busy|log|checkpointed (3 ints) + parts = cp.stdout.strip().split("|") + if len(parts) == 3: + result["wal_checkpoint"] = { + "busy": int(parts[0]), "log": int(parts[1]), "checkpointed": int(parts[2]), + } + result["ok"] = True + except Exception: # noqa: BLE001 + result["ok"] = False + else: + result["sqlite3_missing"] = True + + return result + + +async def _send_one( + client: httpx.AsyncClient, + url: str, + agent_id: str, + message: str, + room_id: str, + sender: str, +) -> tuple: + """ + POST a synthetic Matrix-style event to the bridge debug endpoint. + Returns (latency_ms: float, status_code: int, error: str|None). + """ + payload = { + "room_id": room_id, + "event": { + "event_id": f"!soak-{int(time.monotonic() * 1e6)}", + "sender": sender, + "type": "m.room.message", + "content": {"msgtype": "m.text", "body": message}, + }, + } + t0 = time.monotonic() + try: + resp = await client.post( + f"{url.rstrip('/')}/v1/debug/inject_event", + json=payload, + timeout=30.0, + ) + latency_ms = (time.monotonic() - t0) * 1000 + if resp.status_code >= 500: + return latency_ms, resp.status_code, f"HTTP {resp.status_code}" + return latency_ms, resp.status_code, None + except httpx.TimeoutException: + latency_ms = (time.monotonic() - t0) * 1000 + return latency_ms, 0, "timeout" + except Exception as exc: # noqa: BLE001 + latency_ms = (time.monotonic() - t0) * 1000 + return latency_ms, 0, str(exc) + + +async def _fetch_health(client: httpx.AsyncClient, url: str) -> Dict[str, Any]: + try: + resp = await client.get(f"{url.rstrip('/')}/health", timeout=10.0) + return resp.json() if resp.status_code == 200 else {} + except Exception: # noqa: BLE001 + return {} + + +async def _fetch_metrics(client: httpx.AsyncClient, url: str) -> str: + try: + resp = await client.get(f"{url.rstrip('/')}/metrics", timeout=10.0) + return resp.text if resp.status_code == 200 else "" + except Exception: # noqa: BLE001 + return "" + + +def _percentile(values: List[float], p: float) -> float: + if not values: + return 0.0 + sv = sorted(values) + idx = int(len(sv) * p / 100) + return sv[min(idx, len(sv) - 1)] + + +async def run_soak( + url: str, + n_messages: int, + concurrency: int, + agent_id: str, + room_id: str, + sender: str, + max_p95_ms: float, + max_drop_rate: float, + db_path: str = "", +) -> Dict[str, Any]: + results: List[tuple] = [] + semaphore = asyncio.Semaphore(concurrency) + + async with httpx.AsyncClient() as client: + # Pre-check: inject endpoint + health + preflight_err = await _preflight_inject(client, url, room_id) + if preflight_err: + print(preflight_err, file=sys.stderr) + return {"ok": False, "error": preflight_err, "passed": False, "failures": [preflight_err]} + + # WAL check before soak + wal_before: Dict[str, Any] = {} + if db_path: + wal_before = await _check_wal(db_path) + print(f"[soak] WAL before: {wal_before.get('wal_mb', '?')} MB") + + # Pre-check: health + health_before = await _fetch_health(client, url) + metrics_before = await _fetch_metrics(client, url) + + drops_before = _parse_counter(metrics_before, "matrix_bridge_queue_dropped_total") + rl_before = _parse_counter(metrics_before, "matrix_bridge_rate_limited_total") + fo_before = _parse_counter(metrics_before, "matrix_bridge_failover_total") + + print(f"[soak] Bridge health before: {health_before.get('ok', '?')}") + print(f"[soak] Starting {n_messages} messages (concurrency={concurrency}) ...") + + t_start = time.monotonic() + + async def worker(i: int): + async with semaphore: + msg = f"soak-msg-{i:04d}" + lat, status, err = await _send_one( + client, url, agent_id, msg, room_id, sender + ) + results.append((lat, status, err)) + if (i + 1) % max(1, n_messages // 10) == 0: + print(f" [{i+1}/{n_messages}] last={lat:.0f}ms status={status}") + + await asyncio.gather(*[worker(i) for i in range(n_messages)]) + + elapsed_s = time.monotonic() - t_start + metrics_after = await _fetch_metrics(client, url) + health_after = await _fetch_health(client, url) + + # WAL check after soak + wal_after: Dict[str, Any] = {} + if db_path: + wal_after = await _check_wal(db_path) + print(f"[soak] WAL after: {wal_after.get('wal_mb', '?')} MB " + f"(delta={round(wal_after.get('wal_mb',0) - wal_before.get('wal_mb',0), 2)} MB)") + + latencies = [r[0] for r in results] + errors = [r for r in results if r[2] is not None] + successes = len(results) - len(errors) + error_rate = len(errors) / len(results) if results else 0.0 + + drops_after = _parse_counter(metrics_after, "matrix_bridge_queue_dropped_total") + rl_after = _parse_counter(metrics_after, "matrix_bridge_rate_limited_total") + fo_after = _parse_counter(metrics_after, "matrix_bridge_failover_total") + sticky_after = _parse_counter(metrics_after, "matrix_bridge_sticky_node_total") + + delta_drops = drops_after - drops_before + delta_rl = rl_after - rl_before + delta_fo = fo_after - fo_before + + p50 = _percentile(latencies, 50) + p95 = _percentile(latencies, 95) + p99 = _percentile(latencies, 99) + p_max = max(latencies) if latencies else 0.0 + + # Histogram quantile from Prometheus + hist_p95 = _parse_histogram_quantile( + metrics_after, "matrix_bridge_invoke_duration_seconds", 0.95 + ) + hist_p95_ms = hist_p95 * 1000 if hist_p95 is not None else None + + drop_rate = delta_drops / len(results) if results else 0.0 + + report = { + "wal": { + "before_mb": wal_before.get("wal_mb"), + "after_mb": wal_after.get("wal_mb"), + "delta_mb": round( + (wal_after.get("wal_mb") or 0) - (wal_before.get("wal_mb") or 0), 3 + ) if wal_before and wal_after else None, + "checkpoint_after": wal_after.get("wal_checkpoint"), + "threshold_mb": 10, + }, + "summary": { + "total_messages": n_messages, + "concurrency": concurrency, + "elapsed_s": round(elapsed_s, 2), + "throughput_rps": round(n_messages / elapsed_s, 1) if elapsed_s > 0 else 0, + "successes": successes, + "errors": len(errors), + "error_rate": round(error_rate, 4), + }, + "latency_ms": { + "p50": round(p50, 1), + "p95": round(p95, 1), + "p99": round(p99, 1), + "max": round(p_max, 1), + }, + "metrics_delta": { + "queue_drops": int(delta_drops), + "rate_limited": int(delta_rl), + "failovers": int(delta_fo), + "sticky_sets": int(sticky_after), + "drop_rate": round(drop_rate, 4), + }, + "prometheus_invoke_p95_ms": round(hist_p95_ms, 1) if hist_p95_ms else None, + "health_before": health_before.get("ok"), + "health_after": health_after.get("ok"), + "pass_criteria": { + "max_p95_ms": max_p95_ms, + "max_drop_rate": max_drop_rate, + }, + } + + # Pass/fail evaluation + failures = [] + if p95 > max_p95_ms: + failures.append(f"p95={p95:.0f}ms exceeds threshold {max_p95_ms:.0f}ms") + if drop_rate > max_drop_rate: + failures.append( + f"drop_rate={drop_rate:.3%} exceeds threshold {max_drop_rate:.3%}" + ) + wal_delta = report["wal"]["delta_mb"] + if wal_delta is not None and wal_delta > report["wal"]["threshold_mb"]: + failures.append( + f"WAL grew {wal_delta:.1f}MB (threshold {report['wal']['threshold_mb']}MB) " + "— possible SQLite write pressure (Bottleneck #2)" + ) + + report["passed"] = len(failures) == 0 + report["failures"] = failures + return report + + +def _print_report(r: Dict[str, Any]) -> None: + s = r["summary"] + l = r["latency_ms"] + m = r["metrics_delta"] + passed = "✅ PASSED" if r["passed"] else "❌ FAILED" + + w = r.get("wal", {}) + print() + print("=" * 60) + print(f" matrix-bridge-dagi Soak Report {passed}") + print("=" * 60) + print(f" Messages: {s['total_messages']} concurrency={s['concurrency']}") + print(f" Elapsed: {s['elapsed_s']}s ({s['throughput_rps']} rps)") + print(f" Successes: {s['successes']} errors={s['errors']} ({s['error_rate']:.1%})") + print() + print(f" Latency (client-side): p50={l['p50']}ms p95={l['p95']}ms " + f"p99={l['p99']}ms max={l['max']}ms") + if r["prometheus_invoke_p95_ms"] is not None: + print(f" Invoke p95 (Prometheus): {r['prometheus_invoke_p95_ms']}ms") + print() + print(f" Queue drops: {m['queue_drops']} (rate {m['drop_rate']:.3%})") + print(f" Rate-limited: {m['rate_limited']}") + print(f" Failovers: {m['failovers']}") + print(f" Sticky sets: {m['sticky_sets']}") + if w.get("before_mb") is not None: + wal_delta_str = ( + f"Δ{w['delta_mb']:+.2f}MB" if w.get("delta_mb") is not None else "" + ) + wal_warn = " ⚠️" if (w.get("delta_mb") or 0) > w.get("threshold_mb", 10) else "" + print(f" WAL: {w['before_mb']}MB → {w['after_mb']}MB {wal_delta_str}{wal_warn}") + print() + if r["failures"]: + for f in r["failures"]: + print(f" ❌ {f}") + else: + print(" All pass criteria met.") + print("=" * 60) + + +def main() -> int: + parser = argparse.ArgumentParser(description="matrix-bridge-dagi soak test (M11)") + parser.add_argument("--url", default="http://localhost:9400", + help="Bridge base URL (default: http://localhost:9400)") + parser.add_argument("--messages", type=int, default=100, + help="Total messages to send (default: 100)") + parser.add_argument("--concurrency", type=int, default=4, + help="Concurrent requests (default: 4)") + parser.add_argument("--agent-id", default="sofiia", + help="Agent id for synthetic events (default: sofiia)") + parser.add_argument("--room-id", default="!soak-room:home.invalid", + help="Room id for synthetic events") + parser.add_argument("--sender", default="@soak-user:home.invalid", + help="Sender for synthetic events") + parser.add_argument("--max-p95-ms", type=float, default=_DEFAULT_MAX_P95_MS, + help=f"Max p95 latency ms (default: {_DEFAULT_MAX_P95_MS})") + parser.add_argument("--max-drop-rate",type=float, default=_DEFAULT_MAX_DROP_RATE, + help=f"Max queue drop rate 0..1 (default: {_DEFAULT_MAX_DROP_RATE})") + parser.add_argument("--report-file", default="", + help="Optional path to write JSON report") + parser.add_argument("--db-path", default="", + help="Path to policy_store.db for WAL check " + "(e.g. /opt/microdao-daarion/data/matrix_bridge.db)") + args = parser.parse_args() + + report = asyncio.run(run_soak( + url=args.url, + n_messages=args.messages, + concurrency=args.concurrency, + agent_id=args.agent_id, + room_id=args.room_id, + sender=args.sender, + max_p95_ms=args.max_p95_ms, + max_drop_rate=args.max_drop_rate, + db_path=args.db_path, + )) + _print_report(report) + + if args.report_file: + with open(args.report_file, "w", encoding="utf-8") as fh: + json.dump(report, fh, indent=2) + print(f"\n Report saved: {args.report_file}") + + return 0 if report["passed"] else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/services/matrix-bridge-dagi/app/config.py b/services/matrix-bridge-dagi/app/config.py index c418a72f..699fe783 100644 --- a/services/matrix-bridge-dagi/app/config.py +++ b/services/matrix-bridge-dagi/app/config.py @@ -1,5 +1,5 @@ """ -matrix-bridge-dagi — configuration and validation (M2.1 + M2.2 + M3.0) +matrix-bridge-dagi — configuration and validation (M2.1 + M2.2 + M3.0 + M3.1) """ import os from dataclasses import dataclass, field @@ -54,6 +54,54 @@ class BridgeConfig: # "ignore" | "reply_error" (send ⛔ to room on unauthorized attempt) control_unauthorized_behavior: str + # M3.1: Runbook runner — sofiia-console control token + sofiia_control_token: str # X-Control-Token for /api/runbooks/internal/runs + + # M3.4: Control channel safety — rate limiting + cooldown + control_room_rpm: int # Max commands per room per minute (0 = unlimited) + control_operator_rpm: int # Max commands per operator per minute + control_run_next_rpm: int # Max !runbook next calls per run_id per minute + control_cooldown_s: float # Anti-double-click debounce per (operator, verb, subcmd) + + # M2.3: Persistent event deduplication + persistent_dedupe: bool # Enable SQLite-backed dedupe across restarts + bridge_data_dir: str # Directory for SQLite DB and other bridge data + processed_events_ttl_h: int # TTL for processed events (hours) + processed_events_prune_batch: int # Max rows to prune per prune run + processed_events_prune_interval_s: int # Prune interval in seconds (0 = disable periodic) + + # M4.0: agent discovery + discovery_rpm: int # Max !agents replies per room per minute (0 = unlimited) + + # M5.0: node-aware routing + bridge_allowed_nodes: str # Comma-separated: "NODA1,NODA2" + bridge_default_node: str # Default node when none specified + bridge_room_node_map: str # Optional: "!roomA:server=NODA2;!roomB:server=NODA1" + + # M8.0: node health + soft-failover thresholds + node_fail_consecutive: int # Consecutive failures before node marked "down" + node_lat_ewma_s: float # EWMA latency threshold (seconds) → "degraded" + node_ewma_alpha: float # EWMA smoothing factor (0..1) + + # M8.1: sticky failover cache + failover_sticky_ttl_s: float # Seconds to hold sticky node preference after failover (0 = disabled) + + # M8.2: HA state persistence + ha_health_snapshot_interval_s: int # Seconds between node health writes to DB (0 = disabled) + ha_health_max_age_s: int # Max age of health snapshot to load on startup (seconds) + + # M9.0: Two-step confirmation TTL + confirm_ttl_s: float # Seconds a pending !confirm nonce is valid (0 = disabled) + + # M10.0: Policy export retention + policy_export_retention_days: int # Days to keep policy exports (0 = keep forever) + + # M10.2: Policy change history + policy_history_limit: int # Max rows in policy_changes table (0 = unlimited) + + # M11 soak: synthetic event injection (NEVER enable in production) + debug_inject_enabled: bool # POST /v1/debug/inject_event (default: False) + # Service identity node_id: str build_sha: str @@ -99,6 +147,35 @@ def load_config() -> BridgeConfig: bridge_operator_allowlist=_optional("BRIDGE_OPERATOR_ALLOWLIST", ""), bridge_control_rooms=_optional("BRIDGE_CONTROL_ROOMS", ""), control_unauthorized_behavior=_optional("CONTROL_UNAUTHORIZED_BEHAVIOR", "ignore"), + sofiia_control_token=_optional("SOFIIA_CONTROL_TOKEN", ""), + control_room_rpm=max(0, int(_optional("CONTROL_ROOM_RPM", "60"))), + control_operator_rpm=max(0, int(_optional("CONTROL_OPERATOR_RPM", "30"))), + control_run_next_rpm=max(0, int(_optional("CONTROL_RUN_NEXT_RPM", "20"))), + control_cooldown_s=max(0.0, float(_optional("CONTROL_COOLDOWN_S", "2.0"))), + persistent_dedupe=_optional("PERSISTENT_DEDUPE", "1").strip() not in ("0", "false", ""), + bridge_data_dir=_optional("BRIDGE_DATA_DIR", "/app/data"), + processed_events_ttl_h=max(1, int(_optional("PROCESSED_EVENTS_TTL_H", "48"))), + processed_events_prune_batch=max(1, int(_optional("PROCESSED_EVENTS_PRUNE_BATCH", "5000"))), + processed_events_prune_interval_s=max(0, int(_optional("PROCESSED_EVENTS_PRUNE_INTERVAL_S", "3600"))), + discovery_rpm=max(0, int(_optional("DISCOVERY_RPM", "20"))), + bridge_allowed_nodes=_optional("BRIDGE_ALLOWED_NODES", "NODA1"), + bridge_default_node=_optional("BRIDGE_DEFAULT_NODE", "NODA1"), + bridge_room_node_map=_optional("BRIDGE_ROOM_NODE_MAP", ""), + # M8.0: node health thresholds + node_fail_consecutive=max(1, int(_optional("NODE_FAIL_CONSEC", "3"))), + node_lat_ewma_s=max(0.5, float(_optional("NODE_LAT_EWMA_S", "12.0"))), + node_ewma_alpha=min(1.0, max(0.01, float(_optional("NODE_EWMA_ALPHA", "0.3")))), + # M8.1: sticky failover TTL (0 = disabled) + failover_sticky_ttl_s=max(0.0, float(_optional("FAILOVER_STICKY_TTL_S", "300.0"))), + # M8.2: HA state persistence + ha_health_snapshot_interval_s=max(0, int(_optional("HA_HEALTH_SNAPSHOT_INTERVAL_S", "60"))), + ha_health_max_age_s=max(0, int(_optional("HA_HEALTH_MAX_AGE_S", "600"))), + # M9.0: Two-step confirmation TTL (0 = disabled) + confirm_ttl_s=max(0.0, float(_optional("CONFIRM_TTL_S", "120.0"))), + policy_export_retention_days=max(0, int(_optional("POLICY_EXPORT_RETENTION_DAYS", "30"))), + policy_history_limit=max(0, int(_optional("POLICY_HISTORY_LIMIT", "100"))), + debug_inject_enabled=_optional("DEBUG_INJECT_ENABLED", "false").lower() + in ("1", "true", "yes"), node_id=_optional("NODE_ID", "NODA1"), build_sha=_optional("BUILD_SHA", "dev"), build_time=_optional("BUILD_TIME", "local"), diff --git a/services/matrix-bridge-dagi/app/confirm_store.py b/services/matrix-bridge-dagi/app/confirm_store.py new file mode 100644 index 00000000..217e39d7 --- /dev/null +++ b/services/matrix-bridge-dagi/app/confirm_store.py @@ -0,0 +1,167 @@ +""" +confirm_store — M9.0: Two-step confirmation for dangerous control commands. + +Flow: + 1. Operator issues a dangerous command (e.g. !node set, !policy import mode=replace). + 2. Bridge calls ConfirmStore.add(..., callback=) → returns a nonce. + 3. Bridge replies: "Type !confirm within Ns to apply." + 4. Operator sends !confirm . + 5. Bridge calls ConfirmStore.pop(nonce, sender_hash) → returns PendingConfirmation. + 6. Bridge executes callback() → (reply_text, diff_summary). + 7. Audit trail: matrix.control.intent / matrix.control.confirmed / matrix.control.applied. + +Safety: + - One pending entry per sender (new request replaces old). + - Nonce is sender-bound: wrong sender_hash → pop returns None. + - TTL enforced via monotonic time; expired entries not returned. + - Nonce: 6 uppercase alphanumeric (NONCE_LEN chars from NONCE_CHARS). +""" +from __future__ import annotations + +import secrets +import string +import threading +import time +from dataclasses import dataclass, field +from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple + +NONCE_LEN = 6 +NONCE_CHARS = string.ascii_uppercase + string.digits + +_DEFAULT_TTL_S = 120.0 + + +def make_nonce() -> str: + """Generate a cryptographically random 6-char uppercase alphanumeric nonce.""" + return "".join(secrets.choice(NONCE_CHARS) for _ in range(NONCE_LEN)) + + +@dataclass +class PendingConfirmation: + """A pending two-step confirmation waiting for !confirm .""" + nonce: str + sender_hash: str + verb: str # e.g. "node.set", "room.agents set", "policy.import" + normalized_args: str # human-readable args for audit + action_summary: str # "!node set room=!x:s node=NODA2" + room_id: str # Matrix room_id where the intent was issued + callback: Callable[[], Awaitable[Tuple[str, str]]] # async () → (reply_text, diff_summary) + expires_at: float # time.monotonic() deadline + + +class ConfirmStore: + """ + In-memory, thread-safe store for pending two-step confirmation entries. + + One pending entry per sender at a time. If the same sender issues a new + dangerous command before confirming the previous one, the old entry is + replaced (new nonce issued). + """ + + def __init__(self, ttl_s: float = _DEFAULT_TTL_S) -> None: + self.ttl_s = ttl_s + self._lock = threading.RLock() + self._by_nonce: Dict[str, PendingConfirmation] = {} + self._by_sender: Dict[str, str] = {} # sender_hash → nonce + + # ── Public API ──────────────────────────────────────────────────────────── + + def add( + self, + sender_hash: str, + verb: str, + normalized_args: str, + action_summary: str, + room_id: str, + callback: Callable[[], Awaitable[Tuple[str, str]]], + ) -> str: + """ + Create a pending confirmation entry. Returns the nonce string. + + If the sender already has a pending entry it is replaced (old nonce + becomes invalid immediately). + """ + nonce = make_nonce() + expires_at = time.monotonic() + self.ttl_s + entry = PendingConfirmation( + nonce=nonce, + sender_hash=sender_hash, + verb=verb, + normalized_args=normalized_args, + action_summary=action_summary, + room_id=room_id, + callback=callback, + expires_at=expires_at, + ) + with self._lock: + # Evict any previous pending entry for this sender + old_nonce = self._by_sender.get(sender_hash) + if old_nonce: + self._by_nonce.pop(old_nonce, None) + self._by_nonce[nonce] = entry + self._by_sender[sender_hash] = nonce + return nonce + + def pop(self, nonce: str, sender_hash: str) -> Optional[PendingConfirmation]: + """ + Retrieve and atomically remove a pending confirmation. + + Returns None if: + - nonce does not exist, + - sender_hash does not match the entry owner, + - or the entry has expired. + """ + nonce = nonce.upper() + with self._lock: + entry = self._by_nonce.get(nonce) + if entry is None: + return None + if entry.sender_hash != sender_hash: + # Wrong sender — deny without disclosing any detail + return None + if time.monotonic() > entry.expires_at: + # Expired — clean up and deny + self._by_nonce.pop(nonce, None) + self._by_sender.pop(entry.sender_hash, None) + return None + # Valid confirmation — consume the entry + self._by_nonce.pop(nonce) + self._by_sender.pop(sender_hash, None) + return entry + + def pending_nonce(self, sender_hash: str) -> Optional[str]: + """ + Return the current pending nonce for a sender (non-destructive peek). + Returns None if no entry or the entry has expired. + """ + with self._lock: + nonce = self._by_sender.get(sender_hash) + if nonce is None: + return None + entry = self._by_nonce.get(nonce) + if entry is None or time.monotonic() > entry.expires_at: + # Lazy eviction + self._by_nonce.pop(nonce, None) + self._by_sender.pop(sender_hash, None) + return None + return nonce + + def pending_count(self) -> int: + """Number of non-expired pending entries (for /health, metrics).""" + now = time.monotonic() + with self._lock: + return sum(1 for e in self._by_nonce.values() if now <= e.expires_at) + + def cleanup(self) -> int: + """Eagerly remove all expired entries. Returns count removed.""" + now = time.monotonic() + removed = 0 + with self._lock: + expired_nonces = [ + n for n, e in self._by_nonce.items() if now > e.expires_at + ] + for n in expired_nonces: + entry = self._by_nonce.pop(n) + self._by_sender.pop(entry.sender_hash, None) + removed += 1 + return removed diff --git a/services/matrix-bridge-dagi/app/control.py b/services/matrix-bridge-dagi/app/control.py index 3495843b..d2f8a169 100644 --- a/services/matrix-bridge-dagi/app/control.py +++ b/services/matrix-bridge-dagi/app/control.py @@ -23,18 +23,124 @@ Audit events emitted: import logging import re from dataclasses import dataclass, field -from typing import Dict, FrozenSet, List, Optional, Tuple +from typing import Any, Dict, FrozenSet, List, Optional, Tuple logger = logging.getLogger(__name__) # ── Constants ───────────────────────────────────────────────────────────────── -# Supported control verbs (M3.1+ will implement them fully) +# Supported control verbs VERB_RUNBOOK = "runbook" VERB_STATUS = "status" +VERB_NODES = "nodes" # M5.1: node policy overview +VERB_NODE = "node" # M6.0: dynamic room-node override commands +VERB_ROOM = "room" # M6.1: dynamic mixed room agent overrides +VERB_POLICY = "policy" # M6.2: policy snapshot export/import +VERB_CONFIRM = "confirm" # M9.0: two-step confirmation for dangerous commands VERB_HELP = "help" -KNOWN_VERBS: FrozenSet[str] = frozenset({VERB_RUNBOOK, VERB_STATUS, VERB_HELP}) +KNOWN_VERBS: FrozenSet[str] = frozenset({ + VERB_RUNBOOK, VERB_STATUS, VERB_NODES, VERB_NODE, + VERB_ROOM, VERB_POLICY, VERB_CONFIRM, VERB_HELP, +}) + +# ── M9.0: Dangerous command detection ───────────────────────────────────────── + +def is_dangerous_cmd(cmd: "ControlCommand") -> bool: + """ + Return True if the command requires two-step confirmation before applying. + + Dangerous verbs: + !node set room=... node=... — changes room routing + !room agents set room=... agents=... — replaces all agents for a room + !policy import ... — overwrites policy DB (both modes) + """ + v = cmd.verb + sub = (cmd.subcommand or "").strip().lower() + if v == VERB_NODE and sub == "set": + return True + if v == VERB_ROOM and sub == "agents" and cmd.args and cmd.args[0].lower() == "set": + return True + if v == VERB_POLICY and sub == "import": + return True + # M10.0: prune_exports is dangerous only when dry_run=0 (actual deletion) + if v == VERB_POLICY and sub == "prune_exports": + dry_raw = cmd.kwargs.get("dry_run", "1").strip() + is_dry = dry_raw not in ("0", "false", "no") + return not is_dry + # M10.1: restore is always dangerous (no dry_run option) + if v == VERB_POLICY and sub == "restore": + return True + return False + + +def build_normalized_args(cmd: "ControlCommand") -> str: + """ + Build a human-readable normalized representation of the command args. + Used in audit events and confirmation prompts. + """ + parts: list[str] = [] + # For !room agents set, skip the "set" positional from args display + skip_first_arg = cmd.verb == VERB_ROOM and cmd.subcommand == "agents" + for i, a in enumerate(cmd.args): + if skip_first_arg and i == 0: + continue + parts.append(a) + for k, v in sorted(cmd.kwargs.items()): + parts.append(f"{k}={v}") + return " ".join(parts) + + +def confirm_intent_reply(action_summary: str, nonce: str, ttl_s: int) -> str: + """Reply when a dangerous command is held pending confirmation (M9.0).""" + return ( + f"⚠️ **Confirm required**\n" + f"Action: `{action_summary}`\n" + f"Type `!confirm {nonce}` within {ttl_s}s to apply.\n" + f"_(Only you can confirm this action.)_" + ) + + +def confirm_success_reply(action_result: str) -> str: + """Reply when a confirmation is accepted and the action applied (M9.0).""" + return f"✅ Confirmed and applied.\n{action_result}" + + +def confirm_expired_reply() -> str: + """Reply when the nonce is invalid, expired, or from a different sender (M9.0).""" + return ( + "❌ Invalid or expired confirmation code. " + "The action was **not** applied.\n" + "Re-issue the original command to get a new code." + ) + +# M6.1: !room subcommand + actions +ROOM_SUBCMD_AGENTS = "agents" +ROOM_ACTION_SET = "set" +ROOM_ACTION_ADD = "add" +ROOM_ACTION_REMOVE = "remove" +ROOM_ACTION_GET = "get" +ROOM_ACTION_LIST = "list" +ROOM_ACTION_UNSET = "unset" # remove full override +_VALID_ROOM_ACTIONS = frozenset({ + ROOM_ACTION_SET, ROOM_ACTION_ADD, ROOM_ACTION_REMOVE, + ROOM_ACTION_GET, ROOM_ACTION_LIST, ROOM_ACTION_UNSET, +}) + +# M6.0: !node subcommands +NODE_SUBCMD_SET = "set" +NODE_SUBCMD_UNSET = "unset" +NODE_SUBCMD_GET = "get" +NODE_SUBCMD_LIST = "list" +_VALID_NODE_SUBCMDS = frozenset({NODE_SUBCMD_SET, NODE_SUBCMD_UNSET, NODE_SUBCMD_GET, NODE_SUBCMD_LIST}) + +# Runbook subcommands (M3.x) +SUBCOMMAND_START = "start" # M3.1 — implemented +SUBCOMMAND_NEXT = "next" # M3.2 — implemented +SUBCOMMAND_COMPLETE = "complete" # M3.2 — implemented +SUBCOMMAND_EVIDENCE = "evidence" # M3.3 — implemented +SUBCOMMAND_STATUS = "status" # M3.3 — implemented +SUBCOMMAND_POST_REVIEW = "post_review" # M3.3 — implemented # Max command line length to guard against garbage injection _MAX_CMD_LEN = 512 @@ -225,10 +331,814 @@ def check_authorization( # ── Reply helpers ───────────────────────────────────────────────────────────── def not_implemented_reply(cmd: ControlCommand) -> str: - """Reply for known commands not yet implemented (M3.0 stub).""" + """Reply for known commands not yet implemented.""" return ( f"✅ Command acknowledged: `{cmd.raw}`\n" - f"⏳ `!{cmd.verb} {cmd.subcommand}` — implementation pending (M3.1+)." + f"⏳ `!{cmd.verb} {cmd.subcommand}` — implementation pending." + ) + + +def next_usage_reply() -> str: + """Reply when !runbook next is called without a run_id.""" + return ( + "⚠️ Usage: `!runbook next `\n" + "Example: `!runbook next abc-123`" + ) + + +def complete_usage_reply() -> str: + """Reply when !runbook complete is missing required args.""" + return ( + "⚠️ Usage: `!runbook complete step= status=ok|warn|fail [notes=...]`\n" + "Example: `!runbook complete abc-123 step=3 status=ok notes=done`\n" + "Notes with spaces: join without quotes — `notes=done_and_verified`." + ) + + +def start_usage_reply() -> str: + """Reply when !runbook start is called with missing/invalid runbook_path.""" + return ( + "⚠️ Usage: `!runbook start [node=NODA1]`\n" + "Example: `!runbook start runbooks/rehearsal-v1-checklist.md node=NODA1`\n" + "runbook_path must be a relative path without `..`." + ) + + +def runbook_started_reply(run_id: str, steps_total: int, status: str) -> str: + """Success reply after sofiia-console creates a runbook run.""" + return ( + f"✅ runbook started: `run_id={run_id}` steps={steps_total} status={status}\n" + f"Next: `!runbook next {run_id}`" + ) + + +def runbook_start_error_reply(reason: str) -> str: + """Error reply when sofiia-console returns a non-2xx or connection error.""" + return f"❌ failed to start runbook: {reason}" + + +# ── M3.2 reply helpers ──────────────────────────────────────────────────────── + +# Max chars of instructions_md to include in Matrix message before truncating +_INSTRUCTIONS_EXCERPT_MAX = 1500 + + +def next_manual_reply( + run_id: str, + step_index: int, + steps_total: Optional[int], + title: str, + instructions_md: str, +) -> str: + """Reply for a manual step returned by !runbook next.""" + step_label = f"Step {step_index + 1}" + if steps_total: + step_label += f"/{steps_total}" + + excerpt = instructions_md.strip() + truncated = False + if len(excerpt) > _INSTRUCTIONS_EXCERPT_MAX: + excerpt = excerpt[:_INSTRUCTIONS_EXCERPT_MAX].rsplit("\n", 1)[0] + truncated = True + + parts = [ + f"🧭 {step_label}: **{title}**", + "", + excerpt, + ] + if truncated: + parts.append("_...(truncated — open in console for full instructions)_") + parts += [ + "", + f"Complete: `!runbook complete {run_id} step={step_index} status=ok`", + ] + return "\n".join(parts) + + +def next_auto_reply( + run_id: str, + step_index: int, + action_type: str, + step_status: str, + duration_ms: Optional[int], + completed: bool, +) -> str: + """Reply for an auto step (http_check/script) completed by !runbook next.""" + emoji = {"ok": "✅", "warn": "⚠️", "fail": "❌"}.get(step_status, "ℹ️") + dur = f" duration={duration_ms}ms" if duration_ms is not None else "" + header = f"{emoji} step {step_index + 1} ({action_type}) {step_status}{dur}" + + if completed: + return ( + f"{header}\n" + "🎉 All steps completed!\n" + f"Get evidence: `!runbook evidence {run_id}`" + ) + return f"{header}\nNext: `!runbook next {run_id}`" + + +def next_error_reply(run_id: str, reason: str) -> str: + """Error reply when !runbook next fails.""" + return f"❌ failed to advance runbook: {reason}" + + +def complete_ok_reply(run_id: str, step_index: int, status: str, run_completed: bool) -> str: + """Success reply after !runbook complete.""" + emoji = {"ok": "✅", "warn": "⚠️", "fail": "❌", "skipped": "⏭️"}.get(status, "✅") + line1 = f"{emoji} recorded step {step_index + 1}: {status}" + if run_completed: + return f"{line1}\n🎉 All steps completed!\nGet evidence: `!runbook evidence {run_id}`" + return f"{line1}\nNext: `!runbook next {run_id}`" + + +def complete_error_reply(run_id: str, reason: str) -> str: + """Error reply when !runbook complete fails.""" + return f"❌ failed to complete step: {reason}" + + +# ── M3.3 reply helpers ──────────────────────────────────────────────────────── + +def status_usage_reply() -> str: + return ( + "⚠️ Usage: `!runbook status `\n" + "Example: `!runbook status abc-123`" + ) + + +def evidence_usage_reply() -> str: + return ( + "⚠️ Usage: `!runbook evidence `\n" + "Example: `!runbook evidence abc-123`" + ) + + +def post_review_usage_reply() -> str: + return ( + "⚠️ Usage: `!runbook post_review `\n" + "Example: `!runbook post_review abc-123`" + ) + + +def status_reply(run: dict) -> str: + """Format !runbook status reply from a get_run response.""" + run_id = run.get("run_id", "?") + status = run.get("status", "?") + current = run.get("current_step", 0) + steps_total = run.get("steps_total") or len(run.get("steps", [])) + runbook_path = run.get("runbook_path", "?") + node_id = run.get("node_id", "?") + evidence_path = run.get("evidence_path") + + # Count warn/fail steps + steps = run.get("steps", []) + warn_count = sum(1 for s in steps if s.get("status") == "warn") + fail_count = sum(1 for s in steps if s.get("status") == "fail") + + status_emoji = { + "running": "🔄", "completed": "✅", "aborted": "🛑", "paused": "⏸️", + }.get(status, "ℹ️") + + step_label = f"{current}/{steps_total}" if steps_total else str(current) + lines = [ + f"{status_emoji} `run_id={run_id}` status={status} step={step_label}", + f"runbook: `{runbook_path}` node: {node_id}", + ] + if warn_count or fail_count: + lines.append(f"warn={warn_count} fail={fail_count}") + if evidence_path: + lines.append(f"evidence: `{evidence_path}`") + + if status == "completed" and not evidence_path: + lines.append(f"Get evidence: `!runbook evidence {run_id}`") + elif status == "completed" and evidence_path: + lines.append(f"Post-review: `!runbook post_review {run_id}`") + + return "\n".join(lines) + + +def status_error_reply(run_id: str, reason: str) -> str: + return f"❌ failed to get status: {reason}" + + +def evidence_reply(result: dict) -> str: + """Success reply after !runbook evidence.""" + path = result.get("evidence_path", "?") + size = result.get("bytes", 0) + run_id = result.get("run_id", "") + ts = result.get("created_at", "") + lines = [f"📄 evidence created: `{path}` (bytes={size})"] + if ts: + lines.append(f"created_at: {ts}") + if run_id: + lines.append(f"Next: `!runbook post_review {run_id}`") + return "\n".join(lines) + + +def evidence_error_reply(run_id: str, reason: str) -> str: + return f"❌ failed to generate evidence: {reason}" + + +def post_review_reply(result: dict) -> str: + """Success reply after !runbook post_review.""" + path = result.get("path", "?") + size = result.get("bytes", 0) + ts = result.get("created_at", "") + lines = [f"🧾 post-review created: `{path}` (bytes={size})"] + if ts: + lines.append(f"created_at: {ts}") + return "\n".join(lines) + + +def post_review_error_reply(run_id: str, reason: str) -> str: + return f"❌ failed to generate post-review: {reason}" + + +# ── M3.4 safety helpers ─────────────────────────────────────────────────────── + +#: Maximum length of notes/free-text operator input accepted before truncation. +MAX_NOTES_LEN: int = 500 + +#: Control characters (U+0000–U+001F minus tab/newline) that must be stripped. +_CTRL_CHARS = "".join(chr(i) for i in range(32) if i not in (9, 10, 13)) + + +def sanitize_notes(notes: str) -> str: + """ + Strip control characters and truncate notes to MAX_NOTES_LEN. + + Safe to call with any string; returns empty string for falsy input. + """ + if not notes: + return "" + cleaned = notes.translate(str.maketrans("", "", _CTRL_CHARS)) + if len(cleaned) > MAX_NOTES_LEN: + cleaned = cleaned[:MAX_NOTES_LEN] + "…" + return cleaned + + +def rate_limited_reply(scope: str, retry_after_s: float) -> str: + """Reply when a control command is rejected by rate limiter or cooldown.""" + secs = f"{retry_after_s:.0f}s" if retry_after_s >= 1 else "a moment" + return f"⏳ rate limited ({scope}), retry after {secs}" + + +def status_not_available_reply() -> str: + return "⚠️ Bridge status not available (service initialising or config missing)." + + +# M5.1: !nodes reply +_MAX_ROOM_OVERRIDES_SHOWN = 10 + + +def nodes_reply( + policy_info: dict, + node_stats: Optional[dict] = None, + sticky_info: Optional[dict] = None, +) -> str: + """ + Compact reply for `!nodes` in control room. + + policy_info: from NodePolicy.as_info_dict() + node_stats: optional dict {node_id: {"routed": N, "rejected": M, "health": ..., ...}} + sticky_info: optional dict from StickyNodeCache (M8.1) + """ + default = policy_info.get("default_node", "?") + allowed = sorted(policy_info.get("allowed_nodes") or []) + overrides = policy_info.get("room_overrides", {}) or {} + + allowed_str = ", ".join(f"`{n}`" for n in allowed) + lines = [ + "🌐 **Node policy**", + f"Default: `{default}` Allowed: {allowed_str}", + ] + + if isinstance(overrides, dict) and overrides: + lines.append(f"\n**Room overrides** ({len(overrides)}):") + items = list(overrides.items())[:_MAX_ROOM_OVERRIDES_SHOWN] + for room_id, node in items: + lines.append(f" `{room_id}` → `{node}`") + if len(overrides) > _MAX_ROOM_OVERRIDES_SHOWN: + lines.append(f" _(+{len(overrides) - _MAX_ROOM_OVERRIDES_SHOWN} more)_") + elif isinstance(overrides, int): + # as_info_dict returns room_overrides as int count, not dict + if overrides: + lines.append(f"\nRoom overrides: {overrides}") + else: + lines.append("\nNo room overrides configured.") + else: + lines.append("\nNo room overrides configured.") + + if node_stats: + lines.append("\n**Per-node stats** (since last restart):") + for node_id in sorted(node_stats): + ns = node_stats[node_id] + routed = ns.get("routed", 0) + rejected = ns.get("rejected", 0) + health = ns.get("health", "") + ewma = ns.get("ewma_latency_s") + consec = ns.get("consecutive_failures", 0) + stat_parts = [f"routed={routed}", f"rejected={rejected}"] + if health: + stat_parts.append(f"health={health}") + if ewma is not None: + stat_parts.append(f"ewma={ewma:.2f}s") + if consec: + stat_parts.append(f"consec_fail={consec}") + lines.append(f" `{node_id}`: " + " ".join(stat_parts)) + + # M8.1: sticky cache section + if sticky_info is not None: + active = sticky_info.get("active_keys", 0) + ttl = sticky_info.get("ttl_s", 0) + if active: + lines.append(f"\n**Sticky routing** (anti-flap): {active} active ttl={ttl:.0f}s") + for entry in sticky_info.get("entries", []): + rem = entry.get("remaining_s", 0) + lines.append( + f" `{entry['key']}` → `{entry['node']}` ({rem:.0f}s left)" + ) + if sticky_info.get("truncated"): + lines.append(f" _(+{sticky_info['truncated']} more)_") + else: + lines.append(f"\nSticky routing: none active ttl={ttl:.0f}s") + + return "\n".join(lines) + + +# ── M6.0: !node subcommand parser + reply helpers ────────────────────────────── + +import re as _re + +_ROOM_KWARG_RE = _re.compile(r"\broom=(\S+)", _re.IGNORECASE) +_NODE_VAL_RE = _re.compile(r"\bnode=(\w+)", _re.IGNORECASE) +_ROOM_ID_RE = _re.compile(r"^![a-zA-Z0-9._\-]+:[a-zA-Z0-9._\-]+$") + + +def parse_node_cmd(args_text: str) -> Tuple[str, Optional[str], Optional[str]]: + """ + Parse `!node [room=...] [node=...]` arguments. + + Returns (subcmd, room_id_or_None, node_id_or_None). + subcmd is lower-cased; node_id is upper-cased. + """ + parts = args_text.strip().split(None, 1) + if not parts: + return ("", None, None) + subcmd = parts[0].lower() + rest = parts[1] if len(parts) > 1 else "" + + room_m = _ROOM_KWARG_RE.search(rest) + node_m = _NODE_VAL_RE.search(rest) + + room_id = room_m.group(1) if room_m else None + node_id = node_m.group(1).upper() if node_m else None + return (subcmd, room_id, node_id) + + +def node_cmd_validate_room(room_id: str) -> bool: + """Return True if room_id matches basic Matrix room ID format.""" + return bool(_ROOM_ID_RE.match(room_id)) if room_id else False + + +def node_cmd_reply_set(room_id: str, node_id: str) -> str: + return f"✅ Override set: `{room_id}` → `{node_id}`" + + +def node_cmd_reply_unset_ok(room_id: str) -> str: + return f"✅ Override removed for `{room_id}`" + + +def node_cmd_reply_unset_not_found(room_id: str) -> str: + return f"ℹ️ No override was set for `{room_id}`" + + +def node_cmd_reply_get( + room_id: str, + node_id: Optional[str], + env_node: Optional[str], + default_node: str, +) -> str: + lines = [f"📌 **Node info for** `{room_id}`"] + if node_id: + lines.append(f"Dynamic override: `{node_id}` _(set by operator)_") + else: + lines.append("Dynamic override: _none_") + if env_node: + lines.append(f"Env map: `{env_node}`") + lines.append(f"Default: `{default_node}`") + effective = node_id or env_node or default_node + lines.append(f"\nEffective node: **`{effective}`**") + return "\n".join(lines) + + +def node_cmd_reply_list( + overrides: List[Tuple[str, str, int]], + total: int, +) -> str: + import datetime + lines = [f"📋 **Dynamic node overrides** ({total} total)"] + if not overrides: + lines.append("_None set._") + else: + for room_id, node_id, updated_at in overrides: + ts = datetime.datetime.utcfromtimestamp(updated_at).strftime("%Y-%m-%d %H:%M") + lines.append(f" `{room_id}` → `{node_id}` _(at {ts} UTC)_") + if total > len(overrides): + lines.append(f" _(+{total - len(overrides)} more)_") + return "\n".join(lines) + + +def node_cmd_reply_error(msg: str) -> str: + return ( + f"❌ {msg}\n\n" + "Usage:\n" + " `!node set room=!room:server node=NODA2`\n" + " `!node unset room=!room:server`\n" + " `!node get room=!room:server`\n" + " `!node list`" + ) + + +# ── M6.1: !room agents reply helpers ────────────────────────────────────────── + +_AGENTS_KWARG_RE = _re.compile(r"\bagents=(\S+)", _re.IGNORECASE) +_AGENT_KWARG_RE = _re.compile(r"\bagent=(\w+)", _re.IGNORECASE) +_DEFAULT_KWARG_RE = _re.compile(r"\bdefault=(\w+)", _re.IGNORECASE) + + +def parse_room_agents_cmd( + subcommand: str, + args: tuple, + kwargs: Dict[str, str], +) -> Tuple[str, Optional[str], Optional[List[str]], Optional[str], Optional[str]]: + """ + Parse !room agents [room=...] [agents=...] [agent=...] [default=...] args. + + Returns (action, room_id, agents_or_None, single_agent_or_None, default_agent_or_None). + action: the ROOM_ACTION_* constant (from args[0] or subcommand) + room_id: from kwargs["room"] + agents: from kwargs["agents"] as a list (for set command) + single_agent: from kwargs["agent"] (for add/remove) + default_agent: from kwargs["default"] + """ + # action is args[0] when subcommand == "agents" + action = (args[0].lower() if args else "").strip() or subcommand.lower() + room_id = kwargs.get("room") + + # agents= may be comma-separated + raw_agents = kwargs.get("agents", "") + agents: Optional[List[str]] = ( + [a.strip().lower() for a in raw_agents.split(",") if a.strip()] + if raw_agents else None + ) + + single_agent = kwargs.get("agent", "").strip().lower() or None + default_agent = kwargs.get("default", "").strip().lower() or None + return action, room_id, agents, single_agent, default_agent + + +def room_agents_reply_set(room_id: str, agents: List[str], default_agent: str) -> str: + agents_str = ", ".join(f"`{a}`" for a in sorted(agents)) + return ( + f"✅ Agent override set for `{room_id}`\n" + f"Agents: {agents_str}\n" + f"Default: `{default_agent}`" + ) + + +def room_agents_reply_add(room_id: str, agent: str, agents: List[str], default_agent: Optional[str]) -> str: + agents_str = ", ".join(f"`{a}`" for a in sorted(agents)) + return ( + f"✅ Agent `{agent}` added to `{room_id}`\n" + f"Current agents: {agents_str}" + + (f"\nDefault: `{default_agent}`" if default_agent else "") + ) + + +def room_agents_reply_remove(room_id: str, agent: str, agents: List[str], default_agent: Optional[str]) -> str: + if agents: + agents_str = ", ".join(f"`{a}`" for a in sorted(agents)) + return ( + f"✅ Agent `{agent}` removed from `{room_id}`\n" + f"Remaining: {agents_str}" + + (f"\nDefault: `{default_agent}`" if default_agent else "") + ) + return f"✅ Agent `{agent}` removed — no agents left, override cleared for `{room_id}`" + + +def room_agents_reply_unset_ok(room_id: str) -> str: + return f"✅ Agent override cleared for `{room_id}` (using env/default config)" + + +def room_agents_reply_unset_not_found(room_id: str) -> str: + return f"ℹ️ No agent override was set for `{room_id}`" + + +def room_agents_reply_get( + room_id: str, + override_agents: Optional[List[str]], + override_default: Optional[str], + env_agents: Optional[List[str]], + env_default: Optional[str], +) -> str: + lines = [f"📌 **Agent policy for** `{room_id}`"] + if override_agents: + agents_str = ", ".join(f"`{a}`" for a in sorted(override_agents)) + lines.append(f"Dynamic override: {agents_str} default=`{override_default or '?'}`") + else: + lines.append("Dynamic override: _none_") + if env_agents: + env_str = ", ".join(f"`{a}`" for a in sorted(env_agents)) + lines.append(f"Env config: {env_str} default=`{env_default or '?'}`") + else: + lines.append("Env config: _not configured_") + effective_agents = override_agents or env_agents or [] + effective_default = override_default or env_default or "?" + lines.append(f"\nEffective agents: **{', '.join(f'`{a}`' for a in sorted(effective_agents))}** default=**`{effective_default}`**") + return "\n".join(lines) + + +def room_agents_reply_list( + overrides: List[Tuple[str, List[str], Optional[str], int]], + total: int, +) -> str: + import datetime + lines = [f"📋 **Dynamic agent overrides** ({total} total)"] + if not overrides: + lines.append("_None set._") + else: + for room_id, agents, default_agent, updated_at in overrides: + ts = datetime.datetime.utcfromtimestamp(updated_at).strftime("%Y-%m-%d %H:%M") + agents_str = ", ".join(agents) + lines.append(f" `{room_id}`: [{agents_str}] default=`{default_agent or '?'}` _(at {ts} UTC)_") + if total > len(overrides): + lines.append(f" _(+{total - len(overrides)} more)_") + return "\n".join(lines) + + +def room_agents_reply_error(msg: str) -> str: + return ( + f"❌ {msg}\n\n" + "Usage:\n" + " `!room agents set room=!X agents=sofiia,helion [default=sofiia]`\n" + " `!room agents add room=!X agent=druid`\n" + " `!room agents remove room=!X agent=helion`\n" + " `!room agents get room=!X`\n" + " `!room agents unset room=!X`\n" + " `!room agents list`" + ) + + +# ── M6.2: !policy export/import reply helpers + path validator ──────────────── + +import os as _os +import json as _json + + +POLICY_EXPORTS_SUBDIR = "policy_exports" + + +def validate_export_path(exports_dir: str, filename: str) -> Optional[str]: + """ + Validate and resolve an export filename to an absolute path. + + Security: only allow simple filenames (no slashes, no `..`). + Returns the absolute safe path, or None if invalid. + """ + if not filename: + return None + # Reject anything with directory separators or traversal sequences + if "/" in filename or "\\" in filename or ".." in filename: + return None + # Only allow safe characters: alphanumeric, dash, underscore, dot + if not _re.match(r"^[a-zA-Z0-9._\-]+$", filename): + return None + full_path = _os.path.join(exports_dir, filename) + try: + resolved = _os.path.realpath(full_path) + exports_resolved = _os.path.realpath(exports_dir) + if not resolved.startswith(exports_resolved + _os.sep): + return None + except Exception: # noqa: BLE001 + return None + return full_path + + +def policy_export_reply(path: str, node_count: int, agent_count: int) -> str: + filename = _os.path.basename(path) + return ( + f"✅ **Policy exported**\n" + f"File: `{filename}`\n" + f"Node overrides: {node_count} Agent overrides: {agent_count}" + ) + + +def policy_import_dry_run_reply(stats: dict, mode: str) -> str: + return ( + f"🔍 **Import dry-run** (mode=`{mode}`, no changes applied)\n" + f"Node overrides: +{stats.get('node_added',0)} ~{stats.get('node_updated',0)} -{stats.get('node_deleted',0)}\n" + f"Agent overrides: +{stats.get('agent_added',0)} ~{stats.get('agent_updated',0)} -{stats.get('agent_deleted',0)}\n" + f"_Use `dry_run=0` to apply._" + ) + + +def format_import_diff(diff: Any) -> str: + """ + Format an ImportDiff as a human-readable Markdown string (M9.1). + `diff` is an ImportDiff instance from policy_store. + """ + lines: List[str] = [] + + # Node overrides row + node_parts: List[str] = [] + if diff.node_added: node_parts.append(f"+{diff.node_added} added") + if diff.node_updated: node_parts.append(f"~{diff.node_updated} updated") + if diff.node_deleted: node_parts.append(f"-{diff.node_deleted} deleted ⚠️") + lines.append("**Node overrides:** " + (", ".join(node_parts) if node_parts else "no changes")) + + # Agent overrides row + agent_parts: List[str] = [] + if diff.agent_added: agent_parts.append(f"+{diff.agent_added} added") + if diff.agent_updated: agent_parts.append(f"~{diff.agent_updated} updated") + if diff.agent_deleted: agent_parts.append(f"-{diff.agent_deleted} deleted ⚠️") + lines.append("**Agent overrides:** " + (", ".join(agent_parts) if agent_parts else "no changes")) + + # Sample affected rooms + if getattr(diff, "sample_keys", None): + keys_str = ", ".join(f"`{k}`" for k in diff.sample_keys) + more = diff.total_changes() - len(diff.sample_keys) + suffix = f" _(+{more} more)_" if more > 0 else "" + lines.append(f"**Affected rooms:** {keys_str}{suffix}") + + # Replace danger banner + if getattr(diff, "is_replace", False): + lines.append("⚠️ **REPLACE mode** — existing overrides NOT in the file will be **deleted**.") + + return "\n".join(lines) + + +def policy_import_intent_reply( + diff: Any, + action_summary: str, + nonce: str, + ttl_s: int, +) -> str: + """Reply for !policy import intent with diff preview (M9.1).""" + lines = [ + "⚠️ **Confirm required**", + f"Action: `{action_summary}`", + "", + "**Preview:**", + format_import_diff(diff), + "", + ] + if diff.total_changes() == 0: + lines.append("_(No policy changes would be made.)_") + lines.append("") + lines += [ + f"Type `!confirm {nonce}` within {ttl_s}s to apply.", + "_(Only you can confirm. If the file changes, this confirm will be rejected.)_", + ] + return "\n".join(lines) + + +def policy_import_reply(stats: dict, mode: str) -> str: + return ( + f"✅ **Policy imported** (mode=`{mode}`)\n" + f"Node overrides: +{stats.get('node_added',0)} ~{stats.get('node_updated',0)} -{stats.get('node_deleted',0)}\n" + f"Agent overrides: +{stats.get('agent_added',0)} ~{stats.get('agent_updated',0)} -{stats.get('agent_deleted',0)}" + ) + + +def policy_restore_intent_reply( + diff: Any, + action_summary: str, + nonce: str, + ttl_s: int, +) -> str: + """Reply for !policy restore intent — rollback preview + confirm prompt (M10.1).""" + diff_text = format_import_diff(diff) + return ( + f"🔄 **Policy restore (rollback) preview**\n" + f"{diff_text}\n\n" + f"⚠️ **Rollback action:** `{action_summary}`\n\n" + f"Type `!confirm {nonce}` to apply restore (expires in {ttl_s}s)" + ) + + +def policy_restore_applied_reply( + stats: Any, + mode: str, + autobackup_basename: str = "", +) -> str: + """Reply after !policy restore is confirmed and applied (M10.1).""" + n_a = stats.get("node_added", 0) if isinstance(stats, dict) else 0 + n_u = stats.get("node_updated", 0) if isinstance(stats, dict) else 0 + n_d = stats.get("node_deleted", 0) if isinstance(stats, dict) else 0 + a_a = stats.get("agent_added", 0) if isinstance(stats, dict) else 0 + a_u = stats.get("agent_updated", 0) if isinstance(stats, dict) else 0 + a_d = stats.get("agent_deleted", 0) if isinstance(stats, dict) else 0 + backup_line = ( + f"\n\n💾 Pre-restore backup saved: `{autobackup_basename}`" + if autobackup_basename else "" + ) + return ( + f"✅ **Policy restored** (mode={mode})\n" + f"Node overrides: +{n_a} ~{n_u} -{n_d}\n" + f"Agent overrides: +{a_a} ~{a_u} -{a_d}" + f"{backup_line}" + ) + + +def policy_history_reply(changes: List[Any]) -> str: + """ + Format policy_changes records for !policy history reply (M10.2). + + Each line: #{n}. [id:NN] [YYYY-MM-DD HH:MM] verb/mode +Xn ~Yn -Zn `file` op:`hash8` [⚠️] + Use !policy change id=NN to see full details. + """ + if not changes: + return "📋 **Policy change history**\nNo policy changes recorded yet." + lines = ["📋 **Policy change history** (most recent first)\n"] + for i, c in enumerate(changes, 1): + destr_flag = " ⚠️" if c.is_destructive else "" + fname = c.source_file[:40] + "…" if len(c.source_file) > 40 else c.source_file + line = ( + f"{i}. [id:{c.id}] [{c.when_str()}] `{c.verb}/{c.mode}`" + f" {c.changes_short()}{destr_flag}" + f" `{fname}`" + f" op:`{c.sender_hash[:8]}`" + ) + lines.append(line) + lines.append("\nUse `!policy change id=` for full details of a specific change.") + return "\n".join(lines) + + +def policy_change_detail_reply(change: Any) -> str: + """ + Format full details of a single PolicyChange for !policy change id= (M10.3). + """ + destr_str = "⚠️ Yes" if change.is_destructive else "No" + fname = change.source_file[:60] + "…" if len(change.source_file) > 60 else change.source_file + lines = [ + f"🔍 **Policy change #{change.id}**\n", + f"**Verb:** `{change.verb}`", + f"**Mode:** `{change.mode}`", + f"**Applied:** {change.when_str()} UTC", + f"**Operator:** op:`{change.sender_hash[:8]}`", + f"**File:** `{fname}`", + f"**Destructive:** {destr_str}", + "", + "**Changes:**", + f" Nodes: +{change.node_added} added ~{change.node_updated} updated -{change.node_deleted} deleted", + f" Agents: +{change.agent_added} added ~{change.agent_updated} updated -{change.agent_deleted} deleted", + "", + "**Summary:**", + f" {change.diff_summary}", + ] + return "\n".join(lines) + + +def policy_prune_preview_reply(result: Any, retention_days: int) -> str: + """Reply for !policy prune_exports dry_run=1 — preview of what would be pruned (M10.0).""" + if result.count == 0: + return ( + f"🗑️ **Policy exports prune preview** (retention={retention_days}d)\n" + "No files older than the retention window found. Nothing to prune." + ) + samples = result.sample_filenames(5) + sample_str = "\n".join(f" - `{f}`" for f in samples) + more = result.count - len(samples) + more_str = f"\n _(+{more} more)_" if more > 0 else "" + size_kb = result.total_bytes // 1024 + return ( + f"🗑️ **Policy exports prune preview** (retention={retention_days}d)\n" + f"Would delete **{result.count}** file(s) (~{size_kb} KB):\n" + f"{sample_str}{more_str}\n\n" + f"To actually prune: `!policy prune_exports dry_run=0`" + ) + + +def policy_prune_applied_reply(result: Any, retention_days: int) -> str: + """Reply after !policy prune_exports dry_run=0 is confirmed and applied (M10.0).""" + if result.count == 0: + return ( + f"🗑️ **Policy exports pruned** (retention={retention_days}d)\n" + "No files matched the retention window." + ) + size_kb = result.total_bytes // 1024 + return ( + f"✅ **Policy exports pruned** (retention={retention_days}d)\n" + f"Deleted **{result.count}** file(s) (~{size_kb} KB freed)." + ) + + +def policy_cmd_error(msg: str) -> str: + return ( + f"❌ {msg}\n\n" + "Usage:\n" + " `!policy export`\n" + " `!policy import path=policy-YYYYMMDD-HHMMSS.json [mode=merge|replace] [dry_run=0]`" ) @@ -252,12 +1162,26 @@ def help_reply() -> str: """Brief help text.""" return ( "**DAGI Bridge — Control Commands**\n\n" - "`!runbook start [node=NODA1]` — Start a runbook run\n" - "`!runbook next ` — Advance to next step\n" - "`!runbook complete step= status=ok` — Mark step complete\n" - "`!runbook evidence ` — Get evidence artifact path\n" - "`!runbook status ` — Show current run state\n" - "`!status` — Bridge health summary\n" + "`!runbook start [node=NODA1]` — Start a runbook run ✅\n" + "`!runbook next ` — Advance to next step ✅\n" + "`!runbook complete step= status=ok [notes=...]` — Mark step complete ✅\n" + "`!runbook status ` — Show run status ✅\n" + "`!runbook evidence ` — Generate release evidence ✅\n" + "`!runbook post_review ` — Generate post-release review ✅\n" + "`!status` — Bridge health summary ✅\n" + "`!nodes` — Node policy overview ✅\n" + "`!node set room=!room:server node=NODA2` — Set room-node override ✅\n" + "`!node unset room=!room:server` — Remove room-node override ✅\n" + "`!node get room=!room:server` — Show current override ✅\n" + "`!node list` — List dynamic overrides (top 10) ✅\n" + "`!room agents set room=!X agents=sofiia,helion [default=sofiia]` — Set agent list ✅\n" + "`!room agents add room=!X agent=druid` — Add agent to room ✅\n" + "`!room agents remove room=!X agent=helion` — Remove agent from room ✅\n" + "`!room agents get room=!X` — Show current agent policy ✅\n" + "`!room agents list` — List all rooms with agent overrides ✅\n" + "`!room agents unset room=!X` — Remove all agent overrides for room ✅\n" + "`!policy export` — Export policy snapshot to file ✅\n" + "`!policy import path= [mode=merge|replace] [dry_run=0]` — Import policy snapshot ✅\n" "`!help` — This message\n\n" "_Only authorised operators can issue control commands._" ) diff --git a/services/matrix-bridge-dagi/app/control_limiter.py b/services/matrix-bridge-dagi/app/control_limiter.py new file mode 100644 index 00000000..303ff0c8 --- /dev/null +++ b/services/matrix-bridge-dagi/app/control_limiter.py @@ -0,0 +1,138 @@ +""" +control_limiter — M3.4: Rate limiting + cooldown for Matrix control channel. + +Protection layers: + 1. Per-room sliding window — CONTROL_ROOM_RPM (default 60) + 2. Per-operator sliding window — CONTROL_OPERATOR_RPM (default 30) + 3. Per-run sliding window — CONTROL_RUN_NEXT_RPM (default 20, only !runbook next) + 4. Per-operator cooldown — CONTROL_COOLDOWN_S (default 2s, anti-double-click) + +All state is in-memory (lost on restart), which is intentional — limits reset with the bridge. + +Thread safety: not needed (asyncio single-threaded event loop). +""" +from __future__ import annotations + +import time +from collections import defaultdict, deque +from typing import Dict, Tuple + + +# Sentinel value for "unlimited" (rpm == 0 → skip check) +_UNLIMITED = 0 + + +class ControlRateLimiter: + """ + Sliding-window rate limiter + cooldown for the Matrix control channel. + + All rpm values are requests-per-minute over a 60-second rolling window. + cooldown_s is a per-{operator, verb, subcommand} debounce window (anti-double-click). + """ + + def __init__( + self, + room_rpm: int = 60, + operator_rpm: int = 30, + run_next_rpm: int = 20, + cooldown_s: float = 2.0, + ) -> None: + self.room_rpm = room_rpm + self.operator_rpm = operator_rpm + self.run_next_rpm = run_next_rpm + self.cooldown_s = cooldown_s + + # Sliding-window storage: key → deque[float] (monotonic timestamps) + self._room_windows: Dict[str, deque] = defaultdict(deque) + self._op_windows: Dict[str, deque] = defaultdict(deque) + self._run_windows: Dict[str, deque] = defaultdict(deque) + + # Cooldown: (sender_hash, verb, subcommand) → last accepted timestamp + self._cooldown_times: Dict[str, float] = {} + + # ── Sliding window helpers ───────────────────────────────────────────────── + + @staticmethod + def _check_window( + windows: Dict[str, deque], + key: str, + rpm: int, + ) -> Tuple[bool, float]: + """ + Sliding-window check over a 60-second window. + + Returns (allowed, retry_after_seconds). + If rpm == 0, always allowed. + """ + if rpm == _UNLIMITED: + return True, 0.0 + + now = time.monotonic() + window = windows[key] + cutoff = now - 60.0 + + # Evict expired entries + while window and window[0] < cutoff: + window.popleft() + + if len(window) >= rpm: + # Time until oldest entry expires + retry_after = max(0.0, 60.0 - (now - window[0])) + return False, retry_after + + window.append(now) + return True, 0.0 + + # ── Public check methods ─────────────────────────────────────────────────── + + def check_room(self, room_id: str) -> Tuple[bool, float]: + """Per-room rate limit check. Returns (allowed, retry_after_s).""" + return self._check_window(self._room_windows, room_id, self.room_rpm) + + def check_operator(self, sender_hash: str) -> Tuple[bool, float]: + """Per-operator rate limit check. Returns (allowed, retry_after_s).""" + return self._check_window(self._op_windows, sender_hash, self.operator_rpm) + + def check_run_next(self, run_id: str) -> Tuple[bool, float]: + """ + Per-run rate limit for !runbook next — prevents rapid-fire advancement. + Returns (allowed, retry_after_s). + """ + return self._check_window(self._run_windows, run_id, self.run_next_rpm) + + def check_cooldown( + self, + sender_hash: str, + verb: str, + subcommand: str, + ) -> Tuple[bool, float]: + """ + Anti-double-click cooldown per (operator, verb, subcommand). + + Returns (allowed, wait_s). On first call → records timestamp and allows. + On subsequent calls within cooldown_s → blocks and returns remaining wait. + """ + if self.cooldown_s <= 0: + return True, 0.0 + + key = f"{sender_hash}:{verb}:{subcommand}" + now = time.monotonic() + last = self._cooldown_times.get(key) + + if last is not None: + elapsed = now - last + if elapsed < self.cooldown_s: + return False, self.cooldown_s - elapsed + + self._cooldown_times[key] = now + return True, 0.0 + + # ── Summary ─────────────────────────────────────────────────────────────── + + def as_health_dict(self) -> dict: + return { + "room_rpm": self.room_rpm, + "operator_rpm": self.operator_rpm, + "run_next_rpm": self.run_next_rpm, + "cooldown_s": self.cooldown_s, + } diff --git a/services/matrix-bridge-dagi/app/control_runner.py b/services/matrix-bridge-dagi/app/control_runner.py new file mode 100644 index 00000000..22dc78a6 --- /dev/null +++ b/services/matrix-bridge-dagi/app/control_runner.py @@ -0,0 +1,296 @@ +""" +control_runner — M3.1 + M3.2 + M3.3 + +Thin async HTTP client that calls the sofiia-console internal runbook API +on behalf of the Matrix bridge control channel. + +All functions are stateless; callers supply the pre-built AsyncClient. +""" +from __future__ import annotations + +import logging +from typing import Optional + +import httpx + +logger = logging.getLogger(__name__) + +# Runbook path guards (fail-fast in the bridge, before calling the console) +_MAX_PATH_LEN = 256 +_FORBIDDEN_SEGMENTS = {"..", "~"} + + +class RunnerError(Exception): + """Raised when the sofiia-console returns an error or call fails.""" + + +def validate_runbook_path(path: str) -> Optional[str]: + """ + Return None if valid, or an error string describing the problem. + Checks: non-empty, max length, no traversal segments, no absolute paths. + """ + path = path.strip() + if not path: + return "runbook_path is required" + if len(path) > _MAX_PATH_LEN: + return f"runbook_path too long (max {_MAX_PATH_LEN} chars)" + if path.startswith("/"): + return "absolute paths are not allowed" + parts = path.replace("\\", "/").split("/") + for part in parts: + if part in _FORBIDDEN_SEGMENTS: + return f"forbidden path segment: {part!r}" + return None + + +async def start_runbook_run( + http_client: httpx.AsyncClient, + console_url: str, + control_token: str, + runbook_path: str, + operator_id: str, + node_id: str = "NODA1", + timeout: float = 15.0, +) -> dict: + """ + POST /api/runbooks/internal/runs → {run_id, status, current_step, steps_total} + + Raises RunnerError on HTTP error or non-2xx response. + """ + url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs" + payload: dict = { + "runbook_path": runbook_path, + "operator_id": operator_id, + "node_id": node_id, + } + + try: + resp = await http_client.post( + url, + json=payload, + headers={"X-Control-Token": control_token}, + timeout=timeout, + ) + except httpx.RequestError as exc: + raise RunnerError(f"connection error: {exc}") from exc + + if resp.status_code != 200: + detail = _extract_error_detail(resp) + raise RunnerError(f"HTTP {resp.status_code}: {detail}") + + try: + return resp.json() + except Exception as exc: + raise RunnerError(f"invalid JSON response: {exc}") from exc + + +def _extract_error_detail(resp: httpx.Response) -> str: + """Extract a short error detail from an httpx response (safe: never raises).""" + try: + body = resp.json() + if isinstance(body, dict) and body.get("detail"): + return str(body["detail"])[:200] + except Exception: + pass + try: + return (resp.text or "")[:200] + except Exception: + return "" + + +async def get_runbook_run( + http_client: httpx.AsyncClient, + console_url: str, + control_token: str, + run_id: str, + timeout: float = 10.0, +) -> dict: + """ + GET /api/runbooks/internal/runs/{run_id} → full run with steps. + """ + url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}" + + try: + resp = await http_client.get( + url, + headers={"X-Control-Token": control_token}, + timeout=timeout, + ) + except httpx.RequestError as exc: + raise RunnerError(f"connection error: {exc}") from exc + + if resp.status_code == 404: + raise RunnerError(f"run {run_id!r} not found") + if resp.status_code != 200: + raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}") + + try: + return resp.json() + except Exception as exc: + raise RunnerError(f"invalid JSON response: {exc}") from exc + + +# ── M3.2 ────────────────────────────────────────────────────────────────────── + +async def next_runbook_step( + http_client: httpx.AsyncClient, + console_url: str, + control_token: str, + run_id: str, + operator_id: str = "", + timeout: float = 30.0, +) -> dict: + """ + POST /api/runbooks/internal/runs/{run_id}/next + + Returns either: + {type:"manual", step_index, title, section, instructions_md, steps_total?} + {type:"http_check"|"script", step_index, title, result, step_status, next_step, completed} + + Raises RunnerError on HTTP error, 404 (run not found / not active). + """ + url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}/next" + payload = {"operator_id": operator_id} if operator_id else {} + + try: + resp = await http_client.post( + url, + json=payload, + headers={"X-Control-Token": control_token}, + timeout=timeout, + ) + except httpx.RequestError as exc: + raise RunnerError(f"connection error: {exc}") from exc + + if resp.status_code == 404: + detail = _extract_error_detail(resp) + raise RunnerError(f"run not found or not active: {detail}") + if resp.status_code != 200: + raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}") + + try: + return resp.json() + except Exception as exc: + raise RunnerError(f"invalid JSON response: {exc}") from exc + + +async def complete_runbook_step( + http_client: httpx.AsyncClient, + console_url: str, + control_token: str, + run_id: str, + step_index: int, + status: str, + notes: str = "", + operator_id: str = "", + timeout: float = 15.0, +) -> dict: + """ + POST /api/runbooks/internal/runs/{run_id}/steps/{step_index}/complete + + Returns: {ok, run_id, step_index, status, next_step, steps_total, run_completed} + + Raises RunnerError on HTTP error or 404 (run/step not found or wrong current step). + """ + url = ( + f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}" + f"/steps/{step_index}/complete" + ) + payload: dict = {"status": status} + if notes: + payload["notes"] = notes + if operator_id: + payload["operator_id"] = operator_id + + try: + resp = await http_client.post( + url, + json=payload, + headers={"X-Control-Token": control_token}, + timeout=timeout, + ) + except httpx.RequestError as exc: + raise RunnerError(f"connection error: {exc}") from exc + + if resp.status_code == 404: + detail = _extract_error_detail(resp) + raise RunnerError(f"step not found or not current: {detail}") + if resp.status_code != 200: + raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}") + + try: + return resp.json() + except Exception as exc: + raise RunnerError(f"invalid JSON response: {exc}") from exc + + +# ── M3.3 ────────────────────────────────────────────────────────────────────── + +async def generate_evidence( + http_client: httpx.AsyncClient, + console_url: str, + control_token: str, + run_id: str, + timeout: float = 30.0, +) -> dict: + """ + POST /api/runbooks/internal/runs/{run_id}/evidence + + Returns: {evidence_path, bytes, created_at, run_id} + + Raises RunnerError on HTTP error or 404 (run not found). + """ + url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}/evidence" + try: + resp = await http_client.post( + url, + headers={"X-Control-Token": control_token}, + timeout=timeout, + ) + except httpx.RequestError as exc: + raise RunnerError(f"connection error: {exc}") from exc + + if resp.status_code == 404: + raise RunnerError(f"run {run_id!r} not found") + if resp.status_code != 200: + raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}") + + try: + return resp.json() + except Exception as exc: + raise RunnerError(f"invalid JSON response: {exc}") from exc + + +async def generate_post_review( + http_client: httpx.AsyncClient, + console_url: str, + control_token: str, + run_id: str, + timeout: float = 30.0, +) -> dict: + """ + POST /api/runbooks/internal/runs/{run_id}/post_review + + Returns: {path, bytes, created_at, run_id} + + Raises RunnerError on HTTP error or 404. + """ + url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}/post_review" + try: + resp = await http_client.post( + url, + headers={"X-Control-Token": control_token}, + timeout=timeout, + ) + except httpx.RequestError as exc: + raise RunnerError(f"connection error: {exc}") from exc + + if resp.status_code == 404: + raise RunnerError(f"run {run_id!r} not found") + if resp.status_code != 200: + raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}") + + try: + return resp.json() + except Exception as exc: + raise RunnerError(f"invalid JSON response: {exc}") from exc diff --git a/services/matrix-bridge-dagi/app/discovery.py b/services/matrix-bridge-dagi/app/discovery.py new file mode 100644 index 00000000..14d4ecab --- /dev/null +++ b/services/matrix-bridge-dagi/app/discovery.py @@ -0,0 +1,210 @@ +""" +discovery — M4.0: Agent discovery helpers for Matrix user rooms. + +Provides formatted replies for `!agents` and `!agents status` commands. +These commands are available to all room members (no auth required) and +are processed BEFORE routing to the LLM agent. + +Supports: + - Mixed rooms: list all agents, default, usage examples + - Direct rooms: show single agent mapping + - Unknown rooms: "no mapping" notice +""" +from __future__ import annotations + +import datetime +from typing import Optional + +from .mixed_routing import MixedRoomConfig +from .room_mapping import RoomMappingConfig # noqa: F401 — used in type hints + + +def _fmt_ts(ts: int) -> str: + """Format a Unix timestamp as compact UTC string.""" + try: + return datetime.datetime.fromtimestamp(ts, tz=datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + except Exception: # noqa: BLE001 + return str(ts) + +# Discovery command prefix +DISCOVERY_CMD = "!agents" + +# Reply length cap (Matrix message, not truncated — kept short by design) +_MAX_REPLY_LEN = 3500 + + +def is_discovery_message(text: str) -> bool: + """Return True if the message is a !agents discovery command.""" + lower = text.strip().lower() + return lower == DISCOVERY_CMD or lower.startswith(DISCOVERY_CMD + " ") + + +def agents_reply( + room_id: str, + room_map: RoomMappingConfig, + mixed_room_config: Optional[MixedRoomConfig], +) -> str: + """ + Build a discovery reply for the given room. + + Mixed room → list agents, default, usage examples. + Direct room → single agent mapping. + Unknown → 'no mapping' notice. + """ + # Mixed room check first + if mixed_room_config and mixed_room_config.is_mixed(room_id): + room = mixed_room_config.rooms.get(room_id) + if room is not None: + return _mixed_room_reply(room_id, room) + + # Direct room check + agent_id = room_map.agent_for_room(room_id) + if agent_id is not None: + return _direct_room_reply(agent_id) + + return _unknown_room_reply() + + +def _mixed_room_reply(room_id: str, room) -> str: + """Format reply for a mixed room.""" + agents = room.agents + default = room.default_agent or (agents[0] if agents else "?") + agent_list = ", ".join(f"**{a}**" for a in agents) + + lines = [ + f"🤖 **Agents available in this room:** {agent_list}", + f"⭐ **Default:** {default}", + "", + "**How to address an agent:**", + ] + for agent in agents[:5]: # show max 5 examples + lines.append(f" • `/{agent} ` — slash command") + lines.append(f" • `@{agent} ` — mention") + lines.append(f" • `{agent}: ` — colon prefix") + lines.extend([ + "", + f"_Messages without prefix go to **{default}** by default._", + ]) + return "\n".join(lines) + + +def _direct_room_reply(agent_id: str) -> str: + """Format reply for a directly-mapped room (1 agent).""" + return ( + f"🤖 This room is mapped to agent: **{agent_id}**\n\n" + f"All messages are forwarded to **{agent_id}** automatically.\n" + f"No prefix needed — just write your message." + ) + + +def _unknown_room_reply() -> str: + """Format reply when room has no mapping.""" + return ( + "⚠️ This room has no agent mapping.\n\n" + "Contact an operator to configure an agent for this room." + ) + + +# ── Bridge status reply (M4.1) ──────────────────────────────────────────────── + +def bridge_status_reply(snapshot: dict) -> str: + """ + Format a concise bridge health snapshot for `!status` in control room. + + snapshot keys (all optional with defaults): + node_id, queue_size, queue_max, worker_count, + room_count, mixed_room_count, operators_count, + control_safety (dict), persistent_dedupe (dict), + dedupe_hits, dedupe_inserts + """ + node_id = snapshot.get("node_id", "?") + q_size = snapshot.get("queue_size", "?") + q_max = snapshot.get("queue_max", "?") + workers = snapshot.get("worker_count", "?") + rooms = snapshot.get("room_count", 0) + mixed = snapshot.get("mixed_room_count", 0) + ops = snapshot.get("operators_count", 0) + + safety = snapshot.get("control_safety") or {} + dedupe = snapshot.get("persistent_dedupe") or {} + + node_policy = snapshot.get("node_policy") or {} + default_node = node_policy.get("default_node", node_id) + allowed_nodes = node_policy.get("allowed_nodes") or [] + room_overrides = node_policy.get("room_overrides", 0) + + lines = [ + f"📡 **Bridge status** — node: `{node_id}`", + "", + f"**Queue:** {q_size}/{q_max} workers: {workers}", + f"**Rooms:** {rooms} direct {mixed} mixed ops: {ops} operators", + "", + ] + + # M5.0: node policy + if allowed_nodes: + allowed_str = ", ".join(f"`{n}`" for n in sorted(allowed_nodes)) + lines.append( + f"**Node policy:** default=`{default_node}` allowed={allowed_str} room_overrides={room_overrides}" + ) + + # Control safety + if safety: + enabled = "✅" if safety.get("enabled") else "⬜" + lines.append( + f"**Control safety {enabled}:** " + f"room={safety.get('room_rpm', '?')}rpm " + f"op={safety.get('operator_rpm', '?')}rpm " + f"cooldown={safety.get('cooldown_s', '?')}s" + ) + + # Persistent dedupe + if dedupe: + ok_emoji = "✅" if dedupe.get("ok") else "❌" + pruned = dedupe.get("pruned_rows_last", 0) + ttl = dedupe.get("ttl_h", "?") + lines.append( + f"**Dedupe {ok_emoji}:** ttl={ttl}h pruned_last={pruned} " + f"db=`{dedupe.get('db_path') or 'n/a'}`" + ) + + # M6.0/M6.1: policy store status + ps = snapshot.get("policy_store") or {} + if ps: + ps_ok = "✅" if ps.get("ok") else "❌" + ps_node_count = ps.get("overrides_count", 0) + ps_agent_count = ps.get("agent_overrides_count", snapshot.get("policy_agent_overrides_count", 0)) + ps_path = ps.get("policy_store_path") or ps.get("path") or "n/a" + lines.append( + f"**Policy store {ps_ok}:** node_overrides={ps_node_count} " + f"agent_overrides={ps_agent_count} db=`{ps_path}`" + ) + + # M6.2: last export/import timestamps + DB mtime + _last_export = snapshot.get("policy_last_export_at") + _last_import = snapshot.get("policy_last_import_at") + _db_mtime = snapshot.get("policy_db_mtime") + _snap_parts: list = [] + if _last_export: + _snap_parts.append(f"last_export=`{_fmt_ts(_last_export)}`") + if _last_import: + _snap_parts.append(f"last_import=`{_fmt_ts(_last_import)}`") + if _db_mtime: + _snap_parts.append(f"db_mtime=`{_fmt_ts(_db_mtime)}`") + if _snap_parts: + lines.append("**Policy snapshots:** " + " ".join(_snap_parts)) + + # M5.1: per-node routed/rejected breakdown + node_stats = snapshot.get("nodes") or {} + if node_stats: + lines.append("\n**Per-node stats:**") + for nid in sorted(node_stats): + ns = node_stats[nid] + lines.append( + f" `{nid}`: routed={ns.get('routed', 0)} rejected={ns.get('rejected', 0)}" + ) + + reply = "\n".join(lines) + if len(reply) > _MAX_REPLY_LEN: + reply = reply[:_MAX_REPLY_LEN - 3] + "…" + return reply diff --git a/services/matrix-bridge-dagi/app/event_store.py b/services/matrix-bridge-dagi/app/event_store.py new file mode 100644 index 00000000..a461f66c --- /dev/null +++ b/services/matrix-bridge-dagi/app/event_store.py @@ -0,0 +1,213 @@ +""" +event_store — M2.3: Persistent event deduplication via SQLite. + +Stores processed Matrix event_ids so that bridge restarts do not reprocess +events still returned by /sync (within TTL window). + +Schema: + processed_events (room_id, event_id, ts, sender_hash) + PK: (room_id, event_id) + IDX: idx_processed_events_ts (ts) + +Design notes: + - Uses aiosqlite for non-blocking async access from the ingress event loop. + - Prune is best-effort: failures are logged but do NOT abort processing. + - If the DB is unavailable (init error, corruption), EventStore degrades to + a no-op: is_processed() returns False, mark_processed() is a no-op. + The in-memory LRU dedupe (H1) continues to protect within a single run. + - WAL mode is enabled for better concurrent read performance. +""" +from __future__ import annotations + +import logging +import time +from pathlib import Path +from typing import Optional, Tuple + +try: + import aiosqlite + _AIOSQLITE_OK = True +except ImportError: # pragma: no cover + aiosqlite = None # type: ignore + _AIOSQLITE_OK = False + +logger = logging.getLogger(__name__) + +_SCHEMA = """ +CREATE TABLE IF NOT EXISTS processed_events ( + room_id TEXT NOT NULL, + event_id TEXT NOT NULL, + ts INTEGER NOT NULL, + sender_hash TEXT, + PRIMARY KEY (room_id, event_id) +); +CREATE INDEX IF NOT EXISTS idx_processed_events_ts ON processed_events (ts); +""" + + +class EventStore: + """ + Async SQLite-backed deduplication store for Matrix event_ids. + + Usage: + store = EventStore("/app/data/matrix_bridge.db", ttl_h=48) + await store.open() + ... + hit = await store.is_processed(room_id, event_id) + if not hit: + await store.mark_processed(room_id, event_id, sender_hash) + ... + pruned = await store.prune(batch=5000) + await store.close() + """ + + def __init__( + self, + db_path: str, + ttl_h: int = 48, + prune_batch: int = 5000, + ) -> None: + self.db_path = db_path + self.ttl_h = ttl_h + self.prune_batch = prune_batch + self._db: Optional["aiosqlite.Connection"] = None + self._ok: bool = False + self._last_prune_at: Optional[float] = None + self._pruned_rows_last: int = 0 + + # ── Lifecycle ───────────────────────────────────────────────────────────── + + async def open(self) -> bool: + """ + Open the SQLite connection and apply schema. + + Returns True on success; False on failure (degraded mode). + """ + if not _AIOSQLITE_OK: + logger.warning("aiosqlite not available — persistent dedupe disabled") + return False + try: + Path(self.db_path).parent.mkdir(parents=True, exist_ok=True) + self._db = await aiosqlite.connect(self.db_path) + # WAL mode: better concurrent read, non-blocking writes + await self._db.execute("PRAGMA journal_mode=WAL") + await self._db.execute("PRAGMA synchronous=NORMAL") + await self._db.executescript(_SCHEMA) + await self._db.commit() + self._ok = True + logger.info("EventStore opened: %s (ttl_h=%d)", self.db_path, self.ttl_h) + return True + except Exception as exc: + logger.error("EventStore.open failed — degraded: %s", exc) + self._ok = False + return False + + async def close(self) -> None: + """Close the SQLite connection gracefully.""" + if self._db is not None: + try: + await self._db.close() + except Exception as exc: # pragma: no cover + logger.warning("EventStore.close error: %s", exc) + self._db = None + self._ok = False + + # ── Core operations ─────────────────────────────────────────────────────── + + async def is_processed(self, room_id: str, event_id: str) -> bool: + """ + Return True if (room_id, event_id) has already been processed. + + Safe to call even when degraded (returns False → no false deduplication). + """ + if not self._ok or self._db is None: + return False + try: + async with self._db.execute( + "SELECT 1 FROM processed_events WHERE room_id=? AND event_id=? LIMIT 1", + (room_id, event_id), + ) as cursor: + row = await cursor.fetchone() + return row is not None + except Exception as exc: + logger.warning("EventStore.is_processed error (degraded): %s", exc) + return False + + async def mark_processed( + self, + room_id: str, + event_id: str, + sender_hash: str = "", + ) -> bool: + """ + Insert (room_id, event_id) as processed. + + Returns True on success, False if already exists or on error. + Uses INSERT OR IGNORE to avoid duplicates without raising. + """ + if not self._ok or self._db is None: + return False + ts = int(time.time()) + try: + await self._db.execute( + "INSERT OR IGNORE INTO processed_events (room_id, event_id, ts, sender_hash) " + "VALUES (?, ?, ?, ?)", + (room_id, event_id, ts, sender_hash or None), + ) + await self._db.commit() + return True + except Exception as exc: + logger.warning("EventStore.mark_processed error (degraded): %s", exc) + return False + + # ── Prune ───────────────────────────────────────────────────────────────── + + async def prune(self, batch: Optional[int] = None) -> int: + """ + Delete events older than ttl_h. + + Returns the number of rows deleted (0 on error or degraded). + Uses LIMIT batch to avoid long locks on large tables. + """ + if not self._ok or self._db is None: + return 0 + + cutoff = int(time.time()) - self.ttl_h * 3600 + effective_batch = batch or self.prune_batch + deleted = 0 + + try: + # SQLite DELETE with LIMIT requires compiling with SQLITE_ENABLE_UPDATE_DELETE_LIMIT, + # which may not be available. Use a subquery approach instead. + await self._db.execute( + "DELETE FROM processed_events " + "WHERE rowid IN (" + " SELECT rowid FROM processed_events WHERE ts < ? LIMIT ?" + ")", + (cutoff, effective_batch), + ) + await self._db.commit() + # Estimate rows deleted from changes() + async with self._db.execute("SELECT changes()") as cursor: + row = await cursor.fetchone() + deleted = row[0] if row else 0 + self._last_prune_at = time.time() + self._pruned_rows_last = deleted + if deleted: + logger.info("EventStore pruned %d rows (cutoff=%d)", deleted, cutoff) + except Exception as exc: + logger.warning("EventStore.prune error: %s", exc) + + return deleted + + # ── Health / introspection ───────────────────────────────────────────────── + + def as_health_dict(self) -> dict: + return { + "enabled": self._ok, + "db_path": self.db_path, + "ttl_h": self.ttl_h, + "ok": self._ok, + "last_prune_at": self._last_prune_at, + "pruned_rows_last": self._pruned_rows_last, + } diff --git a/services/matrix-bridge-dagi/app/ingress.py b/services/matrix-bridge-dagi/app/ingress.py index 8659c49b..238d5bb0 100644 --- a/services/matrix-bridge-dagi/app/ingress.py +++ b/services/matrix-bridge-dagi/app/ingress.py @@ -1,5 +1,5 @@ """ -Matrix Ingress + Egress Loop — Phase M1.4 + H1 + H2 + H3 + M2.1 + M2.2 + M3.0 (control channel) +Matrix Ingress + Egress Loop — Phase M1.4 + H1 + H2 + H3 + M2.1 + M2.2 + M3.0 + M3.1 + M3.3 + SessionScopeV2 Architecture (H2): Reader task → asyncio.Queue(maxsize) → N Worker tasks @@ -26,7 +26,10 @@ Queue entry: _QueueEntry(event, room_id, agent_id, enqueue_time, routing_reason, """ import asyncio +import hashlib +import json as _json import logging +import os as _os import time from dataclasses import dataclass, field from typing import Any, Callable, Dict, List, Optional @@ -37,11 +40,57 @@ from .control import ( ControlConfig, ControlCommand, check_authorization, parse_command, is_control_message, not_implemented_reply, unknown_command_reply, unauthorized_reply, help_reply, - VERB_HELP, + start_usage_reply, runbook_started_reply, runbook_start_error_reply, + next_usage_reply, next_manual_reply, next_auto_reply, next_error_reply, + complete_usage_reply, complete_ok_reply, complete_error_reply, + status_usage_reply, status_reply, status_error_reply, + evidence_usage_reply, evidence_reply, evidence_error_reply, + post_review_usage_reply, post_review_reply, post_review_error_reply, + rate_limited_reply, sanitize_notes, MAX_NOTES_LEN, + status_not_available_reply, nodes_reply, + VERB_HELP, VERB_RUNBOOK, VERB_STATUS, VERB_NODES, VERB_NODE, VERB_CONFIRM, + is_dangerous_cmd, build_normalized_args, + confirm_intent_reply, confirm_success_reply, confirm_expired_reply, + NODE_SUBCMD_SET, NODE_SUBCMD_UNSET, NODE_SUBCMD_GET, NODE_SUBCMD_LIST, + parse_node_cmd, node_cmd_validate_room, + node_cmd_reply_set, node_cmd_reply_unset_ok, node_cmd_reply_unset_not_found, + node_cmd_reply_get, node_cmd_reply_list, node_cmd_reply_error, + VERB_ROOM, ROOM_SUBCMD_AGENTS, + ROOM_ACTION_SET, ROOM_ACTION_ADD, ROOM_ACTION_REMOVE, + ROOM_ACTION_GET, ROOM_ACTION_LIST, ROOM_ACTION_UNSET, + parse_room_agents_cmd, + room_agents_reply_set, room_agents_reply_add, room_agents_reply_remove, + room_agents_reply_unset_ok, room_agents_reply_unset_not_found, + room_agents_reply_get, room_agents_reply_list, room_agents_reply_error, + VERB_POLICY, POLICY_EXPORTS_SUBDIR, validate_export_path, + policy_import_intent_reply, format_import_diff as _format_import_diff, + policy_export_reply, policy_import_dry_run_reply, policy_import_reply, + policy_cmd_error, + policy_prune_preview_reply, policy_prune_applied_reply, + policy_restore_intent_reply, policy_restore_applied_reply, + policy_history_reply, policy_change_detail_reply, + SUBCOMMAND_START, SUBCOMMAND_NEXT, SUBCOMMAND_COMPLETE, + SUBCOMMAND_STATUS, SUBCOMMAND_EVIDENCE, SUBCOMMAND_POST_REVIEW, ) +from .control_limiter import ControlRateLimiter +from .discovery import agents_reply, bridge_status_reply, is_discovery_message +from .event_store import EventStore +from .node_policy import ( + NodePolicy, NodeResolution, + NODE_SOURCE_DEFAULT, NODE_SOURCE_EXPLICIT, NODE_SOURCE_ROOM_MAP, + extract_node_kwarg, node_rejected_reply, +) +from .node_health import ( + NodeHealthTracker, NodeHealthConfig, + NODE_STATE_HEALTHY, NODE_STATE_DEGRADED, NODE_STATE_DOWN, + FAILOVER_REASON_TIMEOUT, FAILOVER_REASON_HTTP_5XX, FAILOVER_REASON_NETWORK, +) +from .sticky_cache import StickyNodeCache, make_sticky_key +from .confirm_store import ConfirmStore +from . import control_runner as _ctrl_runner from .matrix_client import MatrixClient from .mixed_routing import ( - MixedRoomConfig, route_message, reply_prefix, + MixedRoomConfig, MixedRoom, route_message, reply_prefix, build_override_config, REASON_REJECTED_UNKNOWN_AGENT, REASON_REJECTED_SLASH_TOO_LONG, REASON_REJECTED_NO_MAPPING, ) from .rate_limit import InMemoryRateLimiter @@ -71,6 +120,35 @@ class _QueueEntry: is_mixed: bool = False # True for mixed-room entries (reply tagging, session isolation) +# ── Session Scope v2 ─────────────────────────────────────────────────────────── +# Invariants: +# 1. Control room messages never reach the Router (no session_key needed there). +# 2. Matrix and Telegram never share a key namespace (prefix "matrix:"). +# 3. Mixed rooms: each (room_id, agent_id) pair has its own key — no cross-agent leakage. +# 4. Logs/metrics receive sender_hash (sha256[:16]), never raw Matrix user_id. + +SCOPE_ROOM_AGENT = "room_agent" # default: shared room context per agent +SCOPE_OPS_RUNBOOK = "ops_runbook" # future: control/ops room invocations +SCOPE_DM_USER = "dm_agent_user" # future: per-user DM isolation + + +def _sender_hash(sender: str) -> str: + """PII-safe 16-hex hash of a Matrix user_id (e.g. @alice:server → 'a3f9...').""" + return hashlib.sha256(sender.encode()).hexdigest()[:16] + + +def _build_session_key(room_id: str, agent_id: str, scope: str = SCOPE_ROOM_AGENT) -> str: + """ + Canonical session key v2: matrix:{scope}:{room_key}:{agent_id} + + Examples: + matrix:room_agent:roomXserver_yourdomain:sofiia + matrix:ops_runbook:opsroomXserver:sofiia + """ + room_key = room_id.replace("!", "").replace(":", "_") + return f"matrix:{scope}:{room_key}:{agent_id}" + + # ── Router invoke ────────────────────────────────────────────────────────────── async def _invoke_router( @@ -80,6 +158,9 @@ async def _invoke_router( node_id: str, prompt: str, session_id: str, + sender_hash: str = "", + scope: str = SCOPE_ROOM_AGENT, + node_source: str = NODE_SOURCE_DEFAULT, ) -> str: """POST /v1/agents/{agent_id}/infer → response text. Raises httpx.HTTPError on failure.""" url = f"{router_url.rstrip('/')}/v1/agents/{agent_id}/infer" @@ -87,7 +168,14 @@ async def _invoke_router( "prompt": prompt, "session_id": session_id, "user_id": "matrix_bridge", - "metadata": {"transport": "matrix", "node_id": node_id}, + "metadata": { + "transport": "matrix", + "node_id": node_id, + "node_source": node_source, # M5.0: how node was resolved + "session_key": session_id, # explicit for Router/Memory to index on + "sender_hash": sender_hash, # PII-safe sender fingerprint + "scope": scope, + }, } resp = await http_client.post(url, json=payload, timeout=_ROUTER_TIMEOUT_S) resp.raise_for_status() @@ -102,6 +190,20 @@ async def _invoke_router( return (text if isinstance(text, str) else str(text)).strip() +# ── M6.2: File helpers (run in thread) ──────────────────────────────────────── + +def _write_json_file(path: str, data: Any) -> None: + """Synchronously write data as JSON to path (UTF-8, 2-space indent).""" + with open(path, "w", encoding="utf-8") as fh: + _json.dump(data, fh, ensure_ascii=False, indent=2) + + +def _read_json_file(path: str) -> Any: + """Synchronously read and parse a JSON file.""" + with open(path, encoding="utf-8") as fh: + return _json.load(fh) + + # ── Audit write ──────────────────────────────────────────────────────────────── async def _write_audit( @@ -187,6 +289,16 @@ class MatrixIngressLoop: # M3.0: control channel control_config: Optional[ControlConfig] = None, control_unauthorized_behavior: str = "ignore", # "ignore" | "reply_error" + # M3.1: runbook runner integration + sofiia_control_token: str = "", + # M3.4: control channel safety + control_limiter: Optional["ControlRateLimiter"] = None, + # M2.3: persistent event deduplication + event_store: Optional["EventStore"] = None, + # M4.0: agent discovery + discovery_rpm: int = 20, + # M5.0: node-aware routing + node_policy: Optional["NodePolicy"] = None, # Callbacks on_message_received: Optional[Callable[[str, str], None]] = None, on_message_replied: Optional[Callable[[str, str, str], None]] = None, @@ -194,12 +306,39 @@ class MatrixIngressLoop: on_rate_limited: Optional[Callable[[str, str, str], None]] = None, on_queue_dropped: Optional[Callable[[str, str], None]] = None, on_queue_size: Optional[Callable[[int], None]] = None, - on_invoke_latency: Optional[Callable[[str, float], None]] = None, + on_invoke_latency: Optional[Callable[..., None]] = None, # (agent_id, duration_s, node_id="") on_send_latency: Optional[Callable[[str, float], None]] = None, on_queue_wait: Optional[Callable[[str, float], None]] = None, on_routed: Optional[Callable[[str, str], None]] = None, on_route_rejected: Optional[Callable[[str, str], None]] = None, on_control_command: Optional[Callable[[str, str, str], None]] = None, + on_control_rate_limited: Optional[Callable[[str], None]] = None, + on_dedupe_persistent_hit: Optional[Callable[[str, str], None]] = None, + on_dedupe_persistent_insert: Optional[Callable[[], None]] = None, + # M5.0: node routing callbacks + on_node_selected: Optional[Callable[[str, str, str], None]] = None, # (agent_id, node_id, source) + on_node_rejected: Optional[Callable[[str], None]] = None, # (rejected_node) + # M5.1: per-node stats for !status reply + node_stats_getter: Optional[Callable[[], Dict[str, Any]]] = None, + # M6.0: persistent policy store for dynamic room-node overrides + policy_store: Optional[Any] = None, # app.policy_store.PolicyStore + # M6.2: data directory for policy exports/imports + bridge_data_dir: Optional[str] = None, + # M8.0: node health tracker for soft-failover + node_health_tracker: Optional[NodeHealthTracker] = None, + on_failover: Optional[Callable[[str, str, str], None]] = None, # (from_node, to_node, reason) + # M8.1: sticky failover cache (anti-flap) + sticky_cache: Optional[StickyNodeCache] = None, + on_sticky_set: Optional[Callable[[str, str], None]] = None, # (node_id, scope) + # M8.2: HA state persistence config + ha_health_snapshot_interval_s: int = 60, + ha_health_max_age_s: int = 600, + # M9.0: Two-step confirmation store for dangerous commands + confirm_store: Optional[ConfirmStore] = None, + # M10.0: Auto-backup retention policy (days; 0 = keep forever) + policy_export_retention_days: int = 30, + # M10.2: max rows to keep in policy_changes history (0 = unlimited) + policy_history_limit: int = 100, ) -> None: self._hs_url = matrix_homeserver_url self._token = matrix_access_token @@ -226,12 +365,54 @@ class MatrixIngressLoop: self._mixed_room_config = mixed_room_config self._control_config = control_config self._control_unauthorized_behavior = control_unauthorized_behavior + self._control_token = sofiia_control_token self._unknown_agent_behavior = unknown_agent_behavior self._max_slash_len = max_slash_len self._mixed_concurrency_cap = mixed_concurrency_cap self._on_routed = on_routed self._on_route_rejected = on_route_rejected self._on_control_command = on_control_command + self._on_control_rate_limited = on_control_rate_limited + # M3.4: control channel safety + self._control_limiter = control_limiter + # M2.3: persistent event deduplication + self._event_store: Optional[EventStore] = event_store + self._on_dedupe_persistent_hit = on_dedupe_persistent_hit + self._on_dedupe_persistent_insert = on_dedupe_persistent_insert + # M5.0: node routing callbacks + self._on_node_selected = on_node_selected + self._on_node_rejected = on_node_rejected + # M5.1: per-node stats getter for !status reply + self._node_stats_getter = node_stats_getter + # M6.0: persistent policy store + self._policy_store = policy_store + # M6.2: policy exports directory + self._bridge_data_dir: Optional[str] = bridge_data_dir + self._policy_last_export_at: Optional[int] = None + self._policy_last_import_at: Optional[int] = None + # M10.0: auto-backup retention + self._policy_export_retention_days: int = policy_export_retention_days + # M10.2: history table row limit + self._policy_history_limit: int = policy_history_limit + # M8.0: node health + soft-failover + self._node_health_tracker: Optional[NodeHealthTracker] = node_health_tracker + self._on_failover = on_failover + # M8.1: sticky failover cache + self._sticky_cache: Optional[StickyNodeCache] = sticky_cache + self._on_sticky_set = on_sticky_set + # M9.0: two-step confirmation store + self._confirm_store: Optional[ConfirmStore] = confirm_store + # M8.2: HA state persistence + self._ha_health_snapshot_interval_s: int = ha_health_snapshot_interval_s + self._ha_health_max_age_s: int = ha_health_max_age_s + self._ha_sticky_loaded: int = 0 # count of sticky entries loaded on startup + self._ha_health_loaded: bool = False # whether health state was loaded on startup + # M4.0: agent discovery — simple per-room sliding window (reuses InMemoryRateLimiter logic) + self._discovery_rpm = discovery_rpm + # M5.0: node-aware routing policy + self._node_policy: Optional[NodePolicy] = node_policy + from collections import defaultdict, deque + self._discovery_windows: dict = defaultdict(deque) # Lazily populated semaphores keyed by "{room_id}:{agent_id}" self._concurrency_locks: Dict[str, asyncio.Semaphore] = {} self._next_batch: Optional[str] = None @@ -249,6 +430,69 @@ class MatrixIngressLoop: def worker_count(self) -> int: return self._worker_count + def get_status(self) -> Dict[str, Any]: + """Return a simple bridge status dict for health/ops queries.""" + status: Dict[str, Any] = { + "queue_size": self._queue.qsize() if self._queue else 0, + "queue_max": self._queue_max, + "worker_count": self._worker_count, + } + if self._node_policy is not None: + status["node_policy"] = self._node_policy.as_info_dict() + # M5.1: per-node routed/rejected counters + if self._node_stats_getter is not None: + status["nodes"] = self._node_stats_getter() + # M6.0: policy store info + if self._policy_store is not None: + try: + status["policy_store_ok"] = self._policy_store.is_open + status["policy_store_path"] = self._policy_store.db_path + status["policy_overrides_count"] = self._policy_store.count_overrides() + status["policy_agent_overrides_count"] = self._policy_store.count_agent_overrides() # M6.1 + except Exception as exc: # noqa: BLE001 + status["policy_store_ok"] = False + status["policy_store_error"] = str(exc) + # M6.2: policy snapshot timestamps + if self._policy_last_export_at is not None: + status["policy_last_export_at"] = self._policy_last_export_at + if self._policy_last_import_at is not None: + status["policy_last_import_at"] = self._policy_last_import_at + # M6.2: policy DB mtime (best-effort) + if self._policy_store is not None: + try: + db_path = self._policy_store.db_path + if db_path and _os.path.exists(db_path): + status["policy_db_mtime"] = int(_os.path.getmtime(db_path)) + except Exception: # noqa: BLE001 + pass + # M10.2: policy change history count + if self._policy_store is not None and self._policy_store.is_open: + try: + status["policy_changes_count"] = self._policy_store.get_policy_changes_count() + except Exception: # noqa: BLE001 + pass + # M8.0: node health tracker state + if self._node_health_tracker is not None: + allowed = ( + self._node_policy.allowed_nodes + if self._node_policy is not None + else None + ) + status["node_health"] = self._node_health_tracker.all_info(allowed) + # M8.1: sticky failover cache info + if self._sticky_cache is not None: + status["sticky_active_keys"] = self._sticky_cache.active_count() + status["sticky_ttl_s"] = self._sticky_cache.ttl_s + # M9.0: pending confirmations + if self._confirm_store is not None: + status["confirm_pending"] = self._confirm_store.pending_count() + status["confirm_ttl_s"] = self._confirm_store.ttl_s + # M8.2: HA persistence info + status["ha_sticky_loaded"] = self._ha_sticky_loaded + status["ha_health_loaded"] = self._ha_health_loaded + status["ha_health_snapshot_interval_s"] = self._ha_health_snapshot_interval_s + return status + @property def active_lock_count(self) -> int: """Number of room-agent pairs currently holding a concurrency lock.""" @@ -307,6 +551,9 @@ class MatrixIngressLoop: ) async with httpx.AsyncClient() as http_client: + # M8.2: Load persisted HA state before processing any messages + await self._load_ha_state() + # Start workers worker_tasks = [ asyncio.create_task( @@ -316,6 +563,18 @@ class MatrixIngressLoop: for i in range(self._worker_count) ] + # M8.2: Start periodic node health snapshot task + _health_snapshot_task = None + if ( + self._ha_health_snapshot_interval_s > 0 + and self._policy_store is not None + and self._node_health_tracker is not None + ): + _health_snapshot_task = asyncio.create_task( + self._node_health_snapshot_loop(), + name="ha_health_snapshot", + ) + # Run reader until stop_event await self._reader(client, queue, http_client, stop_event) @@ -337,6 +596,9 @@ class MatrixIngressLoop: # Cancel workers for task in worker_tasks: task.cancel() + # M8.2: Cancel health snapshot task if running + if _health_snapshot_task is not None and not _health_snapshot_task.done(): + _health_snapshot_task.cancel() results = await asyncio.gather(*worker_tasks, return_exceptions=True) cancelled = sum(1 for r in results if isinstance(r, asyncio.CancelledError)) logger.info("Workers stopped (%d cancelled)", cancelled) @@ -392,6 +654,11 @@ class MatrixIngressLoop: continue messages = client.extract_room_messages(sync_resp, mapping.room_id) for event in messages: + text = event.get("content", {}).get("body", "").strip() + # M4.0: agent discovery before routing + if is_discovery_message(text): + await self._handle_discovery(client, http_client, event, mapping.room_id) + continue await self._try_enqueue(client, queue, http_client, event, mapping) # Mixed rooms: 1 room → N agents, routing per message (M2.1) @@ -399,6 +666,11 @@ class MatrixIngressLoop: for room_id in self._mixed_room_config.rooms: messages = client.extract_room_messages(sync_resp, room_id) for event in messages: + text = event.get("content", {}).get("body", "").strip() + # M4.0: agent discovery before routing + if is_discovery_message(text): + await self._handle_discovery(client, http_client, event, room_id) + continue await self._try_enqueue_mixed(client, queue, http_client, event, room_id) async def _try_enqueue( @@ -441,6 +713,28 @@ class MatrixIngressLoop: # Dedupe — mark before enqueue (prevents double-enqueue on retry) client.mark_seen(event_id) + # M2.3: Persistent dedupe (cross-restart protection) + if self._event_store is not None: + try: + already = await self._event_store.is_processed(room_id, event_id) + except Exception as exc: + logger.warning("EventStore.is_processed error (degraded): %s", exc) + already = False + + if already: + logger.debug("Persistent dedupe hit: event=%s room=%s", event_id, room_id) + if self._on_dedupe_persistent_hit: + self._on_dedupe_persistent_hit(room_id, agent_id) + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.dedupe.persistent_hit", + agent_id=agent_id, node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="ok", + data={"sender": sender}, + ) + return + # H2: Enqueue or drop entry = _QueueEntry( event=event, @@ -448,8 +742,10 @@ class MatrixIngressLoop: agent_id=agent_id, enqueue_time=time.monotonic(), ) + enqueued = False try: queue.put_nowait(entry) + enqueued = True qsize = queue.qsize() logger.debug("Enqueued event=%s qsize=%d", event_id, qsize) if self._on_queue_size: @@ -470,6 +766,16 @@ class MatrixIngressLoop: data={"queue_max": self._queue_max, "sender": sender}, ) + # M2.3: Mark as processed ONLY after successful enqueue + if enqueued and self._event_store is not None: + sender_hash = _sender_hash(sender) + try: + await self._event_store.mark_processed(room_id, event_id, sender_hash) + if self._on_dedupe_persistent_insert: + self._on_dedupe_persistent_insert() + except Exception as exc: + logger.warning("EventStore.mark_processed error (degraded): %s", exc) + async def _try_enqueue_mixed( self, client: MatrixClient, @@ -487,9 +793,27 @@ class MatrixIngressLoop: if not text: return + # M6.1: look up dynamic agent override for this room + _routing_config = self._mixed_room_config + if self._policy_store is not None and self._policy_store.is_open: + try: + _agent_ov = await asyncio.to_thread( + self._policy_store.get_agent_override, room_id + ) + if _agent_ov is not None: + _ov_agents, _ov_default = _agent_ov + _effective_default = _ov_default or (_ov_agents[0] if _ov_agents else None) + if _ov_agents and _effective_default: + _routing_config = build_override_config( + self._mixed_room_config, room_id, + _ov_agents, _effective_default, + ) + except Exception as _exc: # noqa: BLE001 + logger.warning("PolicyStore get_agent_override failed: %s", _exc) + # Route message to determine target agent agent_id, routing_reason, effective_text = route_message( - text, room_id, self._mixed_room_config, self._room_map.allowed_agents, + text, room_id, _routing_config, self._room_map.allowed_agents, max_slash_len=self._max_slash_len, ) @@ -552,6 +876,28 @@ class MatrixIngressLoop: client.mark_seen(event_id) + # M2.3: Persistent dedupe (cross-restart protection, mixed rooms) + if self._event_store is not None: + try: + already = await self._event_store.is_processed(room_id, event_id) + except Exception as exc: + logger.warning("EventStore.is_processed error mixed (degraded): %s", exc) + already = False + + if already: + logger.debug("Persistent dedupe hit (mixed): event=%s room=%s agent=%s", event_id, room_id, agent_id) + if self._on_dedupe_persistent_hit: + self._on_dedupe_persistent_hit(room_id, agent_id) + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.dedupe.persistent_hit", + agent_id=agent_id, node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="ok", + data={"sender": sender, "routing_reason": routing_reason}, + ) + return + # Store effective_text (stripped of routing token) in a patched event copy effective_event = dict(event) effective_event["content"] = dict(event.get("content", {})) @@ -565,8 +911,10 @@ class MatrixIngressLoop: routing_reason=routing_reason, is_mixed=True, ) + enqueued_mixed = False try: queue.put_nowait(entry) + enqueued_mixed = True qsize = queue.qsize() logger.debug( "Enqueued (mixed): event=%s agent=%s reason=%s qsize=%d", @@ -590,6 +938,1531 @@ class MatrixIngressLoop: data={"queue_max": self._queue_max, "sender": sender}, ) + # M2.3: Mark as processed ONLY after successful enqueue + if enqueued_mixed and self._event_store is not None: + sender_hash = _sender_hash(sender) + try: + await self._event_store.mark_processed(room_id, event_id, sender_hash) + if self._on_dedupe_persistent_insert: + self._on_dedupe_persistent_insert() + except Exception as exc: + logger.warning("EventStore.mark_processed error mixed (degraded): %s", exc) + + # ── M4.0: Agent discovery ────────────────────────────────────────────────── + + def _check_discovery_rate(self, room_id: str) -> bool: + """Sliding-window per-room rate check for discovery replies. Returns True if allowed.""" + if self._discovery_rpm <= 0: + return True + now = time.monotonic() + window = self._discovery_windows[room_id] + cutoff = now - 60.0 + while window and window[0] < cutoff: + window.popleft() + if len(window) >= self._discovery_rpm: + return False + window.append(now) + return True + + async def _handle_discovery( + self, + client: MatrixClient, + http_client: httpx.AsyncClient, + event: Dict[str, Any], + room_id: str, + ) -> None: + """ + Reply to !agents in any user room (no auth required). + Rate-limited per room. Marks event as seen + persisted (no router enqueue). + """ + event_id = event.get("event_id", "") + sender = event.get("sender", "") + + # Rate limit for discovery replies + if not self._check_discovery_rate(room_id): + logger.debug("Discovery rate limited: room=%s", room_id) + client.mark_seen(event_id) + return + + client.mark_seen(event_id) + + # M6.1: use store-based agent config if available for accurate discovery + _disc_config = self._mixed_room_config + if self._policy_store is not None and self._policy_store.is_open: + try: + _disc_ov = await asyncio.to_thread( + self._policy_store.get_agent_override, room_id + ) + if _disc_ov is not None: + _d_agents, _d_default = _disc_ov + _d_eff_default = _d_default or (_d_agents[0] if _d_agents else None) + if _d_agents and _d_eff_default and self._mixed_room_config is not None: + _disc_config = build_override_config( + self._mixed_room_config, room_id, _d_agents, _d_eff_default, + ) + except Exception: # noqa: BLE001 + pass + + reply = agents_reply(room_id, self._room_map, _disc_config) + txn_id = MatrixClient.make_txn_id(room_id, event_id + "_discovery") + try: + await client.send_text(room_id, reply, txn_id) + except Exception as exc: + logger.warning("Could not send discovery reply: %s", exc) + + # Persist dedupe so restart doesn't re-deliver this discovery + if self._event_store is not None: + sender_hash = _sender_hash(sender) + await self._event_store.mark_processed(room_id, event_id, sender_hash) + + # ── M6.1: Dynamic mixed room agent overrides via !room agents command ───── + + async def _handle_room_cmd( + self, + http_client: httpx.AsyncClient, + sender: str, + ctrl_room_id: str, + event_id: str, + cmd_subcommand: str, + cmd_args: tuple, + cmd_kwargs: Dict[str, str], + ) -> str: + """Handle `!room agents ` from authorized operator.""" + if self._policy_store is None or not self._policy_store.is_open: + return "⚠️ Policy store not available." + + if cmd_subcommand != ROOM_SUBCMD_AGENTS: + return room_agents_reply_error( + f"Unknown subcommand: `{cmd_subcommand or '?'}`. Use `!room agents `." + ) + + action, room_id, agents_list, single_agent, default_agent = parse_room_agents_cmd( + cmd_subcommand, cmd_args, cmd_kwargs, + ) + + if action not in (ROOM_ACTION_SET, ROOM_ACTION_ADD, ROOM_ACTION_REMOVE, + ROOM_ACTION_GET, ROOM_ACTION_LIST, ROOM_ACTION_UNSET): + return room_agents_reply_error(f"Unknown action: `{action or '?'}`") + + # Validate allowed agents from global policy + allowed_all = self._room_map.allowed_agents # global allowed agents set + + # ── list ────────────────────────────────────────────────────────────── + if action == ROOM_ACTION_LIST: + try: + rows = await asyncio.to_thread(self._policy_store.list_agent_overrides, 10) + total = await asyncio.to_thread(self._policy_store.count_agent_overrides) + except Exception as exc: + logger.warning("PolicyStore list_agent_overrides error: %s", exc) + return "⚠️ Could not read policy store." + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.room.agents.list", + agent_id="control", node_id=self._node_id, + room_id=ctrl_room_id, event_id=event_id, + status="ok", data={"sender": sender, "total": total}, + ) + return room_agents_reply_list(rows, total) + + # ── subcommands that require room_id ────────────────────────────────── + if not room_id: + return room_agents_reply_error("Missing `room=` argument.") + if not node_cmd_validate_room(room_id): + return room_agents_reply_error( + f"Invalid room ID format: `{room_id}`\nExpected: `!localpart:server`" + ) + + # ── get ─────────────────────────────────────────────────────────────── + if action == ROOM_ACTION_GET: + try: + ov = await asyncio.to_thread(self._policy_store.get_agent_override, room_id) + except Exception as exc: + logger.warning("PolicyStore get_agent_override error: %s", exc) + return "⚠️ Could not read policy store." + ov_agents, ov_default = (ov if ov else (None, None)) + env_room = ( + self._mixed_room_config.rooms.get(room_id) if self._mixed_room_config else None + ) + env_agents = list(env_room.agents) if env_room else None + env_default = env_room.default_agent if env_room else None + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.room.agents.get", + agent_id="control", node_id=self._node_id, + room_id=ctrl_room_id, event_id=event_id, + status="ok", data={"sender": sender, "queried_room": room_id}, + ) + return room_agents_reply_get(room_id, ov_agents, ov_default, env_agents, env_default) + + # ── unset ───────────────────────────────────────────────────────────── + if action == ROOM_ACTION_UNSET: + try: + deleted = await asyncio.to_thread(self._policy_store.delete_agent_override, room_id) + except Exception as exc: + logger.warning("PolicyStore delete_agent_override error: %s", exc) + return "⚠️ Could not write to policy store." + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.room.agents.unset", + agent_id="control", node_id=self._node_id, + room_id=ctrl_room_id, event_id=event_id, + status="ok", data={"sender": sender, "target_room": room_id, "was_set": deleted}, + ) + return room_agents_reply_unset_ok(room_id) if deleted else room_agents_reply_unset_not_found(room_id) + + # ── add ─────────────────────────────────────────────────────────────── + if action == ROOM_ACTION_ADD: + if not single_agent: + return room_agents_reply_error("Missing `agent=` argument for `add`.") + if single_agent not in allowed_all: + allowed_str = ", ".join(f"`{a}`" for a in sorted(allowed_all)) + return room_agents_reply_error(f"Agent `{single_agent}` not in allowed agents: {allowed_str}") + try: + new_agents, new_default = await asyncio.to_thread( + self._policy_store.add_agent_to_room, room_id, single_agent, sender + ) + except Exception as exc: + logger.warning("PolicyStore add_agent_to_room error: %s", exc) + return "⚠️ Could not write to policy store." + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.room.agents.add", + agent_id="control", node_id=self._node_id, + room_id=ctrl_room_id, event_id=event_id, + status="ok", data={"sender": sender, "target_room": room_id, "agent": single_agent}, + ) + return room_agents_reply_add(room_id, single_agent, new_agents, new_default) + + # ── remove ──────────────────────────────────────────────────────────── + if action == ROOM_ACTION_REMOVE: + if not single_agent: + return room_agents_reply_error("Missing `agent=` argument for `remove`.") + try: + removed, err = await asyncio.to_thread( + self._policy_store.remove_agent_from_room, room_id, single_agent, sender + ) + except Exception as exc: + logger.warning("PolicyStore remove_agent_from_room error: %s", exc) + return "⚠️ Could not write to policy store." + if not removed: + return room_agents_reply_error(err or "Could not remove agent.") + # Get updated state + try: + ov = await asyncio.to_thread(self._policy_store.get_agent_override, room_id) + except Exception: # noqa: BLE001 + ov = None + remaining = ov[0] if ov else [] + new_default_r = ov[1] if ov else None + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.room.agents.remove", + agent_id="control", node_id=self._node_id, + room_id=ctrl_room_id, event_id=event_id, + status="ok", data={"sender": sender, "target_room": room_id, "agent": single_agent}, + ) + return room_agents_reply_remove(room_id, single_agent, remaining, new_default_r) + + # ── set ─────────────────────────────────────────────────────────────── + if action == ROOM_ACTION_SET: + if not agents_list: + return room_agents_reply_error("Missing `agents=` argument for `set`.") + invalid = [a for a in agents_list if a not in allowed_all] + if invalid: + allowed_str = ", ".join(f"`{a}`" for a in sorted(allowed_all)) + return room_agents_reply_error( + f"Unknown agents: {', '.join(f'`{a}`' for a in invalid)}\nAllowed: {allowed_str}" + ) + effective_default = default_agent if default_agent else agents_list[0] + if effective_default not in agents_list: + return room_agents_reply_error( + f"Default agent `{effective_default}` not in provided agents list." + ) + try: + await asyncio.to_thread( + self._policy_store.set_agent_override, + room_id, agents_list, effective_default, sender, + ) + except Exception as exc: + logger.warning("PolicyStore set_agent_override error: %s", exc) + return "⚠️ Could not write to policy store." + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.room.agents.set", + agent_id="control", node_id=self._node_id, + room_id=ctrl_room_id, event_id=event_id, + status="ok", data={ + "sender": sender, "target_room": room_id, + "agents": agents_list, "default": effective_default, + }, + ) + return room_agents_reply_set(room_id, agents_list, effective_default) + + return room_agents_reply_error("Unhandled action.") + + # ── M6.2: Policy snapshot export / import ───────────────────────────────── + + async def _handle_policy_cmd( + self, + http_client: httpx.AsyncClient, + sender: str, + ctrl_room_id: str, + event_id: str, + subcommand: Optional[str], + cmd_kwargs: Dict[str, str], + ) -> str: + """ + Handle `!policy export`, `!policy import`, and `!policy prune_exports`. + Requires policy_store and bridge_data_dir to be configured. + """ + if self._policy_store is None or not self._policy_store.is_open: + return policy_cmd_error("Policy store is not available.") + + if not self._bridge_data_dir: + return policy_cmd_error("BRIDGE_DATA_DIR is not configured.") + + exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR) + + # ── export ──────────────────────────────────────────────────────────── + if subcommand == "export": + try: + snapshot = await asyncio.to_thread(self._policy_store.export_all) + node_count = len(snapshot.get("room_node_overrides", [])) + agent_count = len(snapshot.get("room_agent_overrides", [])) + + import datetime as _dt + ts = _dt.datetime.now(_dt.timezone.utc).strftime("%Y%m%d-%H%M%S") + filename = f"policy-{ts}.json" + + await asyncio.to_thread(_os.makedirs, exports_dir, exist_ok=True) + export_path = _os.path.join(exports_dir, filename) + await asyncio.to_thread( + _write_json_file, export_path, snapshot + ) + + self._policy_last_export_at = int(time.time()) + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.policy.export", + agent_id="control", node_id=self._node_id, + room_id=ctrl_room_id, event_id=event_id, + status="ok", data={ + "sender": sender, "file": filename, + "node_overrides": node_count, "agent_overrides": agent_count, + }, + ) + return policy_export_reply(export_path, node_count, agent_count) + + except Exception as exc: # noqa: BLE001 + logger.exception("_handle_policy_cmd export error: %s", exc) + return policy_cmd_error(f"Export failed: {exc}") + + # ── import ──────────────────────────────────────────────────────────── + if subcommand == "import": + filename = cmd_kwargs.get("path", "").strip() + if not filename: + return policy_cmd_error("Missing `path=` argument.") + + safe_path = validate_export_path(exports_dir, filename) + if safe_path is None: + return policy_cmd_error( + f"Invalid path `{filename}`. Only simple filenames within the exports " + f"directory are allowed." + ) + + mode_raw = cmd_kwargs.get("mode", "merge").strip().lower() + if mode_raw not in ("merge", "replace"): + return policy_cmd_error("mode must be `merge` or `replace`.") + + dry_raw = cmd_kwargs.get("dry_run", "1").strip() + dry_run = dry_raw not in ("0", "false", "no") + + try: + raw_text = await asyncio.to_thread(_read_json_file, safe_path) + except FileNotFoundError: + return policy_cmd_error(f"File not found: `{filename}`") + except Exception as exc: # noqa: BLE001 + return policy_cmd_error(f"Cannot read file: {exc}") + + try: + data = raw_text if isinstance(raw_text, dict) else {} + stats = await asyncio.to_thread( + self._policy_store.import_snapshot, + data, mode_raw, dry_run, sender, + ) + except ValueError as ve: + return policy_cmd_error(str(ve)) + except Exception as exc: # noqa: BLE001 + logger.exception("_handle_policy_cmd import error: %s", exc) + return policy_cmd_error(f"Import failed: {exc}") + + if not dry_run: + self._policy_last_import_at = int(time.time()) + # M10.2: record in policy change history + _is_destr = ( + stats.get("node_deleted", 0) + stats.get("agent_deleted", 0) + ) > 0 + _ds = ( + f"node: +{stats['node_added']} ~{stats['node_updated']} " + f"-{stats['node_deleted']}; " + f"agent: +{stats['agent_added']} ~{stats['agent_updated']} " + f"-{stats['agent_deleted']}" + ) + try: + await asyncio.to_thread( + self._policy_store.record_policy_change, + "policy.import", mode_raw, filename, + _sender_hash(sender), _ds, _is_destr, + stats.get("node_added", 0), stats.get("node_updated", 0), + stats.get("node_deleted", 0), + stats.get("agent_added", 0), stats.get("agent_updated", 0), + stats.get("agent_deleted", 0), + self._policy_history_limit, + ) + except Exception as _exc: # noqa: BLE001 + logger.warning("Failed to record import history (non-fatal): %s", _exc) + + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.policy.import", + agent_id="control", node_id=self._node_id, + room_id=ctrl_room_id, event_id=event_id, + status="ok", data={ + "sender": sender, "file": filename, + "mode": mode_raw, "dry_run": dry_run, "stats": stats, + }, + ) + return ( + policy_import_dry_run_reply(stats, mode_raw) + if dry_run + else policy_import_reply(stats, mode_raw) + ) + + # ── restore (M10.1) ─────────────────────────────────────────────────── + if subcommand == "restore": + filename = cmd_kwargs.get("path", "").strip() + if not filename: + return policy_cmd_error("Missing `path=` argument.") + + safe_path = validate_export_path(exports_dir, filename) + if safe_path is None: + return policy_cmd_error( + f"Invalid path `{filename}`. Only simple filenames within the exports " + "directory are allowed." + ) + + mode_raw = cmd_kwargs.get("mode", "replace").strip().lower() + if mode_raw not in ("merge", "replace"): + return policy_cmd_error("mode must be `merge` or `replace`.") + + try: + raw_data = await asyncio.to_thread(_read_json_file, safe_path) + except FileNotFoundError: + return policy_cmd_error(f"File not found: `{filename}`") + except Exception as exc: # noqa: BLE001 + return policy_cmd_error(f"Cannot read file: {exc}") + + data = raw_data if isinstance(raw_data, dict) else {} + try: + stats = await asyncio.to_thread( + self._policy_store.import_snapshot, + data, mode_raw, False, sender, + ) + self._policy_last_import_at = int(time.time()) + except ValueError as ve: + return policy_cmd_error(str(ve)) + except Exception as exc: # noqa: BLE001 + logger.exception("_handle_policy_cmd restore error: %s", exc) + return policy_cmd_error(f"Restore failed: {exc}") + + # M10.2: record in policy change history + _is_destr = ( + stats.get("node_deleted", 0) + stats.get("agent_deleted", 0) + ) > 0 + _rds = ( + f"restore/{mode_raw}: " + f"node: +{stats['node_added']} ~{stats['node_updated']} " + f"-{stats['node_deleted']}; " + f"agent: +{stats['agent_added']} ~{stats['agent_updated']} " + f"-{stats['agent_deleted']}" + ) + try: + await asyncio.to_thread( + self._policy_store.record_policy_change, + "policy.restore", mode_raw, filename, + _sender_hash(sender), _rds, _is_destr, + stats.get("node_added", 0), stats.get("node_updated", 0), + stats.get("node_deleted", 0), + stats.get("agent_added", 0), stats.get("agent_updated", 0), + stats.get("agent_deleted", 0), + self._policy_history_limit, + ) + except Exception as _exc: # noqa: BLE001 + logger.warning("Failed to record restore history (non-fatal): %s", _exc) + + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.policy.restore", + agent_id="control", node_id=self._node_id, + room_id=ctrl_room_id, event_id=event_id, + status="ok", + data={ + "sender": sender, "file": filename, + "mode": mode_raw, "stats": stats, + }, + ) + return policy_restore_applied_reply(stats, mode_raw) + + # ── prune_exports ───────────────────────────────────────────────────── + if subcommand == "prune_exports": + dry_raw = cmd_kwargs.get("dry_run", "1").strip() + dry_run = dry_raw not in ("0", "false", "no") + try: + retention_days = int( + cmd_kwargs.get( + "retention_days", str(self._policy_export_retention_days) + ) + ) + except (ValueError, TypeError): + return policy_cmd_error("`retention_days` must be a positive integer.") + + try: + await asyncio.to_thread(_os.makedirs, exports_dir, exist_ok=True) + result = await asyncio.to_thread( + self._policy_store.prune_exports, + exports_dir, retention_days, dry_run, + ) + except Exception as exc: # noqa: BLE001 + logger.exception("_handle_policy_cmd prune error: %s", exc) + return policy_cmd_error(f"Prune failed: {exc}") + + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.policy.prune_exports", + agent_id="control", node_id=self._node_id, + room_id=ctrl_room_id, event_id=event_id, + status="ok", + data={ + "sender": sender, + "dry_run": dry_run, + "retention_days": retention_days, + "files_to_delete": result.count, + "bytes_to_free": result.total_bytes, + }, + ) + return ( + policy_prune_preview_reply(result, retention_days) + if dry_run + else policy_prune_applied_reply(result, retention_days) + ) + + # ── history (M10.2) ─────────────────────────────────────────────────── + if subcommand == "history": + try: + limit_raw = int(cmd_kwargs.get("limit", "10")) + except (ValueError, TypeError): + return policy_cmd_error("`limit` must be a positive integer.") + safe_limit = max(1, min(limit_raw, 20)) + try: + changes = await asyncio.to_thread( + self._policy_store.list_policy_changes, safe_limit, + ) + except Exception as exc: # noqa: BLE001 + logger.exception("_handle_policy_cmd history error: %s", exc) + return policy_cmd_error(f"History fetch failed: {exc}") + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.policy.history", + agent_id="control", node_id=self._node_id, + room_id=ctrl_room_id, event_id=event_id, + status="ok", data={"sender": sender, "limit": safe_limit, "count": len(changes)}, + ) + return policy_history_reply(changes) + + # ── change (M10.3) ──────────────────────────────────────────────────── + if subcommand == "change": + try: + change_id = int(cmd_kwargs.get("id", "0")) + except (ValueError, TypeError): + return policy_cmd_error("`id` must be a positive integer (DB change id).") + if change_id <= 0: + return policy_cmd_error("Missing or invalid `id=` argument. " + "Use `!policy history` to get change ids.") + try: + change = await asyncio.to_thread( + self._policy_store.get_policy_change_by_id, change_id, + ) + except Exception as exc: # noqa: BLE001 + logger.exception("_handle_policy_cmd change detail error: %s", exc) + return policy_cmd_error(f"DB error: {exc}") + if change is None: + return policy_cmd_error( + f"Change id={change_id} not found. " + "Use `!policy history` to see available ids." + ) + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.policy.change_detail", + agent_id="control", node_id=self._node_id, + room_id=ctrl_room_id, event_id=event_id, + status="ok", data={"sender": sender, "change_id": change_id}, + ) + return policy_change_detail_reply(change) + + return policy_cmd_error(f"Unknown subcommand: `{subcommand!r}`.") + + # ── M5.1: Nodes overview for operators ──────────────────────────────────── + + async def _handle_nodes( + self, + http_client: httpx.AsyncClient, + sender: str, + room_id: str, + event_id: str, + ) -> str: + """Return node policy overview for `!nodes` in control room.""" + try: + policy_info = ( + self._node_policy.as_info_dict() + if self._node_policy is not None + else {} + ) + node_stats = ( + self._node_stats_getter() + if self._node_stats_getter is not None + else {} + ) + # M8.0: merge health state into node_stats + if self._node_health_tracker is not None: + allowed = ( + self._node_policy.allowed_nodes + if self._node_policy is not None + else None + ) + health_all = self._node_health_tracker.all_info(allowed) + for nid, info in health_all.items(): + if nid not in node_stats: + node_stats[nid] = {} + node_stats[nid]["health"] = info.get("state", NODE_STATE_HEALTHY) + node_stats[nid]["ewma_latency_s"] = info.get("ewma_latency_s") + node_stats[nid]["consecutive_failures"] = info.get("consecutive_failures", 0) + # M8.1: include sticky cache info + sticky_info = None + if self._sticky_cache is not None: + sticky_entries = self._sticky_cache.active_entries() + sticky_info = { + "active_keys": len(sticky_entries), + "ttl_s": self._sticky_cache.ttl_s, + "entries": [ + {"key": k, "node": n, "remaining_s": round(r, 0)} + for k, n, r in sticky_entries[:5] # show at most 5 + ], + } + if len(sticky_entries) > 5: + sticky_info["truncated"] = len(sticky_entries) - 5 + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.nodes", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="ok", data={"sender": sender}, + ) + return nodes_reply(policy_info, node_stats, sticky_info=sticky_info) + except Exception as exc: + logger.warning("_handle_nodes error: %s", exc) + return "⚠️ Node info not available." + + # ── M8.2: HA state persistence helpers ──────────────────────────────────── + + async def _load_ha_state(self) -> None: + """ + Load persisted HA state (sticky entries + node health) from PolicyStore on startup. + Non-fatal: any error is logged and bridge continues in in-memory-only mode. + """ + if self._policy_store is None or not self._policy_store.is_open: + return + + # Load sticky entries + if self._sticky_cache is not None: + try: + entries = await asyncio.to_thread(self._policy_store.load_sticky_entries) + now_unix = int(time.time()) + loaded = 0 + for key, node_id, expires_at_unix in entries: + remaining_s = expires_at_unix - now_unix + if remaining_s > 0: + self._sticky_cache.set(key, node_id, ttl_s=float(remaining_s)) + loaded += 1 + self._ha_sticky_loaded = loaded + logger.info("HA: loaded %d sticky entries from DB", loaded) + except Exception as exc: # noqa: BLE001 + logger.warning("HA: failed to load sticky entries (non-fatal): %s", exc) + + # Load node health snapshot + if self._node_health_tracker is not None and self._ha_health_max_age_s > 0: + try: + snapshot = await asyncio.to_thread( + self._policy_store.load_node_health, self._ha_health_max_age_s + ) + if snapshot: + for node_id, info in snapshot.items(): + self._node_health_tracker.restore_node( + node_id, + ewma_latency_s=info.get("ewma_latency_s"), + consecutive_failures=int(info.get("consecutive_failures", 0)), + ) + self._ha_health_loaded = True + logger.info( + "HA: loaded node health for %d nodes from DB", len(snapshot) + ) + else: + logger.info("HA: no fresh node health snapshot found in DB") + except Exception as exc: # noqa: BLE001 + logger.warning("HA: failed to load node health (non-fatal): %s", exc) + + async def _sticky_persist_set(self, key: str, node_id: str) -> None: + """Persist a sticky entry to DB after setting it in-memory (M8.2).""" + if self._policy_store is None or not self._policy_store.is_open: + return + if self._sticky_cache is None: + return + expires_at_unix = int(time.time()) + int(self._sticky_cache.ttl_s) + try: + await asyncio.to_thread( + self._policy_store.upsert_sticky, key, node_id, expires_at_unix + ) + except Exception as exc: # noqa: BLE001 + logger.warning("HA: failed to persist sticky key=%s (non-fatal): %s", key, exc) + + async def _sticky_persist_delete(self, key: str) -> None: + """Remove a sticky entry from DB after deleting it in-memory (M8.2).""" + if self._policy_store is None or not self._policy_store.is_open: + return + try: + await asyncio.to_thread(self._policy_store.delete_sticky, key) + except Exception as exc: # noqa: BLE001 + logger.warning("HA: failed to delete sticky key=%s (non-fatal): %s", key, exc) + + async def _node_health_snapshot_loop(self) -> None: + """ + Background task: periodically write node health state to DB (M8.2). + Runs until policy_store becomes unavailable or interval is 0. + """ + if self._ha_health_snapshot_interval_s <= 0: + return + logger.debug( + "HA: health snapshot loop started (interval=%ds)", self._ha_health_snapshot_interval_s + ) + while True: + await asyncio.sleep(self._ha_health_snapshot_interval_s) + if self._policy_store is None or not self._policy_store.is_open: + break + if self._node_health_tracker is None or self._node_policy is None: + break + try: + for node_id in sorted(self._node_policy.allowed_nodes): + info = self._node_health_tracker.as_info_dict(node_id) + await asyncio.to_thread( + self._policy_store.upsert_node_health, + node_id, + info.get("ewma_latency_s"), + int(info.get("consecutive_failures", 0)), + ) + logger.debug( + "HA: health snapshot written for %d nodes", + len(self._node_policy.allowed_nodes), + ) + except Exception as exc: # noqa: BLE001 + logger.warning("HA: health snapshot write failed (non-fatal): %s", exc) + + # ── M9.0: Two-step confirmation for dangerous control commands ───────────── + + async def _handle_policy_import_intent( + self, + http_client: "httpx.AsyncClient", + cmd: "ControlCommand", + sender: str, + room_id: str, + event_id: str, + action_summary: str, + normalized: str, + ) -> str: + """ + M9.1: Intent step for !policy import with diff preview and hash binding. + + Reads the file, computes a diff preview, stores a hash-bound callback, + and returns a formatted preview reply containing the nonce. + The confirm callback verifies the file hasn't changed before applying. + """ + assert self._confirm_store is not None + + # ── Validate args ────────────────────────────────────────────────────── + if self._policy_store is None or not self._policy_store.is_open: + return policy_cmd_error("Policy store not available.") + if not self._bridge_data_dir: + return policy_cmd_error("BRIDGE_DATA_DIR not configured.") + + filename = cmd.kwargs.get("path", "").strip() + if not filename: + return policy_cmd_error("Missing `path=` argument.") + + exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR) + safe_path = validate_export_path(exports_dir, filename) + if safe_path is None: + return policy_cmd_error( + f"Invalid path `{filename}`. Only simple filenames within the exports " + "directory are allowed." + ) + + mode_raw = cmd.kwargs.get("mode", "merge").strip().lower() + if mode_raw not in ("merge", "replace"): + return policy_cmd_error("mode must be `merge` or `replace`.") + + # ── Read file + compute diff preview ─────────────────────────────────── + try: + raw_data = await asyncio.to_thread(_read_json_file, safe_path) + except FileNotFoundError: + return policy_cmd_error(f"File not found: `{filename}`") + except Exception as exc: # noqa: BLE001 + return policy_cmd_error(f"Cannot read file: {exc}") + + if not isinstance(raw_data, dict): + return policy_cmd_error("Invalid JSON format (expected object).") + + try: + diff = await asyncio.to_thread( + self._policy_store.compute_import_diff, raw_data, mode_raw, + ) + except ValueError as ve: + return policy_cmd_error(str(ve)) + except Exception as exc: # noqa: BLE001 + return policy_cmd_error(f"Preview failed: {exc}") + + # ── Compute snapshot hash for confirm binding ───────────────────────── + _content_bytes = _json.dumps( + raw_data, sort_keys=True, ensure_ascii=True + ).encode("utf-8") + snapshot_hash = hashlib.sha256( + (filename + ":" + mode_raw + ":").encode("utf-8") + _content_bytes + ).hexdigest()[:32] + + sender_hash = _sender_hash(sender) + _captured_hash = snapshot_hash + _captured_data = raw_data + _captured_mode = mode_raw + _captured_path = safe_path + _captured_fname = filename + _captured_sender = sender + + # ── Build hash-bound callback ────────────────────────────────────────── + # Late-capture of nonce for autobackup filename (set after add() below) + _nonce_holder: list = [] + + async def _callback(): + # Re-read file and verify hash to detect tampering (anti-TOCTOU) + try: + fresh_data = await asyncio.to_thread(_read_json_file, _captured_path) + except Exception as exc: # noqa: BLE001 + return ( + f"❌ Cannot re-read file `{_captured_fname}` at apply time: {exc}\n" + "Re-issue the command.", + "", + ) + + fresh_bytes = _json.dumps( + fresh_data if isinstance(fresh_data, dict) else {}, + sort_keys=True, ensure_ascii=True, + ).encode("utf-8") + fresh_hash = hashlib.sha256( + (_captured_fname + ":" + _captured_mode + ":").encode("utf-8") + fresh_bytes + ).hexdigest()[:32] + + if fresh_hash != _captured_hash: + logger.warning( + "Policy import confirm rejected: file changed since preview " + "(sender=%s file=%s)", _captured_sender, _captured_fname, + ) + return ( + f"❌ File `{_captured_fname}` changed after preview — confirm rejected.\n" + "Re-issue `!policy import ...` to get a new preview.", + "", + ) + + # M10.0: Auto-backup current policy before applying changes + _autobackup_basename = "" + _autobackup_hash = "" + if self._policy_store is not None and self._bridge_data_dir: + _exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR) + _nonce_suffix = _nonce_holder[0] if _nonce_holder else "BACKUP" + _sender_hash8 = _sender_hash(_captured_sender)[:8] + try: + _os.makedirs(_exports_dir, exist_ok=True) + _ab_path, _autobackup_hash = await asyncio.to_thread( + self._policy_store.write_autobackup, + _exports_dir, _sender_hash8, _nonce_suffix, + ) + _autobackup_basename = _os.path.basename(_ab_path) + logger.info( + "Policy auto-backup written: %s hash=%s", + _autobackup_basename, _autobackup_hash, + ) + except Exception as exc: # noqa: BLE001 + logger.warning("Policy auto-backup failed (non-fatal): %s", exc) + + # Apply the import using the captured (previewed) data + try: + stats = await asyncio.to_thread( + self._policy_store.import_snapshot, + _captured_data, _captured_mode, False, _captured_sender, + ) + self._policy_last_import_at = int(time.time()) + diff_summary = ( + f"node: +{stats['node_added']} ~{stats['node_updated']} " + f"-{stats['node_deleted']}; " + f"agent: +{stats['agent_added']} ~{stats['agent_updated']} " + f"-{stats['agent_deleted']}" + ) + if _autobackup_basename: + diff_summary += f"; autobackup={_autobackup_basename}" + # M10.2: record in policy change history + _is_destr = ( + stats.get("node_deleted", 0) + stats.get("agent_deleted", 0) + ) > 0 + try: + await asyncio.to_thread( + self._policy_store.record_policy_change, + "policy.import", _captured_mode, _captured_fname, + _sender_hash(_captured_sender), diff_summary, _is_destr, + stats.get("node_added", 0), stats.get("node_updated", 0), + stats.get("node_deleted", 0), + stats.get("agent_added", 0), stats.get("agent_updated", 0), + stats.get("agent_deleted", 0), + self._policy_history_limit, + ) + except Exception as _exc: # noqa: BLE001 + logger.warning("Failed to record policy change history: %s", _exc) + reply = policy_import_reply(stats, _captured_mode) + if _autobackup_basename: + reply += ( + f"\n\n💾 Auto-backup saved: `{_autobackup_basename}` " + f"(hash `{_autobackup_hash}`)" + ) + return reply, diff_summary + except Exception as exc: # noqa: BLE001 + logger.exception( + "Policy import apply failed: sender=%s file=%s", _captured_sender, _captured_fname, + ) + return f"❌ Import failed: {exc}", "" + + # ── Store pending confirmation ───────────────────────────────────────── + nonce = self._confirm_store.add( + sender_hash=sender_hash, + verb="policy.import", + normalized_args=normalized, + action_summary=action_summary, + room_id=room_id, + callback=_callback, + ) + # M10.0: make nonce available inside _callback for backup filename + _nonce_holder.append(nonce) + + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.intent", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="ok", + data={ + "sender_hash": sender_hash, + "verb": "policy.import", + "normalized": normalized, + "nonce": nonce, + "expires_in_s": int(self._confirm_store.ttl_s), + "snapshot_hash_prefix": snapshot_hash[:8], + "diff": { + "node_added": diff.node_added, + "node_updated": diff.node_updated, + "node_deleted": diff.node_deleted, + "agent_added": diff.agent_added, + "agent_updated": diff.agent_updated, + "agent_deleted": diff.agent_deleted, + "sample_keys": diff.sample_keys, + }, + }, + ) + + logger.info( + "Confirm policy import intent: sender=%s mode=%s file=%s hash=%s nonce=%s", + sender, mode_raw, filename, snapshot_hash[:8], nonce, + ) + return policy_import_intent_reply( + diff=diff, + action_summary=action_summary, + nonce=nonce, + ttl_s=int(self._confirm_store.ttl_s), + ) + + async def _handle_policy_restore_intent( + self, + http_client: "httpx.AsyncClient", + cmd: "ControlCommand", + sender: str, + room_id: str, + event_id: str, + *, + action_summary: str, + normalized: str, + ) -> str: + """ + Intent step for !policy restore (M10.1). + + Reads the snapshot file, computes a diff preview, binds a SHA-256 hash + to the exact file content + mode, stores a callback in ConfirmStore, and + returns a rollback preview reply with the nonce. + """ + assert self._confirm_store is not None + assert self._policy_store is not None + assert self._bridge_data_dir is not None + + filename = cmd.kwargs.get("path", "").strip() + if not filename: + return policy_cmd_error("Missing `path=` argument.") + + exports_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR) + safe_path = validate_export_path(exports_dir, filename) + if safe_path is None: + return policy_cmd_error( + f"Invalid path `{filename}`. Only simple filenames within the exports " + "directory are allowed." + ) + + mode_raw = cmd.kwargs.get("mode", "replace").strip().lower() + if mode_raw not in ("merge", "replace"): + return policy_cmd_error("mode must be `merge` or `replace`.") + + # ── Read file + compute diff preview ────────────────────────────────── + try: + raw_data = await asyncio.to_thread(_read_json_file, safe_path) + except FileNotFoundError: + return policy_cmd_error(f"File not found: `{filename}`") + except Exception as exc: # noqa: BLE001 + return policy_cmd_error(f"Cannot read file: {exc}") + + if not isinstance(raw_data, dict): + return policy_cmd_error("Invalid JSON format (expected object).") + + try: + diff = await asyncio.to_thread( + self._policy_store.compute_import_diff, raw_data, mode_raw, + ) + except ValueError as ve: + return policy_cmd_error(str(ve)) + except Exception as exc: # noqa: BLE001 + return policy_cmd_error(f"Preview failed: {exc}") + + # ── Compute snapshot hash for confirm binding (anti-TOCTOU) ────────── + _content_bytes = _json.dumps( + raw_data, sort_keys=True, ensure_ascii=True + ).encode("utf-8") + snapshot_hash = hashlib.sha256( + (filename + ":" + mode_raw + ":restore:").encode("utf-8") + _content_bytes + ).hexdigest()[:32] + + sender_hash = _sender_hash(sender) + _captured_hash = snapshot_hash + _captured_data = raw_data + _captured_mode = mode_raw + _captured_path = safe_path + _captured_fname = filename + _captured_sender = sender + + # Late-capture of nonce for autobackup filename + _nonce_holder: list = [] + + # ── Build hash-bound callback ───────────────────────────────────────── + async def _callback(): + # Re-read + verify hash (anti-TOCTOU) + try: + fresh_data = await asyncio.to_thread(_read_json_file, _captured_path) + except Exception as exc: # noqa: BLE001 + return ( + f"❌ Cannot re-read `{_captured_fname}` at apply time: {exc}\n" + "Re-issue the command.", + "", + ) + + fresh_bytes = _json.dumps( + fresh_data if isinstance(fresh_data, dict) else {}, + sort_keys=True, ensure_ascii=True, + ).encode("utf-8") + fresh_hash = hashlib.sha256( + (_captured_fname + ":" + _captured_mode + ":restore:").encode("utf-8") + + fresh_bytes + ).hexdigest()[:32] + + if fresh_hash != _captured_hash: + logger.warning( + "Policy restore confirm rejected: file changed since preview " + "(sender=%s file=%s)", _captured_sender, _captured_fname, + ) + return ( + f"❌ File `{_captured_fname}` changed after preview — confirm rejected.\n" + "Re-issue `!policy restore ...` to get a new preview.", + "", + ) + + # Auto-backup current state before overwriting + _autobackup_basename = "" + _autobackup_hash = "" + if self._policy_store is not None and self._bridge_data_dir: + _exp_dir = _os.path.join(self._bridge_data_dir, POLICY_EXPORTS_SUBDIR) + _nonce_suffix = _nonce_holder[0] if _nonce_holder else "RESTORE" + _sender_hash8 = _sender_hash(_captured_sender)[:8] + try: + _os.makedirs(_exp_dir, exist_ok=True) + _ab_path, _autobackup_hash = await asyncio.to_thread( + self._policy_store.write_autobackup, + _exp_dir, _sender_hash8, _nonce_suffix, + ) + _autobackup_basename = _os.path.basename(_ab_path) + logger.info( + "Pre-restore backup written: %s hash=%s", + _autobackup_basename, _autobackup_hash, + ) + except Exception as exc: # noqa: BLE001 + logger.warning("Pre-restore backup failed (non-fatal): %s", exc) + + # Apply restore + try: + stats = await asyncio.to_thread( + self._policy_store.import_snapshot, + _captured_data, _captured_mode, False, _captured_sender, + ) + self._policy_last_import_at = int(time.time()) + diff_summary = ( + f"restore/{_captured_mode}: " + f"node: +{stats['node_added']} ~{stats['node_updated']} " + f"-{stats['node_deleted']}; " + f"agent: +{stats['agent_added']} ~{stats['agent_updated']} " + f"-{stats['agent_deleted']}" + ) + if _autobackup_basename: + diff_summary += f"; autobackup={_autobackup_basename}" + # M10.2: record in policy change history + _is_destr = ( + stats.get("node_deleted", 0) + stats.get("agent_deleted", 0) + ) > 0 + try: + await asyncio.to_thread( + self._policy_store.record_policy_change, + "policy.restore", _captured_mode, _captured_fname, + _sender_hash(_captured_sender), diff_summary, _is_destr, + stats.get("node_added", 0), stats.get("node_updated", 0), + stats.get("node_deleted", 0), + stats.get("agent_added", 0), stats.get("agent_updated", 0), + stats.get("agent_deleted", 0), + self._policy_history_limit, + ) + except Exception as _exc: # noqa: BLE001 + logger.warning("Failed to record restore history: %s", _exc) + reply = policy_restore_applied_reply( + stats, _captured_mode, _autobackup_basename + ) + return reply, diff_summary + except Exception as exc: # noqa: BLE001 + logger.exception( + "Policy restore apply failed: sender=%s file=%s", + _captured_sender, _captured_fname, + ) + return f"❌ Restore failed: {exc}", "" + + # ── Store pending confirmation ──────────────────────────────────────── + nonce = self._confirm_store.add( + sender_hash=sender_hash, + verb="policy.restore", + normalized_args=normalized, + action_summary=action_summary, + room_id=room_id, + callback=_callback, + ) + _nonce_holder.append(nonce) + + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.policy.restore", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="ok", + data={ + "sender_hash": sender_hash, + "verb": "policy.restore", + "normalized": normalized, + "nonce": nonce, + "expires_in_s": int(self._confirm_store.ttl_s), + "mode": mode_raw, + "snapshot_hash_prefix": snapshot_hash[:8], + "diff": { + "node_added": diff.node_added, + "node_updated": diff.node_updated, + "node_deleted": diff.node_deleted, + "agent_added": diff.agent_added, + "agent_updated": diff.agent_updated, + "agent_deleted": diff.agent_deleted, + "sample_keys": diff.sample_keys, + }, + }, + ) + + logger.info( + "Confirm policy restore intent: sender=%s mode=%s file=%s hash=%s nonce=%s", + sender, mode_raw, filename, snapshot_hash[:8], nonce, + ) + return policy_restore_intent_reply( + diff=diff, + action_summary=action_summary, + nonce=nonce, + ttl_s=int(self._confirm_store.ttl_s), + ) + + async def _handle_dangerous_intent( + self, + http_client: "httpx.AsyncClient", + cmd: "ControlCommand", + sender: str, + room_id: str, + event_id: str, + ) -> str: + """ + First leg of the two-step confirm flow (M9.0). + + Does NOT apply the command. Stores a pending confirmation with a + callback that will execute the original handler, and returns a reply + containing the nonce that the operator must send via !confirm . + """ + assert self._confirm_store is not None + + sender_hash = _sender_hash(sender) + normalized = build_normalized_args(cmd) + action_summary = ( + f"!{cmd.verb} {cmd.subcommand or ''} {normalized}".strip() + ) + + # M9.1: policy import gets a richer preview with diff + hash binding + if cmd.verb == VERB_POLICY and (cmd.subcommand or "").lower() == "import": + return await self._handle_policy_import_intent( + http_client, cmd, sender, room_id, event_id, + action_summary=action_summary, normalized=normalized, + ) + + # M10.1: policy restore — rollback with diff preview + hash binding + if cmd.verb == VERB_POLICY and (cmd.subcommand or "").lower() == "restore": + if self._policy_store is None or not self._bridge_data_dir: + return policy_cmd_error( + "Policy store or data directory not configured." + ) + return await self._handle_policy_restore_intent( + http_client, cmd, sender, room_id, event_id, + action_summary=action_summary, normalized=normalized, + ) + + # Build the callback: calls the actual handler when confirmed. + # We capture all args by closure so the callback is self-contained. + _verb = cmd.verb + _subcmd = cmd.subcommand + _args = cmd.args + _kw = dict(cmd.kwargs) + + async def _callback(): + if _verb == VERB_NODE: + # Reconstruct args_text (same as _try_control does) + _parts = [] + if _subcmd: + _parts.append(_subcmd) + _parts.extend(_args) + _parts.extend(f"{k}={v}" for k, v in _kw.items()) + reply = await self._handle_node_cmd( + http_client, sender, room_id, event_id, " ".join(_parts), + ) + elif _verb == VERB_ROOM: + reply = await self._handle_room_cmd( + http_client, sender, room_id, event_id, + _subcmd, tuple(_args), _kw, + ) + elif _verb == VERB_POLICY: + reply = await self._handle_policy_cmd( + http_client, sender, room_id, event_id, _subcmd, _kw, + ) + else: + reply = f"❌ Unknown dangerous verb: {_verb}" + return reply, action_summary + + nonce = self._confirm_store.add( + sender_hash=sender_hash, + verb=f"{cmd.verb}.{cmd.subcommand or ''}", + normalized_args=normalized, + action_summary=action_summary, + room_id=room_id, + callback=_callback, + ) + + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.intent", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="ok", + data={ + "sender_hash": sender_hash, + "verb": cmd.verb, + "subcommand": cmd.subcommand or "", + "normalized": normalized, + "nonce": nonce, + "expires_in_s": int(self._confirm_store.ttl_s), + }, + ) + + logger.info( + "Confirm intent: sender=%s verb=%s/%s nonce=%s ttl=%.0fs", + sender, cmd.verb, cmd.subcommand, nonce, self._confirm_store.ttl_s, + ) + return confirm_intent_reply(action_summary, nonce, int(self._confirm_store.ttl_s)) + + async def _handle_confirm_cmd( + self, + http_client: "httpx.AsyncClient", + cmd: "ControlCommand", + sender: str, + room_id: str, + event_id: str, + ) -> str: + """ + Second leg of the two-step confirm flow (M9.0). + + Validates the nonce and sender, executes the stored callback, and + emits confirmed + applied audit events. + """ + if self._confirm_store is None: + return "❌ Confirmation store not active." + + # Nonce may come as subcommand (token right after !confirm) + nonce = (cmd.subcommand or "").strip().upper() + if not nonce and cmd.args: + nonce = cmd.args[0].strip().upper() + if not nonce: + return "❌ Usage: `!confirm ` — provide the confirmation code." + + sender_hash = _sender_hash(sender) + entry = self._confirm_store.pop(nonce, sender_hash) + + if entry is None: + logger.info( + "Confirm rejected: sender=%s nonce=%s (invalid/expired/wrong-sender)", + sender, nonce, + ) + return confirm_expired_reply() + + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.confirmed", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="ok", + data={ + "sender_hash": sender_hash, + "nonce": nonce, + "verb": entry.verb, + "action_summary": entry.action_summary, + }, + ) + + logger.info( + "Confirm accepted: sender=%s nonce=%s verb=%s", + sender, nonce, entry.verb, + ) + + try: + reply_text, diff_summary = await entry.callback() + except Exception as exc: + logger.exception( + "Confirm callback failed: sender=%s nonce=%s verb=%s", + sender, nonce, entry.verb, + ) + return f"❌ Apply failed: {exc}" + + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.applied", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="ok", + data={ + "sender_hash": sender_hash, + "verb": entry.verb, + "normalized": entry.normalized_args, + "diff_summary": diff_summary, + }, + ) + + return confirm_success_reply(reply_text) + + # ── M6.0: Dynamic room-node overrides via !node command ─────────────────── + + async def _handle_node_cmd( + self, + http_client: httpx.AsyncClient, + sender: str, + ctrl_room_id: str, + event_id: str, + args_text: str, + ) -> str: + """Handle `!node ` from an authorized operator.""" + if self._policy_store is None or not self._policy_store.is_open: + return "⚠️ Policy store not available." + + subcmd, room_id, node_id = parse_node_cmd(args_text) + + if subcmd not in (NODE_SUBCMD_SET, NODE_SUBCMD_UNSET, NODE_SUBCMD_GET, NODE_SUBCMD_LIST): + return node_cmd_reply_error( + f"Unknown subcommand: `{subcmd or '?'}`" + ) + + # ── list ────────────────────────────────────────────────────────────── + if subcmd == NODE_SUBCMD_LIST: + try: + rows = await asyncio.to_thread(self._policy_store.list_overrides, 10) + total = await asyncio.to_thread(self._policy_store.count_overrides) + except Exception as exc: + logger.warning("PolicyStore list_overrides error: %s", exc) + return "⚠️ Could not read policy store." + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.node.list", + agent_id="control", node_id=self._node_id, + room_id=ctrl_room_id, event_id=event_id, + status="ok", data={"sender": sender, "total": total}, + ) + return node_cmd_reply_list(rows, total) + + # ── subcommands that require room_id ────────────────────────────────── + if not room_id: + return node_cmd_reply_error("Missing `room=` argument.") + if not node_cmd_validate_room(room_id): + return node_cmd_reply_error( + f"Invalid room ID format: `{room_id}`\n" + "Expected: `!localpart:server`" + ) + + # ── get ─────────────────────────────────────────────────────────────── + if subcmd == NODE_SUBCMD_GET: + try: + override = await asyncio.to_thread(self._policy_store.get_override, room_id) + except Exception as exc: + logger.warning("PolicyStore get_override error: %s", exc) + return "⚠️ Could not read policy store." + # env map lookup for context + env_node: Optional[str] = None + if self._node_policy is not None: + env_node = self._node_policy.room_node_map.get(room_id) + default = self._node_policy.default_node if self._node_policy else self._node_id + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.node.get", + agent_id="control", node_id=self._node_id, + room_id=ctrl_room_id, event_id=event_id, + status="ok", data={"sender": sender, "queried_room": room_id}, + ) + return node_cmd_reply_get(room_id, override, env_node, default) + + # ── unset ───────────────────────────────────────────────────────────── + if subcmd == NODE_SUBCMD_UNSET: + try: + deleted = await asyncio.to_thread(self._policy_store.delete_override, room_id) + except Exception as exc: + logger.warning("PolicyStore delete_override error: %s", exc) + return "⚠️ Could not write to policy store." + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.node.unset", + agent_id="control", node_id=self._node_id, + room_id=ctrl_room_id, event_id=event_id, + status="ok", data={"sender": sender, "target_room": room_id, "was_set": deleted}, + ) + return node_cmd_reply_unset_ok(room_id) if deleted else node_cmd_reply_unset_not_found(room_id) + + # ── set ─────────────────────────────────────────────────────────────── + if subcmd == NODE_SUBCMD_SET: + if not node_id: + return node_cmd_reply_error("Missing `node=` argument for `set`.") + allowed = self._node_policy.allowed_nodes if self._node_policy else frozenset([self._node_id]) + if node_id not in allowed: + allowed_list = ", ".join(f"`{n}`" for n in sorted(allowed)) + return node_cmd_reply_error( + f"Node `{node_id}` is not in allowed list: {allowed_list}" + ) + try: + await asyncio.to_thread(self._policy_store.set_override, room_id, node_id, sender) + except Exception as exc: + logger.warning("PolicyStore set_override error: %s", exc) + return "⚠️ Could not write to policy store." + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.node.set", + agent_id="control", node_id=self._node_id, + room_id=ctrl_room_id, event_id=event_id, + status="ok", data={"sender": sender, "target_room": room_id, "set_node": node_id}, + ) + return node_cmd_reply_set(room_id, node_id) + + return node_cmd_reply_error("Unhandled subcommand.") + + # ── M4.1: Bridge status for operators ───────────────────────────────────── + + async def _handle_bridge_status( + self, + http_client: httpx.AsyncClient, + sender: str, + room_id: str, + event_id: str, + ) -> str: + """Build and return a bridge health snapshot for `!status` in control room.""" + try: + snapshot: Dict[str, Any] = { + "node_id": self._node_id, + "worker_count": self._worker_count, + "room_count": len(self._room_map.mappings), + "mixed_room_count": ( + len(self._mixed_room_config.rooms) + if self._mixed_room_config + else 0 + ), + "operators_count": ( + len(self._control_config.operator_allowlist) + if self._control_config and self._control_config.operator_allowlist + else 0 + ), + } + # Queue info (exposed by MatrixIngressLoop via get_status) + status = self.get_status() + snapshot["queue_size"] = status.get("queue_size", "?") + snapshot["queue_max"] = status.get("queue_max", "?") + # Control safety + if self._control_limiter is not None: + snapshot["control_safety"] = self._control_limiter.as_health_dict() + # Persistent dedupe + if self._event_store is not None: + snapshot["persistent_dedupe"] = self._event_store.as_health_dict() + + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.bridge_status", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="ok", data={"sender": sender}, + ) + return bridge_status_reply(snapshot) + except Exception as exc: + logger.warning("_handle_bridge_status error: %s", exc) + return status_not_available_reply() + # ── Control command handler ──────────────────────────────────────────────── async def _try_control( @@ -639,12 +2512,82 @@ class MatrixIngressLoop: logger.warning("Could not send unauthorized reply: %s", exc) return + # M3.4: Rate limiting + cooldown (after auth, before parse/dispatch) + if self._control_limiter is not None: + sender_hash_ctrl = _sender_hash(sender) + + allowed_room, retry_room = self._control_limiter.check_room(room_id) + if not allowed_room: + scope = "room" + logger.info("Control rate limited: scope=%s room=%s sender=%s", scope, room_id, sender) + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.rate_limited", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="error", error_code="rate_limited_room", + data={"sender": sender, "scope": scope, "retry_after_s": retry_room}, + ) + if self._on_control_rate_limited: + self._on_control_rate_limited(scope) + txn_id = MatrixClient.make_txn_id(room_id, event_id + "_rl") + await client.send_text(room_id, rate_limited_reply(scope, retry_room), txn_id) + return + + allowed_op, retry_op = self._control_limiter.check_operator(sender_hash_ctrl) + if not allowed_op: + scope = "operator" + logger.info("Control rate limited: scope=%s sender=%s", scope, sender) + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.rate_limited", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="error", error_code="rate_limited_operator", + data={"sender": sender, "scope": scope, "retry_after_s": retry_op}, + ) + if self._on_control_rate_limited: + self._on_control_rate_limited(scope) + txn_id = MatrixClient.make_txn_id(room_id, event_id + "_rl") + await client.send_text(room_id, rate_limited_reply(scope, retry_op), txn_id) + return + # Parse command cmd = parse_command(text) if cmd is None: logger.warning("Control message from %s could not be parsed: %r", sender, text[:60]) return + # M3.4: Cooldown check (anti-double-click, per operator+verb+subcommand) + if self._control_limiter is not None: + sender_hash_ctrl = _sender_hash(sender) + allowed_cd, wait_cd = self._control_limiter.check_cooldown( + sender_hash_ctrl, cmd.verb, cmd.subcommand or "", + ) + if not allowed_cd: + scope = "cooldown" + logger.info( + "Control cooldown: sender=%s verb=%s sub=%s wait=%.1fs", + sender, cmd.verb, cmd.subcommand, wait_cd, + ) + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.rate_limited", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="error", error_code="cooldown", + data={ + "sender": sender, "scope": scope, + "verb": cmd.verb, "subcommand": cmd.subcommand, + "wait_s": wait_cd, + }, + ) + if self._on_control_rate_limited: + self._on_control_rate_limited(scope) + txn_id = MatrixClient.make_txn_id(room_id, event_id + "_cd") + await client.send_text(room_id, rate_limited_reply(scope, wait_cd), txn_id) + return + # Metric callback if self._on_control_command: self._on_control_command(sender, cmd.verb, cmd.subcommand) @@ -671,10 +2614,91 @@ class MatrixIngressLoop: sender, cmd.verb, cmd.subcommand, cmd.args, ) - # Build reply + # Dispatch command txn_id = MatrixClient.make_txn_id(room_id, event_id + "_ctrl") - if cmd.verb == VERB_HELP: + + # M9.0: Dangerous commands → two-step confirmation (intent leg) + if ( + self._confirm_store is not None + and is_dangerous_cmd(cmd) + and cmd.verb != VERB_CONFIRM + ): + reply_text = await self._handle_dangerous_intent( + http_client, cmd, sender, room_id, event_id, + ) + elif cmd.verb == VERB_CONFIRM: + # M9.0: !confirm (second leg) + reply_text = await self._handle_confirm_cmd( + http_client, cmd, sender, room_id, event_id, + ) + elif cmd.verb == VERB_HELP: reply_text = help_reply() + elif cmd.verb == VERB_CONFIRM: + # Fallback if confirm_store is None (disabled) — inform the operator + reply_text = "❌ Confirmation system is disabled." + elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_START: + reply_text = await self._handle_runbook_start( + http_client, client, cmd, sender, room_id, event_id, + ) + elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_NEXT: + # M3.2: advance to next step + reply_text = await self._handle_runbook_next( + http_client, client, cmd, sender, room_id, event_id, + ) + elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_COMPLETE: + # M3.2: mark manual step complete + reply_text = await self._handle_runbook_complete( + http_client, client, cmd, sender, room_id, event_id, + ) + elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_STATUS: + # M3.3: show run status + reply_text = await self._handle_runbook_status( + http_client, client, cmd, sender, room_id, event_id, + ) + elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_EVIDENCE: + # M3.3: generate release evidence + reply_text = await self._handle_runbook_evidence( + http_client, client, cmd, sender, room_id, event_id, + ) + elif cmd.verb == VERB_RUNBOOK and cmd.subcommand == SUBCOMMAND_POST_REVIEW: + # M3.3: generate post-release review + reply_text = await self._handle_runbook_post_review( + http_client, client, cmd, sender, room_id, event_id, + ) + elif cmd.verb == VERB_STATUS: + # M4.1: bridge health snapshot for operators + reply_text = await self._handle_bridge_status( + http_client, sender, room_id, event_id, + ) + elif cmd.verb == VERB_NODES: + # M5.1: node policy overview for operators + reply_text = await self._handle_nodes( + http_client, sender, room_id, event_id, + ) + elif cmd.verb == VERB_NODE: + # M6.0: dynamic room-node override commands + # Reconstruct args_text from parsed command parts + _node_args_parts = [] + if cmd.subcommand: + _node_args_parts.append(cmd.subcommand) + _node_args_parts.extend(cmd.args) + _node_args_parts.extend(f"{k}={v}" for k, v in cmd.kwargs.items()) + _node_args_text = " ".join(_node_args_parts) + reply_text = await self._handle_node_cmd( + http_client, sender, room_id, event_id, _node_args_text, + ) + elif cmd.verb == VERB_ROOM: + # M6.1: dynamic mixed room agent overrides + reply_text = await self._handle_room_cmd( + http_client, sender, room_id, event_id, + cmd.subcommand, cmd.args, cmd.kwargs, + ) + elif cmd.verb == VERB_POLICY: + # M6.2: policy snapshot export/import + reply_text = await self._handle_policy_cmd( + http_client, sender, room_id, event_id, + cmd.subcommand, cmd.kwargs, + ) elif not cmd.is_known: reply_text = unknown_command_reply(cmd) await _write_audit( @@ -686,7 +2710,6 @@ class MatrixIngressLoop: data={"verb": cmd.verb, "sender": sender}, ) else: - # M3.1+ will implement actual runbook/status commands reply_text = not_implemented_reply(cmd) try: @@ -694,6 +2717,439 @@ class MatrixIngressLoop: except Exception as exc: logger.error("Could not send control reply: %s", exc) + async def _handle_runbook_start( + self, + http_client: httpx.AsyncClient, + client: "MatrixClient", + cmd: ControlCommand, + sender: str, + room_id: str, + event_id: str, + ) -> str: + """ + M3.1: Execute !runbook start [node=NODA1]. + + Calls sofiia-console POST /api/runbooks/internal/runs. + Returns reply text (success or failure) for delivery to the control room. + Audits matrix.control.runbook.start regardless of outcome. + """ + # Extract positional runbook_path + runbook_path = cmd.args[0].strip() if cmd.args else "" + node_id = cmd.kwargs.get("node", "NODA1").strip() + + # Validate path before calling the console + path_error = _ctrl_runner.validate_runbook_path(runbook_path) + if path_error: + logger.warning( + "!runbook start invalid path: sender=%s path=%r error=%s", + sender, runbook_path, path_error, + ) + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.runbook.start", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="error", error_code="invalid_path", + data={"sender": sender, "runbook_path": runbook_path, "error": path_error}, + ) + return start_usage_reply() + + # Call sofiia-console internal API + run_id: Optional[str] = None + http_status: Optional[int] = None + try: + result = await _ctrl_runner.start_runbook_run( + http_client=http_client, + console_url=self._console_url, + control_token=self._control_token, + runbook_path=runbook_path, + operator_id=sender, + node_id=node_id, + ) + run_id = result.get("run_id", "") + steps_total = result.get("steps_total", 0) + status = result.get("status", "running") + http_status = 200 + + logger.info( + "Runbook started: run_id=%s path=%s node=%s steps=%d by sender=%s", + run_id, runbook_path, node_id, steps_total, sender, + ) + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.runbook.start", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="ok", + data={ + "sender": sender, + "runbook_path": runbook_path, + "node_id": node_id, + "run_id": run_id, + "steps_total": steps_total, + "http_status": http_status, + }, + ) + return runbook_started_reply(run_id, steps_total, status) + + except _ctrl_runner.RunnerError as exc: + reason = str(exc) + logger.error( + "!runbook start failed: sender=%s path=%r node=%s error=%s", + sender, runbook_path, node_id, reason, + ) + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.runbook.start", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="error", error_code="runner_error", + data={ + "sender": sender, + "runbook_path": runbook_path, + "node_id": node_id, + "error": reason, + "http_status": http_status, + }, + ) + return runbook_start_error_reply(reason) + + async def _handle_runbook_next( + self, + http_client: httpx.AsyncClient, + client: "MatrixClient", + cmd: ControlCommand, + sender: str, + room_id: str, + event_id: str, + ) -> str: + """ + M3.2: Execute !runbook next . + + Calls sofiia-console POST /api/runbooks/internal/runs/{run_id}/next. + Returns reply text for the control room. + Audits matrix.control.runbook.next. + """ + run_id = cmd.args[0].strip() if cmd.args else "" + if not run_id: + return next_usage_reply() + + # M3.4: per-run rate limit for !runbook next + if self._control_limiter is not None: + allowed_run, retry_run = self._control_limiter.check_run_next(run_id) + if not allowed_run: + scope = "run" + if self._on_control_rate_limited: + self._on_control_rate_limited(scope) + return rate_limited_reply(scope, retry_run) + + http_status: Optional[int] = None + try: + result = await _ctrl_runner.next_runbook_step( + http_client=http_client, + console_url=self._console_url, + control_token=self._control_token, + run_id=run_id, + operator_id=sender, + ) + http_status = 200 + step_type = result.get("type", "unknown") + step_index = result.get("step_index", 0) + + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.runbook.next", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="ok", + data={ + "sender": sender, + "run_id": run_id, + "step_index": step_index, + "step_type": step_type, + "http_status": http_status, + }, + ) + + if step_type == "manual": + return next_manual_reply( + run_id=run_id, + step_index=step_index, + steps_total=result.get("steps_total"), + title=result.get("title", ""), + instructions_md=result.get("instructions_md", ""), + ) + else: + # http_check / script + result_dict = result.get("result") or {} + duration_ms = int(result_dict.get("duration_ms", 0)) if isinstance(result_dict, dict) else 0 + return next_auto_reply( + run_id=run_id, + step_index=step_index, + action_type=step_type, + step_status=result.get("step_status", "ok"), + duration_ms=duration_ms or None, + completed=bool(result.get("completed", False)), + ) + + except _ctrl_runner.RunnerError as exc: + reason = str(exc) + logger.error( + "!runbook next failed: sender=%s run_id=%r error=%s", + sender, run_id, reason, + ) + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.runbook.next", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="error", error_code="runner_error", + data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status}, + ) + return next_error_reply(run_id, reason) + + async def _handle_runbook_complete( + self, + http_client: httpx.AsyncClient, + client: "MatrixClient", + cmd: ControlCommand, + sender: str, + room_id: str, + event_id: str, + ) -> str: + """ + M3.2: Execute !runbook complete step= status=ok|warn|fail [notes=...] + + Calls sofiia-console POST /api/runbooks/internal/runs/{run_id}/steps/{n}/complete. + Audits matrix.control.runbook.complete. + """ + run_id = cmd.args[0].strip() if cmd.args else "" + if not run_id: + return complete_usage_reply() + + # step kwarg required + step_raw = cmd.kwargs.get("step", "").strip() + if not step_raw or not step_raw.isdigit(): + return complete_usage_reply() + step_index = int(step_raw) + + # status kwarg required + status = cmd.kwargs.get("status", "").strip().lower() + if status not in ("ok", "warn", "fail", "skipped"): + return complete_usage_reply() + + # notes: kwarg or remaining positional args (joined with space) + notes = cmd.kwargs.get("notes", "").strip() + if not notes and len(cmd.args) > 1: + notes = " ".join(cmd.args[1:]) + notes = sanitize_notes(notes) # M3.4: strip control chars + truncate to MAX_NOTES_LEN + + http_status: Optional[int] = None + try: + result = await _ctrl_runner.complete_runbook_step( + http_client=http_client, + console_url=self._console_url, + control_token=self._control_token, + run_id=run_id, + step_index=step_index, + status=status, + notes=notes, + operator_id=sender, + ) + http_status = 200 + run_completed = bool(result.get("run_completed", False)) + + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.runbook.complete", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="ok", + data={ + "sender": sender, + "run_id": run_id, + "step_index": step_index, + "status": status, + "run_completed": run_completed, + "http_status": http_status, + }, + ) + return complete_ok_reply(run_id, step_index, status, run_completed) + + except _ctrl_runner.RunnerError as exc: + reason = str(exc) + logger.error( + "!runbook complete failed: sender=%s run_id=%r step=%d error=%s", + sender, run_id, step_index, reason, + ) + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.runbook.complete", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="error", error_code="runner_error", + data={ + "sender": sender, + "run_id": run_id, + "step_index": step_index, + "error": reason, + "http_status": http_status, + }, + ) + return complete_error_reply(run_id, reason) + + async def _handle_runbook_status( + self, + http_client: httpx.AsyncClient, + client: "MatrixClient", + cmd: ControlCommand, + sender: str, + room_id: str, + event_id: str, + ) -> str: + """M3.3: !runbook status — GET run info + format status.""" + run_id = cmd.args[0].strip() if cmd.args else "" + if not run_id: + return status_usage_reply() + + http_status: Optional[int] = None + try: + result = await _ctrl_runner.get_runbook_run( + http_client=http_client, + console_url=self._console_url, + control_token=self._control_token, + run_id=run_id, + ) + http_status = 200 + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.runbook.status", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="ok", + data={ + "sender": sender, "run_id": run_id, + "run_status": result.get("status"), + "http_status": http_status, + }, + ) + return status_reply(result) + + except _ctrl_runner.RunnerError as exc: + reason = str(exc) + logger.error("!runbook status failed: sender=%s run_id=%r error=%s", sender, run_id, reason) + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.runbook.status", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="error", error_code="runner_error", + data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status}, + ) + return status_error_reply(run_id, reason) + + async def _handle_runbook_evidence( + self, + http_client: httpx.AsyncClient, + client: "MatrixClient", + cmd: ControlCommand, + sender: str, + room_id: str, + event_id: str, + ) -> str: + """M3.3: !runbook evidence — generate release evidence.""" + run_id = cmd.args[0].strip() if cmd.args else "" + if not run_id: + return evidence_usage_reply() + + http_status: Optional[int] = None + try: + result = await _ctrl_runner.generate_evidence( + http_client=http_client, + console_url=self._console_url, + control_token=self._control_token, + run_id=run_id, + ) + http_status = 200 + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.runbook.evidence", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="ok", + data={ + "sender": sender, "run_id": run_id, + "evidence_path": result.get("evidence_path"), + "bytes": result.get("bytes"), + "http_status": http_status, + }, + ) + return evidence_reply(result) + + except _ctrl_runner.RunnerError as exc: + reason = str(exc) + logger.error("!runbook evidence failed: sender=%s run_id=%r error=%s", sender, run_id, reason) + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.runbook.evidence", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="error", error_code="runner_error", + data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status}, + ) + return evidence_error_reply(run_id, reason) + + async def _handle_runbook_post_review( + self, + http_client: httpx.AsyncClient, + client: "MatrixClient", + cmd: ControlCommand, + sender: str, + room_id: str, + event_id: str, + ) -> str: + """M3.3: !runbook post_review — generate post-release review.""" + run_id = cmd.args[0].strip() if cmd.args else "" + if not run_id: + return post_review_usage_reply() + + http_status: Optional[int] = None + try: + result = await _ctrl_runner.generate_post_review( + http_client=http_client, + console_url=self._console_url, + control_token=self._control_token, + run_id=run_id, + ) + http_status = 200 + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.runbook.post_review", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="ok", + data={ + "sender": sender, "run_id": run_id, + "path": result.get("path"), + "bytes": result.get("bytes"), + "http_status": http_status, + }, + ) + return post_review_reply(result) + + except _ctrl_runner.RunnerError as exc: + reason = str(exc) + logger.error("!runbook post_review failed: sender=%s run_id=%r error=%s", sender, run_id, reason) + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.control.runbook.post_review", + agent_id="control", node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="error", error_code="runner_error", + data={"sender": sender, "run_id": run_id, "error": reason, "http_status": http_status}, + ) + return post_review_error_reply(run_id, reason) + # ── Worker ───────────────────────────────────────────────────────────────── async def _worker( @@ -760,12 +3216,76 @@ class MatrixIngressLoop: }, ) - # M2.1: session isolation per (room, agent) for mixed rooms - room_key = room_id.replace("!", "").replace(":", "_") - if is_mixed: - session_id = f"matrix:{room_key}:{agent_id}" - else: - session_id = f"matrix:{room_key}" + # Session Scope v2: canonical key + PII-safe sender hash. + # Scope is always room_agent for user-initiated messages. + # Control room messages never reach _process_entry (handled by _try_control). + scope = SCOPE_ROOM_AGENT + session_id = _build_session_key(room_id, agent_id, scope=scope) + sender_hash = _sender_hash(sender) + + logger.debug( + "Session scope v2: session_key=%s scope=%s sender_hash=%s", + session_id, scope, sender_hash, + ) + + # M5.0: Node-aware routing + # Extract node=X kwarg from body (mixed rooms only, to avoid breaking direct rooms) + explicit_node: Optional[str] = None + effective_text = text + if is_mixed and self._node_policy is not None: + explicit_node, effective_text = extract_node_kwarg(text) + + # M6.0: look up dynamic policy store override for this room + store_override: Optional[str] = None + if self._policy_store is not None and self._policy_store.is_open: + try: + store_override = await asyncio.to_thread( + self._policy_store.get_override, room_id + ) + except Exception as exc: # noqa: BLE001 + logger.warning("PolicyStore get_override failed: %s", exc) + + node_res = ( + self._node_policy.resolve(room_id, explicit_node, store_override=store_override) + if self._node_policy is not None + else NodeResolution(node_id=self._node_id, source=NODE_SOURCE_DEFAULT) + ) + + if node_res.rejected_node: + logger.info( + "Node kwarg rejected: requested=%s allowed=%s room=%s agent=%s", + node_res.rejected_node, self._node_policy.allowed_nodes if self._node_policy else {}, room_id, agent_id, + ) + if self._on_node_rejected: + self._on_node_rejected(node_res.rejected_node) + txn_rej = MatrixClient.make_txn_id(room_id, event_id + "_node_rej") + allowed = self._node_policy.allowed_nodes if self._node_policy else frozenset() + reply_rej = node_rejected_reply(node_res.rejected_node, allowed) + try: + await client.send_text(room_id, reply_rej, txn_rej) + except Exception as exc: + logger.warning("Could not send node rejection reply: %s", exc) + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.route.node_rejected", + agent_id=agent_id, node_id=self._node_id, + room_id=room_id, event_id=event_id, + status="error", error_code="node_rejected", + data={"requested_node": node_res.rejected_node, "resolved_node": node_res.node_id}, + ) + # Continue with fallback node (do not drop the message) + + if self._on_node_selected: + self._on_node_selected(agent_id, node_res.node_id, node_res.source) + + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.route.node_selected", + agent_id=agent_id, node_id=node_res.node_id, + room_id=room_id, event_id=event_id, + status="ok", + data={"node_id": node_res.node_id, "source": node_res.source}, + ) # M2.2: per-room-agent concurrency cap (only for mixed rooms; single-agent rooms unaffected) _lock = self._get_concurrency_lock(room_id, agent_id) if is_mixed and self._mixed_concurrency_cap > 0 else None @@ -774,6 +3294,9 @@ class MatrixIngressLoop: try: await self._invoke_and_send( client, http_client, entry, session_id, wait_s, is_mixed, routing_reason, + sender_hash=sender_hash, scope=scope, + effective_node_id=node_res.node_id, node_source=node_res.source, + effective_text=effective_text, ) finally: if _lock is not None: @@ -788,76 +3311,212 @@ class MatrixIngressLoop: wait_s: float, is_mixed: bool, routing_reason: str, + sender_hash: str = "", + scope: str = SCOPE_ROOM_AGENT, + # M5.0: resolved node + effective_node_id: Optional[str] = None, + node_source: str = NODE_SOURCE_DEFAULT, + effective_text: Optional[str] = None, # text with node=X kwarg stripped ) -> None: """Inner: invoke Router + send reply (separated for concurrency lock wrapping).""" event = entry.event event_id = event.get("event_id", "") - text = event.get("content", {}).get("body", "").strip() + # Use effective_text if provided (node kwarg stripped), otherwise original body + text = effective_text if effective_text is not None else event.get("content", {}).get("body", "").strip() room_id = entry.room_id agent_id = entry.agent_id + node_id = effective_node_id if effective_node_id is not None else self._node_id - # H3: Invoke with latency + # H3 + M8.0 + M8.1: Invoke with latency tracking, soft-failover, and sticky routing t0 = time.monotonic() reply_text: Optional[str] = None invoke_ok = False invoke_duration_s = 0.0 + used_node_id = node_id # may change on failover - try: - reply_text = await _invoke_router( - http_client, self._router_url, - agent_id=agent_id, node_id=self._node_id, - prompt=text, session_id=session_id, - ) - invoke_ok = True - invoke_duration_s = time.monotonic() - t0 - if self._on_invoke_latency: - self._on_invoke_latency(agent_id, invoke_duration_s) + # M8.1: check sticky cache (skip primary if sticky is set for this room:agent) + sticky_key = make_sticky_key(room_id, agent_id) + sticky_node: Optional[str] = None + if node_source != NODE_SOURCE_EXPLICIT and self._sticky_cache is not None: + sticky_node = self._sticky_cache.get(sticky_key) + + async def _do_invoke(target_node: str, target_source: str) -> Optional[str]: + """Single invoke attempt; returns reply text or None on failure.""" + nonlocal invoke_duration_s + _t = time.monotonic() + try: + result = await _invoke_router( + http_client, self._router_url, + agent_id=agent_id, node_id=target_node, + prompt=text, session_id=session_id, + sender_hash=sender_hash, scope=scope, + node_source=target_source, + ) + invoke_duration_s = time.monotonic() - _t + if self._node_health_tracker is not None: + self._node_health_tracker.record_ok(target_node, invoke_duration_s) + if self._on_invoke_latency: + self._on_invoke_latency(agent_id, invoke_duration_s, target_node) + logger.info( + "Invoke ok: agent=%s node=%s event=%s reply_len=%d duration=%dms", + agent_id, target_node, event_id, len(result or ""), int(invoke_duration_s * 1000), + ) + return result + except httpx.HTTPStatusError as exc: + invoke_duration_s = time.monotonic() - _t + _reason = FAILOVER_REASON_HTTP_5XX if exc.response.status_code >= 500 else "http_4xx" + logger.error( + "Router HTTP %d agent=%s node=%s event=%s duration=%dms", + exc.response.status_code, agent_id, target_node, event_id, + int(invoke_duration_s * 1000), + ) + if self._node_health_tracker and exc.response.status_code >= 500: + self._node_health_tracker.record_error(target_node, _reason) + if self._on_gateway_error: + self._on_gateway_error(f"http_{exc.response.status_code}") + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.error", agent_id=agent_id, node_id=target_node, + room_id=room_id, event_id=event_id, + status="error", error_code=f"router_http_{exc.response.status_code}", + duration_ms=int(invoke_duration_s * 1000), + ) + if exc.response.status_code >= 500: + raise # eligible for failover + return None # 4xx: not a node issue, don't failover + + except (httpx.ConnectError, httpx.TimeoutException) as exc: + invoke_duration_s = time.monotonic() - _t + _reason = ( + FAILOVER_REASON_TIMEOUT + if isinstance(exc, httpx.TimeoutException) + else FAILOVER_REASON_NETWORK + ) + logger.error( + "Router network error agent=%s node=%s event=%s: %s", + agent_id, target_node, event_id, exc, + ) + if self._node_health_tracker: + self._node_health_tracker.record_error(target_node, _reason) + if self._on_gateway_error: + self._on_gateway_error("network_error") + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.error", agent_id=agent_id, node_id=target_node, + room_id=room_id, event_id=event_id, + status="error", error_code="router_network_error", + duration_ms=int(invoke_duration_s * 1000), + ) + raise # eligible for failover + + except Exception as exc: + invoke_duration_s = time.monotonic() - _t + logger.error( + "Unexpected invoke error agent=%s node=%s event=%s: %s", + agent_id, target_node, event_id, exc, + ) + if self._node_health_tracker: + self._node_health_tracker.record_error(target_node, "unexpected") + if self._on_gateway_error: + self._on_gateway_error("unexpected") + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.error", agent_id=agent_id, node_id=target_node, + room_id=room_id, event_id=event_id, + status="error", error_code="router_unexpected", + duration_ms=int(invoke_duration_s * 1000), + ) + return None # unexpected errors: no failover (could be code bug) + + if sticky_node is not None: + # M8.1: sticky path — route directly to known-good fallback, skip primary logger.info( - "Invoke ok: agent=%s event=%s reply_len=%d duration=%dms", - agent_id, event_id, len(reply_text or ""), int(invoke_duration_s * 1000), + "Sticky: routing %s→%s (skipping primary=%s) agent=%s event=%s", + sticky_key, sticky_node, node_id, agent_id, event_id, ) + try: + reply_text = await _do_invoke(sticky_node, NODE_SOURCE_DEFAULT) + invoke_ok = reply_text is not None + used_node_id = sticky_node + except Exception: # noqa: BLE001 + # Sticky node also failed — clear sticky and leave reply_text=None + self._sticky_cache.delete(sticky_key) # type: ignore[union-attr] + logger.warning( + "Sticky node %s failed for %s — cleared (agent=%s event=%s)", + sticky_node, sticky_key, agent_id, event_id, + ) + # M8.2: remove from DB as well + await self._sticky_persist_delete(sticky_key) - except httpx.HTTPStatusError as exc: - invoke_duration_s = time.monotonic() - t0 - logger.error( - "Router HTTP %d agent=%s event=%s duration=%dms", - exc.response.status_code, agent_id, event_id, int(invoke_duration_s * 1000), - ) - if self._on_gateway_error: - self._on_gateway_error(f"http_{exc.response.status_code}") - await _write_audit( - http_client, self._console_url, self._internal_token, - event="matrix.error", agent_id=agent_id, node_id=self._node_id, - room_id=room_id, event_id=event_id, - status="error", error_code=f"router_http_{exc.response.status_code}", - duration_ms=int(invoke_duration_s * 1000), - ) + else: + # Normal path: try primary; attempt failover on eligible errors + try: + reply_text = await _do_invoke(node_id, node_source) + invoke_ok = reply_text is not None + used_node_id = node_id - except (httpx.ConnectError, httpx.TimeoutException) as exc: - invoke_duration_s = time.monotonic() - t0 - logger.error("Router network error agent=%s event=%s: %s", agent_id, event_id, exc) - if self._on_gateway_error: - self._on_gateway_error("network_error") - await _write_audit( - http_client, self._console_url, self._internal_token, - event="matrix.error", agent_id=agent_id, node_id=self._node_id, - room_id=room_id, event_id=event_id, - status="error", error_code="router_network_error", - duration_ms=int(invoke_duration_s * 1000), - ) + except (httpx.ConnectError, httpx.TimeoutException, httpx.HTTPStatusError): + # Primary failed with a failover-eligible error. + # Failover only for non-explicit routing (explicit = user chose node). + if node_source == NODE_SOURCE_EXPLICIT: + logger.info( + "Node %s failed for explicit routing — no failover (agent=%s event=%s)", + node_id, agent_id, event_id, + ) + # reply_text stays None; error already audited + else: + # Attempt failover + fallback_node: Optional[str] = None + if self._node_health_tracker is not None and self._node_policy is not None: + fallback_node = self._node_health_tracker.pick_fallback( + node_id, self._node_policy.allowed_nodes + ) + elif self._node_policy is not None: + # No tracker — pick any other allowed node deterministically + others = sorted( + n for n in self._node_policy.allowed_nodes if n != node_id + ) + fallback_node = others[0] if others else None - except Exception as exc: - invoke_duration_s = time.monotonic() - t0 - logger.error("Unexpected invoke error agent=%s event=%s: %s", agent_id, event_id, exc) - if self._on_gateway_error: - self._on_gateway_error("unexpected") - await _write_audit( - http_client, self._console_url, self._internal_token, - event="matrix.error", agent_id=agent_id, node_id=self._node_id, - room_id=room_id, event_id=event_id, - status="error", error_code="router_unexpected", - duration_ms=int(invoke_duration_s * 1000), - ) + if fallback_node: + logger.warning( + "Failover: %s → %s agent=%s event=%s", + node_id, fallback_node, agent_id, event_id, + ) + try: + reply_text = await _do_invoke(fallback_node, NODE_SOURCE_DEFAULT) + invoke_ok = reply_text is not None + used_node_id = fallback_node + if invoke_ok: + # Fire failover callback and audit + if self._on_failover: + self._on_failover(node_id, fallback_node, "invoke_error") + await _write_audit( + http_client, self._console_url, self._internal_token, + event="matrix.node.failover", + agent_id=agent_id, node_id=fallback_node, + room_id=room_id, event_id=event_id, + status="ok", data={ + "from_node": node_id, + "to_node": fallback_node, + "original_source": node_source, + }, + ) + # M8.1: set sticky — future messages skip primary + if self._sticky_cache is not None: + self._sticky_cache.set(sticky_key, fallback_node) + scope = "mixed" if is_mixed else "direct" + if self._on_sticky_set: + self._on_sticky_set(fallback_node, scope) + logger.info( + "Sticky set: %s → %s scope=%s ttl=%.0fs", + sticky_key, fallback_node, scope, + self._sticky_cache.ttl_s, + ) + # M8.2: persist sticky to DB + await self._sticky_persist_set(sticky_key, fallback_node) + except Exception: # noqa: BLE001 + pass # errors already audited inside _do_invoke if not invoke_ok or not reply_text: if invoke_ok: @@ -881,7 +3540,7 @@ class MatrixIngressLoop: self._on_message_replied(room_id, agent_id, "ok") await _write_audit( http_client, self._console_url, self._internal_token, - event="matrix.agent.replied", agent_id=agent_id, node_id=self._node_id, + event="matrix.agent.replied", agent_id=agent_id, node_id=used_node_id, room_id=room_id, event_id=event_id, status="ok", duration_ms=int(send_duration_s * 1000), data={ @@ -891,6 +3550,8 @@ class MatrixIngressLoop: "queue_wait_ms": int(wait_s * 1000), "routing_reason": routing_reason, "is_mixed": is_mixed, + "node_source": node_source, + "failover": used_node_id != node_id, # M8.0: failover flag }, ) logger.info( @@ -907,7 +3568,7 @@ class MatrixIngressLoop: self._on_gateway_error("matrix_send_error") await _write_audit( http_client, self._console_url, self._internal_token, - event="matrix.error", agent_id=agent_id, node_id=self._node_id, + event="matrix.error", agent_id=agent_id, node_id=node_id, room_id=room_id, event_id=event_id, status="error", error_code="matrix_send_failed", duration_ms=int(send_duration_s * 1000), diff --git a/services/matrix-bridge-dagi/app/main.py b/services/matrix-bridge-dagi/app/main.py index a297931e..f91713a7 100644 --- a/services/matrix-bridge-dagi/app/main.py +++ b/services/matrix-bridge-dagi/app/main.py @@ -33,6 +33,9 @@ except ImportError: # pragma: no cover from .config import BridgeConfig, load_config from .control import ControlConfig, parse_control_config +from .control_limiter import ControlRateLimiter +from .event_store import EventStore +from .node_policy import parse_node_policy from .ingress import MatrixIngressLoop from .mixed_routing import MixedRoomConfig, parse_mixed_room_map from .rate_limit import InMemoryRateLimiter @@ -69,7 +72,7 @@ if _PROM_OK: _invoke_latency = Histogram( "matrix_bridge_invoke_duration_seconds", "Latency of DAGI Router infer call", - ["agent_id"], + ["agent_id", "node_id"], # M5.1: per-node latency breakdown buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 45.0], ) _send_latency = Histogram( @@ -80,7 +83,8 @@ if _PROM_OK: ) _bridge_up = Gauge( "matrix_bridge_up", - "1 if bridge started successfully", + "1 if bridge started successfully; 0 on config error", + ["node_id"], # M7.1: per-node label for multi-node deployments ) _rate_limiter_active_rooms = Gauge( "matrix_bridge_rate_limiter_active_rooms", @@ -106,10 +110,11 @@ if _PROM_OK: ["agent_id"], buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 30.0], ) - # M2.2: Mixed room routing metrics - _routed_total = Counter( - "matrix_bridge_routed_total", - "Successful message routing by reason", + # M2.2: Mixed room routing — reason breakdown (slash/mention/name/default/direct) + # M7.1: Renamed from matrix_bridge_routed_total to avoid collision with M5.0 counter + _routing_reasons_total = Counter( + "matrix_bridge_routing_reasons_total", + "Message routing breakdown by agent and routing reason (slash/mention/name/default/direct)", ["agent_id", "reason"], ) _route_rejected_total = Counter( @@ -127,20 +132,74 @@ if _PROM_OK: "Total control commands received from authorized operators", ["sender", "verb", "subcommand"], ) + _control_rate_limited_total = Counter( + "matrix_bridge_control_rate_limited_total", + "Total control commands rejected by rate limiter or cooldown", + ["scope"], + ) + _dedupe_persistent_hits_total = Counter( + "matrix_bridge_dedupe_persistent_hits_total", + "Total events dropped by persistent (SQLite) deduplication", + ["room_id"], + ) + _dedupe_persistent_inserts_total = Counter( + "matrix_bridge_dedupe_persistent_inserts_total", + "Total events marked as processed in persistent dedupe store", + ) + # M5.0: node-aware routing — primary routed counter (unique name, no collision) + _routed_total = Counter( + "matrix_bridge_routed_total", + "Total messages successfully routed, by agent, resolved node, and node source", + ["agent_id", "node_id", "source"], + ) + _node_rejected_total = Counter( + "matrix_bridge_node_rejected_total", + "Total messages with rejected (non-allowlisted) node kwarg", + ["node_id"], + ) + # M8.0: soft-failover metrics + _failover_total = Counter( + "matrix_bridge_failover_total", + "Total successful soft-failovers by node transition and reason", + ["from_node", "to_node", "reason"], + ) + _node_health_state = Gauge( + "matrix_bridge_node_health_state", + "Node health state: 1=healthy 0.5=degraded 0=down", + ["node_id"], + ) + # M8.1: sticky routing metrics + _sticky_set_total = Counter( + "matrix_bridge_sticky_node_total", + "Total sticky routing entries set after failover, by preferred node and scope", + ["node_id", "scope"], + ) + _sticky_active = Gauge( + "matrix_bridge_sticky_node_active", + "Current count of active sticky routing entries", + [], + ) # ── Startup state ───────────────────────────────────────────────────────────── _START_TIME = time.monotonic() _cfg: Optional[BridgeConfig] = None +# M5.1: in-memory per-node counters (lightweight, for !status reply) +from collections import defaultdict as _defaultdict +_node_stats: Dict[str, Dict[str, int]] = _defaultdict(lambda: {"routed": 0, "rejected": 0}) _config_error: Optional[str] = None _matrix_reachable: Optional[bool] = None _gateway_reachable: Optional[bool] = None _room_map: Optional[RoomMappingConfig] = None _mixed_room_config: Optional[MixedRoomConfig] = None _control_config: Optional[ControlConfig] = None +_event_store: Optional[EventStore] = None _rate_limiter: Optional[InMemoryRateLimiter] = None _ingress_loop: Optional["MatrixIngressLoop"] = None # for /health queue_size _ingress_task: Optional[asyncio.Task] = None _ingress_stop: Optional[asyncio.Event] = None +_sticky_cache: Optional[Any] = None # M8.1: StickyNodeCache instance +_confirm_store: Optional[Any] = None # M9.0: ConfirmStore instance +_dummy_http_client: Optional[Any] = None # M11: soak inject endpoint (debug only) async def _probe_url(url: str, timeout: float = 5.0) -> bool: @@ -230,7 +289,7 @@ async def lifespan(app_: Any): else: logger.warning("⚠️ DAGI Gateway NOT reachable: %s", _cfg.dagi_gateway_url) if _PROM_OK: - _bridge_up.set(1) + _bridge_up.labels(node_id=_cfg.node_id or "").set(1) # M7.1: labeled # Start ingress loop (fire-and-forget asyncio task) _has_rooms = (_room_map and _room_map.total_mappings > 0) or ( @@ -263,9 +322,9 @@ async def lifespan(app_: Any): _rate_limiter_active_rooms.set(stats["active_rooms"]) _rate_limiter_active_senders.set(stats["active_senders"]) - def _on_invoke_latency(agent_id: str, duration_s: float) -> None: + def _on_invoke_latency(agent_id: str, duration_s: float, node_id: str = "") -> None: if _PROM_OK: - _invoke_latency.labels(agent_id=agent_id).observe(duration_s) + _invoke_latency.labels(agent_id=agent_id, node_id=node_id or "unknown").observe(duration_s) def _on_send_latency(agent_id: str, duration_s: float) -> None: if _PROM_OK: @@ -287,7 +346,7 @@ async def lifespan(app_: Any): # M2.2 callbacks def _on_routed(agent_id: str, reason: str) -> None: if _PROM_OK: - _routed_total.labels(agent_id=agent_id, reason=reason).inc() + _routing_reasons_total.labels(agent_id=agent_id, reason=reason).inc() # M7.1: renamed def _on_route_rejected(room_id: str, reason: str) -> None: if _PROM_OK: @@ -300,6 +359,164 @@ async def lifespan(app_: Any): sender=sender, verb=verb, subcommand=subcommand ).inc() + # M3.4: control safety rate limiter + _control_limiter = ControlRateLimiter( + room_rpm=_cfg.control_room_rpm, + operator_rpm=_cfg.control_operator_rpm, + run_next_rpm=_cfg.control_run_next_rpm, + cooldown_s=_cfg.control_cooldown_s, + ) if _control_config and _control_config.is_enabled else None + + def _on_control_rate_limited(scope: str) -> None: + if _PROM_OK: + _control_rate_limited_total.labels(scope=scope).inc() + + # M2.3: Persistent event deduplication + _prune_task: Optional[asyncio.Task] = None + if _cfg.persistent_dedupe: + import os + db_path = os.path.join(_cfg.bridge_data_dir, "matrix_bridge.db") + _event_store = EventStore( + db_path=db_path, + ttl_h=_cfg.processed_events_ttl_h, + prune_batch=_cfg.processed_events_prune_batch, + ) + store_ok = await _event_store.open() + if store_ok: + logger.info( + "✅ Persistent dedupe: %s (ttl_h=%d)", + db_path, _cfg.processed_events_ttl_h, + ) + # Best-effort prune on startup + pruned = await _event_store.prune() + if pruned: + logger.info("Startup prune removed %d stale events", pruned) + # Periodic prune task + if _cfg.processed_events_prune_interval_s > 0: + async def _prune_loop() -> None: + while True: + await asyncio.sleep(_cfg.processed_events_prune_interval_s) + if _event_store: + await _event_store.prune() + _prune_task = asyncio.create_task(_prune_loop(), name="event_store_prune") + else: + logger.warning("⚠️ EventStore init failed — persistent dedupe disabled (degraded)") + _event_store = None + else: + logger.info("Persistent dedupe disabled (PERSISTENT_DEDUPE=0)") + + def _on_dedupe_hit(room_id: str, agent_id: str) -> None: + if _PROM_OK: + _dedupe_persistent_hits_total.labels(room_id=room_id).inc() + + def _on_dedupe_insert() -> None: + if _PROM_OK: + _dedupe_persistent_inserts_total.inc() + + # M5.0: node-aware routing policy + _node_policy = parse_node_policy( + raw_allowed=_cfg.bridge_allowed_nodes, + default_node=_cfg.bridge_default_node, + raw_room_map=_cfg.bridge_room_node_map, + ) + logger.info( + "✅ Node policy: default=%s allowed=%s room_overrides=%d", + _node_policy.default_node, + sorted(_node_policy.allowed_nodes), + len(_node_policy.room_node_map), + ) + + # M6.0: Persistent policy store for dynamic room-node overrides + _policy_store: Optional[Any] = None + try: + from .policy_store import PolicyStore as _PolicyStore + import os + _ps_path = os.path.join(_cfg.bridge_data_dir, "policy_overrides.db") + _policy_store = _PolicyStore(db_path=_ps_path) + _policy_store.open() + logger.info( + "✅ Policy store: %s (%d overrides)", + _ps_path, _policy_store.count_overrides(), + ) + except Exception as _ps_exc: + logger.warning("Policy store init failed (non-fatal): %s", _ps_exc) + _policy_store = None + + def _on_node_selected(agent_id: str, node_id: str, source: str) -> None: + if _PROM_OK: + _routed_total.labels(agent_id=agent_id, node_id=node_id, source=source).inc() + _node_stats[node_id]["routed"] += 1 + + def _on_node_rejected(rejected_node: str) -> None: + if _PROM_OK: + _node_rejected_total.labels(node_id=rejected_node).inc() + _node_stats[rejected_node]["rejected"] += 1 + + # M8.0: Node health tracker + soft-failover + from .node_health import NodeHealthTracker as _NodeHealthTracker, parse_node_health_config as _parse_nhc + _health_cfg = _parse_nhc( + fail_consecutive=_cfg.node_fail_consecutive, + lat_ewma_s=_cfg.node_lat_ewma_s, + ewma_alpha=_cfg.node_ewma_alpha, + ) + _node_health_tracker = _NodeHealthTracker(_health_cfg) + logger.info( + "✅ Node health tracker: fail_consecutive=%d lat_ewma_s=%.1f ewma_alpha=%.2f", + _cfg.node_fail_consecutive, _cfg.node_lat_ewma_s, _cfg.node_ewma_alpha, + ) + + def _on_failover(from_node: str, to_node: str, reason: str) -> None: + if _PROM_OK: + _failover_total.labels( + from_node=from_node, to_node=to_node, reason=reason + ).inc() + if _PROM_OK: + _update_health_gauges() + logger.info("⚡ Failover: %s → %s reason=%s", from_node, to_node, reason) + + def _update_health_gauges() -> None: + if not _PROM_OK or _node_health_tracker is None or _node_policy is None: + return + _STATE_MAP = {"healthy": 1.0, "degraded": 0.5, "down": 0.0} + for nid in _node_policy.allowed_nodes: + state = _node_health_tracker.state(nid) + _node_health_state.labels(node_id=nid).set(_STATE_MAP.get(state, 1.0)) + + # M8.1: Sticky failover cache + from .sticky_cache import StickyNodeCache as _StickyNodeCache + global _sticky_cache + if _cfg.failover_sticky_ttl_s > 0: + _sticky_cache = _StickyNodeCache(ttl_s=_cfg.failover_sticky_ttl_s) + logger.info("✅ Sticky failover cache: ttl=%.0fs", _cfg.failover_sticky_ttl_s) + else: + _sticky_cache = None + logger.info("ℹ️ Sticky failover disabled (FAILOVER_STICKY_TTL_S=0)") + + # M9.0: Confirm store + from .confirm_store import ConfirmStore as _ConfirmStore + global _confirm_store + if _cfg.confirm_ttl_s > 0: + _confirm_store = _ConfirmStore(ttl_s=_cfg.confirm_ttl_s) + logger.info("✅ Confirm store: ttl=%.0fs", _cfg.confirm_ttl_s) + else: + _confirm_store = None + logger.info("ℹ️ Confirm store disabled (CONFIRM_TTL_S=0)") + + # M11: debug inject client (only created when inject is enabled) + global _dummy_http_client + if _cfg.debug_inject_enabled and _HTTPX_OK: + _dummy_http_client = _httpx.AsyncClient(timeout=30.0) + logger.warning( + "⚠️ DEBUG_INJECT_ENABLED=true — synthetic event injection active. " + "NEVER use in production!" + ) + + def _on_sticky_set(node_id: str, scope: str) -> None: + if _PROM_OK: + _sticky_set_total.labels(node_id=node_id, scope=scope).inc() + if _sticky_cache is not None: + _sticky_active.labels().set(_sticky_cache.active_count()) + ingress = MatrixIngressLoop( matrix_homeserver_url=_cfg.matrix_homeserver_url, matrix_access_token=_cfg.matrix_access_token, @@ -330,7 +547,38 @@ async def lifespan(app_: Any): on_route_rejected=_on_route_rejected, control_config=_control_config, control_unauthorized_behavior=_cfg.control_unauthorized_behavior, + sofiia_control_token=_cfg.sofiia_control_token, + control_limiter=_control_limiter, on_control_command=_on_control_command, + on_control_rate_limited=_on_control_rate_limited, + event_store=_event_store, + on_dedupe_persistent_hit=_on_dedupe_hit, + on_dedupe_persistent_insert=_on_dedupe_insert, + # M4.0: agent discovery + discovery_rpm=_cfg.discovery_rpm, + # M5.0: node-aware routing + node_policy=_node_policy, + on_node_selected=_on_node_selected, + on_node_rejected=_on_node_rejected, + # M5.1: node stats getter for !status + node_stats_getter=lambda: {k: dict(v) for k, v in _node_stats.items()}, + # M6.0: dynamic room-node policy store + policy_store=_policy_store, + # M6.2: data directory for policy exports/imports + bridge_data_dir=_cfg.bridge_data_dir, + # M8.0: node health tracker + failover callback + node_health_tracker=_node_health_tracker, + on_failover=_on_failover, + # M8.1: sticky failover cache + sticky_cache=_sticky_cache, + on_sticky_set=_on_sticky_set, + # M8.2: HA persistence config + ha_health_snapshot_interval_s=_cfg.ha_health_snapshot_interval_s, + ha_health_max_age_s=_cfg.ha_health_max_age_s, + # M9.0: Two-step confirmation store + confirm_store=_confirm_store, + policy_export_retention_days=_cfg.policy_export_retention_days, + policy_history_limit=_cfg.policy_history_limit, ) logger.info( "✅ Backpressure queue: max=%d workers=%d drain_timeout=%.1fs", @@ -349,7 +597,8 @@ async def lifespan(app_: Any): _config_error = str(exc) logger.error("❌ Config error: %s", _config_error) if _PROM_OK: - _bridge_up.set(0) + _cfg_node = _cfg.node_id if _cfg else "" + _bridge_up.labels(node_id=_cfg_node or "").set(0) # M7.1: labeled yield # Shutdown: cancel ingress loop if _ingress_stop: @@ -360,6 +609,23 @@ async def lifespan(app_: Any): await asyncio.wait_for(_ingress_task, timeout=5.0) except (asyncio.CancelledError, asyncio.TimeoutError): pass + # Shutdown: cancel prune task + close EventStore + if "_prune_task" in dir() and _prune_task and not _prune_task.done(): # type: ignore[name-defined] + _prune_task.cancel() # type: ignore[name-defined] + if _event_store is not None: + await _event_store.close() + # M6.0: close policy store + if "_policy_store" in dir() and _policy_store is not None: # type: ignore[name-defined] + try: + _policy_store.close() # type: ignore[name-defined] + except Exception: # noqa: BLE001 + pass + # M11: close debug http client if open + if _dummy_http_client is not None: + try: + await _dummy_http_client.aclose() + except Exception: # noqa: BLE001 + pass logger.info("matrix-bridge-dagi shutting down") # ── App ─────────────────────────────────────────────────────────────────────── @@ -435,6 +701,89 @@ async def health() -> Dict[str, Any]: "operators_count": len(_control_config.operator_allowlist) if _control_config else 0, "unauthorized_behavior": _cfg.control_unauthorized_behavior, }, + "control_safety": { + "enabled": _cfg.control_room_rpm > 0 or _cfg.control_operator_rpm > 0, + "room_rpm": _cfg.control_room_rpm, + "operator_rpm": _cfg.control_operator_rpm, + "run_next_rpm": _cfg.control_run_next_rpm, + "cooldown_s": _cfg.control_cooldown_s, + }, + "persistent_dedupe": _event_store.as_health_dict() if _event_store else { + "enabled": False, + "db_path": None, + "ttl_h": _cfg.processed_events_ttl_h, + "ok": False, + "last_prune_at": None, + "pruned_rows_last": 0, + }, + # M6.0: policy store health + "policy_store": _health_policy_store_dict(), + # M8.1: sticky failover cache health + "sticky_cache": _health_sticky_dict(), + # M8.2: HA state persistence info + "ha_state": _health_ha_dict(), + # M9.0: confirm store + "confirm_store": _health_confirm_dict(), + } + + +def _health_confirm_dict() -> Dict[str, Any]: + """Return confirm store info for /health endpoint (M9.0).""" + if _confirm_store is None: + return {"enabled": False} + return { + "enabled": True, + "pending": _confirm_store.pending_count(), + "ttl_s": _confirm_store.ttl_s, + } + + +def _health_ha_dict() -> Dict[str, Any]: + """Return HA persistence info for /health endpoint (M8.2).""" + if _ingress_loop is None: + return {"sticky_loaded": 0, "health_loaded": False, "snapshot_interval_s": 0} + try: + s = _ingress_loop.get_status() + return { + "sticky_loaded": s.get("ha_sticky_loaded", 0), + "health_loaded": s.get("ha_health_loaded", False), + "snapshot_interval_s": s.get("ha_health_snapshot_interval_s", 0), + } + except Exception: # noqa: BLE001 + return {"sticky_loaded": 0, "health_loaded": False, "snapshot_interval_s": 0} + + +def _health_sticky_dict() -> Dict[str, Any]: + """Return sticky failover cache health for /health endpoint (M8.1).""" + if _sticky_cache is None: + return {"enabled": False, "active_keys": 0, "ttl_s": 0} + return { + "enabled": True, + "active_keys": _sticky_cache.active_count(), + "ttl_s": _sticky_cache.ttl_s, + } + + +def _health_policy_store_dict() -> Dict[str, Any]: + """Return policy store health info for /health endpoint.""" + try: + if _ingress_loop is not None: + s = _ingress_loop.get_status() + return { + "ok": s.get("policy_store_ok", False), + "path": s.get("policy_store_path"), + "overrides_count": s.get("policy_overrides_count", 0), + "agent_overrides_count": s.get("policy_agent_overrides_count", 0), # M6.1 + "last_export_at": s.get("policy_last_export_at"), # M6.2 + "last_import_at": s.get("policy_last_import_at"), # M6.2 + "db_mtime": s.get("policy_db_mtime"), # M6.2 + } + except Exception: # noqa: BLE001 + pass + return { + "ok": False, "path": None, + "overrides_count": 0, "agent_overrides_count": 0, + "last_export_at": None, "last_import_at": None, "db_mtime": None, } @@ -464,6 +813,101 @@ async def bridge_mappings() -> Dict[str, Any]: } +# ── Debug / Soak (M11) ──────────────────────────────────────────────────────── +@app.post("/v1/debug/inject_event") +async def debug_inject_event(body: Dict[str, Any]) -> Dict[str, Any]: + """ + Synthetic event injection for soak/load testing. + + Enabled ONLY when DEBUG_INJECT_ENABLED=true (never in production). + + Body: { "room_id": "!room:server", "event": { Matrix event dict } } + The event is enqueued directly into the ingress loop, bypassing Matrix poll. + + Returns: { "ok": bool, "enqueued": bool, "room_id": str, "event_id": str } + """ + if _cfg is None or not _cfg.debug_inject_enabled: + return Response( # type: ignore[return-value] + '{"ok":false,"error":"debug inject disabled"}', + status_code=403, + media_type="application/json", + ) + if _ingress_loop is None: + return {"ok": False, "error": "ingress loop not running"} + + room_id = body.get("room_id", "") + event = body.get("event", {}) + if not room_id or not event: + return {"ok": False, "error": "missing room_id or event"} + + # Ensure event has minimum required fields for ingress processing + if not event.get("event_id"): + import time as _time + event["event_id"] = f"!inject-{int(_time.monotonic() * 1e6)}" + if not event.get("type"): + event["type"] = "m.room.message" + if not event.get("content"): + event["content"] = {"msgtype": "m.text", "body": event.get("body", "soak-ping")} + + # Build a minimal sync_resp that looks like a real Matrix /sync response + # so _enqueue_from_sync can pick it up via extract_room_messages. + # We bypass Matrix polling by directly calling _try_enqueue on the right mapping. + enqueued = False + try: + # Find the matching room mapping (direct rooms only for soak) + mapping = None + if _ingress_loop._room_map is not None: + for m in _ingress_loop._room_map.mappings: + if m.room_id == room_id: + mapping = m + break + + if mapping is None: + return {"ok": False, "error": f"no mapping for room_id={room_id!r}"} + + # Build a minimal stub Matrix client — replies are discarded for soak events + from .matrix_client import MatrixClient + + class _SoakMatrixClient(MatrixClient): # type: ignore[misc] + """No-op Matrix client for synthetic soak events.""" + def __init__(self) -> None: # noqa: D107 + pass # skip real __init__ + + async def mark_seen(self, room_id: str, event_id: str) -> None: # type: ignore[override] + pass + + async def send_text(self, room_id: str, text: str, # type: ignore[override] + txn: Optional[str] = None) -> None: + pass + + _stub_client = _SoakMatrixClient() + + if _dummy_http_client is None: + return {"ok": False, "error": "debug http client not initialised"} + + await _ingress_loop._try_enqueue( + _stub_client, # type: ignore[arg-type] + _ingress_loop._queue, + _dummy_http_client, + event, + mapping, + ) + enqueued = True + except Exception as exc: # noqa: BLE001 + return {"ok": False, "error": str(exc), "enqueued": False} + + return { + "ok": True, + "enqueued": enqueued, + "room_id": room_id, + "event_id": event.get("event_id"), + } + + +async def _noop_send(room_id: str, text: str, txn: Optional[str] = None) -> None: + """Discard replies from injected soak events.""" + + # ── Metrics ─────────────────────────────────────────────────────────────────── @app.get("/metrics") async def metrics(): diff --git a/services/matrix-bridge-dagi/app/metrics_contract.py b/services/matrix-bridge-dagi/app/metrics_contract.py new file mode 100644 index 00000000..ce346def --- /dev/null +++ b/services/matrix-bridge-dagi/app/metrics_contract.py @@ -0,0 +1,224 @@ +""" +Metrics Contract — Matrix Bridge DAGI +Phase M7.1 + +Single source of truth for all Prometheus metric names and their label sets. +Used by: + - main.py (registers metrics against this contract) + - tests/test_matrix_bridge_m71_metrics_contract.py (static validation) + - ops/prometheus/alerts/matrix-bridge-dagi.rules.yml (PromQL expressions) + - ops/grafana/dashboards/matrix-bridge-dagi.json (panel queries) + +Format: + METRICS_CONTRACT: Dict[metric_name, MetricSpec] + +MetricSpec fields: + kind : "counter" | "histogram" | "gauge" + labels : list of label names (empty list = no labels) + help : one-line description + phase : originating milestone +""" +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Dict, List + + +@dataclass(frozen=True) +class MetricSpec: + kind: str # "counter" | "histogram" | "gauge" + labels: List[str] # label names; empty = no labels + help: str + phase: str = "M1" # originating milestone for traceability + + +# ── Contract ────────────────────────────────────────────────────────────────── + +METRICS_CONTRACT: Dict[str, MetricSpec] = { + + # ── Core message traffic ────────────────────────────────────────────────── + "matrix_bridge_messages_received_total": MetricSpec( + kind="counter", + labels=["room_id", "agent_id"], + help="Total Matrix messages received", + phase="M1", + ), + "matrix_bridge_messages_replied_total": MetricSpec( + kind="counter", + labels=["room_id", "agent_id", "status"], + help="Total agent replies sent to Matrix (status=ok|error)", + phase="M1", + ), + "matrix_bridge_rate_limited_total": MetricSpec( + kind="counter", + labels=["room_id", "agent_id", "limit_type"], + help="Messages dropped by rate limiter", + phase="H1", + ), + "matrix_bridge_gateway_errors_total": MetricSpec( + kind="counter", + labels=["error_type"], + help="Bridge errors by stage: sync_error, network_error, http_, matrix_send_error, unexpected", + phase="M1", + ), + + # ── Latency histograms ──────────────────────────────────────────────────── + "matrix_bridge_invoke_duration_seconds": MetricSpec( + kind="histogram", + labels=["agent_id", "node_id"], + help="Latency of DAGI Router infer call, per agent and node", + phase="H3", + ), + "matrix_bridge_send_duration_seconds": MetricSpec( + kind="histogram", + labels=["agent_id"], + help="Latency of Matrix send_text call", + phase="H3", + ), + "matrix_bridge_queue_wait_seconds": MetricSpec( + kind="histogram", + labels=["agent_id"], + help="Time between enqueue and worker start processing", + phase="H3", + ), + + # ── Queue ───────────────────────────────────────────────────────────────── + "matrix_bridge_queue_size": MetricSpec( + kind="gauge", + labels=[], + help="Current number of pending items in the work queue", + phase="H2", + ), + "matrix_bridge_queue_dropped_total": MetricSpec( + kind="counter", + labels=["room_id", "agent_id"], + help="Messages dropped because queue was full", + phase="H2", + ), + + # ── Rate limiter gauges ─────────────────────────────────────────────────── + "matrix_bridge_rate_limiter_active_rooms": MetricSpec( + kind="gauge", + labels=[], + help="Rooms with activity in the current rate-limit window", + phase="H1", + ), + "matrix_bridge_rate_limiter_active_senders": MetricSpec( + kind="gauge", + labels=[], + help="Senders with activity in the current rate-limit window", + phase="H1", + ), + + # ── Routing ─────────────────────────────────────────────────────────────── + "matrix_bridge_routing_reasons_total": MetricSpec( + kind="counter", + labels=["agent_id", "reason"], + help="Message routing breakdown by agent and routing reason (slash/mention/name/default/direct)", + phase="M2.2", + ), + "matrix_bridge_route_rejected_total": MetricSpec( + kind="counter", + labels=["room_id", "reason"], + help="Messages rejected during routing (unknown agent, bad slash, no mapping, etc.)", + phase="M2.2", + ), + "matrix_bridge_active_room_agent_locks": MetricSpec( + kind="gauge", + labels=[], + help="Number of room-agent pairs currently holding a concurrency lock", + phase="M2.2", + ), + + # ── Control channel ─────────────────────────────────────────────────────── + "matrix_bridge_control_commands_total": MetricSpec( + kind="counter", + labels=["sender", "verb", "subcommand"], + help="Total control commands received from authorized operators", + phase="M3.0", + ), + "matrix_bridge_control_rate_limited_total": MetricSpec( + kind="counter", + labels=["scope"], + help="Total control commands rejected by rate limiter or cooldown", + phase="M3.4", + ), + + # ── Persistent deduplication ───────────────────────────────────────────── + "matrix_bridge_dedupe_persistent_hits_total": MetricSpec( + kind="counter", + labels=["room_id"], + help="Total events dropped by persistent (SQLite) deduplication", + phase="M2.3", + ), + "matrix_bridge_dedupe_persistent_inserts_total": MetricSpec( + kind="counter", + labels=[], + help="Total events marked as processed in persistent dedupe store", + phase="M2.3", + ), + + # ── Node-aware routing (M5.0) ───────────────────────────────────────────── + "matrix_bridge_routed_total": MetricSpec( + kind="counter", + labels=["agent_id", "node_id", "source"], + help="Total messages successfully routed, by agent, resolved node, and node source", + phase="M5.0", + ), + "matrix_bridge_node_rejected_total": MetricSpec( + kind="counter", + labels=["node_id"], + help="Total messages with rejected (non-allowlisted) node kwarg", + phase="M5.0", + ), + + # ── Bridge health (M7.1) ────────────────────────────────────────────────── + "matrix_bridge_up": MetricSpec( + kind="gauge", + labels=["node_id"], + help="1 if bridge started successfully; 0 on config error", + phase="M7.1", + ), + + # ── Soft-failover (M8.0) ───────────────────────────────────────────────── + "matrix_bridge_failover_total": MetricSpec( + kind="counter", + labels=["from_node", "to_node", "reason"], + help="Total successful soft-failovers by node transition and reason", + phase="M8.0", + ), + "matrix_bridge_node_health_state": MetricSpec( + kind="gauge", + labels=["node_id"], + help="Node health state gauge: 1=healthy 0.5=degraded 0=down", + phase="M8.0", + ), + + # ── Sticky routing anti-flap (M8.1) ────────────────────────────────────── + "matrix_bridge_sticky_node_total": MetricSpec( + kind="counter", + labels=["node_id", "scope"], + help="Total sticky routing entries set after failover, by preferred node and scope", + phase="M8.1", + ), + "matrix_bridge_sticky_node_active": MetricSpec( + kind="gauge", + labels=[], + help="Current count of active sticky routing entries", + phase="M8.1", + ), +} + +# ── Alert metric references ──────────────────────────────────────────────────── +# These are the metric base-names referenced in alert rules. +# All must exist in METRICS_CONTRACT. +ALERT_METRIC_REFS = frozenset({ + "matrix_bridge_up", + "matrix_bridge_gateway_errors_total", + "matrix_bridge_messages_replied_total", + "matrix_bridge_queue_dropped_total", + "matrix_bridge_rate_limited_total", + "matrix_bridge_control_rate_limited_total", + "matrix_bridge_dedupe_persistent_hits_total", + "matrix_bridge_invoke_duration_seconds", +}) diff --git a/services/matrix-bridge-dagi/app/mixed_routing.py b/services/matrix-bridge-dagi/app/mixed_routing.py index 040a6a1c..937fa038 100644 --- a/services/matrix-bridge-dagi/app/mixed_routing.py +++ b/services/matrix-bridge-dagi/app/mixed_routing.py @@ -309,3 +309,25 @@ def reply_prefix(agent_id: str, is_mixed: bool) -> str: return "" # Capitalise first letter of agent name: "sofiia" → "Sofiia" return f"{agent_id.capitalize()}: " + + +def build_override_config( + base_config: MixedRoomConfig, + room_id: str, + agents: List[str], + default_agent: str, +) -> MixedRoomConfig: + """ + M6.1: Build a temporary MixedRoomConfig that uses a dynamic store override + for room_id while keeping all other rooms from base_config unchanged. + + Used in _enqueue_from_mixed_room to inject PolicyStore agent overrides + without mutating the shared base configuration. + """ + rooms = dict(base_config.rooms) + rooms[room_id] = MixedRoom( + room_id=room_id, + agents=agents, + default_agent=default_agent, + ) + return MixedRoomConfig(rooms=rooms) diff --git a/services/matrix-bridge-dagi/app/node_health.py b/services/matrix-bridge-dagi/app/node_health.py new file mode 100644 index 00000000..7dd66f67 --- /dev/null +++ b/services/matrix-bridge-dagi/app/node_health.py @@ -0,0 +1,262 @@ +""" +NodeHealthTracker — M8.0: per-node health state tracking for soft-failover. + +Tracks invoke outcomes per node and maintains: + - EWMA latency estimate + - consecutive failure counter + - last ok / last error timestamps + - derived health state: "healthy" | "degraded" | "down" + +State transitions +----------------- + Any state → "down" : consecutive_failures >= fail_consecutive + Any state → "degraded" : ewma_latency_s >= lat_ewma_threshold + (and not yet "down") + "down"/"degraded" → "healthy" : record_ok() resets consecutive_failures to 0 + and ewma is updated towards the actual latency + +Thread safety +------------- + All mutations are protected by a threading.Lock so this can be called from + asyncio callbacks (e.g. in `_invoke_and_send` on the event loop thread). + Use `record_ok` / `record_error` from within coroutines; they are synchronous + (no blocking I/O) so they are safe to call directly without to_thread. +""" +from __future__ import annotations + +import logging +import threading +import time +from dataclasses import dataclass, field +from typing import Dict, FrozenSet, Optional, Tuple + +logger = logging.getLogger(__name__) + +# ── State constants ──────────────────────────────────────────────────────────── + +NODE_STATE_HEALTHY = "healthy" +NODE_STATE_DEGRADED = "degraded" +NODE_STATE_DOWN = "down" + +# Failover-triggering error classes +FAILOVER_REASON_TIMEOUT = "timeout" +FAILOVER_REASON_HTTP_5XX = "http_5xx" +FAILOVER_REASON_NETWORK = "network" + + +# ── Config ──────────────────────────────────────────────────────────────────── + +@dataclass(frozen=True) +class NodeHealthConfig: + """ + Thresholds controlling when a node is considered degraded or down. + + fail_consecutive : int number of consecutive failures → "down" + lat_ewma_s : float EWMA latency estimate (seconds) threshold → "degraded" + ewma_alpha : float EWMA smoothing factor (0..1); higher = more reactive + """ + fail_consecutive: int = 3 + lat_ewma_s: float = 12.0 + ewma_alpha: float = 0.3 + + def __post_init__(self) -> None: + if not (0 < self.ewma_alpha <= 1): + raise ValueError(f"ewma_alpha must be in (0, 1], got {self.ewma_alpha}") + if self.fail_consecutive < 1: + raise ValueError(f"fail_consecutive must be ≥ 1, got {self.fail_consecutive}") + if self.lat_ewma_s <= 0: + raise ValueError(f"lat_ewma_s must be > 0, got {self.lat_ewma_s}") + + +# ── Per-node state ──────────────────────────────────────────────────────────── + +@dataclass +class _NodeState: + invoke_ok_total: int = 0 + invoke_err_total: int = 0 + consecutive_failures: int = 0 + last_ok_ts: Optional[float] = None + last_err_ts: Optional[float] = None + ewma_latency_s: Optional[float] = None # None until first ok record + + +# ── Tracker ─────────────────────────────────────────────────────────────────── + +class NodeHealthTracker: + """ + Thread-safe per-node health tracker. + + Usage: + tracker = NodeHealthTracker(NodeHealthConfig()) + + # On successful invoke + tracker.record_ok("NODA1", latency_s=1.4) + + # On failed invoke + tracker.record_error("NODA1", reason=FAILOVER_REASON_TIMEOUT) + + # Read health state + state = tracker.state("NODA1") # "healthy" | "degraded" | "down" + fallback = tracker.pick_fallback("NODA1", allowed_nodes=frozenset({"NODA1","NODA2"})) + """ + + def __init__(self, config: Optional[NodeHealthConfig] = None) -> None: + self._cfg = config or NodeHealthConfig() + self._nodes: Dict[str, _NodeState] = {} + self._lock = threading.RLock() # RLock: re-entrant (needed for all_info → as_info_dict) + + # ── Public mutation API ──────────────────────────────────────────────────── + + def record_ok(self, node_id: str, latency_s: float) -> None: + """Record a successful invoke for node_id with given latency.""" + with self._lock: + ns = self._get_or_create(node_id) + ns.invoke_ok_total += 1 + ns.consecutive_failures = 0 + ns.last_ok_ts = time.monotonic() + if ns.ewma_latency_s is None: + ns.ewma_latency_s = latency_s + else: + alpha = self._cfg.ewma_alpha + ns.ewma_latency_s = alpha * latency_s + (1 - alpha) * ns.ewma_latency_s + + def record_error(self, node_id: str, reason: str = "unknown") -> None: + """Record a failed invoke for node_id.""" + with self._lock: + ns = self._get_or_create(node_id) + ns.invoke_err_total += 1 + ns.consecutive_failures += 1 + ns.last_err_ts = time.monotonic() + logger.debug( + "NodeHealth: node=%s consecutive_failures=%d reason=%s", + node_id, ns.consecutive_failures, reason, + ) + + # ── Public read API ─────────────────────────────────────────────────────── + + def state(self, node_id: str) -> str: + """Return current health state for node_id.""" + with self._lock: + return self._state_unlocked(node_id) + + def pick_fallback( + self, + primary: str, + allowed_nodes: FrozenSet[str], + ) -> Optional[str]: + """ + Return the best alternative node for failover. + + Priority: healthy > degraded > (never down) + Returns None if no acceptable fallback exists. + """ + with self._lock: + candidates = sorted(n for n in allowed_nodes if n != primary) + # Prefer healthy first + for n in candidates: + if self._state_unlocked(n) == NODE_STATE_HEALTHY: + return n + # Accept degraded if no healthy available + for n in candidates: + if self._state_unlocked(n) == NODE_STATE_DEGRADED: + return n + # Do not failover to "down" nodes + return None + + def as_info_dict(self, node_id: str) -> dict: + """Return a JSON-safe status dict for one node.""" + with self._lock: + ns = self._nodes.get(node_id) + if ns is None: + return { + "node_id": node_id, + "state": NODE_STATE_HEALTHY, + "invoke_ok": 0, + "invoke_err": 0, + "consecutive_failures": 0, + "ewma_latency_s": None, + "last_ok_ts": None, + "last_err_ts": None, + } + return { + "node_id": node_id, + "state": self._state_unlocked(node_id), + "invoke_ok": ns.invoke_ok_total, + "invoke_err": ns.invoke_err_total, + "consecutive_failures": ns.consecutive_failures, + "ewma_latency_s": round(ns.ewma_latency_s, 3) if ns.ewma_latency_s else None, + "last_ok_ts": ns.last_ok_ts, + "last_err_ts": ns.last_err_ts, + } + + def all_info(self, allowed_nodes: Optional[FrozenSet[str]] = None) -> Dict[str, dict]: + """ + Return status dicts for all tracked (or specified) nodes. + If allowed_nodes provided, also include entries for unseen nodes (state=healthy). + """ + with self._lock: + keys = set(self._nodes.keys()) + if allowed_nodes: + keys |= set(allowed_nodes) + return {n: self.as_info_dict(n) for n in sorted(keys)} + + def reset(self, node_id: str) -> None: + """Reset health state for a node (e.g. after manual recovery).""" + with self._lock: + self._nodes.pop(node_id, None) + + def restore_node( + self, + node_id: str, + ewma_latency_s: Optional[float], + consecutive_failures: int, + ) -> None: + """ + Restore persisted node state after a restart (M8.2). + + Only restores ewma_latency_s and consecutive_failures; counters + (invoke_ok_total, invoke_err_total) start from 0 since they are + runtime metrics for the current session. + """ + with self._lock: + ns = self._get_or_create(node_id) + ns.ewma_latency_s = ewma_latency_s + ns.consecutive_failures = max(0, consecutive_failures) + + # ── Internal ────────────────────────────────────────────────────────────── + + def _get_or_create(self, node_id: str) -> _NodeState: + if node_id not in self._nodes: + self._nodes[node_id] = _NodeState() + return self._nodes[node_id] + + def _state_unlocked(self, node_id: str) -> str: + ns = self._nodes.get(node_id) + if ns is None: + return NODE_STATE_HEALTHY # unseen nodes are assumed healthy + + if ns.consecutive_failures >= self._cfg.fail_consecutive: + return NODE_STATE_DOWN + + if ( + ns.ewma_latency_s is not None + and ns.ewma_latency_s >= self._cfg.lat_ewma_s + ): + return NODE_STATE_DEGRADED + + return NODE_STATE_HEALTHY + + +# ── Parser (env vars → NodeHealthConfig) ────────────────────────────────────── + +def parse_node_health_config( + fail_consecutive: int = 3, + lat_ewma_s: float = 12.0, + ewma_alpha: float = 0.3, +) -> NodeHealthConfig: + """Construct NodeHealthConfig from parsed env values.""" + return NodeHealthConfig( + fail_consecutive=fail_consecutive, + lat_ewma_s=lat_ewma_s, + ewma_alpha=ewma_alpha, + ) diff --git a/services/matrix-bridge-dagi/app/node_policy.py b/services/matrix-bridge-dagi/app/node_policy.py new file mode 100644 index 00000000..4c115a2f --- /dev/null +++ b/services/matrix-bridge-dagi/app/node_policy.py @@ -0,0 +1,179 @@ +""" +node_policy — Node-aware routing for matrix-bridge-dagi. + +Resolves which NODA (NODA1, NODA2, …) a message should be tagged with based on: + 1. Explicit `node=X` kwarg in the message body (mixed rooms only) + 2. Dynamic store override (PolicyStore, set by operators via !node set) ← M6.0 + 3. Static per-room mapping from BRIDGE_ROOM_NODE_MAP env + 4. BRIDGE_DEFAULT_NODE (fallback) + +The resolved node_id is embedded in the Router metadata so downstream +services (Router / Memory / Agent) can apply per-node policies. + +This module does NOT change the HTTP endpoint called — the Router URL +stays the same. +""" +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from typing import Dict, FrozenSet, Optional, Tuple + +# Regex to find 'node=X' anywhere in message text (case-insensitive) +_NODE_KWARG_RE = re.compile(r"\bnode=(\w+)\b", re.IGNORECASE) + +# Node resolution sources (priority order) +NODE_SOURCE_EXPLICIT = "explicit" +NODE_SOURCE_STORE = "store" # M6.0: dynamic PolicyStore override +NODE_SOURCE_ROOM_MAP = "room_map" +NODE_SOURCE_DEFAULT = "default" + + +@dataclass(frozen=True) +class NodeResolution: + """Result of resolving the target node for a message.""" + node_id: str + source: str + rejected_node: Optional[str] = None # set when explicit node was not allowlisted + + +@dataclass +class NodePolicy: + """ + Node resolution policy. + + Attributes: + allowed_nodes: Set of valid node names (uppercase). + default_node: Fallback node when no explicit or room-map match. + room_node_map: Optional per-room override (room_id → node_id). + """ + allowed_nodes: FrozenSet[str] + default_node: str + room_node_map: Dict[str, str] = field(default_factory=dict) + + def resolve( + self, + room_id: str, + explicit_node: Optional[str] = None, + store_override: Optional[str] = None, + ) -> NodeResolution: + """ + Resolve target node for a message. + + Priority (highest → lowest): + 1. explicit_node kwarg (user-supplied, mixed rooms only) + 2. store_override — dynamic PolicyStore entry (M6.0) + 3. room_node_map — static BRIDGE_ROOM_NODE_MAP env entry + 4. default_node + """ + if explicit_node is not None: + upper = explicit_node.upper() + if upper in self.allowed_nodes: + return NodeResolution(node_id=upper, source=NODE_SOURCE_EXPLICIT) + # Rejected — report bad value and fall through to best available + fallback = self._fallback(room_id, store_override) + return NodeResolution( + node_id=fallback.node_id, + source=fallback.source, + rejected_node=upper, + ) + + return self._fallback(room_id, store_override) + + def _fallback( + self, + room_id: str, + store_override: Optional[str] = None, + ) -> NodeResolution: + """Resolve node without an explicit kwarg (store → env map → default).""" + if store_override is not None: + upper = store_override.upper() + if upper in self.allowed_nodes: + return NodeResolution(node_id=upper, source=NODE_SOURCE_STORE) + + if room_id in self.room_node_map: + mapped = self.room_node_map[room_id].upper() + if mapped in self.allowed_nodes: + return NodeResolution(node_id=mapped, source=NODE_SOURCE_ROOM_MAP) + + return NodeResolution(node_id=self.default_node, source=NODE_SOURCE_DEFAULT) + + def as_info_dict(self) -> dict: + """Return a safe dict for health/ops snapshots (no secrets).""" + return { + "default_node": self.default_node, + "allowed_nodes": sorted(self.allowed_nodes), + "room_overrides": len(self.room_node_map), + } + + +def parse_node_policy( + raw_allowed: str, + default_node: str, + raw_room_map: str, +) -> NodePolicy: + """ + Parse node policy from env-style config strings. + + raw_allowed: "NODA1,NODA2" + default_node: "NODA1" + raw_room_map: "!roomA:server=NODA2;!roomB:server=NODA1" + """ + default = default_node.strip().upper() or "NODA1" + + allowed: FrozenSet[str] = frozenset( + n.strip().upper() for n in raw_allowed.split(",") if n.strip() + ) + if not allowed: + allowed = frozenset([default]) + elif default not in allowed: + # default must always be reachable + allowed = allowed | frozenset([default]) + + room_map: Dict[str, str] = {} + for entry in raw_room_map.split(";"): + entry = entry.strip() + if not entry or "=" not in entry: + continue + room_id_raw, node_raw = entry.split("=", 1) + room_id = room_id_raw.strip() + node = node_raw.strip().upper() + if room_id and node: + room_map[room_id] = node + + return NodePolicy( + allowed_nodes=allowed, + default_node=default, + room_node_map=room_map, + ) + + +def extract_node_kwarg(text: str) -> Tuple[Optional[str], str]: + """ + Extract 'node=X' kwarg from message text. + + Returns (node_id_or_None, cleaned_text_without_kwarg). + Preserves the rest of the message — no other transformations. + + Example: + "/sofiia node=NODA2 Hello!" + → ("NODA2", "/sofiia Hello!") + """ + m = _NODE_KWARG_RE.search(text) + if m: + node = m.group(1).upper() + cleaned = _NODE_KWARG_RE.sub("", text, count=1) + # Collapse runs of whitespace introduced by the removal + cleaned = " ".join(cleaned.split()) + return node, cleaned + return None, text + + +def node_rejected_reply(requested: str, allowed: FrozenSet[str]) -> str: + """Reply when user requests a node not in the allowlist.""" + allowed_list = ", ".join(f"`{n}`" for n in sorted(allowed)) + return ( + f"⚠️ Unknown node: `{requested}`\n" + f"Allowed: {allowed_list}\n" + f"_Example: `/sofiia node=NODA1 Hello!`_" + ) diff --git a/services/matrix-bridge-dagi/app/policy_store.py b/services/matrix-bridge-dagi/app/policy_store.py new file mode 100644 index 00000000..49c685d0 --- /dev/null +++ b/services/matrix-bridge-dagi/app/policy_store.py @@ -0,0 +1,1007 @@ +""" +policy_store — M6.0: Persistent room-node override store. + +SQLite-backed store that allows operators to dynamically set a preferred +node (NODA1, NODA2, …) for any Matrix room without redeploying the bridge. + +Resolution layer (in NodePolicy.resolve): + 1. explicit node=X kwarg (highest priority) + 2. dynamic store override ← this module + 3. static BRIDGE_ROOM_NODE_MAP env + 4. BRIDGE_DEFAULT_NODE (lowest priority) + +All DB operations are synchronous/blocking. Call via asyncio.to_thread +in async contexts to avoid blocking the event loop. + +Security: + - operator identity is stored as SHA-256[:16] (no PII verbatim) + - room_id values validated against basic Matrix ID format by callers + - SQLite WAL mode, PRAGMA synchronous=NORMAL for durability+speed +""" +from __future__ import annotations + +import datetime +import glob as _glob +import hashlib +import json as _json +import logging +import os as _os +import sqlite3 +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +POLICY_SNAPSHOT_VERSION = 1 +POLICY_IMPORT_MODE_MERGE = "merge" +POLICY_IMPORT_MODE_REPLACE = "replace" + +logger = logging.getLogger(__name__) + +_DDL = """ +CREATE TABLE IF NOT EXISTS room_node_overrides ( + room_id TEXT PRIMARY KEY, + node_id TEXT NOT NULL, + updated_at INTEGER NOT NULL, + updated_by_hash TEXT NOT NULL +); +""" + +_IDX_TS = """ +CREATE INDEX IF NOT EXISTS idx_rno_updated_at + ON room_node_overrides (updated_at DESC); +""" + +# M6.1: Dynamic mixed room agent overrides +_DDL_AGENT = """ +CREATE TABLE IF NOT EXISTS room_agent_overrides ( + room_id TEXT PRIMARY KEY, + agents_csv TEXT NOT NULL, + default_agent TEXT, + updated_at INTEGER NOT NULL, + updated_by_hash TEXT NOT NULL +); +""" + +_IDX_AGENT_TS = """ +CREATE INDEX IF NOT EXISTS idx_rao_updated_at + ON room_agent_overrides (updated_at DESC); +""" + +# M8.2: HA persistence tables +_DDL_STICKY = """ +CREATE TABLE IF NOT EXISTS sticky_node_cache ( + key TEXT PRIMARY KEY, + node_id TEXT NOT NULL, + expires_at INTEGER NOT NULL, + updated_at INTEGER NOT NULL +); +""" + +_DDL_NODE_HEALTH = """ +CREATE TABLE IF NOT EXISTS node_health_state ( + node_id TEXT PRIMARY KEY, + ewma_latency_s REAL, + consecutive_failures INTEGER NOT NULL DEFAULT 0, + updated_at INTEGER NOT NULL +); +""" + +# M10.2: Policy change history table +_DDL_POLICY_CHANGES = """ +CREATE TABLE IF NOT EXISTS policy_changes ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + applied_at INTEGER NOT NULL, + verb TEXT NOT NULL DEFAULT '', + mode TEXT NOT NULL DEFAULT '', + source_file TEXT NOT NULL DEFAULT '', + sender_hash TEXT NOT NULL DEFAULT '', + diff_summary TEXT NOT NULL DEFAULT '', + is_destructive INTEGER NOT NULL DEFAULT 0, + node_added INTEGER NOT NULL DEFAULT 0, + node_updated INTEGER NOT NULL DEFAULT 0, + node_deleted INTEGER NOT NULL DEFAULT 0, + agent_added INTEGER NOT NULL DEFAULT 0, + agent_updated INTEGER NOT NULL DEFAULT 0, + agent_deleted INTEGER NOT NULL DEFAULT 0 +); +""" + +_IDX_POLICY_CHANGES_TS = """ +CREATE INDEX IF NOT EXISTS idx_pc_applied_at + ON policy_changes (applied_at DESC); +""" + +_POLICY_HISTORY_DEFAULT_LIMIT = 100 + +# Maximum number of entries returned by list_* (safety cap) +_LIST_HARD_LIMIT = 100 + + +# M9.1: Import diff result dataclass +_SAMPLE_KEYS_MAX = 5 + + +@dataclass +class ImportDiff: + """ + Result of compute_import_diff — what would change if a snapshot were imported. + + Used to build a preview reply and confirm binding hash (M9.1). + """ + node_added: int = 0 + node_updated: int = 0 + node_deleted: int = 0 + agent_added: int = 0 + agent_updated: int = 0 + agent_deleted: int = 0 + sample_keys: List[str] = field(default_factory=list) # up to _SAMPLE_KEYS_MAX + is_replace: bool = False + + def total_changes(self) -> int: + return ( + self.node_added + self.node_updated + self.node_deleted + + self.agent_added + self.agent_updated + self.agent_deleted + ) + + def is_destructive(self) -> bool: + """True if any existing data would be deleted.""" + return self.node_deleted > 0 or self.agent_deleted > 0 + + +# M10.2: Policy change history entry +@dataclass +class PolicyChange: + """A single recorded policy apply event (import or restore).""" + id: int + applied_at: int # unix timestamp + verb: str # e.g. "policy.import", "policy.restore" + mode: str # "merge" or "replace" + source_file: str # snapshot filename (basename only) + sender_hash: str # truncated hash of operator sender_id + diff_summary: str # human-readable change summary string + is_destructive: bool # True if any deletions occurred + node_added: int + node_updated: int + node_deleted: int + agent_added: int + agent_updated: int + agent_deleted: int + + def when_str(self) -> str: + """Human-readable UTC timestamp.""" + return datetime.datetime.fromtimestamp( + self.applied_at, datetime.timezone.utc + ).strftime("%Y-%m-%d %H:%M") + + def changes_short(self) -> str: + """Compact change summary, e.g. '+2n -1n +1a'.""" + parts = [] + if self.node_added: parts.append(f"+{self.node_added}n") + if self.node_updated: parts.append(f"~{self.node_updated}n") + if self.node_deleted: parts.append(f"-{self.node_deleted}n") + if self.agent_added: parts.append(f"+{self.agent_added}a") + if self.agent_updated: parts.append(f"~{self.agent_updated}a") + if self.agent_deleted: parts.append(f"-{self.agent_deleted}a") + return " ".join(parts) or "no changes" + + +# M10.0: Auto-backup + prune result +_AUTOBACKUP_PREFIX = "policy-autobackup-" +_EXPORT_GLOB = "policy-*.json" +_PRUNE_SAMPLE_MAX = 5 + + +@dataclass +class PruneResult: + """Result of prune_exports — what was (or would be) pruned (M10.0).""" + files_to_delete: List[str] # basenames of matching expired files + total_bytes: int # approximate bytes freed (or to be freed) + oldest_mtime: Optional[float] = None # oldest mtime among files to delete + + @property + def count(self) -> int: + return len(self.files_to_delete) + + def sample_filenames(self, n: int = _PRUNE_SAMPLE_MAX) -> List[str]: + return sorted(self.files_to_delete)[:n] + + +def _hash_sender(sender: str) -> str: + """Partial SHA-256 of sender Matrix ID (non-reversible, no PII stored raw).""" + return hashlib.sha256(sender.encode("utf-8")).hexdigest()[:16] + + +class PolicyStore: + """ + Lightweight synchronous SQLite wrapper for room→node overrides. + + Usage pattern (async callers): + override = await asyncio.to_thread(store.get_override, room_id) + await asyncio.to_thread(store.set_override, room_id, "NODA2", sender) + """ + + def __init__(self, db_path: str) -> None: + self._db_path = db_path + self._conn: Optional[sqlite3.Connection] = None + + # ── Lifecycle ────────────────────────────────────────────────────────────── + + def open(self) -> None: + """Open (or create) the SQLite DB and apply schema.""" + Path(self._db_path).parent.mkdir(parents=True, exist_ok=True) + self._conn = sqlite3.connect( + self._db_path, + check_same_thread=False, + isolation_level=None, # autocommit + ) + self._conn.execute("PRAGMA journal_mode=WAL") + self._conn.execute("PRAGMA synchronous=NORMAL") + self._conn.execute(_DDL) + self._conn.execute(_IDX_TS) + self._conn.execute(_DDL_AGENT) + self._conn.execute(_IDX_AGENT_TS) + # M8.2: HA persistence tables + self._conn.execute(_DDL_STICKY) + self._conn.execute(_DDL_NODE_HEALTH) + # M10.2: Policy change history + self._conn.execute(_DDL_POLICY_CHANGES) + self._conn.execute(_IDX_POLICY_CHANGES_TS) + logger.info("PolicyStore opened: %s", self._db_path) + + def close(self) -> None: + """Close the SQLite connection.""" + if self._conn: + try: + self._conn.close() + except Exception: # noqa: BLE001 + pass + finally: + self._conn = None + + # ── CRUD ─────────────────────────────────────────────────────────────────── + + def get_override(self, room_id: str) -> Optional[str]: + """Return the stored node_id for room_id, or None if not set.""" + self._require_open() + row = self._conn.execute( # type: ignore[union-attr] + "SELECT node_id FROM room_node_overrides WHERE room_id = ?", + (room_id,), + ).fetchone() + return row[0] if row else None + + def set_override(self, room_id: str, node_id: str, updated_by: str) -> None: + """Upsert a room→node override.""" + self._require_open() + self._conn.execute( # type: ignore[union-attr] + """ + INSERT INTO room_node_overrides (room_id, node_id, updated_at, updated_by_hash) + VALUES (?, ?, ?, ?) + ON CONFLICT(room_id) DO UPDATE SET + node_id = excluded.node_id, + updated_at = excluded.updated_at, + updated_by_hash = excluded.updated_by_hash + """, + (room_id, node_id, int(time.time()), _hash_sender(updated_by)), + ) + + def delete_override(self, room_id: str) -> bool: + """Remove override for room_id. Returns True if a row was deleted.""" + self._require_open() + cursor = self._conn.execute( # type: ignore[union-attr] + "DELETE FROM room_node_overrides WHERE room_id = ?", + (room_id,), + ) + return cursor.rowcount > 0 + + def list_overrides(self, limit: int = 10) -> List[Tuple[str, str, int]]: + """ + Return [(room_id, node_id, updated_at), …] ordered by updated_at DESC. + Hard-capped at _LIST_HARD_LIMIT regardless of caller's limit. + """ + self._require_open() + cap = min(max(1, limit), _LIST_HARD_LIMIT) + rows = self._conn.execute( # type: ignore[union-attr] + """ + SELECT room_id, node_id, updated_at + FROM room_node_overrides + ORDER BY updated_at DESC + LIMIT ? + """, + (cap,), + ).fetchall() + return [(r[0], r[1], r[2]) for r in rows] + + def count_overrides(self) -> int: + """Return total number of override rows in the DB.""" + self._require_open() + row = self._conn.execute( + "SELECT COUNT(*) FROM room_node_overrides" + ).fetchone() + return int(row[0]) if row else 0 + + # ── Properties ───────────────────────────────────────────────────────────── + + @property + def db_path(self) -> str: + return self._db_path + + @property + def is_open(self) -> bool: + return self._conn is not None + + # ── M6.1: Room agent overrides ───────────────────────────────────────────── + + def get_agent_override( + self, room_id: str + ) -> Optional[Tuple[List[str], Optional[str]]]: + """ + Return (agents_list, default_agent_or_None) for room_id, + or None if no override exists. + """ + self._require_open() + row = self._conn.execute( # type: ignore[union-attr] + "SELECT agents_csv, default_agent FROM room_agent_overrides WHERE room_id = ?", + (room_id,), + ).fetchone() + if row is None: + return None + agents = [a.strip() for a in row[0].split(",") if a.strip()] + return agents, (row[1] or None) + + def set_agent_override( + self, + room_id: str, + agents: List[str], + default_agent: Optional[str], + updated_by: str, + ) -> None: + """Upsert a room agent override (sorted, deduplicated agents_csv).""" + self._require_open() + agents_csv = ",".join(sorted(set(agents))) + self._conn.execute( # type: ignore[union-attr] + """ + INSERT INTO room_agent_overrides + (room_id, agents_csv, default_agent, updated_at, updated_by_hash) + VALUES (?, ?, ?, ?, ?) + ON CONFLICT(room_id) DO UPDATE SET + agents_csv = excluded.agents_csv, + default_agent = excluded.default_agent, + updated_at = excluded.updated_at, + updated_by_hash = excluded.updated_by_hash + """, + (room_id, agents_csv, default_agent, int(time.time()), _hash_sender(updated_by)), + ) + + def delete_agent_override(self, room_id: str) -> bool: + """Remove agent override for room_id. Returns True if deleted.""" + self._require_open() + cursor = self._conn.execute( # type: ignore[union-attr] + "DELETE FROM room_agent_overrides WHERE room_id = ?", + (room_id,), + ) + return cursor.rowcount > 0 + + def add_agent_to_room( + self, room_id: str, agent: str, updated_by: str + ) -> Tuple[List[str], Optional[str]]: + """ + Add agent to room override, creating it if it doesn't exist. + Returns the new (agents, default_agent) state. + """ + self._require_open() + existing = self.get_agent_override(room_id) + if existing: + agents, default = existing + if agent not in agents: + agents = sorted(set(agents) | {agent}) + self.set_agent_override(room_id, agents, default, updated_by) + return agents, default + else: + self.set_agent_override(room_id, [agent], agent, updated_by) + return [agent], agent + + def remove_agent_from_room( + self, room_id: str, agent: str, updated_by: str + ) -> Tuple[bool, Optional[str]]: + """ + Remove agent from room override. + Returns (removed: bool, error_message_or_None). + If the last agent is removed, the entire override is deleted. + """ + self._require_open() + existing = self.get_agent_override(room_id) + if not existing: + return False, "No agent override set for this room" + agents, default = existing + if agent not in agents: + return False, f"Agent `{agent}` not in override list" + agents = [a for a in agents if a != agent] + if not agents: + self.delete_agent_override(room_id) + return True, None + new_default = default if default != agent else agents[0] + self.set_agent_override(room_id, agents, new_default, updated_by) + return True, None + + def list_agent_overrides( + self, limit: int = 10 + ) -> List[Tuple[str, List[str], Optional[str], int]]: + """ + Return [(room_id, agents_list, default_agent, updated_at), …] + ordered by updated_at DESC. + """ + self._require_open() + cap = min(max(1, limit), _LIST_HARD_LIMIT) + rows = self._conn.execute( # type: ignore[union-attr] + """ + SELECT room_id, agents_csv, default_agent, updated_at + FROM room_agent_overrides + ORDER BY updated_at DESC + LIMIT ? + """, + (cap,), + ).fetchall() + return [ + (r[0], [a.strip() for a in r[1].split(",") if a.strip()], r[2] or None, r[3]) + for r in rows + ] + + def count_agent_overrides(self) -> int: + """Return total number of agent override rows.""" + self._require_open() + row = self._conn.execute( + "SELECT COUNT(*) FROM room_agent_overrides" + ).fetchone() + return int(row[0]) if row else 0 + + # ── M8.2: HA persistence — sticky node cache ────────────────────────────── + + def upsert_sticky(self, key: str, node_id: str, expires_at_unix: int) -> None: + """Persist a sticky routing entry. Idempotent (upsert by key).""" + assert self._conn, "Store not open" + now = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) + self._conn.execute( + """INSERT INTO sticky_node_cache (key, node_id, expires_at, updated_at) + VALUES (?, ?, ?, ?) + ON CONFLICT(key) DO UPDATE SET + node_id=excluded.node_id, + expires_at=excluded.expires_at, + updated_at=excluded.updated_at""", + (key, node_id, expires_at_unix, now), + ) + + def delete_sticky(self, key: str) -> bool: + """Remove a sticky entry. Returns True if it existed.""" + assert self._conn, "Store not open" + cur = self._conn.execute( + "DELETE FROM sticky_node_cache WHERE key=?", (key,) + ) + return cur.rowcount > 0 + + def load_sticky_entries(self) -> List[Tuple[str, str, int]]: + """ + Return all non-expired sticky entries as (key, node_id, expires_at_unix). + Callers filter by monotonic time; here we compare against unix now. + """ + assert self._conn, "Store not open" + now = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) + rows = self._conn.execute( + "SELECT key, node_id, expires_at FROM sticky_node_cache WHERE expires_at > ?", + (now,), + ).fetchall() + return [(r[0], r[1], int(r[2])) for r in rows] + + def prune_sticky_expired(self) -> int: + """Remove all expired sticky entries. Returns count removed.""" + assert self._conn, "Store not open" + now = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) + cur = self._conn.execute( + "DELETE FROM sticky_node_cache WHERE expires_at <= ?", (now,) + ) + return cur.rowcount + + # ── M8.2: HA persistence — node health state ────────────────────────────── + + def upsert_node_health( + self, + node_id: str, + ewma_latency_s: Optional[float], + consecutive_failures: int, + ) -> None: + """Persist node health snapshot. Idempotent (upsert by node_id).""" + assert self._conn, "Store not open" + now = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) + self._conn.execute( + """INSERT INTO node_health_state + (node_id, ewma_latency_s, consecutive_failures, updated_at) + VALUES (?, ?, ?, ?) + ON CONFLICT(node_id) DO UPDATE SET + ewma_latency_s=excluded.ewma_latency_s, + consecutive_failures=excluded.consecutive_failures, + updated_at=excluded.updated_at""", + (node_id, ewma_latency_s, consecutive_failures, now), + ) + + def load_node_health(self, max_age_s: int = 600) -> Optional[Dict[str, Any]]: + """ + Load node health snapshot if all rows are fresh enough (updated_at >= now - max_age_s). + Returns None if no rows or snapshot is stale. + Returns dict: {node_id: {ewma_latency_s, consecutive_failures, updated_at}} + """ + assert self._conn, "Store not open" + now = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) + cutoff = now - max_age_s + rows = self._conn.execute( + """SELECT node_id, ewma_latency_s, consecutive_failures, updated_at + FROM node_health_state""", + ).fetchall() + if not rows: + return None + result: Dict[str, Any] = {} + for node_id, ewma, consec, updated_at in rows: + if int(updated_at) < cutoff: + logger.debug( + "HA: node health snapshot for %s is stale (age=%ds > max=%ds) — ignoring", + node_id, now - int(updated_at), max_age_s, + ) + return None # Any stale node → discard whole snapshot + result[node_id] = { + "ewma_latency_s": ewma, + "consecutive_failures": int(consec), + "updated_at": int(updated_at), + } + return result if result else None + + # ── M6.2: Snapshot export / import ──────────────────────────────────────── + + # ── M10.2: Policy change history ────────────────────────────────────────── + + def record_policy_change( + self, + verb: str, + mode: str, + source_file: str, + sender_hash: str, + diff_summary: str, + is_destructive: bool, + node_added: int, + node_updated: int, + node_deleted: int, + agent_added: int, + agent_updated: int, + agent_deleted: int, + history_limit: int = _POLICY_HISTORY_DEFAULT_LIMIT, + ) -> int: + """ + Insert a policy apply event into the history table and prune old rows. + + history_limit=0 means keep all rows (no pruning). + Returns the id of the inserted row. + """ + self._require_open() + cur = self._conn.execute( # type: ignore[union-attr] + """INSERT INTO policy_changes + (applied_at, verb, mode, source_file, sender_hash, + diff_summary, is_destructive, + node_added, node_updated, node_deleted, + agent_added, agent_updated, agent_deleted) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", + ( + int(time.time()), verb, mode, source_file, sender_hash, + diff_summary, int(is_destructive), + node_added, node_updated, node_deleted, + agent_added, agent_updated, agent_deleted, + ), + ) + row_id: int = cur.lastrowid # type: ignore[assignment] + + # Prune oldest rows beyond limit + if history_limit > 0: + self._conn.execute( # type: ignore[union-attr] + """DELETE FROM policy_changes + WHERE id NOT IN ( + SELECT id FROM policy_changes ORDER BY id DESC LIMIT ? + )""", + (history_limit,), + ) + + logger.debug( + "Recorded policy change id=%d verb=%s mode=%s file=%s destr=%s", + row_id, verb, mode, source_file, is_destructive, + ) + return row_id + + def list_policy_changes(self, limit: int = 10) -> List[PolicyChange]: + """ + Return the most-recent `limit` policy change records, newest first. + + Hard cap: min(limit, _LIST_HARD_LIMIT). + """ + self._require_open() + safe_limit = min(max(1, limit), _LIST_HARD_LIMIT) + rows = self._conn.execute( # type: ignore[union-attr] + """SELECT id, applied_at, verb, mode, source_file, sender_hash, + diff_summary, is_destructive, + node_added, node_updated, node_deleted, + agent_added, agent_updated, agent_deleted + FROM policy_changes + ORDER BY id DESC LIMIT ?""", + (safe_limit,), + ).fetchall() + return [ + PolicyChange( + id=r[0], applied_at=r[1], verb=r[2], mode=r[3], + source_file=r[4], sender_hash=r[5], diff_summary=r[6], + is_destructive=bool(r[7]), + node_added=r[8], node_updated=r[9], node_deleted=r[10], + agent_added=r[11], agent_updated=r[12], agent_deleted=r[13], + ) + for r in rows + ] + + def get_policy_changes_count(self) -> int: + """Return the total number of recorded policy changes.""" + self._require_open() + row = self._conn.execute( # type: ignore[union-attr] + "SELECT COUNT(*) FROM policy_changes" + ).fetchone() + return row[0] if row else 0 + + def get_policy_change_by_id(self, change_id: int) -> Optional["PolicyChange"]: + """Return a single PolicyChange by its DB auto-increment id, or None.""" + self._require_open() + row = self._conn.execute( # type: ignore[union-attr] + """SELECT id, applied_at, verb, mode, source_file, sender_hash, + diff_summary, is_destructive, + node_added, node_updated, node_deleted, + agent_added, agent_updated, agent_deleted + FROM policy_changes WHERE id = ?""", + (change_id,), + ).fetchone() + if row is None: + return None + return PolicyChange( + id=row[0], applied_at=row[1], verb=row[2], mode=row[3], + source_file=row[4], sender_hash=row[5], diff_summary=row[6], + is_destructive=bool(row[7]), + node_added=row[8], node_updated=row[9], node_deleted=row[10], + agent_added=row[11], agent_updated=row[12], agent_deleted=row[13], + ) + + # ── M10.0: Auto-backup + retention prune ────────────────────────────────── + + def write_autobackup( + self, + exports_dir: str, + sender_hash8: str, + nonce: str, + ) -> tuple[str, str]: + """ + Export all current policy to a timestamped autobackup file. + + Filename: policy-autobackup---.json + + Returns (file_path, content_hash_prefix[:8]). + Non-atomic write is acceptable: file is complete before we return. + """ + self._require_open() + ts = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%dT%H%M%SZ") + filename = f"{_AUTOBACKUP_PREFIX}{ts}-{sender_hash8[:8]}-{nonce}.json" + file_path = _os.path.join(exports_dir, filename) + + snapshot = self.export_all() + content = _json.dumps(snapshot, sort_keys=True, ensure_ascii=True) + with open(file_path, "w", encoding="utf-8") as fh: + fh.write(content) + + content_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()[:8] + logger.debug("Auto-backup written: %s hash=%s", filename, content_hash) + return file_path, content_hash + + def prune_exports( + self, + exports_dir: str, + retention_days: int, + dry_run: bool = True, + ) -> PruneResult: + """ + Remove policy export files older than retention_days. + + Only files matching 'policy-*.json' in exports_dir are considered — + never recursing into subdirectories. + + dry_run=True: compute stats without deleting. + dry_run=False: actually delete matching files. + + Returns PruneResult with filenames, total_bytes, oldest_mtime. + """ + if retention_days <= 0: + return PruneResult(files_to_delete=[], total_bytes=0) + + cutoff = time.time() - retention_days * 86400 + pattern = _os.path.join(exports_dir, _EXPORT_GLOB) + to_delete: List[str] = [] + total_bytes = 0 + oldest_mtime: Optional[float] = None + + for fpath in sorted(_glob.glob(pattern)): + # Safety: only process files directly in exports_dir (no subdirs) + if _os.path.dirname(fpath) != _os.path.abspath(exports_dir): + continue + try: + stat = _os.stat(fpath) + except OSError: + continue + if stat.st_mtime < cutoff: + basename = _os.path.basename(fpath) + to_delete.append(basename) + total_bytes += stat.st_size + if oldest_mtime is None or stat.st_mtime < oldest_mtime: + oldest_mtime = stat.st_mtime + + if not dry_run: + for basename in to_delete: + fpath = _os.path.join(exports_dir, basename) + try: + _os.remove(fpath) + logger.info("Pruned policy export: %s", basename) + except OSError as exc: + logger.warning("Could not prune %s: %s", basename, exc) + + return PruneResult( + files_to_delete=to_delete, + total_bytes=total_bytes, + oldest_mtime=oldest_mtime, + ) + + def export_all(self) -> Dict[str, Any]: + """ + Export all overrides as a JSON-serializable snapshot dict. + + Format (version 1): + { + "version": 1, + "created_at": "Z", + "room_node_overrides": [{room_id, node_id, updated_at, updated_by}, ...], + "room_agent_overrides": [{room_id, agents, default_agent, updated_at, updated_by}, ...] + } + """ + self._require_open() + node_rows = self._conn.execute( # type: ignore[union-attr] + "SELECT room_id, node_id, updated_at, updated_by_hash FROM room_node_overrides ORDER BY room_id" + ).fetchall() + agent_rows = self._conn.execute( # type: ignore[union-attr] + """SELECT room_id, agents_csv, default_agent, updated_at, updated_by_hash + FROM room_agent_overrides ORDER BY room_id""" + ).fetchall() + return { + "version": POLICY_SNAPSHOT_VERSION, + "created_at": datetime.datetime.now(datetime.timezone.utc).isoformat().replace("+00:00", "Z"), + "room_node_overrides": [ + {"room_id": r[0], "node_id": r[1], "updated_at": r[2], "updated_by": r[3]} + for r in node_rows + ], + "room_agent_overrides": [ + { + "room_id": r[0], + "agents": [a.strip() for a in r[1].split(",") if a.strip()], + "default_agent": r[2] or None, + "updated_at": r[3], + "updated_by": r[4], + } + for r in agent_rows + ], + } + + def compute_import_diff( + self, + data: Dict[str, Any], + mode: str = POLICY_IMPORT_MODE_MERGE, + ) -> ImportDiff: + """ + Compute what would change if data were imported (dry-run, M9.1). + + Returns an ImportDiff with counts and up to _SAMPLE_KEYS_MAX changed rooms. + Non-destructive — never modifies the database. + """ + if data.get("version") != POLICY_SNAPSHOT_VERSION: + raise ValueError(f"Unsupported snapshot version: {data.get('version')!r}") + + self._require_open() + + existing_nodes: Dict[str, str] = { + r[0]: r[1] + for r in self._conn.execute( # type: ignore[union-attr] + "SELECT room_id, node_id FROM room_node_overrides" + ).fetchall() + } + existing_agents: Dict[str, str] = { + r[0]: r[1] + for r in self._conn.execute( # type: ignore[union-attr] + "SELECT room_id, agents_csv FROM room_agent_overrides" + ).fetchall() + } + + file_nodes: Dict[str, str] = { + e["room_id"]: e["node_id"] + for e in (data.get("room_node_overrides") or []) + if "room_id" in e and "node_id" in e + } + file_agents: Dict[str, Any] = { + e["room_id"]: e + for e in (data.get("room_agent_overrides") or []) + if "room_id" in e and "agents" in e + } + + node_added = sum(1 for r in file_nodes if r not in existing_nodes) + node_updated = sum(1 for r in file_nodes if r in existing_nodes) + agent_added = sum(1 for r in file_agents if r not in existing_agents) + agent_updated = sum(1 for r in file_agents if r in existing_agents) + node_deleted = 0 + agent_deleted = 0 + if mode == POLICY_IMPORT_MODE_REPLACE: + node_deleted = sum(1 for r in existing_nodes if r not in file_nodes) + agent_deleted = sum(1 for r in existing_agents if r not in file_agents) + + # Collect up to _SAMPLE_KEYS_MAX affected rooms (deterministic: sorted) + affected: List[str] = [] + seen: set[str] = set() + for rid in list(file_nodes) + list(file_agents): + if rid not in seen: + affected.append(rid) + seen.add(rid) + if mode == POLICY_IMPORT_MODE_REPLACE: + for rid in list(existing_nodes) + list(existing_agents): + if rid not in seen and (rid not in file_nodes or rid not in file_agents): + affected.append(rid) + seen.add(rid) + sample_keys = sorted(affected)[:_SAMPLE_KEYS_MAX] + + return ImportDiff( + node_added=node_added, + node_updated=node_updated, + node_deleted=node_deleted, + agent_added=agent_added, + agent_updated=agent_updated, + agent_deleted=agent_deleted, + sample_keys=sample_keys, + is_replace=(mode == POLICY_IMPORT_MODE_REPLACE), + ) + + def import_snapshot( + self, + data: Dict[str, Any], + mode: str = POLICY_IMPORT_MODE_MERGE, + dry_run: bool = True, + imported_by: str = "import", + ) -> Dict[str, int]: + """ + Import a policy snapshot. + + mode=merge: upsert entries from file; never delete existing entries not in file. + mode=replace: upsert entries from file AND delete entries in DB not present in file. + + dry_run=True: compute stats without modifying DB. + + Returns: + { + "node_added": N, "node_updated": N, "node_deleted": N, + "agent_added": N, "agent_updated": N, "agent_deleted": N, + } + """ + if data.get("version") != POLICY_SNAPSHOT_VERSION: + raise ValueError(f"Unsupported snapshot version: {data.get('version')!r}") + + self._require_open() + + # ── Current DB state ────────────────────────────────────────────────── + existing_nodes: Dict[str, str] = { + r[0]: r[1] + for r in self._conn.execute( # type: ignore[union-attr] + "SELECT room_id, node_id FROM room_node_overrides" + ).fetchall() + } + existing_agents: Dict[str, str] = { + r[0]: r[1] + for r in self._conn.execute( # type: ignore[union-attr] + "SELECT room_id, agents_csv FROM room_agent_overrides" + ).fetchall() + } + + # ── Compute deltas ──────────────────────────────────────────────────── + file_nodes = { + e["room_id"]: e["node_id"] + for e in (data.get("room_node_overrides") or []) + if "room_id" in e and "node_id" in e + } + file_agents = { + e["room_id"]: e + for e in (data.get("room_agent_overrides") or []) + if "room_id" in e and "agents" in e + } + + node_added = sum(1 for r in file_nodes if r not in existing_nodes) + node_updated = sum(1 for r in file_nodes if r in existing_nodes) + agent_added = sum(1 for r in file_agents if r not in existing_agents) + agent_updated = sum(1 for r in file_agents if r in existing_agents) + + node_deleted = 0 + agent_deleted = 0 + if mode == POLICY_IMPORT_MODE_REPLACE: + node_deleted = sum(1 for r in existing_nodes if r not in file_nodes) + agent_deleted = sum(1 for r in existing_agents if r not in file_agents) + + stats = { + "node_added": node_added, + "node_updated": node_updated, + "node_deleted": node_deleted, + "agent_added": agent_added, + "agent_updated": agent_updated, + "agent_deleted": agent_deleted, + } + + if dry_run: + return stats + + # ── Apply changes ───────────────────────────────────────────────────── + now = int(time.time()) + by_hash = _hash_sender(imported_by) + + for entry in (data.get("room_node_overrides") or []): + rid = entry.get("room_id") + nid = entry.get("node_id") + if rid and nid: + self._conn.execute( # type: ignore[union-attr] + """ + INSERT INTO room_node_overrides (room_id, node_id, updated_at, updated_by_hash) + VALUES (?, ?, ?, ?) + ON CONFLICT(room_id) DO UPDATE SET + node_id = excluded.node_id, + updated_at = excluded.updated_at, + updated_by_hash = excluded.updated_by_hash + """, + (rid, nid, now, by_hash), + ) + + for entry in (data.get("room_agent_overrides") or []): + rid = entry.get("room_id") + agents = entry.get("agents") or [] + def_agent = entry.get("default_agent") or (agents[0] if agents else None) + if rid and agents: + agents_csv = ",".join(sorted(set(agents))) + self._conn.execute( # type: ignore[union-attr] + """ + INSERT INTO room_agent_overrides + (room_id, agents_csv, default_agent, updated_at, updated_by_hash) + VALUES (?, ?, ?, ?, ?) + ON CONFLICT(room_id) DO UPDATE SET + agents_csv = excluded.agents_csv, + default_agent = excluded.default_agent, + updated_at = excluded.updated_at, + updated_by_hash = excluded.updated_by_hash + """, + (rid, agents_csv, def_agent, now, by_hash), + ) + + if mode == POLICY_IMPORT_MODE_REPLACE: + file_node_rooms = set(file_nodes.keys()) + file_agent_rooms = set(file_agents.keys()) + for room_id in existing_nodes: + if room_id not in file_node_rooms: + self._conn.execute( # type: ignore[union-attr] + "DELETE FROM room_node_overrides WHERE room_id = ?", (room_id,) + ) + for room_id in existing_agents: + if room_id not in file_agent_rooms: + self._conn.execute( # type: ignore[union-attr] + "DELETE FROM room_agent_overrides WHERE room_id = ?", (room_id,) + ) + + return stats + + # ── Internal ─────────────────────────────────────────────────────────────── + + def _require_open(self) -> None: + if self._conn is None: + raise RuntimeError("PolicyStore is not open — call open() first") diff --git a/services/matrix-bridge-dagi/app/sticky_cache.py b/services/matrix-bridge-dagi/app/sticky_cache.py new file mode 100644 index 00000000..36eedbce --- /dev/null +++ b/services/matrix-bridge-dagi/app/sticky_cache.py @@ -0,0 +1,149 @@ +""" +StickyNodeCache — M8.1: anti-flap sticky routing after soft-failover. + +After a successful failover (primary → fallback), the bridge remembers the +fallback node per room:agent pair for `ttl_s` seconds. Subsequent messages +for the same pair skip the primary entirely and go directly to the known-good +fallback, preventing oscillation ("flapping") while the primary recovers. + +Key design +---------- + key = "{room_id}:{agent_id}" + ttl = FAILOVER_STICKY_TTL_S (default 300 s) + +Priority in routing (when source != explicit): + 1. sticky cache (temporary) + 2. store override (desired long-term policy) + 3. env room_node_map + 4. env default + +Sticky expires naturally; recovery is automatic — no operator action needed. +If the sticky node also fails, the entry is removed and normal failover logic +takes over again. + +Thread safety +------------- + Uses threading.RLock — safe to call from asyncio callbacks without to_thread. +""" +from __future__ import annotations + +import logging +import threading +import time +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +logger = logging.getLogger(__name__) + +_DEFAULT_TTL_S = 300.0 + + +@dataclass +class _StickyEntry: + node_id: str + expires_at: float # time.monotonic() deadline + + +class StickyNodeCache: + """ + In-memory sticky node preference cache. + + Usage: + cache = StickyNodeCache(ttl_s=300) + + # After successful failover: + cache.set("!room:srv:sofiia", "NODA2") + + # Before routing the next message: + node = cache.get("!room:srv:sofiia") # → "NODA2" or None if expired/missing + + # If sticky node also fails: + cache.delete("!room:srv:sofiia") + """ + + def __init__(self, ttl_s: float = _DEFAULT_TTL_S) -> None: + if ttl_s <= 0: + raise ValueError(f"ttl_s must be > 0, got {ttl_s}") + self._ttl_s = ttl_s + self._cache: Dict[str, _StickyEntry] = {} + self._lock = threading.RLock() + + # ── Public API ──────────────────────────────────────────────────────────── + + def set(self, key: str, node_id: str, ttl_s: Optional[float] = None) -> None: + """Set sticky preference; overwrites existing entry.""" + ttl = ttl_s if ttl_s is not None else self._ttl_s + with self._lock: + self._cache[key] = _StickyEntry( + node_id=node_id, + expires_at=time.monotonic() + ttl, + ) + logger.debug("StickyCache.set: key=%s node=%s ttl=%.0fs", key, node_id, ttl) + + def get(self, key: str) -> Optional[str]: + """ + Return sticky node_id if entry exists and not expired; else None. + Expired entries are lazily removed on access. + """ + with self._lock: + entry = self._cache.get(key) + if entry is None: + return None + if time.monotonic() >= entry.expires_at: + del self._cache[key] + logger.debug("StickyCache.expired: key=%s node=%s", key, entry.node_id) + return None + return entry.node_id + + def delete(self, key: str) -> bool: + """Remove an entry. Returns True if it existed.""" + with self._lock: + existed = key in self._cache + self._cache.pop(key, None) + if existed: + logger.debug("StickyCache.delete: key=%s", key) + return existed + + def active_count(self) -> int: + """Count of non-expired entries (best-effort; no eviction).""" + now = time.monotonic() + with self._lock: + return sum(1 for e in self._cache.values() if e.expires_at > now) + + def active_entries(self) -> List[Tuple[str, str, float]]: + """ + Return (key, node_id, ttl_remaining_s) for all non-expired entries. + Useful for ops visibility in !status/!nodes. + """ + now = time.monotonic() + with self._lock: + result = [] + for k, e in self._cache.items(): + remaining = e.expires_at - now + if remaining > 0: + result.append((k, e.node_id, remaining)) + return sorted(result, key=lambda x: x[0]) + + def cleanup(self) -> int: + """ + Remove all expired entries. + Call periodically (e.g. in a background task) to reclaim memory. + Returns count of removed entries. + """ + now = time.monotonic() + with self._lock: + expired_keys = [k for k, e in self._cache.items() if e.expires_at <= now] + for k in expired_keys: + del self._cache[k] + if expired_keys: + logger.debug("StickyCache.cleanup: removed %d expired entries", len(expired_keys)) + return len(expired_keys) + + @property + def ttl_s(self) -> float: + return self._ttl_s + + +def make_sticky_key(room_id: str, agent_id: str) -> str: + """Canonical sticky cache key for a room+agent pair.""" + return f"{room_id}:{agent_id}" diff --git a/services/matrix-bridge-dagi/requirements.txt b/services/matrix-bridge-dagi/requirements.txt index 208a74dc..a4f539e6 100644 --- a/services/matrix-bridge-dagi/requirements.txt +++ b/services/matrix-bridge-dagi/requirements.txt @@ -4,3 +4,4 @@ httpx>=0.25.0 python-dotenv>=1.0.0 prometheus-client>=0.20.0 pyyaml>=6.0 +aiosqlite>=0.19.0