Phase6/7 runtime + Gitea smoke gate setup #1

Merged
daarion-admin merged 214 commits from codex/sync-node1-runtime into main 2026-03-05 10:38:18 -08:00
21 changed files with 9123 additions and 93 deletions
Showing only changes of commit 82d5ff2a4f - Show all commits

View File

@@ -67,6 +67,41 @@ services:
- BRIDGE_CONTROL_ROOMS=${BRIDGE_CONTROL_ROOMS:-} - BRIDGE_CONTROL_ROOMS=${BRIDGE_CONTROL_ROOMS:-}
# "ignore" (silent) | "reply_error" (⛔ reply to unauthorised attempts) # "ignore" (silent) | "reply_error" (⛔ reply to unauthorised attempts)
- CONTROL_UNAUTHORIZED_BEHAVIOR=${CONTROL_UNAUTHORIZED_BEHAVIOR:-ignore} - CONTROL_UNAUTHORIZED_BEHAVIOR=${CONTROL_UNAUTHORIZED_BEHAVIOR:-ignore}
# ── M3.1: Runbook runner token ───────────────────────────────────────
# X-Control-Token for POST /api/runbooks/internal/runs (sofiia-console)
- SOFIIA_CONTROL_TOKEN=${SOFIIA_CONTROL_TOKEN:-}
# M3.4: Control channel safety — rate limiting + cooldown
- CONTROL_ROOM_RPM=${CONTROL_ROOM_RPM:-60}
- CONTROL_OPERATOR_RPM=${CONTROL_OPERATOR_RPM:-30}
- CONTROL_RUN_NEXT_RPM=${CONTROL_RUN_NEXT_RPM:-20}
- CONTROL_COOLDOWN_S=${CONTROL_COOLDOWN_S:-2.0}
# M2.3: Persistent event deduplication
- PERSISTENT_DEDUPE=${PERSISTENT_DEDUPE:-1}
- BRIDGE_DATA_DIR=${BRIDGE_DATA_DIR:-/app/data}
- PROCESSED_EVENTS_TTL_H=${PROCESSED_EVENTS_TTL_H:-48}
- PROCESSED_EVENTS_PRUNE_BATCH=${PROCESSED_EVENTS_PRUNE_BATCH:-5000}
- PROCESSED_EVENTS_PRUNE_INTERVAL_S=${PROCESSED_EVENTS_PRUNE_INTERVAL_S:-3600}
# M4.0: agent discovery
- DISCOVERY_RPM=${DISCOVERY_RPM:-20}
# M5.0: node-aware routing
- BRIDGE_ALLOWED_NODES=${BRIDGE_ALLOWED_NODES:-NODA1}
- BRIDGE_DEFAULT_NODE=${BRIDGE_DEFAULT_NODE:-NODA1}
- BRIDGE_ROOM_NODE_MAP=${BRIDGE_ROOM_NODE_MAP:-}
# M8.0: Node health + soft-failover thresholds
- NODE_FAIL_CONSEC=${NODE_FAIL_CONSEC:-3}
- NODE_LAT_EWMA_S=${NODE_LAT_EWMA_S:-12.0}
- NODE_EWMA_ALPHA=${NODE_EWMA_ALPHA:-0.3}
# M8.1: Sticky failover TTL (0 = disabled)
- FAILOVER_STICKY_TTL_S=${FAILOVER_STICKY_TTL_S:-300}
# M8.2: HA state persistence
- HA_HEALTH_SNAPSHOT_INTERVAL_S=${HA_HEALTH_SNAPSHOT_INTERVAL_S:-60}
- HA_HEALTH_MAX_AGE_S=${HA_HEALTH_MAX_AGE_S:-600}
# M9.0: Two-step confirmation TTL for dangerous commands (0 = disabled)
- CONFIRM_TTL_S=${CONFIRM_TTL_S:-120}
- POLICY_EXPORT_RETENTION_DAYS=${POLICY_EXPORT_RETENTION_DAYS:-30}
- POLICY_HISTORY_LIMIT=${POLICY_HISTORY_LIMIT:-100}
# M11 soak: NEVER set to true in production
- DEBUG_INJECT_ENABLED=${DEBUG_INJECT_ENABLED:-false}
# ── M2.2: Mixed room guard rails ──────────────────────────────────── # ── M2.2: Mixed room guard rails ────────────────────────────────────
# Fail-fast if any room defines more agents than this # Fail-fast if any room defines more agents than this

View File

@@ -0,0 +1,986 @@
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "Prometheus",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__elements": {},
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "9.0.0"
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
},
{
"type": "panel",
"id": "stat",
"name": "Stat",
"version": ""
},
{
"type": "panel",
"id": "timeseries",
"name": "Time series",
"version": ""
},
{
"type": "panel",
"id": "gauge",
"name": "Gauge",
"version": ""
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Matrix Bridge DAGI \u2014 operational overview (M7.0). Traffic, latency, errors, queue, dedupe, control channel.",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [
{
"asDropdown": false,
"icon": "doc",
"includeVars": false,
"keepTime": false,
"tags": [],
"targetBlank": true,
"title": "Runbook",
"tooltip": "matrix-bridge-dagi-ops.md",
"type": "link",
"url": "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md"
}
],
"panels": [
{
"id": 1,
"type": "stat",
"title": "Bridge Up",
"gridPos": {
"x": 0,
"y": 0,
"w": 4,
"h": 4
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum(matrix_bridge_up)",
"legendFormat": "up (all nodes)",
"refId": "A",
"instant": true
}
],
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"colorMode": "background",
"graphMode": "none",
"textMode": "auto",
"orientation": "auto"
},
"fieldConfig": {
"defaults": {
"mappings": [
{
"type": "value",
"options": {
"0": {
"text": "DOWN",
"color": "red"
},
"1": {
"text": "UP",
"color": "green"
}
}
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "red",
"value": null
},
{
"color": "green",
"value": 1
}
]
},
"color": {
"mode": "thresholds"
}
},
"overrides": []
}
},
{
"id": 2,
"type": "stat",
"title": "Queue Size",
"gridPos": {
"x": 4,
"y": 0,
"w": 4,
"h": 4
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "matrix_bridge_queue_size",
"legendFormat": "queue",
"refId": "A",
"instant": true
}
],
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"colorMode": "background",
"graphMode": "area",
"textMode": "auto"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "red",
"value": 100
}
]
},
"color": {
"mode": "thresholds"
},
"unit": "short"
},
"overrides": []
}
},
{
"id": 3,
"type": "stat",
"title": "Active Rate-Limiter Rooms",
"gridPos": {
"x": 8,
"y": 0,
"w": 4,
"h": 4
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "matrix_bridge_rate_limiter_active_rooms",
"legendFormat": "rooms",
"refId": "A",
"instant": true
}
],
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"colorMode": "value",
"graphMode": "none"
},
"fieldConfig": {
"defaults": {
"unit": "short",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 4,
"type": "stat",
"title": "Active Room-Agent Locks",
"gridPos": {
"x": 12,
"y": 0,
"w": 4,
"h": 4
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "matrix_bridge_active_room_agent_locks",
"legendFormat": "locks",
"refId": "A",
"instant": true
}
],
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"colorMode": "value",
"graphMode": "none"
},
"fieldConfig": {
"defaults": {
"unit": "short",
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 5,
"type": "stat",
"title": "Drops (5m)",
"gridPos": {
"x": 16,
"y": 0,
"w": 4,
"h": 4
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum(increase(matrix_bridge_queue_dropped_total[5m]))",
"legendFormat": "dropped",
"refId": "A",
"instant": true
}
],
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"colorMode": "background",
"graphMode": "none"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 1
}
]
},
"color": {
"mode": "thresholds"
},
"unit": "short"
},
"overrides": []
}
},
{
"id": 6,
"type": "stat",
"title": "Errors (5m)",
"gridPos": {
"x": 20,
"y": 0,
"w": 4,
"h": 4
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum(increase(matrix_bridge_gateway_errors_total[5m]))",
"legendFormat": "errors",
"refId": "A",
"instant": true
}
],
"options": {
"reduceOptions": {
"calcs": [
"lastNotNull"
]
},
"colorMode": "background",
"graphMode": "none"
},
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 1
},
{
"color": "red",
"value": 5
}
]
},
"color": {
"mode": "thresholds"
},
"unit": "short"
},
"overrides": []
}
},
{
"id": 10,
"type": "timeseries",
"title": "Traffic: Received & Replied (rate/5m)",
"gridPos": {
"x": 0,
"y": 4,
"w": 12,
"h": 8
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum(rate(matrix_bridge_messages_received_total[5m]))",
"legendFormat": "received",
"refId": "A"
},
{
"expr": "sum(rate(matrix_bridge_messages_replied_total{status=\"ok\"}[5m]))",
"legendFormat": "replied ok",
"refId": "B"
},
{
"expr": "sum(rate(matrix_bridge_messages_replied_total{status=\"error\"}[5m]))",
"legendFormat": "replied error",
"refId": "C"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "replied error"
},
"properties": [
{
"id": "color",
"value": {
"mode": "fixed",
"fixedColor": "red"
}
}
]
}
]
}
},
{
"id": 11,
"type": "timeseries",
"title": "Errors / Drops / Rate-Limited (rate/5m)",
"gridPos": {
"x": 12,
"y": 4,
"w": 12,
"h": 8
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum by (error_type) (rate(matrix_bridge_gateway_errors_total[5m]))",
"legendFormat": "gw_error: {{ error_type }}",
"refId": "A"
},
{
"expr": "sum(rate(matrix_bridge_queue_dropped_total[5m]))",
"legendFormat": "queue_dropped",
"refId": "B"
},
{
"expr": "sum(rate(matrix_bridge_rate_limited_total[5m]))",
"legendFormat": "rate_limited",
"refId": "C"
},
{
"expr": "sum by (reason) (rate(matrix_bridge_route_rejected_total[5m]))",
"legendFormat": "route_rejected: {{ reason }}",
"refId": "D"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 15,
"drawStyle": "line",
"stacking": {
"mode": "none"
},
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 20,
"type": "timeseries",
"title": "Invoke Latency P50 / P95 by Node",
"gridPos": {
"x": 0,
"y": 12,
"w": 12,
"h": 8
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "histogram_quantile(0.50, sum by (node_id, le) (rate(matrix_bridge_invoke_duration_seconds_bucket[5m])))",
"legendFormat": "p50 {{ node_id }}",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum by (node_id, le) (rate(matrix_bridge_invoke_duration_seconds_bucket[5m])))",
"legendFormat": "p95 {{ node_id }}",
"refId": "B"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max",
"last"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "s",
"custom": {
"lineWidth": 2,
"fillOpacity": 5,
"drawStyle": "line",
"spanNulls": false
},
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 10
},
{
"color": "red",
"value": 20
}
]
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 21,
"type": "timeseries",
"title": "Queue Wait P50 / P95",
"gridPos": {
"x": 12,
"y": 12,
"w": 12,
"h": 8
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "histogram_quantile(0.50, sum by (agent_id, le) (rate(matrix_bridge_queue_wait_seconds_bucket[5m])))",
"legendFormat": "wait p50 {{ agent_id }}",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum by (agent_id, le) (rate(matrix_bridge_queue_wait_seconds_bucket[5m])))",
"legendFormat": "wait p95 {{ agent_id }}",
"refId": "B"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "s",
"custom": {
"lineWidth": 2,
"fillOpacity": 5,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 30,
"type": "timeseries",
"title": "Node Routing: Routed & Rejected by Node (rate/5m)",
"gridPos": {
"x": 0,
"y": 20,
"w": 12,
"h": 7
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum by (node_id) (rate(matrix_bridge_routed_total[5m]))",
"legendFormat": "routed {{ node_id }}",
"refId": "A"
},
{
"expr": "sum by (node_id) (rate(matrix_bridge_node_rejected_total[5m]))",
"legendFormat": "rejected {{ node_id }}",
"refId": "B"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 31,
"type": "timeseries",
"title": "Persistent Dedupe Hits / Inserts (rate/10m)",
"gridPos": {
"x": 12,
"y": 20,
"w": 12,
"h": 7
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum(rate(matrix_bridge_dedupe_persistent_hits_total[10m]))",
"legendFormat": "dedupe_hits",
"refId": "A"
},
{
"expr": "rate(matrix_bridge_dedupe_persistent_inserts_total[10m])",
"legendFormat": "dedupe_inserts",
"refId": "B"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 40,
"type": "timeseries",
"title": "Control Commands (rate/5m)",
"gridPos": {
"x": 0,
"y": 27,
"w": 12,
"h": 7
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum by (verb) (rate(matrix_bridge_control_commands_total[5m]))",
"legendFormat": "cmd {{ verb }}",
"refId": "A"
},
{
"expr": "sum by (scope) (rate(matrix_bridge_control_rate_limited_total[5m]))",
"legendFormat": "ctrl_ratelimited {{ scope }}",
"refId": "B"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 41,
"type": "timeseries",
"title": "Traffic by Agent (received rate/5m)",
"gridPos": {
"x": 12,
"y": 27,
"w": 24,
"h": 7
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum by (agent_id) (rate(matrix_bridge_messages_received_total[5m]))",
"legendFormat": "{{ agent_id }}",
"refId": "A"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max",
"last"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
},
{
"id": 42,
"type": "timeseries",
"title": "Routing Reasons by Agent (rate/5m)",
"description": "M7.1: matrix_bridge_routing_reasons_total \u2014 slash/mention/name/default/direct breakdown",
"gridPos": {
"x": 0,
"y": 34,
"w": 24,
"h": 7
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"targets": [
{
"expr": "sum by (agent_id, reason) (rate(matrix_bridge_routing_reasons_total[5m]))",
"legendFormat": "{{ agent_id }} / {{ reason }}",
"refId": "A"
}
],
"options": {
"tooltip": {
"mode": "multi",
"sort": "desc"
},
"legend": {
"displayMode": "table",
"placement": "bottom",
"calcs": [
"mean",
"max"
]
}
},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {
"lineWidth": 2,
"fillOpacity": 10,
"drawStyle": "line",
"spanNulls": false
},
"color": {
"mode": "palette-classic"
}
},
"overrides": []
}
}
],
"refresh": "30s",
"schemaVersion": 38,
"tags": [
"matrix-bridge",
"dagi",
"daarion"
],
"templating": {
"list": [
{
"current": {},
"hide": 0,
"includeAll": false,
"label": "Datasource",
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"regex": "",
"type": "datasource"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "UTC",
"title": "Matrix Bridge DAGI",
"uid": "matrix-bridge-dagi-v1",
"version": 1
}

View File

@@ -0,0 +1,158 @@
---
# Prometheus alert rules — Matrix Bridge DAGI
# Phase M7.1 (metrics contract hardening)
#
# Metric source of truth: services/matrix-bridge-dagi/app/metrics_contract.py
# Runbook: docs/runbook/matrix-bridge-dagi-ops.md
#
# Usage:
# promtool check rules ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
# docker run --rm -v $PWD:/w prom/prometheus:latest \
# promtool check rules /w/ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
groups:
- name: matrix_bridge_dagi
interval: 30s
rules:
# ── A1: Bridge process down ─────────────────────────────────────────────
# metric: matrix_bridge_up{node_id} (Gauge, M7.1: labeled per node)
- alert: BridgeDown
expr: sum(matrix_bridge_up) == 0
for: 1m
labels:
severity: critical
team: platform
service: matrix-bridge-dagi
annotations:
summary: "Matrix Bridge DAGI is down"
description: >
`matrix_bridge_up` == 0 across all nodes — bridge process has not
started or has crashed. No messages are being processed.
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a1-bridgedown"
# ── A2: Matrix sync errors spike ────────────────────────────────────────
# metric: matrix_bridge_gateway_errors_total{error_type} (Counter)
- alert: MatrixSyncErrors
expr: >
increase(matrix_bridge_gateway_errors_total{error_type="sync_error"}[5m]) > 3
for: 2m
labels:
severity: warning
team: platform
service: matrix-bridge-dagi
annotations:
summary: "Matrix sync errors elevated"
description: >
More than 3 Matrix `/sync` errors (error_type=sync_error) in the last
5 minutes. May indicate Matrix homeserver problems or network issues.
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a2-matrixsyncerrors"
# ── A3: Gateway (Router) invoke errors spike ─────────────────────────────
# metric: matrix_bridge_messages_replied_total{status} (Counter)
- alert: GatewayInvokeErrors
expr: >
increase(matrix_bridge_messages_replied_total{status="error"}[5m]) > 5
for: 2m
labels:
severity: warning
team: platform
service: matrix-bridge-dagi
annotations:
summary: "Router invoke errors elevated (node={{ $labels.node_id }})"
description: >
More than 5 agent invocation errors (status=error) in the last 5 minutes.
Check Router/DeepSeek connectivity and logs.
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a3-gatewayinvokeerrors"
# ── A4: Queue drops ─────────────────────────────────────────────────────
# metric: matrix_bridge_queue_dropped_total{room_id, agent_id} (Counter)
- alert: QueueDropsHigh
expr: >
rate(matrix_bridge_queue_dropped_total[5m]) > 0
for: 1m
labels:
severity: warning
team: platform
service: matrix-bridge-dagi
annotations:
summary: "Bridge queue is dropping messages"
description: >
`matrix_bridge_queue_dropped_total` is increasing — work queue is full
and incoming messages are being dropped. Increase
`BRIDGE_QUEUE_MAX_EVENTS` or `BRIDGE_WORKER_CONCURRENCY`.
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a4-queuedrops"
# ── A5: User-level rate limiting spike ──────────────────────────────────
# metric: matrix_bridge_rate_limited_total{room_id, agent_id, limit_type} (Counter)
- alert: RateLimitedSpike
expr: >
rate(matrix_bridge_rate_limited_total[5m]) > 2
for: 3m
labels:
severity: warning
team: platform
service: matrix-bridge-dagi
annotations:
summary: "User rate limiting spike"
description: >
More than 2 messages/second are being rate-limited over 3 minutes.
May indicate a flood attack, misbehaving client, or limits too low.
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a5-ratelimitedspike"
# ── A6: Control channel rate limiting spike ──────────────────────────────
# metric: matrix_bridge_control_rate_limited_total{scope} (Counter)
- alert: ControlRateLimitedSpike
expr: >
rate(matrix_bridge_control_rate_limited_total[5m]) > 0.5
for: 3m
labels:
severity: warning
team: platform
service: matrix-bridge-dagi
annotations:
summary: "Control channel rate limiting elevated"
description: >
More than 0.5 control commands/second rejected by rate limiter over
3 minutes. May indicate operator tooling issues or abuse attempt.
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a6-controlratelimitedspike"
# ── A7: Persistent dedupe hit storm (resend loop) ────────────────────────
# metric: matrix_bridge_dedupe_persistent_hits_total{room_id} (Counter)
- alert: DedupeHitStorm
expr: >
rate(matrix_bridge_dedupe_persistent_hits_total[10m]) > 0.5
for: 5m
labels:
severity: warning
team: platform
service: matrix-bridge-dagi
annotations:
summary: "Persistent deduplication hit rate elevated"
description: >
High rate of persistent dedupe hits — may indicate a Matrix resend
storm or a client repeatedly retrying the same event_id.
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a7-dedupehitstorm"
# ── A8: Invoke latency P95 high (per node) ───────────────────────────────
# metric: matrix_bridge_invoke_duration_seconds{agent_id, node_id} (Histogram)
- alert: InvokeLatencyP95High
expr: >
histogram_quantile(
0.95,
sum by (node_id, le) (
rate(matrix_bridge_invoke_duration_seconds_bucket[5m])
)
) > 15
for: 5m
labels:
severity: warning
team: platform
service: matrix-bridge-dagi
annotations:
summary: "Router invoke latency P95 > 15s (node={{ $labels.node_id }})"
description: >
95th percentile invoke latency for node `{{ $labels.node_id }}` exceeds
15 seconds over the last 5 minutes. Check Router load, DeepSeek API,
Ollama/Swapper queue.
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a8-invokelatencyp95high"

View File

@@ -0,0 +1,401 @@
# matrix-bridge-dagi — Soak & Failure Rehearsal Runbook (M11)
**Phase:** M11
**Applies to:** `matrix-bridge-dagi` service on NODA1
**When to run:** Before any production traffic increase, after major code changes, or on a recurring monthly basis.
---
## 1. Goals
| Goal | Measurable pass criterion |
|------|--------------------------|
| Latency under load | p95 invoke < 5 000 ms |
| Queue stability | drop rate < 1% |
| Failover correctness | failover fires on NODA1 outage; NODA2 serves all remaining messages |
| Sticky anti-flap | sticky set after first failover; no re-tries to degraded node |
| Restart recovery | sticky + health snapshot reloads within 10 s of restart |
| Policy operations safe under load | `!policy history` / `!policy change` work while messages in-flight |
---
## 2. Prerequisites
```bash
# On NODA1 or local machine with network access to bridge
pip install httpx
# Verify bridge is up
curl -s http://localhost:9400/health | jq '.ok'
# Expected: true
# Verify /metrics endpoint
curl -s http://localhost:9400/metrics | grep matrix_bridge_up
# Expected: matrix_bridge_up{...} 1
```
---
## 2a. Enabling the Soak Inject Endpoint
The soak script uses `POST /v1/debug/inject_event` which is **disabled by default**.
Enable it only on staging/NODA1 soak runs:
```bash
# On NODA1 — edit docker-compose override or pass env inline:
# Option 1: temporary inline restart
DEBUG_INJECT_ENABLED=true docker-compose \
-f docker-compose.matrix-bridge-node1.yml \
up -d --no-deps matrix-bridge-dagi
# Option 2: .env file override
echo "DEBUG_INJECT_ENABLED=true" >> .env.soak
docker-compose --env-file .env.soak \
-f docker-compose.matrix-bridge-node1.yml \
up -d --no-deps matrix-bridge-dagi
# Verify it's enabled (should return 200, not 403)
curl -s -X POST http://localhost:9400/v1/debug/inject_event \
-H 'Content-Type: application/json' \
-d '{"room_id":"!test:test","event":{}}' | jq .
# Expected: {"ok":false,"error":"no mapping for room_id=..."} ← 200, not 403
# IMPORTANT: disable after soak
docker-compose -f docker-compose.matrix-bridge-node1.yml up -d --no-deps matrix-bridge-dagi
# (DEBUG_INJECT_ENABLED defaults to false)
```
---
## 2b. Step 0 (WORKERS=2 / QUEUE=100) — Record True Baseline
**Goal:** snapshot the "before any tuning" numbers to have a comparison point.
```bash
# 0. Confirm current config (should be defaults)
curl -s http://localhost:9400/health | jq '{workers: .workers, queue_max: .queue.max}'
# Expected: {"workers": 2, "queue_max": 100}
# 1. DB path for WAL check (adjust to your BRIDGE_DATA_DIR)
DB=/opt/microdao-daarion/data/matrix_bridge.db
# 2. WAL size before (manual check)
ls -lh ${DB}-wal 2>/dev/null || echo "(no WAL file yet — first run)"
sqlite3 $DB "PRAGMA wal_checkpoint(PASSIVE);" 2>/dev/null || echo "(no sqlite3)"
# 3. Run Step 0 soak
python3 ops/scripts/matrix_bridge_soak.py \
--url http://localhost:9400 \
--messages 100 \
--concurrency 4 \
--agent sofiia \
--room-id "!your-room-id:your-server" \
--max-p95-ms 5000 \
--max-drop-rate 0.001 \
--db-path $DB \
--report-file /tmp/soak_step0_baseline.json
# 4. Record result in "Baseline numbers" table (section 10) below.
jq '.summary, .latency, .metrics_delta, .wal' /tmp/soak_step0_baseline.json
```
**v1 Go/No-Go thresholds for Step 0:**
| Metric | Green ✅ | Yellow ⚠️ | Red ❌ |
|--------|---------|-----------|-------|
| `p95_invoke_ms` | < 3000 | 30005000 | > 5000 |
| `drop_rate` | 0.00% (mandatory) | — | > 0.1% |
| `error_rate` | < 1% | 13% | > 3% |
| `failovers` | 0 | — | ≥ 1 without cause |
| WAL delta | < 2 MB | 210 MB | > 10 MB |
**If Step 0 is Green → proceed to Step 1 tuning.**
**If Step 0 is Yellow/Red → investigate before touching WORKER_CONCURRENCY.**
---
## 2c. Step 1 (WORKERS=4 / QUEUE=200) — Tune-1
**Goal:** verify that doubling workers gives headroom without Router saturation.
```bash
# 1. Apply tuning
WORKER_CONCURRENCY=4 QUEUE_MAX_EVENTS=200 docker-compose \
-f docker-compose.matrix-bridge-node1.yml \
--env-file .env.soak \
up -d --no-deps matrix-bridge-dagi
sleep 3
curl -s http://localhost:9400/health | jq '{workers: .workers, queue_max: .queue.max}'
# Expected: {"workers": 4, "queue_max": 200}
# 2. Run Step 1 soak (higher concurrency to stress the new headroom)
python3 ops/scripts/matrix_bridge_soak.py \
--url http://localhost:9400 \
--messages 100 \
--concurrency 8 \
--agent sofiia \
--room-id "!your-room-id:your-server" \
--max-p95-ms 3000 \
--max-drop-rate 0.001 \
--db-path $DB \
--report-file /tmp/soak_step1_tune1.json
# 3. Compare Step 0 vs Step 1
python3 - <<'EOF'
import json
s0 = json.load(open('/tmp/soak_step0_baseline.json'))
s1 = json.load(open('/tmp/soak_step1_tune1.json'))
for k in ('p50', 'p95', 'p99'):
print(f"{k}: {s0['latency'][k]}ms → {s1['latency'][k]}ms")
print(f"drops: {s0['metrics_delta']['queue_drops']} → {s1['metrics_delta']['queue_drops']}")
print(f"WAL: {s0['wal'].get('delta_mb')} → {s1['wal'].get('delta_mb')} MB delta")
EOF
```
**Decision:**
- Step 1 Green → **freeze, tag v1.0, ship to production.**
- p95 within 5% of Step 0 → Router is bottleneck (not workers); don't go to Step 2.
- Queue drops > 0 at WORKERS=4 → try Step 2 (WORKERS=8, QUEUE=300).
---
## 3. Scenario A — Baseline load (100 messages, concurrency 4)
**Goal:** establish latency baseline, verify no drops under normal load.
```bash
python3 ops/scripts/matrix_bridge_soak.py \
--url http://localhost:9400 \
--messages 100 \
--concurrency 4 \
--max-p95-ms 3000 \
--report-file /tmp/soak_baseline.json
```
**Expected output:**
```
matrix-bridge-dagi Soak Report ✅ PASSED
Messages: 100 concurrency=4
Latency: p50=<500ms p95=<3000ms
Queue drops: 0 (rate 0.000%)
Failovers: 0
```
**If FAILED:**
- `p95 too high` → check router `/health`, DeepSeek API latency, `docker stats`
- `drop_rate > 0` → check `QUEUE_MAX_EVENTS` env var (increase if needed), inspect bridge logs
---
## 4. Scenario B — Queue saturation test
**Goal:** confirm drop metric fires cleanly and bridge doesn't crash.
```bash
# Reduce queue via env override, then flood:
QUEUE_MAX_EVENTS=5 docker-compose -f docker-compose.matrix-bridge-node1.yml \
up -d matrix-bridge-dagi
# Wait for restart
sleep 5
python3 ops/scripts/matrix_bridge_soak.py \
--url http://localhost:9400 \
--messages 30 \
--concurrency 10 \
--max-drop-rate 0.99 \
--report-file /tmp/soak_queue_sat.json
# Restore normal queue size
docker-compose -f docker-compose.matrix-bridge-node1.yml up -d matrix-bridge-dagi
```
**Expected:** `queue_drops > 0`, bridge still running after the test.
**Verify in Prometheus/Grafana:**
```promql
rate(matrix_bridge_queue_dropped_total[1m])
```
Should spike and then return to 0.
---
## 5. Scenario C — Node failover rehearsal
**Goal:** simulate NODA1 router becoming unavailable, verify NODA2 takes over.
```bash
# Step 1: stop the router on NODA1 temporarily
docker pause dagi-router-node1
# Step 2: run soak against bridge (bridge will failover to NODA2)
python3 ops/scripts/matrix_bridge_soak.py \
--url http://localhost:9400 \
--messages 20 \
--concurrency 2 \
--max-p95-ms 10000 \
--report-file /tmp/soak_failover.json
# Step 3: restore router
docker unpause dagi-router-node1
```
**Expected:**
```
Failovers: 1..20 (at least 1)
Sticky sets: 1+
Errors: 0 (fallback to NODA2 serves all messages)
```
**Check sticky in control room:**
```
!nodes
```
Should show `NODA2` sticky with remaining TTL.
**Check health tracker:**
```
!status
```
Should show `NODA1 state=degraded|down`.
---
## 6. Scenario D — Restart recovery
**Goal:** after restart, sticky and health state reload within one polling cycle.
```bash
# After Scenario C: sticky is set to NODA2
# Restart the bridge
docker restart dagi-matrix-bridge-node1
# Wait for startup (up to 30s)
sleep 15
# Verify sticky reloaded
curl -s http://localhost:9400/health | jq '.ha_state'
# Expected: {"sticky_loaded": N, ...}
# Verify routing still uses NODA2 sticky
python3 ops/scripts/matrix_bridge_soak.py \
--url http://localhost:9400 \
--messages 10 \
--concurrency 2 \
--report-file /tmp/soak_restart.json
```
**Expected:** p95 similar to post-failover run, `Failovers: 0` (sticky already applied).
---
## 7. Scenario E — Rate limit burst
**Goal:** verify rate limiting fires and bridge doesn't silently drop below-limit messages.
```bash
# Set RPM very low for test, then flood from same sender
# This is best done in control room by observing !status rate_limited count
# rather than the soak script (which uses different senders per message).
# In Matrix control room:
# Send 30+ messages from the same user account in quick succession in a mixed room.
# Then:
!status
# Check: rate_limited_total increased, no queue drops.
```
---
## 8. Scenario F — Policy operations under load
**Goal:** `!policy history`, `!policy change`, and `!policy export` work while messages are in-flight.
```bash
# Run a background soak
python3 ops/scripts/matrix_bridge_soak.py \
--url http://localhost:9400 \
--messages 200 \
--concurrency 2 \
--report-file /tmp/soak_concurrent_policy.json &
# While soak is running, in Matrix control room:
!policy history limit=5
!policy export
!status
```
**Expected:** all three commands respond immediately (< 2s), soak completes without extra drops.
---
## 9. Prometheus / Grafana during soak
Key queries for the Grafana dashboard:
```promql
# Throughput (messages/s)
rate(matrix_bridge_routed_total[30s])
# Error rate
rate(matrix_bridge_errors_total[30s])
# p95 invoke latency per node
histogram_quantile(0.95, rate(matrix_bridge_invoke_duration_seconds_bucket[1m]))
# Queue drops rate
rate(matrix_bridge_queue_dropped_total[1m])
# Failovers
rate(matrix_bridge_failover_total[5m])
```
Use the `matrix-bridge-dagi` Grafana dashboard at:
`ops/grafana/dashboards/matrix-bridge-dagi.json`
---
## 10. Baseline numbers (reference)
| Metric | Cold start | Warm (sticky set) |
|--------|-----------|-------------------|
| p50 latency | ~200ms | ~150ms |
| p95 latency | ~2 000ms | ~1 500ms |
| Queue drops | 0 (queue=100) | 0 |
| Failover fires | 1 per degradation | 0 after sticky |
| Policy ops response | < 500ms | < 500ms |
*Update this table after each soak run with actual measured values.*
---
## 11. CI soak (mocked, no network)
For CI pipelines, use the mocked soak scenarios:
```bash
python3 -m pytest tests/test_matrix_bridge_m11_soak_scenarios.py -v
```
Covers (all deterministic, no network):
- **S1** Queue saturation → drop counter
- **S2** Failover under load → on_failover callback, health tracker
- **S3** Sticky routing under burst → sticky set, burst routed to NODA2
- **S4** Multi-room isolation → separate rooms don't interfere
- **S5** Rate-limit burst → RL callback wired, no panic
- **S6** HA restart recovery → sticky + health snapshot persisted and reloaded
- **Perf baseline** 100-msg + 50-msg failover burst < 5s wall clock
---
## 12. Known failure modes & mitigations
| Symptom | Likely cause | Mitigation |
|---------|-------------|------------|
| `p95 > 5000ms` | Router/LLM slow | Increase `ROUTER_TIMEOUT_S`, check DeepSeek API |
| `drop_rate > 1%` | Queue too small | Increase `QUEUE_MAX_EVENTS` |
| `failovers > 0` but errors > 0 | Both nodes degraded | Check NODA1 + NODA2 health; scale router |
| Bridge crash during soak | Memory leak / bug | `docker logs` → file GitHub issue |
| Sticky not set after failover | `FAILOVER_STICKY_TTL_S=0` | Set to 300+ |
| Restart doesn't load sticky | `HA_HEALTH_MAX_AGE_S` too small | Increase or set to 3600 |

View File

@@ -0,0 +1,476 @@
#!/usr/bin/env python3
"""
matrix_bridge_soak.py — M11 live soak script for matrix-bridge-dagi
Usage:
python3 ops/scripts/matrix_bridge_soak.py \
--url http://localhost:9400 \
--messages 100 \
--concurrency 4 \
--report-file /tmp/soak_report.json
Requires: httpx (pip install httpx)
What it does:
1. Sends --messages synthetic messages to the bridge /v1/sync endpoint
(or directly to the router if --direct-router is set).
2. Measures latency (p50, p95, p99, max) per batch.
3. After the run, fetches /metrics and extracts key counters:
- matrix_bridge_queue_dropped_total
- matrix_bridge_rate_limited_total
- matrix_bridge_failover_total
- matrix_bridge_sticky_node_total
- matrix_bridge_invoke_duration_seconds (p50/p95 from histogram)
4. Prints a human-readable report and optionally writes JSON.
Exit codes:
0 = all pass criteria met
1 = one or more thresholds exceeded (see --max-p95-ms, --max-drop-rate)
"""
import argparse
import asyncio
import json
import sys
import time
from typing import Any, Dict, List, Optional
try:
import httpx
except ImportError:
print("ERROR: httpx not installed. Run: pip install httpx", file=sys.stderr)
sys.exit(2)
# ── Pass/fail defaults ─────────────────────────────────────────────────────────
_DEFAULT_MAX_P95_MS = 5000 # 5 s p95 per invoke (generous for cold start)
_DEFAULT_MAX_DROP_RATE = 0.01 # 1% queue drops allowed
# ── Metrics parsing ────────────────────────────────────────────────────────────
def _parse_counter(text: str, name: str) -> float:
"""Extract the last reported value of a Prometheus counter by name."""
for line in text.splitlines():
if line.startswith(name + " ") or line.startswith(name + "{"):
parts = line.rsplit(None, 1)
try:
return float(parts[-1])
except (ValueError, IndexError):
pass
return 0.0
def _parse_histogram_quantile(text: str, name: str, quantile: float) -> Optional[float]:
"""
Approximate histogram_quantile from _bucket lines.
Returns estimated value at given quantile or None if data missing.
"""
buckets: List[tuple] = []
total_count = 0.0
for line in text.splitlines():
if f"{name}_bucket" in line and 'le="' in line:
try:
le_part = line.split('le="')[1].split('"')[0]
le = float(le_part) if le_part != "+Inf" else float("inf")
val = float(line.rsplit(None, 1)[-1])
buckets.append((le, val))
except (ValueError, IndexError):
pass
elif (f"{name}_count " in line or (name + "_count{") in line):
try:
total_count = float(line.rsplit(None, 1)[-1])
except (ValueError, IndexError):
pass
if not buckets or total_count == 0:
return None
buckets.sort()
target = quantile * total_count
prev_le, prev_count = 0.0, 0.0
for le, count in buckets:
if count >= target:
if le == float("inf"):
return prev_le
# Linear interpolation
if count == prev_count:
return le
fraction = (target - prev_count) / (count - prev_count)
return prev_le + fraction * (le - prev_le)
prev_le, prev_count = le, count
return prev_le
# ── Soak runner ────────────────────────────────────────────────────────────────
async def _preflight_inject(client: httpx.AsyncClient, url: str, room_id: str) -> str:
"""
Verify the inject endpoint is reachable and enabled.
Returns "" on success, error message on failure.
"""
try:
resp = await client.post(
f"{url.rstrip('/')}/v1/debug/inject_event",
json={"room_id": room_id, "event": {"event_id": "!preflight", "sender": "@soak:test",
"content": {"msgtype": "m.text", "body": "ping"}}},
timeout=5.0,
)
if resp.status_code == 403:
return (
"❌ DEBUG_INJECT_ENABLED=false on bridge. "
"Set DEBUG_INJECT_ENABLED=true and restart for soak.\n"
" NEVER enable in production!"
)
if resp.status_code >= 500:
return f"❌ Bridge inject endpoint returned HTTP {resp.status_code}"
data = resp.json()
if not data.get("ok") and "no mapping" in data.get("error", ""):
return (
f"❌ No room mapping for room_id={room_id!r}. "
"Pass --room-id matching a configured BRIDGE_ROOM_MAP entry."
)
return ""
except httpx.ConnectError:
return f"❌ Cannot connect to bridge at {url}. Is it running?"
except Exception as exc: # noqa: BLE001
return f"❌ Preflight failed: {exc}"
async def _check_wal(db_path: str) -> Dict[str, Any]:
"""
Run WAL size + checkpoint check on the bridge policy DB.
Returns dict with wal_bytes, wal_mb, checkpoint_result.
Requires sqlite3 CLI on PATH; gracefully skips if unavailable.
"""
import subprocess, shutil
result: Dict[str, Any] = {"db_path": db_path, "ok": False}
wal_path = db_path + "-wal"
try:
wal_bytes = os.path.getsize(wal_path) if os.path.exists(wal_path) else 0
result["wal_bytes"] = wal_bytes
result["wal_mb"] = round(wal_bytes / 1_048_576, 2)
except OSError:
result["wal_bytes"] = -1
result["wal_mb"] = -1
if shutil.which("sqlite3"):
try:
cp = subprocess.run(
["sqlite3", db_path, "PRAGMA wal_checkpoint(PASSIVE);"],
capture_output=True, text=True, timeout=5,
)
# Output: busy|log|checkpointed (3 ints)
parts = cp.stdout.strip().split("|")
if len(parts) == 3:
result["wal_checkpoint"] = {
"busy": int(parts[0]), "log": int(parts[1]), "checkpointed": int(parts[2]),
}
result["ok"] = True
except Exception: # noqa: BLE001
result["ok"] = False
else:
result["sqlite3_missing"] = True
return result
async def _send_one(
client: httpx.AsyncClient,
url: str,
agent_id: str,
message: str,
room_id: str,
sender: str,
) -> tuple:
"""
POST a synthetic Matrix-style event to the bridge debug endpoint.
Returns (latency_ms: float, status_code: int, error: str|None).
"""
payload = {
"room_id": room_id,
"event": {
"event_id": f"!soak-{int(time.monotonic() * 1e6)}",
"sender": sender,
"type": "m.room.message",
"content": {"msgtype": "m.text", "body": message},
},
}
t0 = time.monotonic()
try:
resp = await client.post(
f"{url.rstrip('/')}/v1/debug/inject_event",
json=payload,
timeout=30.0,
)
latency_ms = (time.monotonic() - t0) * 1000
if resp.status_code >= 500:
return latency_ms, resp.status_code, f"HTTP {resp.status_code}"
return latency_ms, resp.status_code, None
except httpx.TimeoutException:
latency_ms = (time.monotonic() - t0) * 1000
return latency_ms, 0, "timeout"
except Exception as exc: # noqa: BLE001
latency_ms = (time.monotonic() - t0) * 1000
return latency_ms, 0, str(exc)
async def _fetch_health(client: httpx.AsyncClient, url: str) -> Dict[str, Any]:
try:
resp = await client.get(f"{url.rstrip('/')}/health", timeout=10.0)
return resp.json() if resp.status_code == 200 else {}
except Exception: # noqa: BLE001
return {}
async def _fetch_metrics(client: httpx.AsyncClient, url: str) -> str:
try:
resp = await client.get(f"{url.rstrip('/')}/metrics", timeout=10.0)
return resp.text if resp.status_code == 200 else ""
except Exception: # noqa: BLE001
return ""
def _percentile(values: List[float], p: float) -> float:
if not values:
return 0.0
sv = sorted(values)
idx = int(len(sv) * p / 100)
return sv[min(idx, len(sv) - 1)]
async def run_soak(
url: str,
n_messages: int,
concurrency: int,
agent_id: str,
room_id: str,
sender: str,
max_p95_ms: float,
max_drop_rate: float,
db_path: str = "",
) -> Dict[str, Any]:
results: List[tuple] = []
semaphore = asyncio.Semaphore(concurrency)
async with httpx.AsyncClient() as client:
# Pre-check: inject endpoint + health
preflight_err = await _preflight_inject(client, url, room_id)
if preflight_err:
print(preflight_err, file=sys.stderr)
return {"ok": False, "error": preflight_err, "passed": False, "failures": [preflight_err]}
# WAL check before soak
wal_before: Dict[str, Any] = {}
if db_path:
wal_before = await _check_wal(db_path)
print(f"[soak] WAL before: {wal_before.get('wal_mb', '?')} MB")
# Pre-check: health
health_before = await _fetch_health(client, url)
metrics_before = await _fetch_metrics(client, url)
drops_before = _parse_counter(metrics_before, "matrix_bridge_queue_dropped_total")
rl_before = _parse_counter(metrics_before, "matrix_bridge_rate_limited_total")
fo_before = _parse_counter(metrics_before, "matrix_bridge_failover_total")
print(f"[soak] Bridge health before: {health_before.get('ok', '?')}")
print(f"[soak] Starting {n_messages} messages (concurrency={concurrency}) ...")
t_start = time.monotonic()
async def worker(i: int):
async with semaphore:
msg = f"soak-msg-{i:04d}"
lat, status, err = await _send_one(
client, url, agent_id, msg, room_id, sender
)
results.append((lat, status, err))
if (i + 1) % max(1, n_messages // 10) == 0:
print(f" [{i+1}/{n_messages}] last={lat:.0f}ms status={status}")
await asyncio.gather(*[worker(i) for i in range(n_messages)])
elapsed_s = time.monotonic() - t_start
metrics_after = await _fetch_metrics(client, url)
health_after = await _fetch_health(client, url)
# WAL check after soak
wal_after: Dict[str, Any] = {}
if db_path:
wal_after = await _check_wal(db_path)
print(f"[soak] WAL after: {wal_after.get('wal_mb', '?')} MB "
f"(delta={round(wal_after.get('wal_mb',0) - wal_before.get('wal_mb',0), 2)} MB)")
latencies = [r[0] for r in results]
errors = [r for r in results if r[2] is not None]
successes = len(results) - len(errors)
error_rate = len(errors) / len(results) if results else 0.0
drops_after = _parse_counter(metrics_after, "matrix_bridge_queue_dropped_total")
rl_after = _parse_counter(metrics_after, "matrix_bridge_rate_limited_total")
fo_after = _parse_counter(metrics_after, "matrix_bridge_failover_total")
sticky_after = _parse_counter(metrics_after, "matrix_bridge_sticky_node_total")
delta_drops = drops_after - drops_before
delta_rl = rl_after - rl_before
delta_fo = fo_after - fo_before
p50 = _percentile(latencies, 50)
p95 = _percentile(latencies, 95)
p99 = _percentile(latencies, 99)
p_max = max(latencies) if latencies else 0.0
# Histogram quantile from Prometheus
hist_p95 = _parse_histogram_quantile(
metrics_after, "matrix_bridge_invoke_duration_seconds", 0.95
)
hist_p95_ms = hist_p95 * 1000 if hist_p95 is not None else None
drop_rate = delta_drops / len(results) if results else 0.0
report = {
"wal": {
"before_mb": wal_before.get("wal_mb"),
"after_mb": wal_after.get("wal_mb"),
"delta_mb": round(
(wal_after.get("wal_mb") or 0) - (wal_before.get("wal_mb") or 0), 3
) if wal_before and wal_after else None,
"checkpoint_after": wal_after.get("wal_checkpoint"),
"threshold_mb": 10,
},
"summary": {
"total_messages": n_messages,
"concurrency": concurrency,
"elapsed_s": round(elapsed_s, 2),
"throughput_rps": round(n_messages / elapsed_s, 1) if elapsed_s > 0 else 0,
"successes": successes,
"errors": len(errors),
"error_rate": round(error_rate, 4),
},
"latency_ms": {
"p50": round(p50, 1),
"p95": round(p95, 1),
"p99": round(p99, 1),
"max": round(p_max, 1),
},
"metrics_delta": {
"queue_drops": int(delta_drops),
"rate_limited": int(delta_rl),
"failovers": int(delta_fo),
"sticky_sets": int(sticky_after),
"drop_rate": round(drop_rate, 4),
},
"prometheus_invoke_p95_ms": round(hist_p95_ms, 1) if hist_p95_ms else None,
"health_before": health_before.get("ok"),
"health_after": health_after.get("ok"),
"pass_criteria": {
"max_p95_ms": max_p95_ms,
"max_drop_rate": max_drop_rate,
},
}
# Pass/fail evaluation
failures = []
if p95 > max_p95_ms:
failures.append(f"p95={p95:.0f}ms exceeds threshold {max_p95_ms:.0f}ms")
if drop_rate > max_drop_rate:
failures.append(
f"drop_rate={drop_rate:.3%} exceeds threshold {max_drop_rate:.3%}"
)
wal_delta = report["wal"]["delta_mb"]
if wal_delta is not None and wal_delta > report["wal"]["threshold_mb"]:
failures.append(
f"WAL grew {wal_delta:.1f}MB (threshold {report['wal']['threshold_mb']}MB) "
"— possible SQLite write pressure (Bottleneck #2)"
)
report["passed"] = len(failures) == 0
report["failures"] = failures
return report
def _print_report(r: Dict[str, Any]) -> None:
s = r["summary"]
l = r["latency_ms"]
m = r["metrics_delta"]
passed = "✅ PASSED" if r["passed"] else "❌ FAILED"
w = r.get("wal", {})
print()
print("=" * 60)
print(f" matrix-bridge-dagi Soak Report {passed}")
print("=" * 60)
print(f" Messages: {s['total_messages']} concurrency={s['concurrency']}")
print(f" Elapsed: {s['elapsed_s']}s ({s['throughput_rps']} rps)")
print(f" Successes: {s['successes']} errors={s['errors']} ({s['error_rate']:.1%})")
print()
print(f" Latency (client-side): p50={l['p50']}ms p95={l['p95']}ms "
f"p99={l['p99']}ms max={l['max']}ms")
if r["prometheus_invoke_p95_ms"] is not None:
print(f" Invoke p95 (Prometheus): {r['prometheus_invoke_p95_ms']}ms")
print()
print(f" Queue drops: {m['queue_drops']} (rate {m['drop_rate']:.3%})")
print(f" Rate-limited: {m['rate_limited']}")
print(f" Failovers: {m['failovers']}")
print(f" Sticky sets: {m['sticky_sets']}")
if w.get("before_mb") is not None:
wal_delta_str = (
f"Δ{w['delta_mb']:+.2f}MB" if w.get("delta_mb") is not None else ""
)
wal_warn = " ⚠️" if (w.get("delta_mb") or 0) > w.get("threshold_mb", 10) else ""
print(f" WAL: {w['before_mb']}MB → {w['after_mb']}MB {wal_delta_str}{wal_warn}")
print()
if r["failures"]:
for f in r["failures"]:
print(f"{f}")
else:
print(" All pass criteria met.")
print("=" * 60)
def main() -> int:
parser = argparse.ArgumentParser(description="matrix-bridge-dagi soak test (M11)")
parser.add_argument("--url", default="http://localhost:9400",
help="Bridge base URL (default: http://localhost:9400)")
parser.add_argument("--messages", type=int, default=100,
help="Total messages to send (default: 100)")
parser.add_argument("--concurrency", type=int, default=4,
help="Concurrent requests (default: 4)")
parser.add_argument("--agent-id", default="sofiia",
help="Agent id for synthetic events (default: sofiia)")
parser.add_argument("--room-id", default="!soak-room:home.invalid",
help="Room id for synthetic events")
parser.add_argument("--sender", default="@soak-user:home.invalid",
help="Sender for synthetic events")
parser.add_argument("--max-p95-ms", type=float, default=_DEFAULT_MAX_P95_MS,
help=f"Max p95 latency ms (default: {_DEFAULT_MAX_P95_MS})")
parser.add_argument("--max-drop-rate",type=float, default=_DEFAULT_MAX_DROP_RATE,
help=f"Max queue drop rate 0..1 (default: {_DEFAULT_MAX_DROP_RATE})")
parser.add_argument("--report-file", default="",
help="Optional path to write JSON report")
parser.add_argument("--db-path", default="",
help="Path to policy_store.db for WAL check "
"(e.g. /opt/microdao-daarion/data/matrix_bridge.db)")
args = parser.parse_args()
report = asyncio.run(run_soak(
url=args.url,
n_messages=args.messages,
concurrency=args.concurrency,
agent_id=args.agent_id,
room_id=args.room_id,
sender=args.sender,
max_p95_ms=args.max_p95_ms,
max_drop_rate=args.max_drop_rate,
db_path=args.db_path,
))
_print_report(report)
if args.report_file:
with open(args.report_file, "w", encoding="utf-8") as fh:
json.dump(report, fh, indent=2)
print(f"\n Report saved: {args.report_file}")
return 0 if report["passed"] else 1
if __name__ == "__main__":
sys.exit(main())

View File

@@ -1,5 +1,5 @@
""" """
matrix-bridge-dagi — configuration and validation (M2.1 + M2.2 + M3.0) matrix-bridge-dagi — configuration and validation (M2.1 + M2.2 + M3.0 + M3.1)
""" """
import os import os
from dataclasses import dataclass, field from dataclasses import dataclass, field
@@ -54,6 +54,54 @@ class BridgeConfig:
# "ignore" | "reply_error" (send ⛔ to room on unauthorized attempt) # "ignore" | "reply_error" (send ⛔ to room on unauthorized attempt)
control_unauthorized_behavior: str control_unauthorized_behavior: str
# M3.1: Runbook runner — sofiia-console control token
sofiia_control_token: str # X-Control-Token for /api/runbooks/internal/runs
# M3.4: Control channel safety — rate limiting + cooldown
control_room_rpm: int # Max commands per room per minute (0 = unlimited)
control_operator_rpm: int # Max commands per operator per minute
control_run_next_rpm: int # Max !runbook next calls per run_id per minute
control_cooldown_s: float # Anti-double-click debounce per (operator, verb, subcmd)
# M2.3: Persistent event deduplication
persistent_dedupe: bool # Enable SQLite-backed dedupe across restarts
bridge_data_dir: str # Directory for SQLite DB and other bridge data
processed_events_ttl_h: int # TTL for processed events (hours)
processed_events_prune_batch: int # Max rows to prune per prune run
processed_events_prune_interval_s: int # Prune interval in seconds (0 = disable periodic)
# M4.0: agent discovery
discovery_rpm: int # Max !agents replies per room per minute (0 = unlimited)
# M5.0: node-aware routing
bridge_allowed_nodes: str # Comma-separated: "NODA1,NODA2"
bridge_default_node: str # Default node when none specified
bridge_room_node_map: str # Optional: "!roomA:server=NODA2;!roomB:server=NODA1"
# M8.0: node health + soft-failover thresholds
node_fail_consecutive: int # Consecutive failures before node marked "down"
node_lat_ewma_s: float # EWMA latency threshold (seconds) → "degraded"
node_ewma_alpha: float # EWMA smoothing factor (0..1)
# M8.1: sticky failover cache
failover_sticky_ttl_s: float # Seconds to hold sticky node preference after failover (0 = disabled)
# M8.2: HA state persistence
ha_health_snapshot_interval_s: int # Seconds between node health writes to DB (0 = disabled)
ha_health_max_age_s: int # Max age of health snapshot to load on startup (seconds)
# M9.0: Two-step confirmation TTL
confirm_ttl_s: float # Seconds a pending !confirm nonce is valid (0 = disabled)
# M10.0: Policy export retention
policy_export_retention_days: int # Days to keep policy exports (0 = keep forever)
# M10.2: Policy change history
policy_history_limit: int # Max rows in policy_changes table (0 = unlimited)
# M11 soak: synthetic event injection (NEVER enable in production)
debug_inject_enabled: bool # POST /v1/debug/inject_event (default: False)
# Service identity # Service identity
node_id: str node_id: str
build_sha: str build_sha: str
@@ -99,6 +147,35 @@ def load_config() -> BridgeConfig:
bridge_operator_allowlist=_optional("BRIDGE_OPERATOR_ALLOWLIST", ""), bridge_operator_allowlist=_optional("BRIDGE_OPERATOR_ALLOWLIST", ""),
bridge_control_rooms=_optional("BRIDGE_CONTROL_ROOMS", ""), bridge_control_rooms=_optional("BRIDGE_CONTROL_ROOMS", ""),
control_unauthorized_behavior=_optional("CONTROL_UNAUTHORIZED_BEHAVIOR", "ignore"), control_unauthorized_behavior=_optional("CONTROL_UNAUTHORIZED_BEHAVIOR", "ignore"),
sofiia_control_token=_optional("SOFIIA_CONTROL_TOKEN", ""),
control_room_rpm=max(0, int(_optional("CONTROL_ROOM_RPM", "60"))),
control_operator_rpm=max(0, int(_optional("CONTROL_OPERATOR_RPM", "30"))),
control_run_next_rpm=max(0, int(_optional("CONTROL_RUN_NEXT_RPM", "20"))),
control_cooldown_s=max(0.0, float(_optional("CONTROL_COOLDOWN_S", "2.0"))),
persistent_dedupe=_optional("PERSISTENT_DEDUPE", "1").strip() not in ("0", "false", ""),
bridge_data_dir=_optional("BRIDGE_DATA_DIR", "/app/data"),
processed_events_ttl_h=max(1, int(_optional("PROCESSED_EVENTS_TTL_H", "48"))),
processed_events_prune_batch=max(1, int(_optional("PROCESSED_EVENTS_PRUNE_BATCH", "5000"))),
processed_events_prune_interval_s=max(0, int(_optional("PROCESSED_EVENTS_PRUNE_INTERVAL_S", "3600"))),
discovery_rpm=max(0, int(_optional("DISCOVERY_RPM", "20"))),
bridge_allowed_nodes=_optional("BRIDGE_ALLOWED_NODES", "NODA1"),
bridge_default_node=_optional("BRIDGE_DEFAULT_NODE", "NODA1"),
bridge_room_node_map=_optional("BRIDGE_ROOM_NODE_MAP", ""),
# M8.0: node health thresholds
node_fail_consecutive=max(1, int(_optional("NODE_FAIL_CONSEC", "3"))),
node_lat_ewma_s=max(0.5, float(_optional("NODE_LAT_EWMA_S", "12.0"))),
node_ewma_alpha=min(1.0, max(0.01, float(_optional("NODE_EWMA_ALPHA", "0.3")))),
# M8.1: sticky failover TTL (0 = disabled)
failover_sticky_ttl_s=max(0.0, float(_optional("FAILOVER_STICKY_TTL_S", "300.0"))),
# M8.2: HA state persistence
ha_health_snapshot_interval_s=max(0, int(_optional("HA_HEALTH_SNAPSHOT_INTERVAL_S", "60"))),
ha_health_max_age_s=max(0, int(_optional("HA_HEALTH_MAX_AGE_S", "600"))),
# M9.0: Two-step confirmation TTL (0 = disabled)
confirm_ttl_s=max(0.0, float(_optional("CONFIRM_TTL_S", "120.0"))),
policy_export_retention_days=max(0, int(_optional("POLICY_EXPORT_RETENTION_DAYS", "30"))),
policy_history_limit=max(0, int(_optional("POLICY_HISTORY_LIMIT", "100"))),
debug_inject_enabled=_optional("DEBUG_INJECT_ENABLED", "false").lower()
in ("1", "true", "yes"),
node_id=_optional("NODE_ID", "NODA1"), node_id=_optional("NODE_ID", "NODA1"),
build_sha=_optional("BUILD_SHA", "dev"), build_sha=_optional("BUILD_SHA", "dev"),
build_time=_optional("BUILD_TIME", "local"), build_time=_optional("BUILD_TIME", "local"),

View File

@@ -0,0 +1,167 @@
"""
confirm_store — M9.0: Two-step confirmation for dangerous control commands.
Flow:
1. Operator issues a dangerous command (e.g. !node set, !policy import mode=replace).
2. Bridge calls ConfirmStore.add(..., callback=<coroutine>) → returns a nonce.
3. Bridge replies: "Type !confirm <nonce> within Ns to apply."
4. Operator sends !confirm <nonce>.
5. Bridge calls ConfirmStore.pop(nonce, sender_hash) → returns PendingConfirmation.
6. Bridge executes callback() → (reply_text, diff_summary).
7. Audit trail: matrix.control.intent / matrix.control.confirmed / matrix.control.applied.
Safety:
- One pending entry per sender (new request replaces old).
- Nonce is sender-bound: wrong sender_hash → pop returns None.
- TTL enforced via monotonic time; expired entries not returned.
- Nonce: 6 uppercase alphanumeric (NONCE_LEN chars from NONCE_CHARS).
"""
from __future__ import annotations
import secrets
import string
import threading
import time
from dataclasses import dataclass, field
from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple
NONCE_LEN = 6
NONCE_CHARS = string.ascii_uppercase + string.digits
_DEFAULT_TTL_S = 120.0
def make_nonce() -> str:
"""Generate a cryptographically random 6-char uppercase alphanumeric nonce."""
return "".join(secrets.choice(NONCE_CHARS) for _ in range(NONCE_LEN))
@dataclass
class PendingConfirmation:
"""A pending two-step confirmation waiting for !confirm <nonce>."""
nonce: str
sender_hash: str
verb: str # e.g. "node.set", "room.agents set", "policy.import"
normalized_args: str # human-readable args for audit
action_summary: str # "!node set room=!x:s node=NODA2"
room_id: str # Matrix room_id where the intent was issued
callback: Callable[[], Awaitable[Tuple[str, str]]] # async () → (reply_text, diff_summary)
expires_at: float # time.monotonic() deadline
class ConfirmStore:
"""
In-memory, thread-safe store for pending two-step confirmation entries.
One pending entry per sender at a time. If the same sender issues a new
dangerous command before confirming the previous one, the old entry is
replaced (new nonce issued).
"""
def __init__(self, ttl_s: float = _DEFAULT_TTL_S) -> None:
self.ttl_s = ttl_s
self._lock = threading.RLock()
self._by_nonce: Dict[str, PendingConfirmation] = {}
self._by_sender: Dict[str, str] = {} # sender_hash → nonce
# ── Public API ────────────────────────────────────────────────────────────
def add(
self,
sender_hash: str,
verb: str,
normalized_args: str,
action_summary: str,
room_id: str,
callback: Callable[[], Awaitable[Tuple[str, str]]],
) -> str:
"""
Create a pending confirmation entry. Returns the nonce string.
If the sender already has a pending entry it is replaced (old nonce
becomes invalid immediately).
"""
nonce = make_nonce()
expires_at = time.monotonic() + self.ttl_s
entry = PendingConfirmation(
nonce=nonce,
sender_hash=sender_hash,
verb=verb,
normalized_args=normalized_args,
action_summary=action_summary,
room_id=room_id,
callback=callback,
expires_at=expires_at,
)
with self._lock:
# Evict any previous pending entry for this sender
old_nonce = self._by_sender.get(sender_hash)
if old_nonce:
self._by_nonce.pop(old_nonce, None)
self._by_nonce[nonce] = entry
self._by_sender[sender_hash] = nonce
return nonce
def pop(self, nonce: str, sender_hash: str) -> Optional[PendingConfirmation]:
"""
Retrieve and atomically remove a pending confirmation.
Returns None if:
- nonce does not exist,
- sender_hash does not match the entry owner,
- or the entry has expired.
"""
nonce = nonce.upper()
with self._lock:
entry = self._by_nonce.get(nonce)
if entry is None:
return None
if entry.sender_hash != sender_hash:
# Wrong sender — deny without disclosing any detail
return None
if time.monotonic() > entry.expires_at:
# Expired — clean up and deny
self._by_nonce.pop(nonce, None)
self._by_sender.pop(entry.sender_hash, None)
return None
# Valid confirmation — consume the entry
self._by_nonce.pop(nonce)
self._by_sender.pop(sender_hash, None)
return entry
def pending_nonce(self, sender_hash: str) -> Optional[str]:
"""
Return the current pending nonce for a sender (non-destructive peek).
Returns None if no entry or the entry has expired.
"""
with self._lock:
nonce = self._by_sender.get(sender_hash)
if nonce is None:
return None
entry = self._by_nonce.get(nonce)
if entry is None or time.monotonic() > entry.expires_at:
# Lazy eviction
self._by_nonce.pop(nonce, None)
self._by_sender.pop(sender_hash, None)
return None
return nonce
def pending_count(self) -> int:
"""Number of non-expired pending entries (for /health, metrics)."""
now = time.monotonic()
with self._lock:
return sum(1 for e in self._by_nonce.values() if now <= e.expires_at)
def cleanup(self) -> int:
"""Eagerly remove all expired entries. Returns count removed."""
now = time.monotonic()
removed = 0
with self._lock:
expired_nonces = [
n for n, e in self._by_nonce.items() if now > e.expires_at
]
for n in expired_nonces:
entry = self._by_nonce.pop(n)
self._by_sender.pop(entry.sender_hash, None)
removed += 1
return removed

View File

@@ -23,18 +23,124 @@ Audit events emitted:
import logging import logging
import re import re
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Dict, FrozenSet, List, Optional, Tuple from typing import Any, Dict, FrozenSet, List, Optional, Tuple
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# ── Constants ───────────────────────────────────────────────────────────────── # ── Constants ─────────────────────────────────────────────────────────────────
# Supported control verbs (M3.1+ will implement them fully) # Supported control verbs
VERB_RUNBOOK = "runbook" VERB_RUNBOOK = "runbook"
VERB_STATUS = "status" VERB_STATUS = "status"
VERB_NODES = "nodes" # M5.1: node policy overview
VERB_NODE = "node" # M6.0: dynamic room-node override commands
VERB_ROOM = "room" # M6.1: dynamic mixed room agent overrides
VERB_POLICY = "policy" # M6.2: policy snapshot export/import
VERB_CONFIRM = "confirm" # M9.0: two-step confirmation for dangerous commands
VERB_HELP = "help" VERB_HELP = "help"
KNOWN_VERBS: FrozenSet[str] = frozenset({VERB_RUNBOOK, VERB_STATUS, VERB_HELP}) KNOWN_VERBS: FrozenSet[str] = frozenset({
VERB_RUNBOOK, VERB_STATUS, VERB_NODES, VERB_NODE,
VERB_ROOM, VERB_POLICY, VERB_CONFIRM, VERB_HELP,
})
# ── M9.0: Dangerous command detection ─────────────────────────────────────────
def is_dangerous_cmd(cmd: "ControlCommand") -> bool:
"""
Return True if the command requires two-step confirmation before applying.
Dangerous verbs:
!node set room=... node=... — changes room routing
!room agents set room=... agents=... — replaces all agents for a room
!policy import ... — overwrites policy DB (both modes)
"""
v = cmd.verb
sub = (cmd.subcommand or "").strip().lower()
if v == VERB_NODE and sub == "set":
return True
if v == VERB_ROOM and sub == "agents" and cmd.args and cmd.args[0].lower() == "set":
return True
if v == VERB_POLICY and sub == "import":
return True
# M10.0: prune_exports is dangerous only when dry_run=0 (actual deletion)
if v == VERB_POLICY and sub == "prune_exports":
dry_raw = cmd.kwargs.get("dry_run", "1").strip()
is_dry = dry_raw not in ("0", "false", "no")
return not is_dry
# M10.1: restore is always dangerous (no dry_run option)
if v == VERB_POLICY and sub == "restore":
return True
return False
def build_normalized_args(cmd: "ControlCommand") -> str:
"""
Build a human-readable normalized representation of the command args.
Used in audit events and confirmation prompts.
"""
parts: list[str] = []
# For !room agents set, skip the "set" positional from args display
skip_first_arg = cmd.verb == VERB_ROOM and cmd.subcommand == "agents"
for i, a in enumerate(cmd.args):
if skip_first_arg and i == 0:
continue
parts.append(a)
for k, v in sorted(cmd.kwargs.items()):
parts.append(f"{k}={v}")
return " ".join(parts)
def confirm_intent_reply(action_summary: str, nonce: str, ttl_s: int) -> str:
"""Reply when a dangerous command is held pending confirmation (M9.0)."""
return (
f"⚠️ **Confirm required**\n"
f"Action: `{action_summary}`\n"
f"Type `!confirm {nonce}` within {ttl_s}s to apply.\n"
f"_(Only you can confirm this action.)_"
)
def confirm_success_reply(action_result: str) -> str:
"""Reply when a confirmation is accepted and the action applied (M9.0)."""
return f"✅ Confirmed and applied.\n{action_result}"
def confirm_expired_reply() -> str:
"""Reply when the nonce is invalid, expired, or from a different sender (M9.0)."""
return (
"❌ Invalid or expired confirmation code. "
"The action was **not** applied.\n"
"Re-issue the original command to get a new code."
)
# M6.1: !room subcommand + actions
ROOM_SUBCMD_AGENTS = "agents"
ROOM_ACTION_SET = "set"
ROOM_ACTION_ADD = "add"
ROOM_ACTION_REMOVE = "remove"
ROOM_ACTION_GET = "get"
ROOM_ACTION_LIST = "list"
ROOM_ACTION_UNSET = "unset" # remove full override
_VALID_ROOM_ACTIONS = frozenset({
ROOM_ACTION_SET, ROOM_ACTION_ADD, ROOM_ACTION_REMOVE,
ROOM_ACTION_GET, ROOM_ACTION_LIST, ROOM_ACTION_UNSET,
})
# M6.0: !node subcommands
NODE_SUBCMD_SET = "set"
NODE_SUBCMD_UNSET = "unset"
NODE_SUBCMD_GET = "get"
NODE_SUBCMD_LIST = "list"
_VALID_NODE_SUBCMDS = frozenset({NODE_SUBCMD_SET, NODE_SUBCMD_UNSET, NODE_SUBCMD_GET, NODE_SUBCMD_LIST})
# Runbook subcommands (M3.x)
SUBCOMMAND_START = "start" # M3.1 — implemented
SUBCOMMAND_NEXT = "next" # M3.2 — implemented
SUBCOMMAND_COMPLETE = "complete" # M3.2 — implemented
SUBCOMMAND_EVIDENCE = "evidence" # M3.3 — implemented
SUBCOMMAND_STATUS = "status" # M3.3 — implemented
SUBCOMMAND_POST_REVIEW = "post_review" # M3.3 — implemented
# Max command line length to guard against garbage injection # Max command line length to guard against garbage injection
_MAX_CMD_LEN = 512 _MAX_CMD_LEN = 512
@@ -225,10 +331,814 @@ def check_authorization(
# ── Reply helpers ───────────────────────────────────────────────────────────── # ── Reply helpers ─────────────────────────────────────────────────────────────
def not_implemented_reply(cmd: ControlCommand) -> str: def not_implemented_reply(cmd: ControlCommand) -> str:
"""Reply for known commands not yet implemented (M3.0 stub).""" """Reply for known commands not yet implemented."""
return ( return (
f"✅ Command acknowledged: `{cmd.raw}`\n" f"✅ Command acknowledged: `{cmd.raw}`\n"
f"⏳ `!{cmd.verb} {cmd.subcommand}` — implementation pending (M3.1+)." f"⏳ `!{cmd.verb} {cmd.subcommand}` — implementation pending."
)
def next_usage_reply() -> str:
"""Reply when !runbook next is called without a run_id."""
return (
"⚠️ Usage: `!runbook next <run_id>`\n"
"Example: `!runbook next abc-123`"
)
def complete_usage_reply() -> str:
"""Reply when !runbook complete is missing required args."""
return (
"⚠️ Usage: `!runbook complete <run_id> step=<n> status=ok|warn|fail [notes=...]`\n"
"Example: `!runbook complete abc-123 step=3 status=ok notes=done`\n"
"Notes with spaces: join without quotes — `notes=done_and_verified`."
)
def start_usage_reply() -> str:
"""Reply when !runbook start is called with missing/invalid runbook_path."""
return (
"⚠️ Usage: `!runbook start <runbook_path> [node=NODA1]`\n"
"Example: `!runbook start runbooks/rehearsal-v1-checklist.md node=NODA1`\n"
"runbook_path must be a relative path without `..`."
)
def runbook_started_reply(run_id: str, steps_total: int, status: str) -> str:
"""Success reply after sofiia-console creates a runbook run."""
return (
f"✅ runbook started: `run_id={run_id}` steps={steps_total} status={status}\n"
f"Next: `!runbook next {run_id}`"
)
def runbook_start_error_reply(reason: str) -> str:
"""Error reply when sofiia-console returns a non-2xx or connection error."""
return f"❌ failed to start runbook: {reason}"
# ── M3.2 reply helpers ────────────────────────────────────────────────────────
# Max chars of instructions_md to include in Matrix message before truncating
_INSTRUCTIONS_EXCERPT_MAX = 1500
def next_manual_reply(
run_id: str,
step_index: int,
steps_total: Optional[int],
title: str,
instructions_md: str,
) -> str:
"""Reply for a manual step returned by !runbook next."""
step_label = f"Step {step_index + 1}"
if steps_total:
step_label += f"/{steps_total}"
excerpt = instructions_md.strip()
truncated = False
if len(excerpt) > _INSTRUCTIONS_EXCERPT_MAX:
excerpt = excerpt[:_INSTRUCTIONS_EXCERPT_MAX].rsplit("\n", 1)[0]
truncated = True
parts = [
f"🧭 {step_label}: **{title}**",
"",
excerpt,
]
if truncated:
parts.append("_...(truncated — open in console for full instructions)_")
parts += [
"",
f"Complete: `!runbook complete {run_id} step={step_index} status=ok`",
]
return "\n".join(parts)
def next_auto_reply(
run_id: str,
step_index: int,
action_type: str,
step_status: str,
duration_ms: Optional[int],
completed: bool,
) -> str:
"""Reply for an auto step (http_check/script) completed by !runbook next."""
emoji = {"ok": "", "warn": "⚠️", "fail": ""}.get(step_status, "")
dur = f" duration={duration_ms}ms" if duration_ms is not None else ""
header = f"{emoji} step {step_index + 1} ({action_type}) {step_status}{dur}"
if completed:
return (
f"{header}\n"
"🎉 All steps completed!\n"
f"Get evidence: `!runbook evidence {run_id}`"
)
return f"{header}\nNext: `!runbook next {run_id}`"
def next_error_reply(run_id: str, reason: str) -> str:
"""Error reply when !runbook next fails."""
return f"❌ failed to advance runbook: {reason}"
def complete_ok_reply(run_id: str, step_index: int, status: str, run_completed: bool) -> str:
"""Success reply after !runbook complete."""
emoji = {"ok": "", "warn": "⚠️", "fail": "", "skipped": "⏭️"}.get(status, "")
line1 = f"{emoji} recorded step {step_index + 1}: {status}"
if run_completed:
return f"{line1}\n🎉 All steps completed!\nGet evidence: `!runbook evidence {run_id}`"
return f"{line1}\nNext: `!runbook next {run_id}`"
def complete_error_reply(run_id: str, reason: str) -> str:
"""Error reply when !runbook complete fails."""
return f"❌ failed to complete step: {reason}"
# ── M3.3 reply helpers ────────────────────────────────────────────────────────
def status_usage_reply() -> str:
return (
"⚠️ Usage: `!runbook status <run_id>`\n"
"Example: `!runbook status abc-123`"
)
def evidence_usage_reply() -> str:
return (
"⚠️ Usage: `!runbook evidence <run_id>`\n"
"Example: `!runbook evidence abc-123`"
)
def post_review_usage_reply() -> str:
return (
"⚠️ Usage: `!runbook post_review <run_id>`\n"
"Example: `!runbook post_review abc-123`"
)
def status_reply(run: dict) -> str:
"""Format !runbook status reply from a get_run response."""
run_id = run.get("run_id", "?")
status = run.get("status", "?")
current = run.get("current_step", 0)
steps_total = run.get("steps_total") or len(run.get("steps", []))
runbook_path = run.get("runbook_path", "?")
node_id = run.get("node_id", "?")
evidence_path = run.get("evidence_path")
# Count warn/fail steps
steps = run.get("steps", [])
warn_count = sum(1 for s in steps if s.get("status") == "warn")
fail_count = sum(1 for s in steps if s.get("status") == "fail")
status_emoji = {
"running": "🔄", "completed": "", "aborted": "🛑", "paused": "⏸️",
}.get(status, "")
step_label = f"{current}/{steps_total}" if steps_total else str(current)
lines = [
f"{status_emoji} `run_id={run_id}` status={status} step={step_label}",
f"runbook: `{runbook_path}` node: {node_id}",
]
if warn_count or fail_count:
lines.append(f"warn={warn_count} fail={fail_count}")
if evidence_path:
lines.append(f"evidence: `{evidence_path}`")
if status == "completed" and not evidence_path:
lines.append(f"Get evidence: `!runbook evidence {run_id}`")
elif status == "completed" and evidence_path:
lines.append(f"Post-review: `!runbook post_review {run_id}`")
return "\n".join(lines)
def status_error_reply(run_id: str, reason: str) -> str:
return f"❌ failed to get status: {reason}"
def evidence_reply(result: dict) -> str:
"""Success reply after !runbook evidence."""
path = result.get("evidence_path", "?")
size = result.get("bytes", 0)
run_id = result.get("run_id", "")
ts = result.get("created_at", "")
lines = [f"📄 evidence created: `{path}` (bytes={size})"]
if ts:
lines.append(f"created_at: {ts}")
if run_id:
lines.append(f"Next: `!runbook post_review {run_id}`")
return "\n".join(lines)
def evidence_error_reply(run_id: str, reason: str) -> str:
return f"❌ failed to generate evidence: {reason}"
def post_review_reply(result: dict) -> str:
"""Success reply after !runbook post_review."""
path = result.get("path", "?")
size = result.get("bytes", 0)
ts = result.get("created_at", "")
lines = [f"🧾 post-review created: `{path}` (bytes={size})"]
if ts:
lines.append(f"created_at: {ts}")
return "\n".join(lines)
def post_review_error_reply(run_id: str, reason: str) -> str:
return f"❌ failed to generate post-review: {reason}"
# ── M3.4 safety helpers ───────────────────────────────────────────────────────
#: Maximum length of notes/free-text operator input accepted before truncation.
MAX_NOTES_LEN: int = 500
#: Control characters (U+0000U+001F minus tab/newline) that must be stripped.
_CTRL_CHARS = "".join(chr(i) for i in range(32) if i not in (9, 10, 13))
def sanitize_notes(notes: str) -> str:
"""
Strip control characters and truncate notes to MAX_NOTES_LEN.
Safe to call with any string; returns empty string for falsy input.
"""
if not notes:
return ""
cleaned = notes.translate(str.maketrans("", "", _CTRL_CHARS))
if len(cleaned) > MAX_NOTES_LEN:
cleaned = cleaned[:MAX_NOTES_LEN] + ""
return cleaned
def rate_limited_reply(scope: str, retry_after_s: float) -> str:
"""Reply when a control command is rejected by rate limiter or cooldown."""
secs = f"{retry_after_s:.0f}s" if retry_after_s >= 1 else "a moment"
return f"⏳ rate limited ({scope}), retry after {secs}"
def status_not_available_reply() -> str:
return "⚠️ Bridge status not available (service initialising or config missing)."
# M5.1: !nodes reply
_MAX_ROOM_OVERRIDES_SHOWN = 10
def nodes_reply(
policy_info: dict,
node_stats: Optional[dict] = None,
sticky_info: Optional[dict] = None,
) -> str:
"""
Compact reply for `!nodes` in control room.
policy_info: from NodePolicy.as_info_dict()
node_stats: optional dict {node_id: {"routed": N, "rejected": M, "health": ..., ...}}
sticky_info: optional dict from StickyNodeCache (M8.1)
"""
default = policy_info.get("default_node", "?")
allowed = sorted(policy_info.get("allowed_nodes") or [])
overrides = policy_info.get("room_overrides", {}) or {}
allowed_str = ", ".join(f"`{n}`" for n in allowed)
lines = [
"🌐 **Node policy**",
f"Default: `{default}` Allowed: {allowed_str}",
]
if isinstance(overrides, dict) and overrides:
lines.append(f"\n**Room overrides** ({len(overrides)}):")
items = list(overrides.items())[:_MAX_ROOM_OVERRIDES_SHOWN]
for room_id, node in items:
lines.append(f" `{room_id}` → `{node}`")
if len(overrides) > _MAX_ROOM_OVERRIDES_SHOWN:
lines.append(f" _(+{len(overrides) - _MAX_ROOM_OVERRIDES_SHOWN} more)_")
elif isinstance(overrides, int):
# as_info_dict returns room_overrides as int count, not dict
if overrides:
lines.append(f"\nRoom overrides: {overrides}")
else:
lines.append("\nNo room overrides configured.")
else:
lines.append("\nNo room overrides configured.")
if node_stats:
lines.append("\n**Per-node stats** (since last restart):")
for node_id in sorted(node_stats):
ns = node_stats[node_id]
routed = ns.get("routed", 0)
rejected = ns.get("rejected", 0)
health = ns.get("health", "")
ewma = ns.get("ewma_latency_s")
consec = ns.get("consecutive_failures", 0)
stat_parts = [f"routed={routed}", f"rejected={rejected}"]
if health:
stat_parts.append(f"health={health}")
if ewma is not None:
stat_parts.append(f"ewma={ewma:.2f}s")
if consec:
stat_parts.append(f"consec_fail={consec}")
lines.append(f" `{node_id}`: " + " ".join(stat_parts))
# M8.1: sticky cache section
if sticky_info is not None:
active = sticky_info.get("active_keys", 0)
ttl = sticky_info.get("ttl_s", 0)
if active:
lines.append(f"\n**Sticky routing** (anti-flap): {active} active ttl={ttl:.0f}s")
for entry in sticky_info.get("entries", []):
rem = entry.get("remaining_s", 0)
lines.append(
f" `{entry['key']}` → `{entry['node']}` ({rem:.0f}s left)"
)
if sticky_info.get("truncated"):
lines.append(f" _(+{sticky_info['truncated']} more)_")
else:
lines.append(f"\nSticky routing: none active ttl={ttl:.0f}s")
return "\n".join(lines)
# ── M6.0: !node subcommand parser + reply helpers ──────────────────────────────
import re as _re
_ROOM_KWARG_RE = _re.compile(r"\broom=(\S+)", _re.IGNORECASE)
_NODE_VAL_RE = _re.compile(r"\bnode=(\w+)", _re.IGNORECASE)
_ROOM_ID_RE = _re.compile(r"^![a-zA-Z0-9._\-]+:[a-zA-Z0-9._\-]+$")
def parse_node_cmd(args_text: str) -> Tuple[str, Optional[str], Optional[str]]:
"""
Parse `!node <subcommand> [room=...] [node=...]` arguments.
Returns (subcmd, room_id_or_None, node_id_or_None).
subcmd is lower-cased; node_id is upper-cased.
"""
parts = args_text.strip().split(None, 1)
if not parts:
return ("", None, None)
subcmd = parts[0].lower()
rest = parts[1] if len(parts) > 1 else ""
room_m = _ROOM_KWARG_RE.search(rest)
node_m = _NODE_VAL_RE.search(rest)
room_id = room_m.group(1) if room_m else None
node_id = node_m.group(1).upper() if node_m else None
return (subcmd, room_id, node_id)
def node_cmd_validate_room(room_id: str) -> bool:
"""Return True if room_id matches basic Matrix room ID format."""
return bool(_ROOM_ID_RE.match(room_id)) if room_id else False
def node_cmd_reply_set(room_id: str, node_id: str) -> str:
return f"✅ Override set: `{room_id}` → `{node_id}`"
def node_cmd_reply_unset_ok(room_id: str) -> str:
return f"✅ Override removed for `{room_id}`"
def node_cmd_reply_unset_not_found(room_id: str) -> str:
return f" No override was set for `{room_id}`"
def node_cmd_reply_get(
room_id: str,
node_id: Optional[str],
env_node: Optional[str],
default_node: str,
) -> str:
lines = [f"📌 **Node info for** `{room_id}`"]
if node_id:
lines.append(f"Dynamic override: `{node_id}` _(set by operator)_")
else:
lines.append("Dynamic override: _none_")
if env_node:
lines.append(f"Env map: `{env_node}`")
lines.append(f"Default: `{default_node}`")
effective = node_id or env_node or default_node
lines.append(f"\nEffective node: **`{effective}`**")
return "\n".join(lines)
def node_cmd_reply_list(
overrides: List[Tuple[str, str, int]],
total: int,
) -> str:
import datetime
lines = [f"📋 **Dynamic node overrides** ({total} total)"]
if not overrides:
lines.append("_None set._")
else:
for room_id, node_id, updated_at in overrides:
ts = datetime.datetime.utcfromtimestamp(updated_at).strftime("%Y-%m-%d %H:%M")
lines.append(f" `{room_id}` → `{node_id}` _(at {ts} UTC)_")
if total > len(overrides):
lines.append(f" _(+{total - len(overrides)} more)_")
return "\n".join(lines)
def node_cmd_reply_error(msg: str) -> str:
return (
f"{msg}\n\n"
"Usage:\n"
" `!node set room=!room:server node=NODA2`\n"
" `!node unset room=!room:server`\n"
" `!node get room=!room:server`\n"
" `!node list`"
)
# ── M6.1: !room agents reply helpers ──────────────────────────────────────────
_AGENTS_KWARG_RE = _re.compile(r"\bagents=(\S+)", _re.IGNORECASE)
_AGENT_KWARG_RE = _re.compile(r"\bagent=(\w+)", _re.IGNORECASE)
_DEFAULT_KWARG_RE = _re.compile(r"\bdefault=(\w+)", _re.IGNORECASE)
def parse_room_agents_cmd(
subcommand: str,
args: tuple,
kwargs: Dict[str, str],
) -> Tuple[str, Optional[str], Optional[List[str]], Optional[str], Optional[str]]:
"""
Parse !room agents <action> [room=...] [agents=...] [agent=...] [default=...] args.
Returns (action, room_id, agents_or_None, single_agent_or_None, default_agent_or_None).
action: the ROOM_ACTION_* constant (from args[0] or subcommand)
room_id: from kwargs["room"]
agents: from kwargs["agents"] as a list (for set command)
single_agent: from kwargs["agent"] (for add/remove)
default_agent: from kwargs["default"]
"""
# action is args[0] when subcommand == "agents"
action = (args[0].lower() if args else "").strip() or subcommand.lower()
room_id = kwargs.get("room")
# agents= may be comma-separated
raw_agents = kwargs.get("agents", "")
agents: Optional[List[str]] = (
[a.strip().lower() for a in raw_agents.split(",") if a.strip()]
if raw_agents else None
)
single_agent = kwargs.get("agent", "").strip().lower() or None
default_agent = kwargs.get("default", "").strip().lower() or None
return action, room_id, agents, single_agent, default_agent
def room_agents_reply_set(room_id: str, agents: List[str], default_agent: str) -> str:
agents_str = ", ".join(f"`{a}`" for a in sorted(agents))
return (
f"✅ Agent override set for `{room_id}`\n"
f"Agents: {agents_str}\n"
f"Default: `{default_agent}`"
)
def room_agents_reply_add(room_id: str, agent: str, agents: List[str], default_agent: Optional[str]) -> str:
agents_str = ", ".join(f"`{a}`" for a in sorted(agents))
return (
f"✅ Agent `{agent}` added to `{room_id}`\n"
f"Current agents: {agents_str}"
+ (f"\nDefault: `{default_agent}`" if default_agent else "")
)
def room_agents_reply_remove(room_id: str, agent: str, agents: List[str], default_agent: Optional[str]) -> str:
if agents:
agents_str = ", ".join(f"`{a}`" for a in sorted(agents))
return (
f"✅ Agent `{agent}` removed from `{room_id}`\n"
f"Remaining: {agents_str}"
+ (f"\nDefault: `{default_agent}`" if default_agent else "")
)
return f"✅ Agent `{agent}` removed — no agents left, override cleared for `{room_id}`"
def room_agents_reply_unset_ok(room_id: str) -> str:
return f"✅ Agent override cleared for `{room_id}` (using env/default config)"
def room_agents_reply_unset_not_found(room_id: str) -> str:
return f" No agent override was set for `{room_id}`"
def room_agents_reply_get(
room_id: str,
override_agents: Optional[List[str]],
override_default: Optional[str],
env_agents: Optional[List[str]],
env_default: Optional[str],
) -> str:
lines = [f"📌 **Agent policy for** `{room_id}`"]
if override_agents:
agents_str = ", ".join(f"`{a}`" for a in sorted(override_agents))
lines.append(f"Dynamic override: {agents_str} default=`{override_default or '?'}`")
else:
lines.append("Dynamic override: _none_")
if env_agents:
env_str = ", ".join(f"`{a}`" for a in sorted(env_agents))
lines.append(f"Env config: {env_str} default=`{env_default or '?'}`")
else:
lines.append("Env config: _not configured_")
effective_agents = override_agents or env_agents or []
effective_default = override_default or env_default or "?"
lines.append(f"\nEffective agents: **{', '.join(f'`{a}`' for a in sorted(effective_agents))}** default=**`{effective_default}`**")
return "\n".join(lines)
def room_agents_reply_list(
overrides: List[Tuple[str, List[str], Optional[str], int]],
total: int,
) -> str:
import datetime
lines = [f"📋 **Dynamic agent overrides** ({total} total)"]
if not overrides:
lines.append("_None set._")
else:
for room_id, agents, default_agent, updated_at in overrides:
ts = datetime.datetime.utcfromtimestamp(updated_at).strftime("%Y-%m-%d %H:%M")
agents_str = ", ".join(agents)
lines.append(f" `{room_id}`: [{agents_str}] default=`{default_agent or '?'}` _(at {ts} UTC)_")
if total > len(overrides):
lines.append(f" _(+{total - len(overrides)} more)_")
return "\n".join(lines)
def room_agents_reply_error(msg: str) -> str:
return (
f"{msg}\n\n"
"Usage:\n"
" `!room agents set room=!X agents=sofiia,helion [default=sofiia]`\n"
" `!room agents add room=!X agent=druid`\n"
" `!room agents remove room=!X agent=helion`\n"
" `!room agents get room=!X`\n"
" `!room agents unset room=!X`\n"
" `!room agents list`"
)
# ── M6.2: !policy export/import reply helpers + path validator ────────────────
import os as _os
import json as _json
POLICY_EXPORTS_SUBDIR = "policy_exports"
def validate_export_path(exports_dir: str, filename: str) -> Optional[str]:
"""
Validate and resolve an export filename to an absolute path.
Security: only allow simple filenames (no slashes, no `..`).
Returns the absolute safe path, or None if invalid.
"""
if not filename:
return None
# Reject anything with directory separators or traversal sequences
if "/" in filename or "\\" in filename or ".." in filename:
return None
# Only allow safe characters: alphanumeric, dash, underscore, dot
if not _re.match(r"^[a-zA-Z0-9._\-]+$", filename):
return None
full_path = _os.path.join(exports_dir, filename)
try:
resolved = _os.path.realpath(full_path)
exports_resolved = _os.path.realpath(exports_dir)
if not resolved.startswith(exports_resolved + _os.sep):
return None
except Exception: # noqa: BLE001
return None
return full_path
def policy_export_reply(path: str, node_count: int, agent_count: int) -> str:
filename = _os.path.basename(path)
return (
f"✅ **Policy exported**\n"
f"File: `{filename}`\n"
f"Node overrides: {node_count} Agent overrides: {agent_count}"
)
def policy_import_dry_run_reply(stats: dict, mode: str) -> str:
return (
f"🔍 **Import dry-run** (mode=`{mode}`, no changes applied)\n"
f"Node overrides: +{stats.get('node_added',0)} ~{stats.get('node_updated',0)} -{stats.get('node_deleted',0)}\n"
f"Agent overrides: +{stats.get('agent_added',0)} ~{stats.get('agent_updated',0)} -{stats.get('agent_deleted',0)}\n"
f"_Use `dry_run=0` to apply._"
)
def format_import_diff(diff: Any) -> str:
"""
Format an ImportDiff as a human-readable Markdown string (M9.1).
`diff` is an ImportDiff instance from policy_store.
"""
lines: List[str] = []
# Node overrides row
node_parts: List[str] = []
if diff.node_added: node_parts.append(f"+{diff.node_added} added")
if diff.node_updated: node_parts.append(f"~{diff.node_updated} updated")
if diff.node_deleted: node_parts.append(f"-{diff.node_deleted} deleted ⚠️")
lines.append("**Node overrides:** " + (", ".join(node_parts) if node_parts else "no changes"))
# Agent overrides row
agent_parts: List[str] = []
if diff.agent_added: agent_parts.append(f"+{diff.agent_added} added")
if diff.agent_updated: agent_parts.append(f"~{diff.agent_updated} updated")
if diff.agent_deleted: agent_parts.append(f"-{diff.agent_deleted} deleted ⚠️")
lines.append("**Agent overrides:** " + (", ".join(agent_parts) if agent_parts else "no changes"))
# Sample affected rooms
if getattr(diff, "sample_keys", None):
keys_str = ", ".join(f"`{k}`" for k in diff.sample_keys)
more = diff.total_changes() - len(diff.sample_keys)
suffix = f" _(+{more} more)_" if more > 0 else ""
lines.append(f"**Affected rooms:** {keys_str}{suffix}")
# Replace danger banner
if getattr(diff, "is_replace", False):
lines.append("⚠️ **REPLACE mode** — existing overrides NOT in the file will be **deleted**.")
return "\n".join(lines)
def policy_import_intent_reply(
diff: Any,
action_summary: str,
nonce: str,
ttl_s: int,
) -> str:
"""Reply for !policy import intent with diff preview (M9.1)."""
lines = [
"⚠️ **Confirm required**",
f"Action: `{action_summary}`",
"",
"**Preview:**",
format_import_diff(diff),
"",
]
if diff.total_changes() == 0:
lines.append("_(No policy changes would be made.)_")
lines.append("")
lines += [
f"Type `!confirm {nonce}` within {ttl_s}s to apply.",
"_(Only you can confirm. If the file changes, this confirm will be rejected.)_",
]
return "\n".join(lines)
def policy_import_reply(stats: dict, mode: str) -> str:
return (
f"✅ **Policy imported** (mode=`{mode}`)\n"
f"Node overrides: +{stats.get('node_added',0)} ~{stats.get('node_updated',0)} -{stats.get('node_deleted',0)}\n"
f"Agent overrides: +{stats.get('agent_added',0)} ~{stats.get('agent_updated',0)} -{stats.get('agent_deleted',0)}"
)
def policy_restore_intent_reply(
diff: Any,
action_summary: str,
nonce: str,
ttl_s: int,
) -> str:
"""Reply for !policy restore intent — rollback preview + confirm prompt (M10.1)."""
diff_text = format_import_diff(diff)
return (
f"🔄 **Policy restore (rollback) preview**\n"
f"{diff_text}\n\n"
f"⚠️ **Rollback action:** `{action_summary}`\n\n"
f"Type `!confirm {nonce}` to apply restore (expires in {ttl_s}s)"
)
def policy_restore_applied_reply(
stats: Any,
mode: str,
autobackup_basename: str = "",
) -> str:
"""Reply after !policy restore is confirmed and applied (M10.1)."""
n_a = stats.get("node_added", 0) if isinstance(stats, dict) else 0
n_u = stats.get("node_updated", 0) if isinstance(stats, dict) else 0
n_d = stats.get("node_deleted", 0) if isinstance(stats, dict) else 0
a_a = stats.get("agent_added", 0) if isinstance(stats, dict) else 0
a_u = stats.get("agent_updated", 0) if isinstance(stats, dict) else 0
a_d = stats.get("agent_deleted", 0) if isinstance(stats, dict) else 0
backup_line = (
f"\n\n💾 Pre-restore backup saved: `{autobackup_basename}`"
if autobackup_basename else ""
)
return (
f"✅ **Policy restored** (mode={mode})\n"
f"Node overrides: +{n_a} ~{n_u} -{n_d}\n"
f"Agent overrides: +{a_a} ~{a_u} -{a_d}"
f"{backup_line}"
)
def policy_history_reply(changes: List[Any]) -> str:
"""
Format policy_changes records for !policy history reply (M10.2).
Each line: #{n}. [id:NN] [YYYY-MM-DD HH:MM] verb/mode +Xn ~Yn -Zn `file` op:`hash8` [⚠️]
Use !policy change id=NN to see full details.
"""
if not changes:
return "📋 **Policy change history**\nNo policy changes recorded yet."
lines = ["📋 **Policy change history** (most recent first)\n"]
for i, c in enumerate(changes, 1):
destr_flag = " ⚠️" if c.is_destructive else ""
fname = c.source_file[:40] + "" if len(c.source_file) > 40 else c.source_file
line = (
f"{i}. [id:{c.id}] [{c.when_str()}] `{c.verb}/{c.mode}`"
f" {c.changes_short()}{destr_flag}"
f" `{fname}`"
f" op:`{c.sender_hash[:8]}`"
)
lines.append(line)
lines.append("\nUse `!policy change id=<n>` for full details of a specific change.")
return "\n".join(lines)
def policy_change_detail_reply(change: Any) -> str:
"""
Format full details of a single PolicyChange for !policy change id=<n> (M10.3).
"""
destr_str = "⚠️ Yes" if change.is_destructive else "No"
fname = change.source_file[:60] + "" if len(change.source_file) > 60 else change.source_file
lines = [
f"🔍 **Policy change #{change.id}**\n",
f"**Verb:** `{change.verb}`",
f"**Mode:** `{change.mode}`",
f"**Applied:** {change.when_str()} UTC",
f"**Operator:** op:`{change.sender_hash[:8]}`",
f"**File:** `{fname}`",
f"**Destructive:** {destr_str}",
"",
"**Changes:**",
f" Nodes: +{change.node_added} added ~{change.node_updated} updated -{change.node_deleted} deleted",
f" Agents: +{change.agent_added} added ~{change.agent_updated} updated -{change.agent_deleted} deleted",
"",
"**Summary:**",
f" {change.diff_summary}",
]
return "\n".join(lines)
def policy_prune_preview_reply(result: Any, retention_days: int) -> str:
"""Reply for !policy prune_exports dry_run=1 — preview of what would be pruned (M10.0)."""
if result.count == 0:
return (
f"🗑️ **Policy exports prune preview** (retention={retention_days}d)\n"
"No files older than the retention window found. Nothing to prune."
)
samples = result.sample_filenames(5)
sample_str = "\n".join(f" - `{f}`" for f in samples)
more = result.count - len(samples)
more_str = f"\n _(+{more} more)_" if more > 0 else ""
size_kb = result.total_bytes // 1024
return (
f"🗑️ **Policy exports prune preview** (retention={retention_days}d)\n"
f"Would delete **{result.count}** file(s) (~{size_kb} KB):\n"
f"{sample_str}{more_str}\n\n"
f"To actually prune: `!policy prune_exports dry_run=0`"
)
def policy_prune_applied_reply(result: Any, retention_days: int) -> str:
"""Reply after !policy prune_exports dry_run=0 is confirmed and applied (M10.0)."""
if result.count == 0:
return (
f"🗑️ **Policy exports pruned** (retention={retention_days}d)\n"
"No files matched the retention window."
)
size_kb = result.total_bytes // 1024
return (
f"✅ **Policy exports pruned** (retention={retention_days}d)\n"
f"Deleted **{result.count}** file(s) (~{size_kb} KB freed)."
)
def policy_cmd_error(msg: str) -> str:
return (
f"{msg}\n\n"
"Usage:\n"
" `!policy export`\n"
" `!policy import path=policy-YYYYMMDD-HHMMSS.json [mode=merge|replace] [dry_run=0]`"
) )
@@ -252,12 +1162,26 @@ def help_reply() -> str:
"""Brief help text.""" """Brief help text."""
return ( return (
"**DAGI Bridge — Control Commands**\n\n" "**DAGI Bridge — Control Commands**\n\n"
"`!runbook start <path> [node=NODA1]` — Start a runbook run\n" "`!runbook start <path> [node=NODA1]` — Start a runbook run\n"
"`!runbook next <run_id>` — Advance to next step\n" "`!runbook next <run_id>` — Advance to next step\n"
"`!runbook complete <run_id> step=<n> status=ok` — Mark step complete\n" "`!runbook complete <run_id> step=<n> status=ok [notes=...]` — Mark step complete\n"
"`!runbook evidence <run_id>` — Get evidence artifact path\n" "`!runbook status <run_id>` — Show run status ✅\n"
"`!runbook status <run_id>` — Show current run state\n" "`!runbook evidence <run_id>` — Generate release evidence ✅\n"
"`!status` — Bridge health summary\n" "`!runbook post_review <run_id>` — Generate post-release review ✅\n"
"`!status` — Bridge health summary ✅\n"
"`!nodes` — Node policy overview ✅\n"
"`!node set room=!room:server node=NODA2` — Set room-node override ✅\n"
"`!node unset room=!room:server` — Remove room-node override ✅\n"
"`!node get room=!room:server` — Show current override ✅\n"
"`!node list` — List dynamic overrides (top 10) ✅\n"
"`!room agents set room=!X agents=sofiia,helion [default=sofiia]` — Set agent list ✅\n"
"`!room agents add room=!X agent=druid` — Add agent to room ✅\n"
"`!room agents remove room=!X agent=helion` — Remove agent from room ✅\n"
"`!room agents get room=!X` — Show current agent policy ✅\n"
"`!room agents list` — List all rooms with agent overrides ✅\n"
"`!room agents unset room=!X` — Remove all agent overrides for room ✅\n"
"`!policy export` — Export policy snapshot to file ✅\n"
"`!policy import path=<file> [mode=merge|replace] [dry_run=0]` — Import policy snapshot ✅\n"
"`!help` — This message\n\n" "`!help` — This message\n\n"
"_Only authorised operators can issue control commands._" "_Only authorised operators can issue control commands._"
) )

View File

@@ -0,0 +1,138 @@
"""
control_limiter — M3.4: Rate limiting + cooldown for Matrix control channel.
Protection layers:
1. Per-room sliding window — CONTROL_ROOM_RPM (default 60)
2. Per-operator sliding window — CONTROL_OPERATOR_RPM (default 30)
3. Per-run sliding window — CONTROL_RUN_NEXT_RPM (default 20, only !runbook next)
4. Per-operator cooldown — CONTROL_COOLDOWN_S (default 2s, anti-double-click)
All state is in-memory (lost on restart), which is intentional — limits reset with the bridge.
Thread safety: not needed (asyncio single-threaded event loop).
"""
from __future__ import annotations
import time
from collections import defaultdict, deque
from typing import Dict, Tuple
# Sentinel value for "unlimited" (rpm == 0 → skip check)
_UNLIMITED = 0
class ControlRateLimiter:
"""
Sliding-window rate limiter + cooldown for the Matrix control channel.
All rpm values are requests-per-minute over a 60-second rolling window.
cooldown_s is a per-{operator, verb, subcommand} debounce window (anti-double-click).
"""
def __init__(
self,
room_rpm: int = 60,
operator_rpm: int = 30,
run_next_rpm: int = 20,
cooldown_s: float = 2.0,
) -> None:
self.room_rpm = room_rpm
self.operator_rpm = operator_rpm
self.run_next_rpm = run_next_rpm
self.cooldown_s = cooldown_s
# Sliding-window storage: key → deque[float] (monotonic timestamps)
self._room_windows: Dict[str, deque] = defaultdict(deque)
self._op_windows: Dict[str, deque] = defaultdict(deque)
self._run_windows: Dict[str, deque] = defaultdict(deque)
# Cooldown: (sender_hash, verb, subcommand) → last accepted timestamp
self._cooldown_times: Dict[str, float] = {}
# ── Sliding window helpers ─────────────────────────────────────────────────
@staticmethod
def _check_window(
windows: Dict[str, deque],
key: str,
rpm: int,
) -> Tuple[bool, float]:
"""
Sliding-window check over a 60-second window.
Returns (allowed, retry_after_seconds).
If rpm == 0, always allowed.
"""
if rpm == _UNLIMITED:
return True, 0.0
now = time.monotonic()
window = windows[key]
cutoff = now - 60.0
# Evict expired entries
while window and window[0] < cutoff:
window.popleft()
if len(window) >= rpm:
# Time until oldest entry expires
retry_after = max(0.0, 60.0 - (now - window[0]))
return False, retry_after
window.append(now)
return True, 0.0
# ── Public check methods ───────────────────────────────────────────────────
def check_room(self, room_id: str) -> Tuple[bool, float]:
"""Per-room rate limit check. Returns (allowed, retry_after_s)."""
return self._check_window(self._room_windows, room_id, self.room_rpm)
def check_operator(self, sender_hash: str) -> Tuple[bool, float]:
"""Per-operator rate limit check. Returns (allowed, retry_after_s)."""
return self._check_window(self._op_windows, sender_hash, self.operator_rpm)
def check_run_next(self, run_id: str) -> Tuple[bool, float]:
"""
Per-run rate limit for !runbook next — prevents rapid-fire advancement.
Returns (allowed, retry_after_s).
"""
return self._check_window(self._run_windows, run_id, self.run_next_rpm)
def check_cooldown(
self,
sender_hash: str,
verb: str,
subcommand: str,
) -> Tuple[bool, float]:
"""
Anti-double-click cooldown per (operator, verb, subcommand).
Returns (allowed, wait_s). On first call → records timestamp and allows.
On subsequent calls within cooldown_s → blocks and returns remaining wait.
"""
if self.cooldown_s <= 0:
return True, 0.0
key = f"{sender_hash}:{verb}:{subcommand}"
now = time.monotonic()
last = self._cooldown_times.get(key)
if last is not None:
elapsed = now - last
if elapsed < self.cooldown_s:
return False, self.cooldown_s - elapsed
self._cooldown_times[key] = now
return True, 0.0
# ── Summary ───────────────────────────────────────────────────────────────
def as_health_dict(self) -> dict:
return {
"room_rpm": self.room_rpm,
"operator_rpm": self.operator_rpm,
"run_next_rpm": self.run_next_rpm,
"cooldown_s": self.cooldown_s,
}

View File

@@ -0,0 +1,296 @@
"""
control_runner — M3.1 + M3.2 + M3.3
Thin async HTTP client that calls the sofiia-console internal runbook API
on behalf of the Matrix bridge control channel.
All functions are stateless; callers supply the pre-built AsyncClient.
"""
from __future__ import annotations
import logging
from typing import Optional
import httpx
logger = logging.getLogger(__name__)
# Runbook path guards (fail-fast in the bridge, before calling the console)
_MAX_PATH_LEN = 256
_FORBIDDEN_SEGMENTS = {"..", "~"}
class RunnerError(Exception):
"""Raised when the sofiia-console returns an error or call fails."""
def validate_runbook_path(path: str) -> Optional[str]:
"""
Return None if valid, or an error string describing the problem.
Checks: non-empty, max length, no traversal segments, no absolute paths.
"""
path = path.strip()
if not path:
return "runbook_path is required"
if len(path) > _MAX_PATH_LEN:
return f"runbook_path too long (max {_MAX_PATH_LEN} chars)"
if path.startswith("/"):
return "absolute paths are not allowed"
parts = path.replace("\\", "/").split("/")
for part in parts:
if part in _FORBIDDEN_SEGMENTS:
return f"forbidden path segment: {part!r}"
return None
async def start_runbook_run(
http_client: httpx.AsyncClient,
console_url: str,
control_token: str,
runbook_path: str,
operator_id: str,
node_id: str = "NODA1",
timeout: float = 15.0,
) -> dict:
"""
POST /api/runbooks/internal/runs → {run_id, status, current_step, steps_total}
Raises RunnerError on HTTP error or non-2xx response.
"""
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs"
payload: dict = {
"runbook_path": runbook_path,
"operator_id": operator_id,
"node_id": node_id,
}
try:
resp = await http_client.post(
url,
json=payload,
headers={"X-Control-Token": control_token},
timeout=timeout,
)
except httpx.RequestError as exc:
raise RunnerError(f"connection error: {exc}") from exc
if resp.status_code != 200:
detail = _extract_error_detail(resp)
raise RunnerError(f"HTTP {resp.status_code}: {detail}")
try:
return resp.json()
except Exception as exc:
raise RunnerError(f"invalid JSON response: {exc}") from exc
def _extract_error_detail(resp: httpx.Response) -> str:
"""Extract a short error detail from an httpx response (safe: never raises)."""
try:
body = resp.json()
if isinstance(body, dict) and body.get("detail"):
return str(body["detail"])[:200]
except Exception:
pass
try:
return (resp.text or "")[:200]
except Exception:
return "<no detail>"
async def get_runbook_run(
http_client: httpx.AsyncClient,
console_url: str,
control_token: str,
run_id: str,
timeout: float = 10.0,
) -> dict:
"""
GET /api/runbooks/internal/runs/{run_id} → full run with steps.
"""
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}"
try:
resp = await http_client.get(
url,
headers={"X-Control-Token": control_token},
timeout=timeout,
)
except httpx.RequestError as exc:
raise RunnerError(f"connection error: {exc}") from exc
if resp.status_code == 404:
raise RunnerError(f"run {run_id!r} not found")
if resp.status_code != 200:
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
try:
return resp.json()
except Exception as exc:
raise RunnerError(f"invalid JSON response: {exc}") from exc
# ── M3.2 ──────────────────────────────────────────────────────────────────────
async def next_runbook_step(
http_client: httpx.AsyncClient,
console_url: str,
control_token: str,
run_id: str,
operator_id: str = "",
timeout: float = 30.0,
) -> dict:
"""
POST /api/runbooks/internal/runs/{run_id}/next
Returns either:
{type:"manual", step_index, title, section, instructions_md, steps_total?}
{type:"http_check"|"script", step_index, title, result, step_status, next_step, completed}
Raises RunnerError on HTTP error, 404 (run not found / not active).
"""
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}/next"
payload = {"operator_id": operator_id} if operator_id else {}
try:
resp = await http_client.post(
url,
json=payload,
headers={"X-Control-Token": control_token},
timeout=timeout,
)
except httpx.RequestError as exc:
raise RunnerError(f"connection error: {exc}") from exc
if resp.status_code == 404:
detail = _extract_error_detail(resp)
raise RunnerError(f"run not found or not active: {detail}")
if resp.status_code != 200:
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
try:
return resp.json()
except Exception as exc:
raise RunnerError(f"invalid JSON response: {exc}") from exc
async def complete_runbook_step(
http_client: httpx.AsyncClient,
console_url: str,
control_token: str,
run_id: str,
step_index: int,
status: str,
notes: str = "",
operator_id: str = "",
timeout: float = 15.0,
) -> dict:
"""
POST /api/runbooks/internal/runs/{run_id}/steps/{step_index}/complete
Returns: {ok, run_id, step_index, status, next_step, steps_total, run_completed}
Raises RunnerError on HTTP error or 404 (run/step not found or wrong current step).
"""
url = (
f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}"
f"/steps/{step_index}/complete"
)
payload: dict = {"status": status}
if notes:
payload["notes"] = notes
if operator_id:
payload["operator_id"] = operator_id
try:
resp = await http_client.post(
url,
json=payload,
headers={"X-Control-Token": control_token},
timeout=timeout,
)
except httpx.RequestError as exc:
raise RunnerError(f"connection error: {exc}") from exc
if resp.status_code == 404:
detail = _extract_error_detail(resp)
raise RunnerError(f"step not found or not current: {detail}")
if resp.status_code != 200:
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
try:
return resp.json()
except Exception as exc:
raise RunnerError(f"invalid JSON response: {exc}") from exc
# ── M3.3 ──────────────────────────────────────────────────────────────────────
async def generate_evidence(
http_client: httpx.AsyncClient,
console_url: str,
control_token: str,
run_id: str,
timeout: float = 30.0,
) -> dict:
"""
POST /api/runbooks/internal/runs/{run_id}/evidence
Returns: {evidence_path, bytes, created_at, run_id}
Raises RunnerError on HTTP error or 404 (run not found).
"""
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}/evidence"
try:
resp = await http_client.post(
url,
headers={"X-Control-Token": control_token},
timeout=timeout,
)
except httpx.RequestError as exc:
raise RunnerError(f"connection error: {exc}") from exc
if resp.status_code == 404:
raise RunnerError(f"run {run_id!r} not found")
if resp.status_code != 200:
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
try:
return resp.json()
except Exception as exc:
raise RunnerError(f"invalid JSON response: {exc}") from exc
async def generate_post_review(
http_client: httpx.AsyncClient,
console_url: str,
control_token: str,
run_id: str,
timeout: float = 30.0,
) -> dict:
"""
POST /api/runbooks/internal/runs/{run_id}/post_review
Returns: {path, bytes, created_at, run_id}
Raises RunnerError on HTTP error or 404.
"""
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}/post_review"
try:
resp = await http_client.post(
url,
headers={"X-Control-Token": control_token},
timeout=timeout,
)
except httpx.RequestError as exc:
raise RunnerError(f"connection error: {exc}") from exc
if resp.status_code == 404:
raise RunnerError(f"run {run_id!r} not found")
if resp.status_code != 200:
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
try:
return resp.json()
except Exception as exc:
raise RunnerError(f"invalid JSON response: {exc}") from exc

View File

@@ -0,0 +1,210 @@
"""
discovery — M4.0: Agent discovery helpers for Matrix user rooms.
Provides formatted replies for `!agents` and `!agents status` commands.
These commands are available to all room members (no auth required) and
are processed BEFORE routing to the LLM agent.
Supports:
- Mixed rooms: list all agents, default, usage examples
- Direct rooms: show single agent mapping
- Unknown rooms: "no mapping" notice
"""
from __future__ import annotations
import datetime
from typing import Optional
from .mixed_routing import MixedRoomConfig
from .room_mapping import RoomMappingConfig # noqa: F401 — used in type hints
def _fmt_ts(ts: int) -> str:
"""Format a Unix timestamp as compact UTC string."""
try:
return datetime.datetime.fromtimestamp(ts, tz=datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
except Exception: # noqa: BLE001
return str(ts)
# Discovery command prefix
DISCOVERY_CMD = "!agents"
# Reply length cap (Matrix message, not truncated — kept short by design)
_MAX_REPLY_LEN = 3500
def is_discovery_message(text: str) -> bool:
"""Return True if the message is a !agents discovery command."""
lower = text.strip().lower()
return lower == DISCOVERY_CMD or lower.startswith(DISCOVERY_CMD + " ")
def agents_reply(
room_id: str,
room_map: RoomMappingConfig,
mixed_room_config: Optional[MixedRoomConfig],
) -> str:
"""
Build a discovery reply for the given room.
Mixed room → list agents, default, usage examples.
Direct room → single agent mapping.
Unknown → 'no mapping' notice.
"""
# Mixed room check first
if mixed_room_config and mixed_room_config.is_mixed(room_id):
room = mixed_room_config.rooms.get(room_id)
if room is not None:
return _mixed_room_reply(room_id, room)
# Direct room check
agent_id = room_map.agent_for_room(room_id)
if agent_id is not None:
return _direct_room_reply(agent_id)
return _unknown_room_reply()
def _mixed_room_reply(room_id: str, room) -> str:
"""Format reply for a mixed room."""
agents = room.agents
default = room.default_agent or (agents[0] if agents else "?")
agent_list = ", ".join(f"**{a}**" for a in agents)
lines = [
f"🤖 **Agents available in this room:** {agent_list}",
f"⭐ **Default:** {default}",
"",
"**How to address an agent:**",
]
for agent in agents[:5]: # show max 5 examples
lines.append(f" • `/{agent} <message>` — slash command")
lines.append(f" • `@{agent} <message>` — mention")
lines.append(f" • `{agent}: <message>` — colon prefix")
lines.extend([
"",
f"_Messages without prefix go to **{default}** by default._",
])
return "\n".join(lines)
def _direct_room_reply(agent_id: str) -> str:
"""Format reply for a directly-mapped room (1 agent)."""
return (
f"🤖 This room is mapped to agent: **{agent_id}**\n\n"
f"All messages are forwarded to **{agent_id}** automatically.\n"
f"No prefix needed — just write your message."
)
def _unknown_room_reply() -> str:
"""Format reply when room has no mapping."""
return (
"⚠️ This room has no agent mapping.\n\n"
"Contact an operator to configure an agent for this room."
)
# ── Bridge status reply (M4.1) ────────────────────────────────────────────────
def bridge_status_reply(snapshot: dict) -> str:
"""
Format a concise bridge health snapshot for `!status` in control room.
snapshot keys (all optional with defaults):
node_id, queue_size, queue_max, worker_count,
room_count, mixed_room_count, operators_count,
control_safety (dict), persistent_dedupe (dict),
dedupe_hits, dedupe_inserts
"""
node_id = snapshot.get("node_id", "?")
q_size = snapshot.get("queue_size", "?")
q_max = snapshot.get("queue_max", "?")
workers = snapshot.get("worker_count", "?")
rooms = snapshot.get("room_count", 0)
mixed = snapshot.get("mixed_room_count", 0)
ops = snapshot.get("operators_count", 0)
safety = snapshot.get("control_safety") or {}
dedupe = snapshot.get("persistent_dedupe") or {}
node_policy = snapshot.get("node_policy") or {}
default_node = node_policy.get("default_node", node_id)
allowed_nodes = node_policy.get("allowed_nodes") or []
room_overrides = node_policy.get("room_overrides", 0)
lines = [
f"📡 **Bridge status** — node: `{node_id}`",
"",
f"**Queue:** {q_size}/{q_max} workers: {workers}",
f"**Rooms:** {rooms} direct {mixed} mixed ops: {ops} operators",
"",
]
# M5.0: node policy
if allowed_nodes:
allowed_str = ", ".join(f"`{n}`" for n in sorted(allowed_nodes))
lines.append(
f"**Node policy:** default=`{default_node}` allowed={allowed_str} room_overrides={room_overrides}"
)
# Control safety
if safety:
enabled = "" if safety.get("enabled") else ""
lines.append(
f"**Control safety {enabled}:** "
f"room={safety.get('room_rpm', '?')}rpm "
f"op={safety.get('operator_rpm', '?')}rpm "
f"cooldown={safety.get('cooldown_s', '?')}s"
)
# Persistent dedupe
if dedupe:
ok_emoji = "" if dedupe.get("ok") else ""
pruned = dedupe.get("pruned_rows_last", 0)
ttl = dedupe.get("ttl_h", "?")
lines.append(
f"**Dedupe {ok_emoji}:** ttl={ttl}h pruned_last={pruned} "
f"db=`{dedupe.get('db_path') or 'n/a'}`"
)
# M6.0/M6.1: policy store status
ps = snapshot.get("policy_store") or {}
if ps:
ps_ok = "" if ps.get("ok") else ""
ps_node_count = ps.get("overrides_count", 0)
ps_agent_count = ps.get("agent_overrides_count", snapshot.get("policy_agent_overrides_count", 0))
ps_path = ps.get("policy_store_path") or ps.get("path") or "n/a"
lines.append(
f"**Policy store {ps_ok}:** node_overrides={ps_node_count} "
f"agent_overrides={ps_agent_count} db=`{ps_path}`"
)
# M6.2: last export/import timestamps + DB mtime
_last_export = snapshot.get("policy_last_export_at")
_last_import = snapshot.get("policy_last_import_at")
_db_mtime = snapshot.get("policy_db_mtime")
_snap_parts: list = []
if _last_export:
_snap_parts.append(f"last_export=`{_fmt_ts(_last_export)}`")
if _last_import:
_snap_parts.append(f"last_import=`{_fmt_ts(_last_import)}`")
if _db_mtime:
_snap_parts.append(f"db_mtime=`{_fmt_ts(_db_mtime)}`")
if _snap_parts:
lines.append("**Policy snapshots:** " + " ".join(_snap_parts))
# M5.1: per-node routed/rejected breakdown
node_stats = snapshot.get("nodes") or {}
if node_stats:
lines.append("\n**Per-node stats:**")
for nid in sorted(node_stats):
ns = node_stats[nid]
lines.append(
f" `{nid}`: routed={ns.get('routed', 0)} rejected={ns.get('rejected', 0)}"
)
reply = "\n".join(lines)
if len(reply) > _MAX_REPLY_LEN:
reply = reply[:_MAX_REPLY_LEN - 3] + ""
return reply

View File

@@ -0,0 +1,213 @@
"""
event_store — M2.3: Persistent event deduplication via SQLite.
Stores processed Matrix event_ids so that bridge restarts do not reprocess
events still returned by /sync (within TTL window).
Schema:
processed_events (room_id, event_id, ts, sender_hash)
PK: (room_id, event_id)
IDX: idx_processed_events_ts (ts)
Design notes:
- Uses aiosqlite for non-blocking async access from the ingress event loop.
- Prune is best-effort: failures are logged but do NOT abort processing.
- If the DB is unavailable (init error, corruption), EventStore degrades to
a no-op: is_processed() returns False, mark_processed() is a no-op.
The in-memory LRU dedupe (H1) continues to protect within a single run.
- WAL mode is enabled for better concurrent read performance.
"""
from __future__ import annotations
import logging
import time
from pathlib import Path
from typing import Optional, Tuple
try:
import aiosqlite
_AIOSQLITE_OK = True
except ImportError: # pragma: no cover
aiosqlite = None # type: ignore
_AIOSQLITE_OK = False
logger = logging.getLogger(__name__)
_SCHEMA = """
CREATE TABLE IF NOT EXISTS processed_events (
room_id TEXT NOT NULL,
event_id TEXT NOT NULL,
ts INTEGER NOT NULL,
sender_hash TEXT,
PRIMARY KEY (room_id, event_id)
);
CREATE INDEX IF NOT EXISTS idx_processed_events_ts ON processed_events (ts);
"""
class EventStore:
"""
Async SQLite-backed deduplication store for Matrix event_ids.
Usage:
store = EventStore("/app/data/matrix_bridge.db", ttl_h=48)
await store.open()
...
hit = await store.is_processed(room_id, event_id)
if not hit:
await store.mark_processed(room_id, event_id, sender_hash)
...
pruned = await store.prune(batch=5000)
await store.close()
"""
def __init__(
self,
db_path: str,
ttl_h: int = 48,
prune_batch: int = 5000,
) -> None:
self.db_path = db_path
self.ttl_h = ttl_h
self.prune_batch = prune_batch
self._db: Optional["aiosqlite.Connection"] = None
self._ok: bool = False
self._last_prune_at: Optional[float] = None
self._pruned_rows_last: int = 0
# ── Lifecycle ─────────────────────────────────────────────────────────────
async def open(self) -> bool:
"""
Open the SQLite connection and apply schema.
Returns True on success; False on failure (degraded mode).
"""
if not _AIOSQLITE_OK:
logger.warning("aiosqlite not available — persistent dedupe disabled")
return False
try:
Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
self._db = await aiosqlite.connect(self.db_path)
# WAL mode: better concurrent read, non-blocking writes
await self._db.execute("PRAGMA journal_mode=WAL")
await self._db.execute("PRAGMA synchronous=NORMAL")
await self._db.executescript(_SCHEMA)
await self._db.commit()
self._ok = True
logger.info("EventStore opened: %s (ttl_h=%d)", self.db_path, self.ttl_h)
return True
except Exception as exc:
logger.error("EventStore.open failed — degraded: %s", exc)
self._ok = False
return False
async def close(self) -> None:
"""Close the SQLite connection gracefully."""
if self._db is not None:
try:
await self._db.close()
except Exception as exc: # pragma: no cover
logger.warning("EventStore.close error: %s", exc)
self._db = None
self._ok = False
# ── Core operations ───────────────────────────────────────────────────────
async def is_processed(self, room_id: str, event_id: str) -> bool:
"""
Return True if (room_id, event_id) has already been processed.
Safe to call even when degraded (returns False → no false deduplication).
"""
if not self._ok or self._db is None:
return False
try:
async with self._db.execute(
"SELECT 1 FROM processed_events WHERE room_id=? AND event_id=? LIMIT 1",
(room_id, event_id),
) as cursor:
row = await cursor.fetchone()
return row is not None
except Exception as exc:
logger.warning("EventStore.is_processed error (degraded): %s", exc)
return False
async def mark_processed(
self,
room_id: str,
event_id: str,
sender_hash: str = "",
) -> bool:
"""
Insert (room_id, event_id) as processed.
Returns True on success, False if already exists or on error.
Uses INSERT OR IGNORE to avoid duplicates without raising.
"""
if not self._ok or self._db is None:
return False
ts = int(time.time())
try:
await self._db.execute(
"INSERT OR IGNORE INTO processed_events (room_id, event_id, ts, sender_hash) "
"VALUES (?, ?, ?, ?)",
(room_id, event_id, ts, sender_hash or None),
)
await self._db.commit()
return True
except Exception as exc:
logger.warning("EventStore.mark_processed error (degraded): %s", exc)
return False
# ── Prune ─────────────────────────────────────────────────────────────────
async def prune(self, batch: Optional[int] = None) -> int:
"""
Delete events older than ttl_h.
Returns the number of rows deleted (0 on error or degraded).
Uses LIMIT batch to avoid long locks on large tables.
"""
if not self._ok or self._db is None:
return 0
cutoff = int(time.time()) - self.ttl_h * 3600
effective_batch = batch or self.prune_batch
deleted = 0
try:
# SQLite DELETE with LIMIT requires compiling with SQLITE_ENABLE_UPDATE_DELETE_LIMIT,
# which may not be available. Use a subquery approach instead.
await self._db.execute(
"DELETE FROM processed_events "
"WHERE rowid IN ("
" SELECT rowid FROM processed_events WHERE ts < ? LIMIT ?"
")",
(cutoff, effective_batch),
)
await self._db.commit()
# Estimate rows deleted from changes()
async with self._db.execute("SELECT changes()") as cursor:
row = await cursor.fetchone()
deleted = row[0] if row else 0
self._last_prune_at = time.time()
self._pruned_rows_last = deleted
if deleted:
logger.info("EventStore pruned %d rows (cutoff=%d)", deleted, cutoff)
except Exception as exc:
logger.warning("EventStore.prune error: %s", exc)
return deleted
# ── Health / introspection ─────────────────────────────────────────────────
def as_health_dict(self) -> dict:
return {
"enabled": self._ok,
"db_path": self.db_path,
"ttl_h": self.ttl_h,
"ok": self._ok,
"last_prune_at": self._last_prune_at,
"pruned_rows_last": self._pruned_rows_last,
}

File diff suppressed because it is too large Load Diff

View File

@@ -33,6 +33,9 @@ except ImportError: # pragma: no cover
from .config import BridgeConfig, load_config from .config import BridgeConfig, load_config
from .control import ControlConfig, parse_control_config from .control import ControlConfig, parse_control_config
from .control_limiter import ControlRateLimiter
from .event_store import EventStore
from .node_policy import parse_node_policy
from .ingress import MatrixIngressLoop from .ingress import MatrixIngressLoop
from .mixed_routing import MixedRoomConfig, parse_mixed_room_map from .mixed_routing import MixedRoomConfig, parse_mixed_room_map
from .rate_limit import InMemoryRateLimiter from .rate_limit import InMemoryRateLimiter
@@ -69,7 +72,7 @@ if _PROM_OK:
_invoke_latency = Histogram( _invoke_latency = Histogram(
"matrix_bridge_invoke_duration_seconds", "matrix_bridge_invoke_duration_seconds",
"Latency of DAGI Router infer call", "Latency of DAGI Router infer call",
["agent_id"], ["agent_id", "node_id"], # M5.1: per-node latency breakdown
buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 45.0], buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 45.0],
) )
_send_latency = Histogram( _send_latency = Histogram(
@@ -80,7 +83,8 @@ if _PROM_OK:
) )
_bridge_up = Gauge( _bridge_up = Gauge(
"matrix_bridge_up", "matrix_bridge_up",
"1 if bridge started successfully", "1 if bridge started successfully; 0 on config error",
["node_id"], # M7.1: per-node label for multi-node deployments
) )
_rate_limiter_active_rooms = Gauge( _rate_limiter_active_rooms = Gauge(
"matrix_bridge_rate_limiter_active_rooms", "matrix_bridge_rate_limiter_active_rooms",
@@ -106,10 +110,11 @@ if _PROM_OK:
["agent_id"], ["agent_id"],
buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 30.0], buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 30.0],
) )
# M2.2: Mixed room routing metrics # M2.2: Mixed room routing — reason breakdown (slash/mention/name/default/direct)
_routed_total = Counter( # M7.1: Renamed from matrix_bridge_routed_total to avoid collision with M5.0 counter
"matrix_bridge_routed_total", _routing_reasons_total = Counter(
"Successful message routing by reason", "matrix_bridge_routing_reasons_total",
"Message routing breakdown by agent and routing reason (slash/mention/name/default/direct)",
["agent_id", "reason"], ["agent_id", "reason"],
) )
_route_rejected_total = Counter( _route_rejected_total = Counter(
@@ -127,20 +132,74 @@ if _PROM_OK:
"Total control commands received from authorized operators", "Total control commands received from authorized operators",
["sender", "verb", "subcommand"], ["sender", "verb", "subcommand"],
) )
_control_rate_limited_total = Counter(
"matrix_bridge_control_rate_limited_total",
"Total control commands rejected by rate limiter or cooldown",
["scope"],
)
_dedupe_persistent_hits_total = Counter(
"matrix_bridge_dedupe_persistent_hits_total",
"Total events dropped by persistent (SQLite) deduplication",
["room_id"],
)
_dedupe_persistent_inserts_total = Counter(
"matrix_bridge_dedupe_persistent_inserts_total",
"Total events marked as processed in persistent dedupe store",
)
# M5.0: node-aware routing — primary routed counter (unique name, no collision)
_routed_total = Counter(
"matrix_bridge_routed_total",
"Total messages successfully routed, by agent, resolved node, and node source",
["agent_id", "node_id", "source"],
)
_node_rejected_total = Counter(
"matrix_bridge_node_rejected_total",
"Total messages with rejected (non-allowlisted) node kwarg",
["node_id"],
)
# M8.0: soft-failover metrics
_failover_total = Counter(
"matrix_bridge_failover_total",
"Total successful soft-failovers by node transition and reason",
["from_node", "to_node", "reason"],
)
_node_health_state = Gauge(
"matrix_bridge_node_health_state",
"Node health state: 1=healthy 0.5=degraded 0=down",
["node_id"],
)
# M8.1: sticky routing metrics
_sticky_set_total = Counter(
"matrix_bridge_sticky_node_total",
"Total sticky routing entries set after failover, by preferred node and scope",
["node_id", "scope"],
)
_sticky_active = Gauge(
"matrix_bridge_sticky_node_active",
"Current count of active sticky routing entries",
[],
)
# ── Startup state ───────────────────────────────────────────────────────────── # ── Startup state ─────────────────────────────────────────────────────────────
_START_TIME = time.monotonic() _START_TIME = time.monotonic()
_cfg: Optional[BridgeConfig] = None _cfg: Optional[BridgeConfig] = None
# M5.1: in-memory per-node counters (lightweight, for !status reply)
from collections import defaultdict as _defaultdict
_node_stats: Dict[str, Dict[str, int]] = _defaultdict(lambda: {"routed": 0, "rejected": 0})
_config_error: Optional[str] = None _config_error: Optional[str] = None
_matrix_reachable: Optional[bool] = None _matrix_reachable: Optional[bool] = None
_gateway_reachable: Optional[bool] = None _gateway_reachable: Optional[bool] = None
_room_map: Optional[RoomMappingConfig] = None _room_map: Optional[RoomMappingConfig] = None
_mixed_room_config: Optional[MixedRoomConfig] = None _mixed_room_config: Optional[MixedRoomConfig] = None
_control_config: Optional[ControlConfig] = None _control_config: Optional[ControlConfig] = None
_event_store: Optional[EventStore] = None
_rate_limiter: Optional[InMemoryRateLimiter] = None _rate_limiter: Optional[InMemoryRateLimiter] = None
_ingress_loop: Optional["MatrixIngressLoop"] = None # for /health queue_size _ingress_loop: Optional["MatrixIngressLoop"] = None # for /health queue_size
_ingress_task: Optional[asyncio.Task] = None _ingress_task: Optional[asyncio.Task] = None
_ingress_stop: Optional[asyncio.Event] = None _ingress_stop: Optional[asyncio.Event] = None
_sticky_cache: Optional[Any] = None # M8.1: StickyNodeCache instance
_confirm_store: Optional[Any] = None # M9.0: ConfirmStore instance
_dummy_http_client: Optional[Any] = None # M11: soak inject endpoint (debug only)
async def _probe_url(url: str, timeout: float = 5.0) -> bool: async def _probe_url(url: str, timeout: float = 5.0) -> bool:
@@ -230,7 +289,7 @@ async def lifespan(app_: Any):
else: else:
logger.warning("⚠️ DAGI Gateway NOT reachable: %s", _cfg.dagi_gateway_url) logger.warning("⚠️ DAGI Gateway NOT reachable: %s", _cfg.dagi_gateway_url)
if _PROM_OK: if _PROM_OK:
_bridge_up.set(1) _bridge_up.labels(node_id=_cfg.node_id or "").set(1) # M7.1: labeled
# Start ingress loop (fire-and-forget asyncio task) # Start ingress loop (fire-and-forget asyncio task)
_has_rooms = (_room_map and _room_map.total_mappings > 0) or ( _has_rooms = (_room_map and _room_map.total_mappings > 0) or (
@@ -263,9 +322,9 @@ async def lifespan(app_: Any):
_rate_limiter_active_rooms.set(stats["active_rooms"]) _rate_limiter_active_rooms.set(stats["active_rooms"])
_rate_limiter_active_senders.set(stats["active_senders"]) _rate_limiter_active_senders.set(stats["active_senders"])
def _on_invoke_latency(agent_id: str, duration_s: float) -> None: def _on_invoke_latency(agent_id: str, duration_s: float, node_id: str = "") -> None:
if _PROM_OK: if _PROM_OK:
_invoke_latency.labels(agent_id=agent_id).observe(duration_s) _invoke_latency.labels(agent_id=agent_id, node_id=node_id or "unknown").observe(duration_s)
def _on_send_latency(agent_id: str, duration_s: float) -> None: def _on_send_latency(agent_id: str, duration_s: float) -> None:
if _PROM_OK: if _PROM_OK:
@@ -287,7 +346,7 @@ async def lifespan(app_: Any):
# M2.2 callbacks # M2.2 callbacks
def _on_routed(agent_id: str, reason: str) -> None: def _on_routed(agent_id: str, reason: str) -> None:
if _PROM_OK: if _PROM_OK:
_routed_total.labels(agent_id=agent_id, reason=reason).inc() _routing_reasons_total.labels(agent_id=agent_id, reason=reason).inc() # M7.1: renamed
def _on_route_rejected(room_id: str, reason: str) -> None: def _on_route_rejected(room_id: str, reason: str) -> None:
if _PROM_OK: if _PROM_OK:
@@ -300,6 +359,164 @@ async def lifespan(app_: Any):
sender=sender, verb=verb, subcommand=subcommand sender=sender, verb=verb, subcommand=subcommand
).inc() ).inc()
# M3.4: control safety rate limiter
_control_limiter = ControlRateLimiter(
room_rpm=_cfg.control_room_rpm,
operator_rpm=_cfg.control_operator_rpm,
run_next_rpm=_cfg.control_run_next_rpm,
cooldown_s=_cfg.control_cooldown_s,
) if _control_config and _control_config.is_enabled else None
def _on_control_rate_limited(scope: str) -> None:
if _PROM_OK:
_control_rate_limited_total.labels(scope=scope).inc()
# M2.3: Persistent event deduplication
_prune_task: Optional[asyncio.Task] = None
if _cfg.persistent_dedupe:
import os
db_path = os.path.join(_cfg.bridge_data_dir, "matrix_bridge.db")
_event_store = EventStore(
db_path=db_path,
ttl_h=_cfg.processed_events_ttl_h,
prune_batch=_cfg.processed_events_prune_batch,
)
store_ok = await _event_store.open()
if store_ok:
logger.info(
"✅ Persistent dedupe: %s (ttl_h=%d)",
db_path, _cfg.processed_events_ttl_h,
)
# Best-effort prune on startup
pruned = await _event_store.prune()
if pruned:
logger.info("Startup prune removed %d stale events", pruned)
# Periodic prune task
if _cfg.processed_events_prune_interval_s > 0:
async def _prune_loop() -> None:
while True:
await asyncio.sleep(_cfg.processed_events_prune_interval_s)
if _event_store:
await _event_store.prune()
_prune_task = asyncio.create_task(_prune_loop(), name="event_store_prune")
else:
logger.warning("⚠️ EventStore init failed — persistent dedupe disabled (degraded)")
_event_store = None
else:
logger.info("Persistent dedupe disabled (PERSISTENT_DEDUPE=0)")
def _on_dedupe_hit(room_id: str, agent_id: str) -> None:
if _PROM_OK:
_dedupe_persistent_hits_total.labels(room_id=room_id).inc()
def _on_dedupe_insert() -> None:
if _PROM_OK:
_dedupe_persistent_inserts_total.inc()
# M5.0: node-aware routing policy
_node_policy = parse_node_policy(
raw_allowed=_cfg.bridge_allowed_nodes,
default_node=_cfg.bridge_default_node,
raw_room_map=_cfg.bridge_room_node_map,
)
logger.info(
"✅ Node policy: default=%s allowed=%s room_overrides=%d",
_node_policy.default_node,
sorted(_node_policy.allowed_nodes),
len(_node_policy.room_node_map),
)
# M6.0: Persistent policy store for dynamic room-node overrides
_policy_store: Optional[Any] = None
try:
from .policy_store import PolicyStore as _PolicyStore
import os
_ps_path = os.path.join(_cfg.bridge_data_dir, "policy_overrides.db")
_policy_store = _PolicyStore(db_path=_ps_path)
_policy_store.open()
logger.info(
"✅ Policy store: %s (%d overrides)",
_ps_path, _policy_store.count_overrides(),
)
except Exception as _ps_exc:
logger.warning("Policy store init failed (non-fatal): %s", _ps_exc)
_policy_store = None
def _on_node_selected(agent_id: str, node_id: str, source: str) -> None:
if _PROM_OK:
_routed_total.labels(agent_id=agent_id, node_id=node_id, source=source).inc()
_node_stats[node_id]["routed"] += 1
def _on_node_rejected(rejected_node: str) -> None:
if _PROM_OK:
_node_rejected_total.labels(node_id=rejected_node).inc()
_node_stats[rejected_node]["rejected"] += 1
# M8.0: Node health tracker + soft-failover
from .node_health import NodeHealthTracker as _NodeHealthTracker, parse_node_health_config as _parse_nhc
_health_cfg = _parse_nhc(
fail_consecutive=_cfg.node_fail_consecutive,
lat_ewma_s=_cfg.node_lat_ewma_s,
ewma_alpha=_cfg.node_ewma_alpha,
)
_node_health_tracker = _NodeHealthTracker(_health_cfg)
logger.info(
"✅ Node health tracker: fail_consecutive=%d lat_ewma_s=%.1f ewma_alpha=%.2f",
_cfg.node_fail_consecutive, _cfg.node_lat_ewma_s, _cfg.node_ewma_alpha,
)
def _on_failover(from_node: str, to_node: str, reason: str) -> None:
if _PROM_OK:
_failover_total.labels(
from_node=from_node, to_node=to_node, reason=reason
).inc()
if _PROM_OK:
_update_health_gauges()
logger.info("⚡ Failover: %s%s reason=%s", from_node, to_node, reason)
def _update_health_gauges() -> None:
if not _PROM_OK or _node_health_tracker is None or _node_policy is None:
return
_STATE_MAP = {"healthy": 1.0, "degraded": 0.5, "down": 0.0}
for nid in _node_policy.allowed_nodes:
state = _node_health_tracker.state(nid)
_node_health_state.labels(node_id=nid).set(_STATE_MAP.get(state, 1.0))
# M8.1: Sticky failover cache
from .sticky_cache import StickyNodeCache as _StickyNodeCache
global _sticky_cache
if _cfg.failover_sticky_ttl_s > 0:
_sticky_cache = _StickyNodeCache(ttl_s=_cfg.failover_sticky_ttl_s)
logger.info("✅ Sticky failover cache: ttl=%.0fs", _cfg.failover_sticky_ttl_s)
else:
_sticky_cache = None
logger.info(" Sticky failover disabled (FAILOVER_STICKY_TTL_S=0)")
# M9.0: Confirm store
from .confirm_store import ConfirmStore as _ConfirmStore
global _confirm_store
if _cfg.confirm_ttl_s > 0:
_confirm_store = _ConfirmStore(ttl_s=_cfg.confirm_ttl_s)
logger.info("✅ Confirm store: ttl=%.0fs", _cfg.confirm_ttl_s)
else:
_confirm_store = None
logger.info(" Confirm store disabled (CONFIRM_TTL_S=0)")
# M11: debug inject client (only created when inject is enabled)
global _dummy_http_client
if _cfg.debug_inject_enabled and _HTTPX_OK:
_dummy_http_client = _httpx.AsyncClient(timeout=30.0)
logger.warning(
"⚠️ DEBUG_INJECT_ENABLED=true — synthetic event injection active. "
"NEVER use in production!"
)
def _on_sticky_set(node_id: str, scope: str) -> None:
if _PROM_OK:
_sticky_set_total.labels(node_id=node_id, scope=scope).inc()
if _sticky_cache is not None:
_sticky_active.labels().set(_sticky_cache.active_count())
ingress = MatrixIngressLoop( ingress = MatrixIngressLoop(
matrix_homeserver_url=_cfg.matrix_homeserver_url, matrix_homeserver_url=_cfg.matrix_homeserver_url,
matrix_access_token=_cfg.matrix_access_token, matrix_access_token=_cfg.matrix_access_token,
@@ -330,7 +547,38 @@ async def lifespan(app_: Any):
on_route_rejected=_on_route_rejected, on_route_rejected=_on_route_rejected,
control_config=_control_config, control_config=_control_config,
control_unauthorized_behavior=_cfg.control_unauthorized_behavior, control_unauthorized_behavior=_cfg.control_unauthorized_behavior,
sofiia_control_token=_cfg.sofiia_control_token,
control_limiter=_control_limiter,
on_control_command=_on_control_command, on_control_command=_on_control_command,
on_control_rate_limited=_on_control_rate_limited,
event_store=_event_store,
on_dedupe_persistent_hit=_on_dedupe_hit,
on_dedupe_persistent_insert=_on_dedupe_insert,
# M4.0: agent discovery
discovery_rpm=_cfg.discovery_rpm,
# M5.0: node-aware routing
node_policy=_node_policy,
on_node_selected=_on_node_selected,
on_node_rejected=_on_node_rejected,
# M5.1: node stats getter for !status
node_stats_getter=lambda: {k: dict(v) for k, v in _node_stats.items()},
# M6.0: dynamic room-node policy store
policy_store=_policy_store,
# M6.2: data directory for policy exports/imports
bridge_data_dir=_cfg.bridge_data_dir,
# M8.0: node health tracker + failover callback
node_health_tracker=_node_health_tracker,
on_failover=_on_failover,
# M8.1: sticky failover cache
sticky_cache=_sticky_cache,
on_sticky_set=_on_sticky_set,
# M8.2: HA persistence config
ha_health_snapshot_interval_s=_cfg.ha_health_snapshot_interval_s,
ha_health_max_age_s=_cfg.ha_health_max_age_s,
# M9.0: Two-step confirmation store
confirm_store=_confirm_store,
policy_export_retention_days=_cfg.policy_export_retention_days,
policy_history_limit=_cfg.policy_history_limit,
) )
logger.info( logger.info(
"✅ Backpressure queue: max=%d workers=%d drain_timeout=%.1fs", "✅ Backpressure queue: max=%d workers=%d drain_timeout=%.1fs",
@@ -349,7 +597,8 @@ async def lifespan(app_: Any):
_config_error = str(exc) _config_error = str(exc)
logger.error("❌ Config error: %s", _config_error) logger.error("❌ Config error: %s", _config_error)
if _PROM_OK: if _PROM_OK:
_bridge_up.set(0) _cfg_node = _cfg.node_id if _cfg else ""
_bridge_up.labels(node_id=_cfg_node or "").set(0) # M7.1: labeled
yield yield
# Shutdown: cancel ingress loop # Shutdown: cancel ingress loop
if _ingress_stop: if _ingress_stop:
@@ -360,6 +609,23 @@ async def lifespan(app_: Any):
await asyncio.wait_for(_ingress_task, timeout=5.0) await asyncio.wait_for(_ingress_task, timeout=5.0)
except (asyncio.CancelledError, asyncio.TimeoutError): except (asyncio.CancelledError, asyncio.TimeoutError):
pass pass
# Shutdown: cancel prune task + close EventStore
if "_prune_task" in dir() and _prune_task and not _prune_task.done(): # type: ignore[name-defined]
_prune_task.cancel() # type: ignore[name-defined]
if _event_store is not None:
await _event_store.close()
# M6.0: close policy store
if "_policy_store" in dir() and _policy_store is not None: # type: ignore[name-defined]
try:
_policy_store.close() # type: ignore[name-defined]
except Exception: # noqa: BLE001
pass
# M11: close debug http client if open
if _dummy_http_client is not None:
try:
await _dummy_http_client.aclose()
except Exception: # noqa: BLE001
pass
logger.info("matrix-bridge-dagi shutting down") logger.info("matrix-bridge-dagi shutting down")
# ── App ─────────────────────────────────────────────────────────────────────── # ── App ───────────────────────────────────────────────────────────────────────
@@ -435,6 +701,89 @@ async def health() -> Dict[str, Any]:
"operators_count": len(_control_config.operator_allowlist) if _control_config else 0, "operators_count": len(_control_config.operator_allowlist) if _control_config else 0,
"unauthorized_behavior": _cfg.control_unauthorized_behavior, "unauthorized_behavior": _cfg.control_unauthorized_behavior,
}, },
"control_safety": {
"enabled": _cfg.control_room_rpm > 0 or _cfg.control_operator_rpm > 0,
"room_rpm": _cfg.control_room_rpm,
"operator_rpm": _cfg.control_operator_rpm,
"run_next_rpm": _cfg.control_run_next_rpm,
"cooldown_s": _cfg.control_cooldown_s,
},
"persistent_dedupe": _event_store.as_health_dict() if _event_store else {
"enabled": False,
"db_path": None,
"ttl_h": _cfg.processed_events_ttl_h,
"ok": False,
"last_prune_at": None,
"pruned_rows_last": 0,
},
# M6.0: policy store health
"policy_store": _health_policy_store_dict(),
# M8.1: sticky failover cache health
"sticky_cache": _health_sticky_dict(),
# M8.2: HA state persistence info
"ha_state": _health_ha_dict(),
# M9.0: confirm store
"confirm_store": _health_confirm_dict(),
}
def _health_confirm_dict() -> Dict[str, Any]:
"""Return confirm store info for /health endpoint (M9.0)."""
if _confirm_store is None:
return {"enabled": False}
return {
"enabled": True,
"pending": _confirm_store.pending_count(),
"ttl_s": _confirm_store.ttl_s,
}
def _health_ha_dict() -> Dict[str, Any]:
"""Return HA persistence info for /health endpoint (M8.2)."""
if _ingress_loop is None:
return {"sticky_loaded": 0, "health_loaded": False, "snapshot_interval_s": 0}
try:
s = _ingress_loop.get_status()
return {
"sticky_loaded": s.get("ha_sticky_loaded", 0),
"health_loaded": s.get("ha_health_loaded", False),
"snapshot_interval_s": s.get("ha_health_snapshot_interval_s", 0),
}
except Exception: # noqa: BLE001
return {"sticky_loaded": 0, "health_loaded": False, "snapshot_interval_s": 0}
def _health_sticky_dict() -> Dict[str, Any]:
"""Return sticky failover cache health for /health endpoint (M8.1)."""
if _sticky_cache is None:
return {"enabled": False, "active_keys": 0, "ttl_s": 0}
return {
"enabled": True,
"active_keys": _sticky_cache.active_count(),
"ttl_s": _sticky_cache.ttl_s,
}
def _health_policy_store_dict() -> Dict[str, Any]:
"""Return policy store health info for /health endpoint."""
try:
if _ingress_loop is not None:
s = _ingress_loop.get_status()
return {
"ok": s.get("policy_store_ok", False),
"path": s.get("policy_store_path"),
"overrides_count": s.get("policy_overrides_count", 0),
"agent_overrides_count": s.get("policy_agent_overrides_count", 0), # M6.1
"last_export_at": s.get("policy_last_export_at"), # M6.2
"last_import_at": s.get("policy_last_import_at"), # M6.2
"db_mtime": s.get("policy_db_mtime"), # M6.2
}
except Exception: # noqa: BLE001
pass
return {
"ok": False, "path": None,
"overrides_count": 0, "agent_overrides_count": 0,
"last_export_at": None, "last_import_at": None, "db_mtime": None,
} }
@@ -464,6 +813,101 @@ async def bridge_mappings() -> Dict[str, Any]:
} }
# ── Debug / Soak (M11) ────────────────────────────────────────────────────────
@app.post("/v1/debug/inject_event")
async def debug_inject_event(body: Dict[str, Any]) -> Dict[str, Any]:
"""
Synthetic event injection for soak/load testing.
Enabled ONLY when DEBUG_INJECT_ENABLED=true (never in production).
Body: { "room_id": "!room:server", "event": { Matrix event dict } }
The event is enqueued directly into the ingress loop, bypassing Matrix poll.
Returns: { "ok": bool, "enqueued": bool, "room_id": str, "event_id": str }
"""
if _cfg is None or not _cfg.debug_inject_enabled:
return Response( # type: ignore[return-value]
'{"ok":false,"error":"debug inject disabled"}',
status_code=403,
media_type="application/json",
)
if _ingress_loop is None:
return {"ok": False, "error": "ingress loop not running"}
room_id = body.get("room_id", "")
event = body.get("event", {})
if not room_id or not event:
return {"ok": False, "error": "missing room_id or event"}
# Ensure event has minimum required fields for ingress processing
if not event.get("event_id"):
import time as _time
event["event_id"] = f"!inject-{int(_time.monotonic() * 1e6)}"
if not event.get("type"):
event["type"] = "m.room.message"
if not event.get("content"):
event["content"] = {"msgtype": "m.text", "body": event.get("body", "soak-ping")}
# Build a minimal sync_resp that looks like a real Matrix /sync response
# so _enqueue_from_sync can pick it up via extract_room_messages.
# We bypass Matrix polling by directly calling _try_enqueue on the right mapping.
enqueued = False
try:
# Find the matching room mapping (direct rooms only for soak)
mapping = None
if _ingress_loop._room_map is not None:
for m in _ingress_loop._room_map.mappings:
if m.room_id == room_id:
mapping = m
break
if mapping is None:
return {"ok": False, "error": f"no mapping for room_id={room_id!r}"}
# Build a minimal stub Matrix client — replies are discarded for soak events
from .matrix_client import MatrixClient
class _SoakMatrixClient(MatrixClient): # type: ignore[misc]
"""No-op Matrix client for synthetic soak events."""
def __init__(self) -> None: # noqa: D107
pass # skip real __init__
async def mark_seen(self, room_id: str, event_id: str) -> None: # type: ignore[override]
pass
async def send_text(self, room_id: str, text: str, # type: ignore[override]
txn: Optional[str] = None) -> None:
pass
_stub_client = _SoakMatrixClient()
if _dummy_http_client is None:
return {"ok": False, "error": "debug http client not initialised"}
await _ingress_loop._try_enqueue(
_stub_client, # type: ignore[arg-type]
_ingress_loop._queue,
_dummy_http_client,
event,
mapping,
)
enqueued = True
except Exception as exc: # noqa: BLE001
return {"ok": False, "error": str(exc), "enqueued": False}
return {
"ok": True,
"enqueued": enqueued,
"room_id": room_id,
"event_id": event.get("event_id"),
}
async def _noop_send(room_id: str, text: str, txn: Optional[str] = None) -> None:
"""Discard replies from injected soak events."""
# ── Metrics ─────────────────────────────────────────────────────────────────── # ── Metrics ───────────────────────────────────────────────────────────────────
@app.get("/metrics") @app.get("/metrics")
async def metrics(): async def metrics():

View File

@@ -0,0 +1,224 @@
"""
Metrics Contract — Matrix Bridge DAGI
Phase M7.1
Single source of truth for all Prometheus metric names and their label sets.
Used by:
- main.py (registers metrics against this contract)
- tests/test_matrix_bridge_m71_metrics_contract.py (static validation)
- ops/prometheus/alerts/matrix-bridge-dagi.rules.yml (PromQL expressions)
- ops/grafana/dashboards/matrix-bridge-dagi.json (panel queries)
Format:
METRICS_CONTRACT: Dict[metric_name, MetricSpec]
MetricSpec fields:
kind : "counter" | "histogram" | "gauge"
labels : list of label names (empty list = no labels)
help : one-line description
phase : originating milestone
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Dict, List
@dataclass(frozen=True)
class MetricSpec:
kind: str # "counter" | "histogram" | "gauge"
labels: List[str] # label names; empty = no labels
help: str
phase: str = "M1" # originating milestone for traceability
# ── Contract ──────────────────────────────────────────────────────────────────
METRICS_CONTRACT: Dict[str, MetricSpec] = {
# ── Core message traffic ──────────────────────────────────────────────────
"matrix_bridge_messages_received_total": MetricSpec(
kind="counter",
labels=["room_id", "agent_id"],
help="Total Matrix messages received",
phase="M1",
),
"matrix_bridge_messages_replied_total": MetricSpec(
kind="counter",
labels=["room_id", "agent_id", "status"],
help="Total agent replies sent to Matrix (status=ok|error)",
phase="M1",
),
"matrix_bridge_rate_limited_total": MetricSpec(
kind="counter",
labels=["room_id", "agent_id", "limit_type"],
help="Messages dropped by rate limiter",
phase="H1",
),
"matrix_bridge_gateway_errors_total": MetricSpec(
kind="counter",
labels=["error_type"],
help="Bridge errors by stage: sync_error, network_error, http_<status>, matrix_send_error, unexpected",
phase="M1",
),
# ── Latency histograms ────────────────────────────────────────────────────
"matrix_bridge_invoke_duration_seconds": MetricSpec(
kind="histogram",
labels=["agent_id", "node_id"],
help="Latency of DAGI Router infer call, per agent and node",
phase="H3",
),
"matrix_bridge_send_duration_seconds": MetricSpec(
kind="histogram",
labels=["agent_id"],
help="Latency of Matrix send_text call",
phase="H3",
),
"matrix_bridge_queue_wait_seconds": MetricSpec(
kind="histogram",
labels=["agent_id"],
help="Time between enqueue and worker start processing",
phase="H3",
),
# ── Queue ─────────────────────────────────────────────────────────────────
"matrix_bridge_queue_size": MetricSpec(
kind="gauge",
labels=[],
help="Current number of pending items in the work queue",
phase="H2",
),
"matrix_bridge_queue_dropped_total": MetricSpec(
kind="counter",
labels=["room_id", "agent_id"],
help="Messages dropped because queue was full",
phase="H2",
),
# ── Rate limiter gauges ───────────────────────────────────────────────────
"matrix_bridge_rate_limiter_active_rooms": MetricSpec(
kind="gauge",
labels=[],
help="Rooms with activity in the current rate-limit window",
phase="H1",
),
"matrix_bridge_rate_limiter_active_senders": MetricSpec(
kind="gauge",
labels=[],
help="Senders with activity in the current rate-limit window",
phase="H1",
),
# ── Routing ───────────────────────────────────────────────────────────────
"matrix_bridge_routing_reasons_total": MetricSpec(
kind="counter",
labels=["agent_id", "reason"],
help="Message routing breakdown by agent and routing reason (slash/mention/name/default/direct)",
phase="M2.2",
),
"matrix_bridge_route_rejected_total": MetricSpec(
kind="counter",
labels=["room_id", "reason"],
help="Messages rejected during routing (unknown agent, bad slash, no mapping, etc.)",
phase="M2.2",
),
"matrix_bridge_active_room_agent_locks": MetricSpec(
kind="gauge",
labels=[],
help="Number of room-agent pairs currently holding a concurrency lock",
phase="M2.2",
),
# ── Control channel ───────────────────────────────────────────────────────
"matrix_bridge_control_commands_total": MetricSpec(
kind="counter",
labels=["sender", "verb", "subcommand"],
help="Total control commands received from authorized operators",
phase="M3.0",
),
"matrix_bridge_control_rate_limited_total": MetricSpec(
kind="counter",
labels=["scope"],
help="Total control commands rejected by rate limiter or cooldown",
phase="M3.4",
),
# ── Persistent deduplication ─────────────────────────────────────────────
"matrix_bridge_dedupe_persistent_hits_total": MetricSpec(
kind="counter",
labels=["room_id"],
help="Total events dropped by persistent (SQLite) deduplication",
phase="M2.3",
),
"matrix_bridge_dedupe_persistent_inserts_total": MetricSpec(
kind="counter",
labels=[],
help="Total events marked as processed in persistent dedupe store",
phase="M2.3",
),
# ── Node-aware routing (M5.0) ─────────────────────────────────────────────
"matrix_bridge_routed_total": MetricSpec(
kind="counter",
labels=["agent_id", "node_id", "source"],
help="Total messages successfully routed, by agent, resolved node, and node source",
phase="M5.0",
),
"matrix_bridge_node_rejected_total": MetricSpec(
kind="counter",
labels=["node_id"],
help="Total messages with rejected (non-allowlisted) node kwarg",
phase="M5.0",
),
# ── Bridge health (M7.1) ──────────────────────────────────────────────────
"matrix_bridge_up": MetricSpec(
kind="gauge",
labels=["node_id"],
help="1 if bridge started successfully; 0 on config error",
phase="M7.1",
),
# ── Soft-failover (M8.0) ─────────────────────────────────────────────────
"matrix_bridge_failover_total": MetricSpec(
kind="counter",
labels=["from_node", "to_node", "reason"],
help="Total successful soft-failovers by node transition and reason",
phase="M8.0",
),
"matrix_bridge_node_health_state": MetricSpec(
kind="gauge",
labels=["node_id"],
help="Node health state gauge: 1=healthy 0.5=degraded 0=down",
phase="M8.0",
),
# ── Sticky routing anti-flap (M8.1) ──────────────────────────────────────
"matrix_bridge_sticky_node_total": MetricSpec(
kind="counter",
labels=["node_id", "scope"],
help="Total sticky routing entries set after failover, by preferred node and scope",
phase="M8.1",
),
"matrix_bridge_sticky_node_active": MetricSpec(
kind="gauge",
labels=[],
help="Current count of active sticky routing entries",
phase="M8.1",
),
}
# ── Alert metric references ────────────────────────────────────────────────────
# These are the metric base-names referenced in alert rules.
# All must exist in METRICS_CONTRACT.
ALERT_METRIC_REFS = frozenset({
"matrix_bridge_up",
"matrix_bridge_gateway_errors_total",
"matrix_bridge_messages_replied_total",
"matrix_bridge_queue_dropped_total",
"matrix_bridge_rate_limited_total",
"matrix_bridge_control_rate_limited_total",
"matrix_bridge_dedupe_persistent_hits_total",
"matrix_bridge_invoke_duration_seconds",
})

View File

@@ -309,3 +309,25 @@ def reply_prefix(agent_id: str, is_mixed: bool) -> str:
return "" return ""
# Capitalise first letter of agent name: "sofiia" → "Sofiia" # Capitalise first letter of agent name: "sofiia" → "Sofiia"
return f"{agent_id.capitalize()}: " return f"{agent_id.capitalize()}: "
def build_override_config(
base_config: MixedRoomConfig,
room_id: str,
agents: List[str],
default_agent: str,
) -> MixedRoomConfig:
"""
M6.1: Build a temporary MixedRoomConfig that uses a dynamic store override
for room_id while keeping all other rooms from base_config unchanged.
Used in _enqueue_from_mixed_room to inject PolicyStore agent overrides
without mutating the shared base configuration.
"""
rooms = dict(base_config.rooms)
rooms[room_id] = MixedRoom(
room_id=room_id,
agents=agents,
default_agent=default_agent,
)
return MixedRoomConfig(rooms=rooms)

View File

@@ -0,0 +1,262 @@
"""
NodeHealthTracker — M8.0: per-node health state tracking for soft-failover.
Tracks invoke outcomes per node and maintains:
- EWMA latency estimate
- consecutive failure counter
- last ok / last error timestamps
- derived health state: "healthy" | "degraded" | "down"
State transitions
-----------------
Any state → "down" : consecutive_failures >= fail_consecutive
Any state → "degraded" : ewma_latency_s >= lat_ewma_threshold
(and not yet "down")
"down"/"degraded""healthy" : record_ok() resets consecutive_failures to 0
and ewma is updated towards the actual latency
Thread safety
-------------
All mutations are protected by a threading.Lock so this can be called from
asyncio callbacks (e.g. in `_invoke_and_send` on the event loop thread).
Use `record_ok` / `record_error` from within coroutines; they are synchronous
(no blocking I/O) so they are safe to call directly without to_thread.
"""
from __future__ import annotations
import logging
import threading
import time
from dataclasses import dataclass, field
from typing import Dict, FrozenSet, Optional, Tuple
logger = logging.getLogger(__name__)
# ── State constants ────────────────────────────────────────────────────────────
NODE_STATE_HEALTHY = "healthy"
NODE_STATE_DEGRADED = "degraded"
NODE_STATE_DOWN = "down"
# Failover-triggering error classes
FAILOVER_REASON_TIMEOUT = "timeout"
FAILOVER_REASON_HTTP_5XX = "http_5xx"
FAILOVER_REASON_NETWORK = "network"
# ── Config ────────────────────────────────────────────────────────────────────
@dataclass(frozen=True)
class NodeHealthConfig:
"""
Thresholds controlling when a node is considered degraded or down.
fail_consecutive : int number of consecutive failures → "down"
lat_ewma_s : float EWMA latency estimate (seconds) threshold → "degraded"
ewma_alpha : float EWMA smoothing factor (0..1); higher = more reactive
"""
fail_consecutive: int = 3
lat_ewma_s: float = 12.0
ewma_alpha: float = 0.3
def __post_init__(self) -> None:
if not (0 < self.ewma_alpha <= 1):
raise ValueError(f"ewma_alpha must be in (0, 1], got {self.ewma_alpha}")
if self.fail_consecutive < 1:
raise ValueError(f"fail_consecutive must be ≥ 1, got {self.fail_consecutive}")
if self.lat_ewma_s <= 0:
raise ValueError(f"lat_ewma_s must be > 0, got {self.lat_ewma_s}")
# ── Per-node state ────────────────────────────────────────────────────────────
@dataclass
class _NodeState:
invoke_ok_total: int = 0
invoke_err_total: int = 0
consecutive_failures: int = 0
last_ok_ts: Optional[float] = None
last_err_ts: Optional[float] = None
ewma_latency_s: Optional[float] = None # None until first ok record
# ── Tracker ───────────────────────────────────────────────────────────────────
class NodeHealthTracker:
"""
Thread-safe per-node health tracker.
Usage:
tracker = NodeHealthTracker(NodeHealthConfig())
# On successful invoke
tracker.record_ok("NODA1", latency_s=1.4)
# On failed invoke
tracker.record_error("NODA1", reason=FAILOVER_REASON_TIMEOUT)
# Read health state
state = tracker.state("NODA1") # "healthy" | "degraded" | "down"
fallback = tracker.pick_fallback("NODA1", allowed_nodes=frozenset({"NODA1","NODA2"}))
"""
def __init__(self, config: Optional[NodeHealthConfig] = None) -> None:
self._cfg = config or NodeHealthConfig()
self._nodes: Dict[str, _NodeState] = {}
self._lock = threading.RLock() # RLock: re-entrant (needed for all_info → as_info_dict)
# ── Public mutation API ────────────────────────────────────────────────────
def record_ok(self, node_id: str, latency_s: float) -> None:
"""Record a successful invoke for node_id with given latency."""
with self._lock:
ns = self._get_or_create(node_id)
ns.invoke_ok_total += 1
ns.consecutive_failures = 0
ns.last_ok_ts = time.monotonic()
if ns.ewma_latency_s is None:
ns.ewma_latency_s = latency_s
else:
alpha = self._cfg.ewma_alpha
ns.ewma_latency_s = alpha * latency_s + (1 - alpha) * ns.ewma_latency_s
def record_error(self, node_id: str, reason: str = "unknown") -> None:
"""Record a failed invoke for node_id."""
with self._lock:
ns = self._get_or_create(node_id)
ns.invoke_err_total += 1
ns.consecutive_failures += 1
ns.last_err_ts = time.monotonic()
logger.debug(
"NodeHealth: node=%s consecutive_failures=%d reason=%s",
node_id, ns.consecutive_failures, reason,
)
# ── Public read API ───────────────────────────────────────────────────────
def state(self, node_id: str) -> str:
"""Return current health state for node_id."""
with self._lock:
return self._state_unlocked(node_id)
def pick_fallback(
self,
primary: str,
allowed_nodes: FrozenSet[str],
) -> Optional[str]:
"""
Return the best alternative node for failover.
Priority: healthy > degraded > (never down)
Returns None if no acceptable fallback exists.
"""
with self._lock:
candidates = sorted(n for n in allowed_nodes if n != primary)
# Prefer healthy first
for n in candidates:
if self._state_unlocked(n) == NODE_STATE_HEALTHY:
return n
# Accept degraded if no healthy available
for n in candidates:
if self._state_unlocked(n) == NODE_STATE_DEGRADED:
return n
# Do not failover to "down" nodes
return None
def as_info_dict(self, node_id: str) -> dict:
"""Return a JSON-safe status dict for one node."""
with self._lock:
ns = self._nodes.get(node_id)
if ns is None:
return {
"node_id": node_id,
"state": NODE_STATE_HEALTHY,
"invoke_ok": 0,
"invoke_err": 0,
"consecutive_failures": 0,
"ewma_latency_s": None,
"last_ok_ts": None,
"last_err_ts": None,
}
return {
"node_id": node_id,
"state": self._state_unlocked(node_id),
"invoke_ok": ns.invoke_ok_total,
"invoke_err": ns.invoke_err_total,
"consecutive_failures": ns.consecutive_failures,
"ewma_latency_s": round(ns.ewma_latency_s, 3) if ns.ewma_latency_s else None,
"last_ok_ts": ns.last_ok_ts,
"last_err_ts": ns.last_err_ts,
}
def all_info(self, allowed_nodes: Optional[FrozenSet[str]] = None) -> Dict[str, dict]:
"""
Return status dicts for all tracked (or specified) nodes.
If allowed_nodes provided, also include entries for unseen nodes (state=healthy).
"""
with self._lock:
keys = set(self._nodes.keys())
if allowed_nodes:
keys |= set(allowed_nodes)
return {n: self.as_info_dict(n) for n in sorted(keys)}
def reset(self, node_id: str) -> None:
"""Reset health state for a node (e.g. after manual recovery)."""
with self._lock:
self._nodes.pop(node_id, None)
def restore_node(
self,
node_id: str,
ewma_latency_s: Optional[float],
consecutive_failures: int,
) -> None:
"""
Restore persisted node state after a restart (M8.2).
Only restores ewma_latency_s and consecutive_failures; counters
(invoke_ok_total, invoke_err_total) start from 0 since they are
runtime metrics for the current session.
"""
with self._lock:
ns = self._get_or_create(node_id)
ns.ewma_latency_s = ewma_latency_s
ns.consecutive_failures = max(0, consecutive_failures)
# ── Internal ──────────────────────────────────────────────────────────────
def _get_or_create(self, node_id: str) -> _NodeState:
if node_id not in self._nodes:
self._nodes[node_id] = _NodeState()
return self._nodes[node_id]
def _state_unlocked(self, node_id: str) -> str:
ns = self._nodes.get(node_id)
if ns is None:
return NODE_STATE_HEALTHY # unseen nodes are assumed healthy
if ns.consecutive_failures >= self._cfg.fail_consecutive:
return NODE_STATE_DOWN
if (
ns.ewma_latency_s is not None
and ns.ewma_latency_s >= self._cfg.lat_ewma_s
):
return NODE_STATE_DEGRADED
return NODE_STATE_HEALTHY
# ── Parser (env vars → NodeHealthConfig) ──────────────────────────────────────
def parse_node_health_config(
fail_consecutive: int = 3,
lat_ewma_s: float = 12.0,
ewma_alpha: float = 0.3,
) -> NodeHealthConfig:
"""Construct NodeHealthConfig from parsed env values."""
return NodeHealthConfig(
fail_consecutive=fail_consecutive,
lat_ewma_s=lat_ewma_s,
ewma_alpha=ewma_alpha,
)

View File

@@ -0,0 +1,179 @@
"""
node_policy — Node-aware routing for matrix-bridge-dagi.
Resolves which NODA (NODA1, NODA2, …) a message should be tagged with based on:
1. Explicit `node=X` kwarg in the message body (mixed rooms only)
2. Dynamic store override (PolicyStore, set by operators via !node set) ← M6.0
3. Static per-room mapping from BRIDGE_ROOM_NODE_MAP env
4. BRIDGE_DEFAULT_NODE (fallback)
The resolved node_id is embedded in the Router metadata so downstream
services (Router / Memory / Agent) can apply per-node policies.
This module does NOT change the HTTP endpoint called — the Router URL
stays the same.
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
from typing import Dict, FrozenSet, Optional, Tuple
# Regex to find 'node=X' anywhere in message text (case-insensitive)
_NODE_KWARG_RE = re.compile(r"\bnode=(\w+)\b", re.IGNORECASE)
# Node resolution sources (priority order)
NODE_SOURCE_EXPLICIT = "explicit"
NODE_SOURCE_STORE = "store" # M6.0: dynamic PolicyStore override
NODE_SOURCE_ROOM_MAP = "room_map"
NODE_SOURCE_DEFAULT = "default"
@dataclass(frozen=True)
class NodeResolution:
"""Result of resolving the target node for a message."""
node_id: str
source: str
rejected_node: Optional[str] = None # set when explicit node was not allowlisted
@dataclass
class NodePolicy:
"""
Node resolution policy.
Attributes:
allowed_nodes: Set of valid node names (uppercase).
default_node: Fallback node when no explicit or room-map match.
room_node_map: Optional per-room override (room_id → node_id).
"""
allowed_nodes: FrozenSet[str]
default_node: str
room_node_map: Dict[str, str] = field(default_factory=dict)
def resolve(
self,
room_id: str,
explicit_node: Optional[str] = None,
store_override: Optional[str] = None,
) -> NodeResolution:
"""
Resolve target node for a message.
Priority (highest → lowest):
1. explicit_node kwarg (user-supplied, mixed rooms only)
2. store_override — dynamic PolicyStore entry (M6.0)
3. room_node_map — static BRIDGE_ROOM_NODE_MAP env entry
4. default_node
"""
if explicit_node is not None:
upper = explicit_node.upper()
if upper in self.allowed_nodes:
return NodeResolution(node_id=upper, source=NODE_SOURCE_EXPLICIT)
# Rejected — report bad value and fall through to best available
fallback = self._fallback(room_id, store_override)
return NodeResolution(
node_id=fallback.node_id,
source=fallback.source,
rejected_node=upper,
)
return self._fallback(room_id, store_override)
def _fallback(
self,
room_id: str,
store_override: Optional[str] = None,
) -> NodeResolution:
"""Resolve node without an explicit kwarg (store → env map → default)."""
if store_override is not None:
upper = store_override.upper()
if upper in self.allowed_nodes:
return NodeResolution(node_id=upper, source=NODE_SOURCE_STORE)
if room_id in self.room_node_map:
mapped = self.room_node_map[room_id].upper()
if mapped in self.allowed_nodes:
return NodeResolution(node_id=mapped, source=NODE_SOURCE_ROOM_MAP)
return NodeResolution(node_id=self.default_node, source=NODE_SOURCE_DEFAULT)
def as_info_dict(self) -> dict:
"""Return a safe dict for health/ops snapshots (no secrets)."""
return {
"default_node": self.default_node,
"allowed_nodes": sorted(self.allowed_nodes),
"room_overrides": len(self.room_node_map),
}
def parse_node_policy(
raw_allowed: str,
default_node: str,
raw_room_map: str,
) -> NodePolicy:
"""
Parse node policy from env-style config strings.
raw_allowed: "NODA1,NODA2"
default_node: "NODA1"
raw_room_map: "!roomA:server=NODA2;!roomB:server=NODA1"
"""
default = default_node.strip().upper() or "NODA1"
allowed: FrozenSet[str] = frozenset(
n.strip().upper() for n in raw_allowed.split(",") if n.strip()
)
if not allowed:
allowed = frozenset([default])
elif default not in allowed:
# default must always be reachable
allowed = allowed | frozenset([default])
room_map: Dict[str, str] = {}
for entry in raw_room_map.split(";"):
entry = entry.strip()
if not entry or "=" not in entry:
continue
room_id_raw, node_raw = entry.split("=", 1)
room_id = room_id_raw.strip()
node = node_raw.strip().upper()
if room_id and node:
room_map[room_id] = node
return NodePolicy(
allowed_nodes=allowed,
default_node=default,
room_node_map=room_map,
)
def extract_node_kwarg(text: str) -> Tuple[Optional[str], str]:
"""
Extract 'node=X' kwarg from message text.
Returns (node_id_or_None, cleaned_text_without_kwarg).
Preserves the rest of the message — no other transformations.
Example:
"/sofiia node=NODA2 Hello!"
→ ("NODA2", "/sofiia Hello!")
"""
m = _NODE_KWARG_RE.search(text)
if m:
node = m.group(1).upper()
cleaned = _NODE_KWARG_RE.sub("", text, count=1)
# Collapse runs of whitespace introduced by the removal
cleaned = " ".join(cleaned.split())
return node, cleaned
return None, text
def node_rejected_reply(requested: str, allowed: FrozenSet[str]) -> str:
"""Reply when user requests a node not in the allowlist."""
allowed_list = ", ".join(f"`{n}`" for n in sorted(allowed))
return (
f"⚠️ Unknown node: `{requested}`\n"
f"Allowed: {allowed_list}\n"
f"_Example: `/sofiia node=NODA1 Hello!`_"
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,149 @@
"""
StickyNodeCache — M8.1: anti-flap sticky routing after soft-failover.
After a successful failover (primary → fallback), the bridge remembers the
fallback node per room:agent pair for `ttl_s` seconds. Subsequent messages
for the same pair skip the primary entirely and go directly to the known-good
fallback, preventing oscillation ("flapping") while the primary recovers.
Key design
----------
key = "{room_id}:{agent_id}"
ttl = FAILOVER_STICKY_TTL_S (default 300 s)
Priority in routing (when source != explicit):
1. sticky cache (temporary)
2. store override (desired long-term policy)
3. env room_node_map
4. env default
Sticky expires naturally; recovery is automatic — no operator action needed.
If the sticky node also fails, the entry is removed and normal failover logic
takes over again.
Thread safety
-------------
Uses threading.RLock — safe to call from asyncio callbacks without to_thread.
"""
from __future__ import annotations
import logging
import threading
import time
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
_DEFAULT_TTL_S = 300.0
@dataclass
class _StickyEntry:
node_id: str
expires_at: float # time.monotonic() deadline
class StickyNodeCache:
"""
In-memory sticky node preference cache.
Usage:
cache = StickyNodeCache(ttl_s=300)
# After successful failover:
cache.set("!room:srv:sofiia", "NODA2")
# Before routing the next message:
node = cache.get("!room:srv:sofiia") # → "NODA2" or None if expired/missing
# If sticky node also fails:
cache.delete("!room:srv:sofiia")
"""
def __init__(self, ttl_s: float = _DEFAULT_TTL_S) -> None:
if ttl_s <= 0:
raise ValueError(f"ttl_s must be > 0, got {ttl_s}")
self._ttl_s = ttl_s
self._cache: Dict[str, _StickyEntry] = {}
self._lock = threading.RLock()
# ── Public API ────────────────────────────────────────────────────────────
def set(self, key: str, node_id: str, ttl_s: Optional[float] = None) -> None:
"""Set sticky preference; overwrites existing entry."""
ttl = ttl_s if ttl_s is not None else self._ttl_s
with self._lock:
self._cache[key] = _StickyEntry(
node_id=node_id,
expires_at=time.monotonic() + ttl,
)
logger.debug("StickyCache.set: key=%s node=%s ttl=%.0fs", key, node_id, ttl)
def get(self, key: str) -> Optional[str]:
"""
Return sticky node_id if entry exists and not expired; else None.
Expired entries are lazily removed on access.
"""
with self._lock:
entry = self._cache.get(key)
if entry is None:
return None
if time.monotonic() >= entry.expires_at:
del self._cache[key]
logger.debug("StickyCache.expired: key=%s node=%s", key, entry.node_id)
return None
return entry.node_id
def delete(self, key: str) -> bool:
"""Remove an entry. Returns True if it existed."""
with self._lock:
existed = key in self._cache
self._cache.pop(key, None)
if existed:
logger.debug("StickyCache.delete: key=%s", key)
return existed
def active_count(self) -> int:
"""Count of non-expired entries (best-effort; no eviction)."""
now = time.monotonic()
with self._lock:
return sum(1 for e in self._cache.values() if e.expires_at > now)
def active_entries(self) -> List[Tuple[str, str, float]]:
"""
Return (key, node_id, ttl_remaining_s) for all non-expired entries.
Useful for ops visibility in !status/!nodes.
"""
now = time.monotonic()
with self._lock:
result = []
for k, e in self._cache.items():
remaining = e.expires_at - now
if remaining > 0:
result.append((k, e.node_id, remaining))
return sorted(result, key=lambda x: x[0])
def cleanup(self) -> int:
"""
Remove all expired entries.
Call periodically (e.g. in a background task) to reclaim memory.
Returns count of removed entries.
"""
now = time.monotonic()
with self._lock:
expired_keys = [k for k, e in self._cache.items() if e.expires_at <= now]
for k in expired_keys:
del self._cache[k]
if expired_keys:
logger.debug("StickyCache.cleanup: removed %d expired entries", len(expired_keys))
return len(expired_keys)
@property
def ttl_s(self) -> float:
return self._ttl_s
def make_sticky_key(room_id: str, agent_id: str) -> str:
"""Canonical sticky cache key for a room+agent pair."""
return f"{room_id}:{agent_id}"

View File

@@ -4,3 +4,4 @@ httpx>=0.25.0
python-dotenv>=1.0.0 python-dotenv>=1.0.0
prometheus-client>=0.20.0 prometheus-client>=0.20.0
pyyaml>=6.0 pyyaml>=6.0
aiosqlite>=0.19.0