Phase6/7 runtime + Gitea smoke gate setup #1
@@ -67,6 +67,41 @@ services:
|
|||||||
- BRIDGE_CONTROL_ROOMS=${BRIDGE_CONTROL_ROOMS:-}
|
- BRIDGE_CONTROL_ROOMS=${BRIDGE_CONTROL_ROOMS:-}
|
||||||
# "ignore" (silent) | "reply_error" (⛔ reply to unauthorised attempts)
|
# "ignore" (silent) | "reply_error" (⛔ reply to unauthorised attempts)
|
||||||
- CONTROL_UNAUTHORIZED_BEHAVIOR=${CONTROL_UNAUTHORIZED_BEHAVIOR:-ignore}
|
- CONTROL_UNAUTHORIZED_BEHAVIOR=${CONTROL_UNAUTHORIZED_BEHAVIOR:-ignore}
|
||||||
|
# ── M3.1: Runbook runner token ───────────────────────────────────────
|
||||||
|
# X-Control-Token for POST /api/runbooks/internal/runs (sofiia-console)
|
||||||
|
- SOFIIA_CONTROL_TOKEN=${SOFIIA_CONTROL_TOKEN:-}
|
||||||
|
# M3.4: Control channel safety — rate limiting + cooldown
|
||||||
|
- CONTROL_ROOM_RPM=${CONTROL_ROOM_RPM:-60}
|
||||||
|
- CONTROL_OPERATOR_RPM=${CONTROL_OPERATOR_RPM:-30}
|
||||||
|
- CONTROL_RUN_NEXT_RPM=${CONTROL_RUN_NEXT_RPM:-20}
|
||||||
|
- CONTROL_COOLDOWN_S=${CONTROL_COOLDOWN_S:-2.0}
|
||||||
|
# M2.3: Persistent event deduplication
|
||||||
|
- PERSISTENT_DEDUPE=${PERSISTENT_DEDUPE:-1}
|
||||||
|
- BRIDGE_DATA_DIR=${BRIDGE_DATA_DIR:-/app/data}
|
||||||
|
- PROCESSED_EVENTS_TTL_H=${PROCESSED_EVENTS_TTL_H:-48}
|
||||||
|
- PROCESSED_EVENTS_PRUNE_BATCH=${PROCESSED_EVENTS_PRUNE_BATCH:-5000}
|
||||||
|
- PROCESSED_EVENTS_PRUNE_INTERVAL_S=${PROCESSED_EVENTS_PRUNE_INTERVAL_S:-3600}
|
||||||
|
# M4.0: agent discovery
|
||||||
|
- DISCOVERY_RPM=${DISCOVERY_RPM:-20}
|
||||||
|
# M5.0: node-aware routing
|
||||||
|
- BRIDGE_ALLOWED_NODES=${BRIDGE_ALLOWED_NODES:-NODA1}
|
||||||
|
- BRIDGE_DEFAULT_NODE=${BRIDGE_DEFAULT_NODE:-NODA1}
|
||||||
|
- BRIDGE_ROOM_NODE_MAP=${BRIDGE_ROOM_NODE_MAP:-}
|
||||||
|
# M8.0: Node health + soft-failover thresholds
|
||||||
|
- NODE_FAIL_CONSEC=${NODE_FAIL_CONSEC:-3}
|
||||||
|
- NODE_LAT_EWMA_S=${NODE_LAT_EWMA_S:-12.0}
|
||||||
|
- NODE_EWMA_ALPHA=${NODE_EWMA_ALPHA:-0.3}
|
||||||
|
# M8.1: Sticky failover TTL (0 = disabled)
|
||||||
|
- FAILOVER_STICKY_TTL_S=${FAILOVER_STICKY_TTL_S:-300}
|
||||||
|
# M8.2: HA state persistence
|
||||||
|
- HA_HEALTH_SNAPSHOT_INTERVAL_S=${HA_HEALTH_SNAPSHOT_INTERVAL_S:-60}
|
||||||
|
- HA_HEALTH_MAX_AGE_S=${HA_HEALTH_MAX_AGE_S:-600}
|
||||||
|
# M9.0: Two-step confirmation TTL for dangerous commands (0 = disabled)
|
||||||
|
- CONFIRM_TTL_S=${CONFIRM_TTL_S:-120}
|
||||||
|
- POLICY_EXPORT_RETENTION_DAYS=${POLICY_EXPORT_RETENTION_DAYS:-30}
|
||||||
|
- POLICY_HISTORY_LIMIT=${POLICY_HISTORY_LIMIT:-100}
|
||||||
|
# M11 soak: NEVER set to true in production
|
||||||
|
- DEBUG_INJECT_ENABLED=${DEBUG_INJECT_ENABLED:-false}
|
||||||
|
|
||||||
# ── M2.2: Mixed room guard rails ────────────────────────────────────
|
# ── M2.2: Mixed room guard rails ────────────────────────────────────
|
||||||
# Fail-fast if any room defines more agents than this
|
# Fail-fast if any room defines more agents than this
|
||||||
|
|||||||
986
ops/grafana/dashboards/matrix-bridge-dagi.json
Normal file
986
ops/grafana/dashboards/matrix-bridge-dagi.json
Normal file
@@ -0,0 +1,986 @@
|
|||||||
|
{
|
||||||
|
"__inputs": [
|
||||||
|
{
|
||||||
|
"name": "DS_PROMETHEUS",
|
||||||
|
"label": "Prometheus",
|
||||||
|
"description": "",
|
||||||
|
"type": "datasource",
|
||||||
|
"pluginId": "prometheus",
|
||||||
|
"pluginName": "Prometheus"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"__elements": {},
|
||||||
|
"__requires": [
|
||||||
|
{
|
||||||
|
"type": "grafana",
|
||||||
|
"id": "grafana",
|
||||||
|
"name": "Grafana",
|
||||||
|
"version": "9.0.0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "datasource",
|
||||||
|
"id": "prometheus",
|
||||||
|
"name": "Prometheus",
|
||||||
|
"version": "1.0.0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "panel",
|
||||||
|
"id": "stat",
|
||||||
|
"name": "Stat",
|
||||||
|
"version": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "panel",
|
||||||
|
"id": "timeseries",
|
||||||
|
"name": "Time series",
|
||||||
|
"version": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "panel",
|
||||||
|
"id": "gauge",
|
||||||
|
"name": "Gauge",
|
||||||
|
"version": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"annotations": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"builtIn": 1,
|
||||||
|
"datasource": {
|
||||||
|
"type": "grafana",
|
||||||
|
"uid": "-- Grafana --"
|
||||||
|
},
|
||||||
|
"enable": true,
|
||||||
|
"hide": true,
|
||||||
|
"iconColor": "rgba(0, 211, 255, 1)",
|
||||||
|
"name": "Annotations & Alerts",
|
||||||
|
"type": "dashboard"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"description": "Matrix Bridge DAGI \u2014 operational overview (M7.0). Traffic, latency, errors, queue, dedupe, control channel.",
|
||||||
|
"editable": true,
|
||||||
|
"fiscalYearStartMonth": 0,
|
||||||
|
"graphTooltip": 1,
|
||||||
|
"id": null,
|
||||||
|
"links": [
|
||||||
|
{
|
||||||
|
"asDropdown": false,
|
||||||
|
"icon": "doc",
|
||||||
|
"includeVars": false,
|
||||||
|
"keepTime": false,
|
||||||
|
"tags": [],
|
||||||
|
"targetBlank": true,
|
||||||
|
"title": "Runbook",
|
||||||
|
"tooltip": "matrix-bridge-dagi-ops.md",
|
||||||
|
"type": "link",
|
||||||
|
"url": "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Bridge Up",
|
||||||
|
"gridPos": {
|
||||||
|
"x": 0,
|
||||||
|
"y": 0,
|
||||||
|
"w": 4,
|
||||||
|
"h": 4
|
||||||
|
},
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "${datasource}"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(matrix_bridge_up)",
|
||||||
|
"legendFormat": "up (all nodes)",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": [
|
||||||
|
"lastNotNull"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none",
|
||||||
|
"textMode": "auto",
|
||||||
|
"orientation": "auto"
|
||||||
|
},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"mappings": [
|
||||||
|
{
|
||||||
|
"type": "value",
|
||||||
|
"options": {
|
||||||
|
"0": {
|
||||||
|
"text": "DOWN",
|
||||||
|
"color": "red"
|
||||||
|
},
|
||||||
|
"1": {
|
||||||
|
"text": "UP",
|
||||||
|
"color": "green"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"color": {
|
||||||
|
"mode": "thresholds"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Queue Size",
|
||||||
|
"gridPos": {
|
||||||
|
"x": 4,
|
||||||
|
"y": 0,
|
||||||
|
"w": 4,
|
||||||
|
"h": 4
|
||||||
|
},
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "${datasource}"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "matrix_bridge_queue_size",
|
||||||
|
"legendFormat": "queue",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": [
|
||||||
|
"lastNotNull"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "area",
|
||||||
|
"textMode": "auto"
|
||||||
|
},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "yellow",
|
||||||
|
"value": 50
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 100
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"color": {
|
||||||
|
"mode": "thresholds"
|
||||||
|
},
|
||||||
|
"unit": "short"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Active Rate-Limiter Rooms",
|
||||||
|
"gridPos": {
|
||||||
|
"x": 8,
|
||||||
|
"y": 0,
|
||||||
|
"w": 4,
|
||||||
|
"h": 4
|
||||||
|
},
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "${datasource}"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "matrix_bridge_rate_limiter_active_rooms",
|
||||||
|
"legendFormat": "rooms",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": [
|
||||||
|
"lastNotNull"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short",
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Active Room-Agent Locks",
|
||||||
|
"gridPos": {
|
||||||
|
"x": 12,
|
||||||
|
"y": 0,
|
||||||
|
"w": 4,
|
||||||
|
"h": 4
|
||||||
|
},
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "${datasource}"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "matrix_bridge_active_room_agent_locks",
|
||||||
|
"legendFormat": "locks",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": [
|
||||||
|
"lastNotNull"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"colorMode": "value",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "short",
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Drops (5m)",
|
||||||
|
"gridPos": {
|
||||||
|
"x": 16,
|
||||||
|
"y": 0,
|
||||||
|
"w": 4,
|
||||||
|
"h": 4
|
||||||
|
},
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "${datasource}"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(increase(matrix_bridge_queue_dropped_total[5m]))",
|
||||||
|
"legendFormat": "dropped",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": [
|
||||||
|
"lastNotNull"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"color": {
|
||||||
|
"mode": "thresholds"
|
||||||
|
},
|
||||||
|
"unit": "short"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Errors (5m)",
|
||||||
|
"gridPos": {
|
||||||
|
"x": 20,
|
||||||
|
"y": 0,
|
||||||
|
"w": 4,
|
||||||
|
"h": 4
|
||||||
|
},
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "${datasource}"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(increase(matrix_bridge_gateway_errors_total[5m]))",
|
||||||
|
"legendFormat": "errors",
|
||||||
|
"refId": "A",
|
||||||
|
"instant": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"reduceOptions": {
|
||||||
|
"calcs": [
|
||||||
|
"lastNotNull"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"colorMode": "background",
|
||||||
|
"graphMode": "none"
|
||||||
|
},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "yellow",
|
||||||
|
"value": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 5
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"color": {
|
||||||
|
"mode": "thresholds"
|
||||||
|
},
|
||||||
|
"unit": "short"
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Traffic: Received & Replied (rate/5m)",
|
||||||
|
"gridPos": {
|
||||||
|
"x": 0,
|
||||||
|
"y": 4,
|
||||||
|
"w": 12,
|
||||||
|
"h": 8
|
||||||
|
},
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "${datasource}"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(matrix_bridge_messages_received_total[5m]))",
|
||||||
|
"legendFormat": "received",
|
||||||
|
"refId": "A"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(matrix_bridge_messages_replied_total{status=\"ok\"}[5m]))",
|
||||||
|
"legendFormat": "replied ok",
|
||||||
|
"refId": "B"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(matrix_bridge_messages_replied_total{status=\"error\"}[5m]))",
|
||||||
|
"legendFormat": "replied error",
|
||||||
|
"refId": "C"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "multi",
|
||||||
|
"sort": "desc"
|
||||||
|
},
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "table",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": [
|
||||||
|
"mean",
|
||||||
|
"max"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"spanNulls": false
|
||||||
|
},
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": [
|
||||||
|
{
|
||||||
|
"matcher": {
|
||||||
|
"id": "byName",
|
||||||
|
"options": "replied error"
|
||||||
|
},
|
||||||
|
"properties": [
|
||||||
|
{
|
||||||
|
"id": "color",
|
||||||
|
"value": {
|
||||||
|
"mode": "fixed",
|
||||||
|
"fixedColor": "red"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Errors / Drops / Rate-Limited (rate/5m)",
|
||||||
|
"gridPos": {
|
||||||
|
"x": 12,
|
||||||
|
"y": 4,
|
||||||
|
"w": 12,
|
||||||
|
"h": 8
|
||||||
|
},
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "${datasource}"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by (error_type) (rate(matrix_bridge_gateway_errors_total[5m]))",
|
||||||
|
"legendFormat": "gw_error: {{ error_type }}",
|
||||||
|
"refId": "A"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(matrix_bridge_queue_dropped_total[5m]))",
|
||||||
|
"legendFormat": "queue_dropped",
|
||||||
|
"refId": "B"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(matrix_bridge_rate_limited_total[5m]))",
|
||||||
|
"legendFormat": "rate_limited",
|
||||||
|
"refId": "C"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by (reason) (rate(matrix_bridge_route_rejected_total[5m]))",
|
||||||
|
"legendFormat": "route_rejected: {{ reason }}",
|
||||||
|
"refId": "D"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "multi",
|
||||||
|
"sort": "desc"
|
||||||
|
},
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "table",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": [
|
||||||
|
"mean",
|
||||||
|
"max"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 15,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"stacking": {
|
||||||
|
"mode": "none"
|
||||||
|
},
|
||||||
|
"spanNulls": false
|
||||||
|
},
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 20,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Invoke Latency P50 / P95 by Node",
|
||||||
|
"gridPos": {
|
||||||
|
"x": 0,
|
||||||
|
"y": 12,
|
||||||
|
"w": 12,
|
||||||
|
"h": 8
|
||||||
|
},
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "${datasource}"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.50, sum by (node_id, le) (rate(matrix_bridge_invoke_duration_seconds_bucket[5m])))",
|
||||||
|
"legendFormat": "p50 {{ node_id }}",
|
||||||
|
"refId": "A"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.95, sum by (node_id, le) (rate(matrix_bridge_invoke_duration_seconds_bucket[5m])))",
|
||||||
|
"legendFormat": "p95 {{ node_id }}",
|
||||||
|
"refId": "B"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "multi",
|
||||||
|
"sort": "desc"
|
||||||
|
},
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "table",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": [
|
||||||
|
"mean",
|
||||||
|
"max",
|
||||||
|
"last"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 5,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"spanNulls": false
|
||||||
|
},
|
||||||
|
"thresholds": {
|
||||||
|
"mode": "absolute",
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"color": "green",
|
||||||
|
"value": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "yellow",
|
||||||
|
"value": 10
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"color": "red",
|
||||||
|
"value": 20
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 21,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Queue Wait P50 / P95",
|
||||||
|
"gridPos": {
|
||||||
|
"x": 12,
|
||||||
|
"y": 12,
|
||||||
|
"w": 12,
|
||||||
|
"h": 8
|
||||||
|
},
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "${datasource}"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.50, sum by (agent_id, le) (rate(matrix_bridge_queue_wait_seconds_bucket[5m])))",
|
||||||
|
"legendFormat": "wait p50 {{ agent_id }}",
|
||||||
|
"refId": "A"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.95, sum by (agent_id, le) (rate(matrix_bridge_queue_wait_seconds_bucket[5m])))",
|
||||||
|
"legendFormat": "wait p95 {{ agent_id }}",
|
||||||
|
"refId": "B"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "multi",
|
||||||
|
"sort": "desc"
|
||||||
|
},
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "table",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": [
|
||||||
|
"mean",
|
||||||
|
"max"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "s",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 5,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"spanNulls": false
|
||||||
|
},
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 30,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Node Routing: Routed & Rejected by Node (rate/5m)",
|
||||||
|
"gridPos": {
|
||||||
|
"x": 0,
|
||||||
|
"y": 20,
|
||||||
|
"w": 12,
|
||||||
|
"h": 7
|
||||||
|
},
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "${datasource}"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by (node_id) (rate(matrix_bridge_routed_total[5m]))",
|
||||||
|
"legendFormat": "routed {{ node_id }}",
|
||||||
|
"refId": "A"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by (node_id) (rate(matrix_bridge_node_rejected_total[5m]))",
|
||||||
|
"legendFormat": "rejected {{ node_id }}",
|
||||||
|
"refId": "B"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "multi",
|
||||||
|
"sort": "desc"
|
||||||
|
},
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "table",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": [
|
||||||
|
"mean",
|
||||||
|
"max"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"spanNulls": false
|
||||||
|
},
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 31,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Persistent Dedupe Hits / Inserts (rate/10m)",
|
||||||
|
"gridPos": {
|
||||||
|
"x": 12,
|
||||||
|
"y": 20,
|
||||||
|
"w": 12,
|
||||||
|
"h": 7
|
||||||
|
},
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "${datasource}"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(matrix_bridge_dedupe_persistent_hits_total[10m]))",
|
||||||
|
"legendFormat": "dedupe_hits",
|
||||||
|
"refId": "A"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "rate(matrix_bridge_dedupe_persistent_inserts_total[10m])",
|
||||||
|
"legendFormat": "dedupe_inserts",
|
||||||
|
"refId": "B"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "multi",
|
||||||
|
"sort": "desc"
|
||||||
|
},
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "table",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": [
|
||||||
|
"mean",
|
||||||
|
"max"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"spanNulls": false
|
||||||
|
},
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 40,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Control Commands (rate/5m)",
|
||||||
|
"gridPos": {
|
||||||
|
"x": 0,
|
||||||
|
"y": 27,
|
||||||
|
"w": 12,
|
||||||
|
"h": 7
|
||||||
|
},
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "${datasource}"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by (verb) (rate(matrix_bridge_control_commands_total[5m]))",
|
||||||
|
"legendFormat": "cmd {{ verb }}",
|
||||||
|
"refId": "A"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum by (scope) (rate(matrix_bridge_control_rate_limited_total[5m]))",
|
||||||
|
"legendFormat": "ctrl_ratelimited {{ scope }}",
|
||||||
|
"refId": "B"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "multi",
|
||||||
|
"sort": "desc"
|
||||||
|
},
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "table",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": [
|
||||||
|
"mean",
|
||||||
|
"max"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"spanNulls": false
|
||||||
|
},
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 41,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Traffic by Agent (received rate/5m)",
|
||||||
|
"gridPos": {
|
||||||
|
"x": 12,
|
||||||
|
"y": 27,
|
||||||
|
"w": 24,
|
||||||
|
"h": 7
|
||||||
|
},
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "${datasource}"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by (agent_id) (rate(matrix_bridge_messages_received_total[5m]))",
|
||||||
|
"legendFormat": "{{ agent_id }}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "multi",
|
||||||
|
"sort": "desc"
|
||||||
|
},
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "table",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": [
|
||||||
|
"mean",
|
||||||
|
"max",
|
||||||
|
"last"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"spanNulls": false
|
||||||
|
},
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 42,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Routing Reasons by Agent (rate/5m)",
|
||||||
|
"description": "M7.1: matrix_bridge_routing_reasons_total \u2014 slash/mention/name/default/direct breakdown",
|
||||||
|
"gridPos": {
|
||||||
|
"x": 0,
|
||||||
|
"y": 34,
|
||||||
|
"w": 24,
|
||||||
|
"h": 7
|
||||||
|
},
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "${datasource}"
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum by (agent_id, reason) (rate(matrix_bridge_routing_reasons_total[5m]))",
|
||||||
|
"legendFormat": "{{ agent_id }} / {{ reason }}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"options": {
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "multi",
|
||||||
|
"sort": "desc"
|
||||||
|
},
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "table",
|
||||||
|
"placement": "bottom",
|
||||||
|
"calcs": [
|
||||||
|
"mean",
|
||||||
|
"max"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": {
|
||||||
|
"unit": "reqps",
|
||||||
|
"custom": {
|
||||||
|
"lineWidth": 2,
|
||||||
|
"fillOpacity": 10,
|
||||||
|
"drawStyle": "line",
|
||||||
|
"spanNulls": false
|
||||||
|
},
|
||||||
|
"color": {
|
||||||
|
"mode": "palette-classic"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"overrides": []
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"refresh": "30s",
|
||||||
|
"schemaVersion": 38,
|
||||||
|
"tags": [
|
||||||
|
"matrix-bridge",
|
||||||
|
"dagi",
|
||||||
|
"daarion"
|
||||||
|
],
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"current": {},
|
||||||
|
"hide": 0,
|
||||||
|
"includeAll": false,
|
||||||
|
"label": "Datasource",
|
||||||
|
"multi": false,
|
||||||
|
"name": "datasource",
|
||||||
|
"options": [],
|
||||||
|
"query": "prometheus",
|
||||||
|
"refresh": 1,
|
||||||
|
"regex": "",
|
||||||
|
"type": "datasource"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"timepicker": {},
|
||||||
|
"timezone": "UTC",
|
||||||
|
"title": "Matrix Bridge DAGI",
|
||||||
|
"uid": "matrix-bridge-dagi-v1",
|
||||||
|
"version": 1
|
||||||
|
}
|
||||||
158
ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
Normal file
158
ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
---
|
||||||
|
# Prometheus alert rules — Matrix Bridge DAGI
|
||||||
|
# Phase M7.1 (metrics contract hardening)
|
||||||
|
#
|
||||||
|
# Metric source of truth: services/matrix-bridge-dagi/app/metrics_contract.py
|
||||||
|
# Runbook: docs/runbook/matrix-bridge-dagi-ops.md
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# promtool check rules ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
|
||||||
|
# docker run --rm -v $PWD:/w prom/prometheus:latest \
|
||||||
|
# promtool check rules /w/ops/prometheus/alerts/matrix-bridge-dagi.rules.yml
|
||||||
|
|
||||||
|
groups:
|
||||||
|
- name: matrix_bridge_dagi
|
||||||
|
interval: 30s
|
||||||
|
rules:
|
||||||
|
|
||||||
|
# ── A1: Bridge process down ─────────────────────────────────────────────
|
||||||
|
# metric: matrix_bridge_up{node_id} (Gauge, M7.1: labeled per node)
|
||||||
|
- alert: BridgeDown
|
||||||
|
expr: sum(matrix_bridge_up) == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
team: platform
|
||||||
|
service: matrix-bridge-dagi
|
||||||
|
annotations:
|
||||||
|
summary: "Matrix Bridge DAGI is down"
|
||||||
|
description: >
|
||||||
|
`matrix_bridge_up` == 0 across all nodes — bridge process has not
|
||||||
|
started or has crashed. No messages are being processed.
|
||||||
|
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a1-bridgedown"
|
||||||
|
|
||||||
|
# ── A2: Matrix sync errors spike ────────────────────────────────────────
|
||||||
|
# metric: matrix_bridge_gateway_errors_total{error_type} (Counter)
|
||||||
|
- alert: MatrixSyncErrors
|
||||||
|
expr: >
|
||||||
|
increase(matrix_bridge_gateway_errors_total{error_type="sync_error"}[5m]) > 3
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
team: platform
|
||||||
|
service: matrix-bridge-dagi
|
||||||
|
annotations:
|
||||||
|
summary: "Matrix sync errors elevated"
|
||||||
|
description: >
|
||||||
|
More than 3 Matrix `/sync` errors (error_type=sync_error) in the last
|
||||||
|
5 minutes. May indicate Matrix homeserver problems or network issues.
|
||||||
|
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a2-matrixsyncerrors"
|
||||||
|
|
||||||
|
# ── A3: Gateway (Router) invoke errors spike ─────────────────────────────
|
||||||
|
# metric: matrix_bridge_messages_replied_total{status} (Counter)
|
||||||
|
- alert: GatewayInvokeErrors
|
||||||
|
expr: >
|
||||||
|
increase(matrix_bridge_messages_replied_total{status="error"}[5m]) > 5
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
team: platform
|
||||||
|
service: matrix-bridge-dagi
|
||||||
|
annotations:
|
||||||
|
summary: "Router invoke errors elevated (node={{ $labels.node_id }})"
|
||||||
|
description: >
|
||||||
|
More than 5 agent invocation errors (status=error) in the last 5 minutes.
|
||||||
|
Check Router/DeepSeek connectivity and logs.
|
||||||
|
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a3-gatewayinvokeerrors"
|
||||||
|
|
||||||
|
# ── A4: Queue drops ─────────────────────────────────────────────────────
|
||||||
|
# metric: matrix_bridge_queue_dropped_total{room_id, agent_id} (Counter)
|
||||||
|
- alert: QueueDropsHigh
|
||||||
|
expr: >
|
||||||
|
rate(matrix_bridge_queue_dropped_total[5m]) > 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
team: platform
|
||||||
|
service: matrix-bridge-dagi
|
||||||
|
annotations:
|
||||||
|
summary: "Bridge queue is dropping messages"
|
||||||
|
description: >
|
||||||
|
`matrix_bridge_queue_dropped_total` is increasing — work queue is full
|
||||||
|
and incoming messages are being dropped. Increase
|
||||||
|
`BRIDGE_QUEUE_MAX_EVENTS` or `BRIDGE_WORKER_CONCURRENCY`.
|
||||||
|
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a4-queuedrops"
|
||||||
|
|
||||||
|
# ── A5: User-level rate limiting spike ──────────────────────────────────
|
||||||
|
# metric: matrix_bridge_rate_limited_total{room_id, agent_id, limit_type} (Counter)
|
||||||
|
- alert: RateLimitedSpike
|
||||||
|
expr: >
|
||||||
|
rate(matrix_bridge_rate_limited_total[5m]) > 2
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
team: platform
|
||||||
|
service: matrix-bridge-dagi
|
||||||
|
annotations:
|
||||||
|
summary: "User rate limiting spike"
|
||||||
|
description: >
|
||||||
|
More than 2 messages/second are being rate-limited over 3 minutes.
|
||||||
|
May indicate a flood attack, misbehaving client, or limits too low.
|
||||||
|
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a5-ratelimitedspike"
|
||||||
|
|
||||||
|
# ── A6: Control channel rate limiting spike ──────────────────────────────
|
||||||
|
# metric: matrix_bridge_control_rate_limited_total{scope} (Counter)
|
||||||
|
- alert: ControlRateLimitedSpike
|
||||||
|
expr: >
|
||||||
|
rate(matrix_bridge_control_rate_limited_total[5m]) > 0.5
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
team: platform
|
||||||
|
service: matrix-bridge-dagi
|
||||||
|
annotations:
|
||||||
|
summary: "Control channel rate limiting elevated"
|
||||||
|
description: >
|
||||||
|
More than 0.5 control commands/second rejected by rate limiter over
|
||||||
|
3 minutes. May indicate operator tooling issues or abuse attempt.
|
||||||
|
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a6-controlratelimitedspike"
|
||||||
|
|
||||||
|
# ── A7: Persistent dedupe hit storm (resend loop) ────────────────────────
|
||||||
|
# metric: matrix_bridge_dedupe_persistent_hits_total{room_id} (Counter)
|
||||||
|
- alert: DedupeHitStorm
|
||||||
|
expr: >
|
||||||
|
rate(matrix_bridge_dedupe_persistent_hits_total[10m]) > 0.5
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
team: platform
|
||||||
|
service: matrix-bridge-dagi
|
||||||
|
annotations:
|
||||||
|
summary: "Persistent deduplication hit rate elevated"
|
||||||
|
description: >
|
||||||
|
High rate of persistent dedupe hits — may indicate a Matrix resend
|
||||||
|
storm or a client repeatedly retrying the same event_id.
|
||||||
|
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a7-dedupehitstorm"
|
||||||
|
|
||||||
|
# ── A8: Invoke latency P95 high (per node) ───────────────────────────────
|
||||||
|
# metric: matrix_bridge_invoke_duration_seconds{agent_id, node_id} (Histogram)
|
||||||
|
- alert: InvokeLatencyP95High
|
||||||
|
expr: >
|
||||||
|
histogram_quantile(
|
||||||
|
0.95,
|
||||||
|
sum by (node_id, le) (
|
||||||
|
rate(matrix_bridge_invoke_duration_seconds_bucket[5m])
|
||||||
|
)
|
||||||
|
) > 15
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
team: platform
|
||||||
|
service: matrix-bridge-dagi
|
||||||
|
annotations:
|
||||||
|
summary: "Router invoke latency P95 > 15s (node={{ $labels.node_id }})"
|
||||||
|
description: >
|
||||||
|
95th percentile invoke latency for node `{{ $labels.node_id }}` exceeds
|
||||||
|
15 seconds over the last 5 minutes. Check Router load, DeepSeek API,
|
||||||
|
Ollama/Swapper queue.
|
||||||
|
runbook_url: "https://github.com/IvanTytar/microdao-daarion/blob/node1-production-snapshot-2026-02-09/docs/runbook/matrix-bridge-dagi-ops.md#a8-invokelatencyp95high"
|
||||||
401
ops/runbook-matrix-bridge-soak.md
Normal file
401
ops/runbook-matrix-bridge-soak.md
Normal file
@@ -0,0 +1,401 @@
|
|||||||
|
# matrix-bridge-dagi — Soak & Failure Rehearsal Runbook (M11)
|
||||||
|
|
||||||
|
**Phase:** M11
|
||||||
|
**Applies to:** `matrix-bridge-dagi` service on NODA1
|
||||||
|
**When to run:** Before any production traffic increase, after major code changes, or on a recurring monthly basis.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Goals
|
||||||
|
|
||||||
|
| Goal | Measurable pass criterion |
|
||||||
|
|------|--------------------------|
|
||||||
|
| Latency under load | p95 invoke < 5 000 ms |
|
||||||
|
| Queue stability | drop rate < 1% |
|
||||||
|
| Failover correctness | failover fires on NODA1 outage; NODA2 serves all remaining messages |
|
||||||
|
| Sticky anti-flap | sticky set after first failover; no re-tries to degraded node |
|
||||||
|
| Restart recovery | sticky + health snapshot reloads within 10 s of restart |
|
||||||
|
| Policy operations safe under load | `!policy history` / `!policy change` work while messages in-flight |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Prerequisites
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On NODA1 or local machine with network access to bridge
|
||||||
|
pip install httpx
|
||||||
|
|
||||||
|
# Verify bridge is up
|
||||||
|
curl -s http://localhost:9400/health | jq '.ok'
|
||||||
|
# Expected: true
|
||||||
|
|
||||||
|
# Verify /metrics endpoint
|
||||||
|
curl -s http://localhost:9400/metrics | grep matrix_bridge_up
|
||||||
|
# Expected: matrix_bridge_up{...} 1
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2a. Enabling the Soak Inject Endpoint
|
||||||
|
|
||||||
|
The soak script uses `POST /v1/debug/inject_event` which is **disabled by default**.
|
||||||
|
Enable it only on staging/NODA1 soak runs:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On NODA1 — edit docker-compose override or pass env inline:
|
||||||
|
# Option 1: temporary inline restart
|
||||||
|
DEBUG_INJECT_ENABLED=true docker-compose \
|
||||||
|
-f docker-compose.matrix-bridge-node1.yml \
|
||||||
|
up -d --no-deps matrix-bridge-dagi
|
||||||
|
|
||||||
|
# Option 2: .env file override
|
||||||
|
echo "DEBUG_INJECT_ENABLED=true" >> .env.soak
|
||||||
|
docker-compose --env-file .env.soak \
|
||||||
|
-f docker-compose.matrix-bridge-node1.yml \
|
||||||
|
up -d --no-deps matrix-bridge-dagi
|
||||||
|
|
||||||
|
# Verify it's enabled (should return 200, not 403)
|
||||||
|
curl -s -X POST http://localhost:9400/v1/debug/inject_event \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d '{"room_id":"!test:test","event":{}}' | jq .
|
||||||
|
# Expected: {"ok":false,"error":"no mapping for room_id=..."} ← 200, not 403
|
||||||
|
|
||||||
|
# IMPORTANT: disable after soak
|
||||||
|
docker-compose -f docker-compose.matrix-bridge-node1.yml up -d --no-deps matrix-bridge-dagi
|
||||||
|
# (DEBUG_INJECT_ENABLED defaults to false)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2b. Step 0 (WORKERS=2 / QUEUE=100) — Record True Baseline
|
||||||
|
|
||||||
|
**Goal:** snapshot the "before any tuning" numbers to have a comparison point.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 0. Confirm current config (should be defaults)
|
||||||
|
curl -s http://localhost:9400/health | jq '{workers: .workers, queue_max: .queue.max}'
|
||||||
|
# Expected: {"workers": 2, "queue_max": 100}
|
||||||
|
|
||||||
|
# 1. DB path for WAL check (adjust to your BRIDGE_DATA_DIR)
|
||||||
|
DB=/opt/microdao-daarion/data/matrix_bridge.db
|
||||||
|
|
||||||
|
# 2. WAL size before (manual check)
|
||||||
|
ls -lh ${DB}-wal 2>/dev/null || echo "(no WAL file yet — first run)"
|
||||||
|
sqlite3 $DB "PRAGMA wal_checkpoint(PASSIVE);" 2>/dev/null || echo "(no sqlite3)"
|
||||||
|
|
||||||
|
# 3. Run Step 0 soak
|
||||||
|
python3 ops/scripts/matrix_bridge_soak.py \
|
||||||
|
--url http://localhost:9400 \
|
||||||
|
--messages 100 \
|
||||||
|
--concurrency 4 \
|
||||||
|
--agent sofiia \
|
||||||
|
--room-id "!your-room-id:your-server" \
|
||||||
|
--max-p95-ms 5000 \
|
||||||
|
--max-drop-rate 0.001 \
|
||||||
|
--db-path $DB \
|
||||||
|
--report-file /tmp/soak_step0_baseline.json
|
||||||
|
|
||||||
|
# 4. Record result in "Baseline numbers" table (section 10) below.
|
||||||
|
jq '.summary, .latency, .metrics_delta, .wal' /tmp/soak_step0_baseline.json
|
||||||
|
```
|
||||||
|
|
||||||
|
**v1 Go/No-Go thresholds for Step 0:**
|
||||||
|
|
||||||
|
| Metric | Green ✅ | Yellow ⚠️ | Red ❌ |
|
||||||
|
|--------|---------|-----------|-------|
|
||||||
|
| `p95_invoke_ms` | < 3000 | 3000–5000 | > 5000 |
|
||||||
|
| `drop_rate` | 0.00% (mandatory) | — | > 0.1% |
|
||||||
|
| `error_rate` | < 1% | 1–3% | > 3% |
|
||||||
|
| `failovers` | 0 | — | ≥ 1 without cause |
|
||||||
|
| WAL delta | < 2 MB | 2–10 MB | > 10 MB |
|
||||||
|
|
||||||
|
**If Step 0 is Green → proceed to Step 1 tuning.**
|
||||||
|
**If Step 0 is Yellow/Red → investigate before touching WORKER_CONCURRENCY.**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2c. Step 1 (WORKERS=4 / QUEUE=200) — Tune-1
|
||||||
|
|
||||||
|
**Goal:** verify that doubling workers gives headroom without Router saturation.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Apply tuning
|
||||||
|
WORKER_CONCURRENCY=4 QUEUE_MAX_EVENTS=200 docker-compose \
|
||||||
|
-f docker-compose.matrix-bridge-node1.yml \
|
||||||
|
--env-file .env.soak \
|
||||||
|
up -d --no-deps matrix-bridge-dagi
|
||||||
|
|
||||||
|
sleep 3
|
||||||
|
curl -s http://localhost:9400/health | jq '{workers: .workers, queue_max: .queue.max}'
|
||||||
|
# Expected: {"workers": 4, "queue_max": 200}
|
||||||
|
|
||||||
|
# 2. Run Step 1 soak (higher concurrency to stress the new headroom)
|
||||||
|
python3 ops/scripts/matrix_bridge_soak.py \
|
||||||
|
--url http://localhost:9400 \
|
||||||
|
--messages 100 \
|
||||||
|
--concurrency 8 \
|
||||||
|
--agent sofiia \
|
||||||
|
--room-id "!your-room-id:your-server" \
|
||||||
|
--max-p95-ms 3000 \
|
||||||
|
--max-drop-rate 0.001 \
|
||||||
|
--db-path $DB \
|
||||||
|
--report-file /tmp/soak_step1_tune1.json
|
||||||
|
|
||||||
|
# 3. Compare Step 0 vs Step 1
|
||||||
|
python3 - <<'EOF'
|
||||||
|
import json
|
||||||
|
s0 = json.load(open('/tmp/soak_step0_baseline.json'))
|
||||||
|
s1 = json.load(open('/tmp/soak_step1_tune1.json'))
|
||||||
|
for k in ('p50', 'p95', 'p99'):
|
||||||
|
print(f"{k}: {s0['latency'][k]}ms → {s1['latency'][k]}ms")
|
||||||
|
print(f"drops: {s0['metrics_delta']['queue_drops']} → {s1['metrics_delta']['queue_drops']}")
|
||||||
|
print(f"WAL: {s0['wal'].get('delta_mb')} → {s1['wal'].get('delta_mb')} MB delta")
|
||||||
|
EOF
|
||||||
|
```
|
||||||
|
|
||||||
|
**Decision:**
|
||||||
|
- Step 1 Green → **freeze, tag v1.0, ship to production.**
|
||||||
|
- p95 within 5% of Step 0 → Router is bottleneck (not workers); don't go to Step 2.
|
||||||
|
- Queue drops > 0 at WORKERS=4 → try Step 2 (WORKERS=8, QUEUE=300).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Scenario A — Baseline load (100 messages, concurrency 4)
|
||||||
|
|
||||||
|
**Goal:** establish latency baseline, verify no drops under normal load.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 ops/scripts/matrix_bridge_soak.py \
|
||||||
|
--url http://localhost:9400 \
|
||||||
|
--messages 100 \
|
||||||
|
--concurrency 4 \
|
||||||
|
--max-p95-ms 3000 \
|
||||||
|
--report-file /tmp/soak_baseline.json
|
||||||
|
```
|
||||||
|
|
||||||
|
**Expected output:**
|
||||||
|
```
|
||||||
|
matrix-bridge-dagi Soak Report ✅ PASSED
|
||||||
|
Messages: 100 concurrency=4
|
||||||
|
Latency: p50=<500ms p95=<3000ms
|
||||||
|
Queue drops: 0 (rate 0.000%)
|
||||||
|
Failovers: 0
|
||||||
|
```
|
||||||
|
|
||||||
|
**If FAILED:**
|
||||||
|
- `p95 too high` → check router `/health`, DeepSeek API latency, `docker stats`
|
||||||
|
- `drop_rate > 0` → check `QUEUE_MAX_EVENTS` env var (increase if needed), inspect bridge logs
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Scenario B — Queue saturation test
|
||||||
|
|
||||||
|
**Goal:** confirm drop metric fires cleanly and bridge doesn't crash.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Reduce queue via env override, then flood:
|
||||||
|
QUEUE_MAX_EVENTS=5 docker-compose -f docker-compose.matrix-bridge-node1.yml \
|
||||||
|
up -d matrix-bridge-dagi
|
||||||
|
|
||||||
|
# Wait for restart
|
||||||
|
sleep 5
|
||||||
|
|
||||||
|
python3 ops/scripts/matrix_bridge_soak.py \
|
||||||
|
--url http://localhost:9400 \
|
||||||
|
--messages 30 \
|
||||||
|
--concurrency 10 \
|
||||||
|
--max-drop-rate 0.99 \
|
||||||
|
--report-file /tmp/soak_queue_sat.json
|
||||||
|
|
||||||
|
# Restore normal queue size
|
||||||
|
docker-compose -f docker-compose.matrix-bridge-node1.yml up -d matrix-bridge-dagi
|
||||||
|
```
|
||||||
|
|
||||||
|
**Expected:** `queue_drops > 0`, bridge still running after the test.
|
||||||
|
|
||||||
|
**Verify in Prometheus/Grafana:**
|
||||||
|
```promql
|
||||||
|
rate(matrix_bridge_queue_dropped_total[1m])
|
||||||
|
```
|
||||||
|
Should spike and then return to 0.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Scenario C — Node failover rehearsal
|
||||||
|
|
||||||
|
**Goal:** simulate NODA1 router becoming unavailable, verify NODA2 takes over.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Step 1: stop the router on NODA1 temporarily
|
||||||
|
docker pause dagi-router-node1
|
||||||
|
|
||||||
|
# Step 2: run soak against bridge (bridge will failover to NODA2)
|
||||||
|
python3 ops/scripts/matrix_bridge_soak.py \
|
||||||
|
--url http://localhost:9400 \
|
||||||
|
--messages 20 \
|
||||||
|
--concurrency 2 \
|
||||||
|
--max-p95-ms 10000 \
|
||||||
|
--report-file /tmp/soak_failover.json
|
||||||
|
|
||||||
|
# Step 3: restore router
|
||||||
|
docker unpause dagi-router-node1
|
||||||
|
```
|
||||||
|
|
||||||
|
**Expected:**
|
||||||
|
```
|
||||||
|
Failovers: 1..20 (at least 1)
|
||||||
|
Sticky sets: 1+
|
||||||
|
Errors: 0 (fallback to NODA2 serves all messages)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Check sticky in control room:**
|
||||||
|
```
|
||||||
|
!nodes
|
||||||
|
```
|
||||||
|
Should show `NODA2` sticky with remaining TTL.
|
||||||
|
|
||||||
|
**Check health tracker:**
|
||||||
|
```
|
||||||
|
!status
|
||||||
|
```
|
||||||
|
Should show `NODA1 state=degraded|down`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Scenario D — Restart recovery
|
||||||
|
|
||||||
|
**Goal:** after restart, sticky and health state reload within one polling cycle.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# After Scenario C: sticky is set to NODA2
|
||||||
|
# Restart the bridge
|
||||||
|
docker restart dagi-matrix-bridge-node1
|
||||||
|
|
||||||
|
# Wait for startup (up to 30s)
|
||||||
|
sleep 15
|
||||||
|
|
||||||
|
# Verify sticky reloaded
|
||||||
|
curl -s http://localhost:9400/health | jq '.ha_state'
|
||||||
|
# Expected: {"sticky_loaded": N, ...}
|
||||||
|
|
||||||
|
# Verify routing still uses NODA2 sticky
|
||||||
|
python3 ops/scripts/matrix_bridge_soak.py \
|
||||||
|
--url http://localhost:9400 \
|
||||||
|
--messages 10 \
|
||||||
|
--concurrency 2 \
|
||||||
|
--report-file /tmp/soak_restart.json
|
||||||
|
```
|
||||||
|
|
||||||
|
**Expected:** p95 similar to post-failover run, `Failovers: 0` (sticky already applied).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Scenario E — Rate limit burst
|
||||||
|
|
||||||
|
**Goal:** verify rate limiting fires and bridge doesn't silently drop below-limit messages.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Set RPM very low for test, then flood from same sender
|
||||||
|
# This is best done in control room by observing !status rate_limited count
|
||||||
|
# rather than the soak script (which uses different senders per message).
|
||||||
|
|
||||||
|
# In Matrix control room:
|
||||||
|
# Send 30+ messages from the same user account in quick succession in a mixed room.
|
||||||
|
# Then:
|
||||||
|
!status
|
||||||
|
# Check: rate_limited_total increased, no queue drops.
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. Scenario F — Policy operations under load
|
||||||
|
|
||||||
|
**Goal:** `!policy history`, `!policy change`, and `!policy export` work while messages are in-flight.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run a background soak
|
||||||
|
python3 ops/scripts/matrix_bridge_soak.py \
|
||||||
|
--url http://localhost:9400 \
|
||||||
|
--messages 200 \
|
||||||
|
--concurrency 2 \
|
||||||
|
--report-file /tmp/soak_concurrent_policy.json &
|
||||||
|
|
||||||
|
# While soak is running, in Matrix control room:
|
||||||
|
!policy history limit=5
|
||||||
|
!policy export
|
||||||
|
!status
|
||||||
|
```
|
||||||
|
|
||||||
|
**Expected:** all three commands respond immediately (< 2s), soak completes without extra drops.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. Prometheus / Grafana during soak
|
||||||
|
|
||||||
|
Key queries for the Grafana dashboard:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
# Throughput (messages/s)
|
||||||
|
rate(matrix_bridge_routed_total[30s])
|
||||||
|
|
||||||
|
# Error rate
|
||||||
|
rate(matrix_bridge_errors_total[30s])
|
||||||
|
|
||||||
|
# p95 invoke latency per node
|
||||||
|
histogram_quantile(0.95, rate(matrix_bridge_invoke_duration_seconds_bucket[1m]))
|
||||||
|
|
||||||
|
# Queue drops rate
|
||||||
|
rate(matrix_bridge_queue_dropped_total[1m])
|
||||||
|
|
||||||
|
# Failovers
|
||||||
|
rate(matrix_bridge_failover_total[5m])
|
||||||
|
```
|
||||||
|
|
||||||
|
Use the `matrix-bridge-dagi` Grafana dashboard at:
|
||||||
|
`ops/grafana/dashboards/matrix-bridge-dagi.json`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10. Baseline numbers (reference)
|
||||||
|
|
||||||
|
| Metric | Cold start | Warm (sticky set) |
|
||||||
|
|--------|-----------|-------------------|
|
||||||
|
| p50 latency | ~200ms | ~150ms |
|
||||||
|
| p95 latency | ~2 000ms | ~1 500ms |
|
||||||
|
| Queue drops | 0 (queue=100) | 0 |
|
||||||
|
| Failover fires | 1 per degradation | 0 after sticky |
|
||||||
|
| Policy ops response | < 500ms | < 500ms |
|
||||||
|
|
||||||
|
*Update this table after each soak run with actual measured values.*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 11. CI soak (mocked, no network)
|
||||||
|
|
||||||
|
For CI pipelines, use the mocked soak scenarios:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 -m pytest tests/test_matrix_bridge_m11_soak_scenarios.py -v
|
||||||
|
```
|
||||||
|
|
||||||
|
Covers (all deterministic, no network):
|
||||||
|
- **S1** Queue saturation → drop counter
|
||||||
|
- **S2** Failover under load → on_failover callback, health tracker
|
||||||
|
- **S3** Sticky routing under burst → sticky set, burst routed to NODA2
|
||||||
|
- **S4** Multi-room isolation → separate rooms don't interfere
|
||||||
|
- **S5** Rate-limit burst → RL callback wired, no panic
|
||||||
|
- **S6** HA restart recovery → sticky + health snapshot persisted and reloaded
|
||||||
|
- **Perf baseline** 100-msg + 50-msg failover burst < 5s wall clock
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 12. Known failure modes & mitigations
|
||||||
|
|
||||||
|
| Symptom | Likely cause | Mitigation |
|
||||||
|
|---------|-------------|------------|
|
||||||
|
| `p95 > 5000ms` | Router/LLM slow | Increase `ROUTER_TIMEOUT_S`, check DeepSeek API |
|
||||||
|
| `drop_rate > 1%` | Queue too small | Increase `QUEUE_MAX_EVENTS` |
|
||||||
|
| `failovers > 0` but errors > 0 | Both nodes degraded | Check NODA1 + NODA2 health; scale router |
|
||||||
|
| Bridge crash during soak | Memory leak / bug | `docker logs` → file GitHub issue |
|
||||||
|
| Sticky not set after failover | `FAILOVER_STICKY_TTL_S=0` | Set to 300+ |
|
||||||
|
| Restart doesn't load sticky | `HA_HEALTH_MAX_AGE_S` too small | Increase or set to 3600 |
|
||||||
476
ops/scripts/matrix_bridge_soak.py
Normal file
476
ops/scripts/matrix_bridge_soak.py
Normal file
@@ -0,0 +1,476 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
matrix_bridge_soak.py — M11 live soak script for matrix-bridge-dagi
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 ops/scripts/matrix_bridge_soak.py \
|
||||||
|
--url http://localhost:9400 \
|
||||||
|
--messages 100 \
|
||||||
|
--concurrency 4 \
|
||||||
|
--report-file /tmp/soak_report.json
|
||||||
|
|
||||||
|
Requires: httpx (pip install httpx)
|
||||||
|
|
||||||
|
What it does:
|
||||||
|
1. Sends --messages synthetic messages to the bridge /v1/sync endpoint
|
||||||
|
(or directly to the router if --direct-router is set).
|
||||||
|
2. Measures latency (p50, p95, p99, max) per batch.
|
||||||
|
3. After the run, fetches /metrics and extracts key counters:
|
||||||
|
- matrix_bridge_queue_dropped_total
|
||||||
|
- matrix_bridge_rate_limited_total
|
||||||
|
- matrix_bridge_failover_total
|
||||||
|
- matrix_bridge_sticky_node_total
|
||||||
|
- matrix_bridge_invoke_duration_seconds (p50/p95 from histogram)
|
||||||
|
4. Prints a human-readable report and optionally writes JSON.
|
||||||
|
|
||||||
|
Exit codes:
|
||||||
|
0 = all pass criteria met
|
||||||
|
1 = one or more thresholds exceeded (see --max-p95-ms, --max-drop-rate)
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
try:
|
||||||
|
import httpx
|
||||||
|
except ImportError:
|
||||||
|
print("ERROR: httpx not installed. Run: pip install httpx", file=sys.stderr)
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
# ── Pass/fail defaults ─────────────────────────────────────────────────────────
|
||||||
|
_DEFAULT_MAX_P95_MS = 5000 # 5 s p95 per invoke (generous for cold start)
|
||||||
|
_DEFAULT_MAX_DROP_RATE = 0.01 # 1% queue drops allowed
|
||||||
|
|
||||||
|
|
||||||
|
# ── Metrics parsing ────────────────────────────────────────────────────────────
|
||||||
|
def _parse_counter(text: str, name: str) -> float:
|
||||||
|
"""Extract the last reported value of a Prometheus counter by name."""
|
||||||
|
for line in text.splitlines():
|
||||||
|
if line.startswith(name + " ") or line.startswith(name + "{"):
|
||||||
|
parts = line.rsplit(None, 1)
|
||||||
|
try:
|
||||||
|
return float(parts[-1])
|
||||||
|
except (ValueError, IndexError):
|
||||||
|
pass
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_histogram_quantile(text: str, name: str, quantile: float) -> Optional[float]:
|
||||||
|
"""
|
||||||
|
Approximate histogram_quantile from _bucket lines.
|
||||||
|
Returns estimated value at given quantile or None if data missing.
|
||||||
|
"""
|
||||||
|
buckets: List[tuple] = []
|
||||||
|
total_count = 0.0
|
||||||
|
for line in text.splitlines():
|
||||||
|
if f"{name}_bucket" in line and 'le="' in line:
|
||||||
|
try:
|
||||||
|
le_part = line.split('le="')[1].split('"')[0]
|
||||||
|
le = float(le_part) if le_part != "+Inf" else float("inf")
|
||||||
|
val = float(line.rsplit(None, 1)[-1])
|
||||||
|
buckets.append((le, val))
|
||||||
|
except (ValueError, IndexError):
|
||||||
|
pass
|
||||||
|
elif (f"{name}_count " in line or (name + "_count{") in line):
|
||||||
|
try:
|
||||||
|
total_count = float(line.rsplit(None, 1)[-1])
|
||||||
|
except (ValueError, IndexError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
if not buckets or total_count == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
buckets.sort()
|
||||||
|
target = quantile * total_count
|
||||||
|
prev_le, prev_count = 0.0, 0.0
|
||||||
|
for le, count in buckets:
|
||||||
|
if count >= target:
|
||||||
|
if le == float("inf"):
|
||||||
|
return prev_le
|
||||||
|
# Linear interpolation
|
||||||
|
if count == prev_count:
|
||||||
|
return le
|
||||||
|
fraction = (target - prev_count) / (count - prev_count)
|
||||||
|
return prev_le + fraction * (le - prev_le)
|
||||||
|
prev_le, prev_count = le, count
|
||||||
|
return prev_le
|
||||||
|
|
||||||
|
|
||||||
|
# ── Soak runner ────────────────────────────────────────────────────────────────
|
||||||
|
async def _preflight_inject(client: httpx.AsyncClient, url: str, room_id: str) -> str:
|
||||||
|
"""
|
||||||
|
Verify the inject endpoint is reachable and enabled.
|
||||||
|
Returns "" on success, error message on failure.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{url.rstrip('/')}/v1/debug/inject_event",
|
||||||
|
json={"room_id": room_id, "event": {"event_id": "!preflight", "sender": "@soak:test",
|
||||||
|
"content": {"msgtype": "m.text", "body": "ping"}}},
|
||||||
|
timeout=5.0,
|
||||||
|
)
|
||||||
|
if resp.status_code == 403:
|
||||||
|
return (
|
||||||
|
"❌ DEBUG_INJECT_ENABLED=false on bridge. "
|
||||||
|
"Set DEBUG_INJECT_ENABLED=true and restart for soak.\n"
|
||||||
|
" NEVER enable in production!"
|
||||||
|
)
|
||||||
|
if resp.status_code >= 500:
|
||||||
|
return f"❌ Bridge inject endpoint returned HTTP {resp.status_code}"
|
||||||
|
data = resp.json()
|
||||||
|
if not data.get("ok") and "no mapping" in data.get("error", ""):
|
||||||
|
return (
|
||||||
|
f"❌ No room mapping for room_id={room_id!r}. "
|
||||||
|
"Pass --room-id matching a configured BRIDGE_ROOM_MAP entry."
|
||||||
|
)
|
||||||
|
return ""
|
||||||
|
except httpx.ConnectError:
|
||||||
|
return f"❌ Cannot connect to bridge at {url}. Is it running?"
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
return f"❌ Preflight failed: {exc}"
|
||||||
|
|
||||||
|
|
||||||
|
async def _check_wal(db_path: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Run WAL size + checkpoint check on the bridge policy DB.
|
||||||
|
Returns dict with wal_bytes, wal_mb, checkpoint_result.
|
||||||
|
Requires sqlite3 CLI on PATH; gracefully skips if unavailable.
|
||||||
|
"""
|
||||||
|
import subprocess, shutil
|
||||||
|
result: Dict[str, Any] = {"db_path": db_path, "ok": False}
|
||||||
|
|
||||||
|
wal_path = db_path + "-wal"
|
||||||
|
try:
|
||||||
|
wal_bytes = os.path.getsize(wal_path) if os.path.exists(wal_path) else 0
|
||||||
|
result["wal_bytes"] = wal_bytes
|
||||||
|
result["wal_mb"] = round(wal_bytes / 1_048_576, 2)
|
||||||
|
except OSError:
|
||||||
|
result["wal_bytes"] = -1
|
||||||
|
result["wal_mb"] = -1
|
||||||
|
|
||||||
|
if shutil.which("sqlite3"):
|
||||||
|
try:
|
||||||
|
cp = subprocess.run(
|
||||||
|
["sqlite3", db_path, "PRAGMA wal_checkpoint(PASSIVE);"],
|
||||||
|
capture_output=True, text=True, timeout=5,
|
||||||
|
)
|
||||||
|
# Output: busy|log|checkpointed (3 ints)
|
||||||
|
parts = cp.stdout.strip().split("|")
|
||||||
|
if len(parts) == 3:
|
||||||
|
result["wal_checkpoint"] = {
|
||||||
|
"busy": int(parts[0]), "log": int(parts[1]), "checkpointed": int(parts[2]),
|
||||||
|
}
|
||||||
|
result["ok"] = True
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
result["ok"] = False
|
||||||
|
else:
|
||||||
|
result["sqlite3_missing"] = True
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
async def _send_one(
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
url: str,
|
||||||
|
agent_id: str,
|
||||||
|
message: str,
|
||||||
|
room_id: str,
|
||||||
|
sender: str,
|
||||||
|
) -> tuple:
|
||||||
|
"""
|
||||||
|
POST a synthetic Matrix-style event to the bridge debug endpoint.
|
||||||
|
Returns (latency_ms: float, status_code: int, error: str|None).
|
||||||
|
"""
|
||||||
|
payload = {
|
||||||
|
"room_id": room_id,
|
||||||
|
"event": {
|
||||||
|
"event_id": f"!soak-{int(time.monotonic() * 1e6)}",
|
||||||
|
"sender": sender,
|
||||||
|
"type": "m.room.message",
|
||||||
|
"content": {"msgtype": "m.text", "body": message},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{url.rstrip('/')}/v1/debug/inject_event",
|
||||||
|
json=payload,
|
||||||
|
timeout=30.0,
|
||||||
|
)
|
||||||
|
latency_ms = (time.monotonic() - t0) * 1000
|
||||||
|
if resp.status_code >= 500:
|
||||||
|
return latency_ms, resp.status_code, f"HTTP {resp.status_code}"
|
||||||
|
return latency_ms, resp.status_code, None
|
||||||
|
except httpx.TimeoutException:
|
||||||
|
latency_ms = (time.monotonic() - t0) * 1000
|
||||||
|
return latency_ms, 0, "timeout"
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
latency_ms = (time.monotonic() - t0) * 1000
|
||||||
|
return latency_ms, 0, str(exc)
|
||||||
|
|
||||||
|
|
||||||
|
async def _fetch_health(client: httpx.AsyncClient, url: str) -> Dict[str, Any]:
|
||||||
|
try:
|
||||||
|
resp = await client.get(f"{url.rstrip('/')}/health", timeout=10.0)
|
||||||
|
return resp.json() if resp.status_code == 200 else {}
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
async def _fetch_metrics(client: httpx.AsyncClient, url: str) -> str:
|
||||||
|
try:
|
||||||
|
resp = await client.get(f"{url.rstrip('/')}/metrics", timeout=10.0)
|
||||||
|
return resp.text if resp.status_code == 200 else ""
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _percentile(values: List[float], p: float) -> float:
|
||||||
|
if not values:
|
||||||
|
return 0.0
|
||||||
|
sv = sorted(values)
|
||||||
|
idx = int(len(sv) * p / 100)
|
||||||
|
return sv[min(idx, len(sv) - 1)]
|
||||||
|
|
||||||
|
|
||||||
|
async def run_soak(
|
||||||
|
url: str,
|
||||||
|
n_messages: int,
|
||||||
|
concurrency: int,
|
||||||
|
agent_id: str,
|
||||||
|
room_id: str,
|
||||||
|
sender: str,
|
||||||
|
max_p95_ms: float,
|
||||||
|
max_drop_rate: float,
|
||||||
|
db_path: str = "",
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
results: List[tuple] = []
|
||||||
|
semaphore = asyncio.Semaphore(concurrency)
|
||||||
|
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
# Pre-check: inject endpoint + health
|
||||||
|
preflight_err = await _preflight_inject(client, url, room_id)
|
||||||
|
if preflight_err:
|
||||||
|
print(preflight_err, file=sys.stderr)
|
||||||
|
return {"ok": False, "error": preflight_err, "passed": False, "failures": [preflight_err]}
|
||||||
|
|
||||||
|
# WAL check before soak
|
||||||
|
wal_before: Dict[str, Any] = {}
|
||||||
|
if db_path:
|
||||||
|
wal_before = await _check_wal(db_path)
|
||||||
|
print(f"[soak] WAL before: {wal_before.get('wal_mb', '?')} MB")
|
||||||
|
|
||||||
|
# Pre-check: health
|
||||||
|
health_before = await _fetch_health(client, url)
|
||||||
|
metrics_before = await _fetch_metrics(client, url)
|
||||||
|
|
||||||
|
drops_before = _parse_counter(metrics_before, "matrix_bridge_queue_dropped_total")
|
||||||
|
rl_before = _parse_counter(metrics_before, "matrix_bridge_rate_limited_total")
|
||||||
|
fo_before = _parse_counter(metrics_before, "matrix_bridge_failover_total")
|
||||||
|
|
||||||
|
print(f"[soak] Bridge health before: {health_before.get('ok', '?')}")
|
||||||
|
print(f"[soak] Starting {n_messages} messages (concurrency={concurrency}) ...")
|
||||||
|
|
||||||
|
t_start = time.monotonic()
|
||||||
|
|
||||||
|
async def worker(i: int):
|
||||||
|
async with semaphore:
|
||||||
|
msg = f"soak-msg-{i:04d}"
|
||||||
|
lat, status, err = await _send_one(
|
||||||
|
client, url, agent_id, msg, room_id, sender
|
||||||
|
)
|
||||||
|
results.append((lat, status, err))
|
||||||
|
if (i + 1) % max(1, n_messages // 10) == 0:
|
||||||
|
print(f" [{i+1}/{n_messages}] last={lat:.0f}ms status={status}")
|
||||||
|
|
||||||
|
await asyncio.gather(*[worker(i) for i in range(n_messages)])
|
||||||
|
|
||||||
|
elapsed_s = time.monotonic() - t_start
|
||||||
|
metrics_after = await _fetch_metrics(client, url)
|
||||||
|
health_after = await _fetch_health(client, url)
|
||||||
|
|
||||||
|
# WAL check after soak
|
||||||
|
wal_after: Dict[str, Any] = {}
|
||||||
|
if db_path:
|
||||||
|
wal_after = await _check_wal(db_path)
|
||||||
|
print(f"[soak] WAL after: {wal_after.get('wal_mb', '?')} MB "
|
||||||
|
f"(delta={round(wal_after.get('wal_mb',0) - wal_before.get('wal_mb',0), 2)} MB)")
|
||||||
|
|
||||||
|
latencies = [r[0] for r in results]
|
||||||
|
errors = [r for r in results if r[2] is not None]
|
||||||
|
successes = len(results) - len(errors)
|
||||||
|
error_rate = len(errors) / len(results) if results else 0.0
|
||||||
|
|
||||||
|
drops_after = _parse_counter(metrics_after, "matrix_bridge_queue_dropped_total")
|
||||||
|
rl_after = _parse_counter(metrics_after, "matrix_bridge_rate_limited_total")
|
||||||
|
fo_after = _parse_counter(metrics_after, "matrix_bridge_failover_total")
|
||||||
|
sticky_after = _parse_counter(metrics_after, "matrix_bridge_sticky_node_total")
|
||||||
|
|
||||||
|
delta_drops = drops_after - drops_before
|
||||||
|
delta_rl = rl_after - rl_before
|
||||||
|
delta_fo = fo_after - fo_before
|
||||||
|
|
||||||
|
p50 = _percentile(latencies, 50)
|
||||||
|
p95 = _percentile(latencies, 95)
|
||||||
|
p99 = _percentile(latencies, 99)
|
||||||
|
p_max = max(latencies) if latencies else 0.0
|
||||||
|
|
||||||
|
# Histogram quantile from Prometheus
|
||||||
|
hist_p95 = _parse_histogram_quantile(
|
||||||
|
metrics_after, "matrix_bridge_invoke_duration_seconds", 0.95
|
||||||
|
)
|
||||||
|
hist_p95_ms = hist_p95 * 1000 if hist_p95 is not None else None
|
||||||
|
|
||||||
|
drop_rate = delta_drops / len(results) if results else 0.0
|
||||||
|
|
||||||
|
report = {
|
||||||
|
"wal": {
|
||||||
|
"before_mb": wal_before.get("wal_mb"),
|
||||||
|
"after_mb": wal_after.get("wal_mb"),
|
||||||
|
"delta_mb": round(
|
||||||
|
(wal_after.get("wal_mb") or 0) - (wal_before.get("wal_mb") or 0), 3
|
||||||
|
) if wal_before and wal_after else None,
|
||||||
|
"checkpoint_after": wal_after.get("wal_checkpoint"),
|
||||||
|
"threshold_mb": 10,
|
||||||
|
},
|
||||||
|
"summary": {
|
||||||
|
"total_messages": n_messages,
|
||||||
|
"concurrency": concurrency,
|
||||||
|
"elapsed_s": round(elapsed_s, 2),
|
||||||
|
"throughput_rps": round(n_messages / elapsed_s, 1) if elapsed_s > 0 else 0,
|
||||||
|
"successes": successes,
|
||||||
|
"errors": len(errors),
|
||||||
|
"error_rate": round(error_rate, 4),
|
||||||
|
},
|
||||||
|
"latency_ms": {
|
||||||
|
"p50": round(p50, 1),
|
||||||
|
"p95": round(p95, 1),
|
||||||
|
"p99": round(p99, 1),
|
||||||
|
"max": round(p_max, 1),
|
||||||
|
},
|
||||||
|
"metrics_delta": {
|
||||||
|
"queue_drops": int(delta_drops),
|
||||||
|
"rate_limited": int(delta_rl),
|
||||||
|
"failovers": int(delta_fo),
|
||||||
|
"sticky_sets": int(sticky_after),
|
||||||
|
"drop_rate": round(drop_rate, 4),
|
||||||
|
},
|
||||||
|
"prometheus_invoke_p95_ms": round(hist_p95_ms, 1) if hist_p95_ms else None,
|
||||||
|
"health_before": health_before.get("ok"),
|
||||||
|
"health_after": health_after.get("ok"),
|
||||||
|
"pass_criteria": {
|
||||||
|
"max_p95_ms": max_p95_ms,
|
||||||
|
"max_drop_rate": max_drop_rate,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# Pass/fail evaluation
|
||||||
|
failures = []
|
||||||
|
if p95 > max_p95_ms:
|
||||||
|
failures.append(f"p95={p95:.0f}ms exceeds threshold {max_p95_ms:.0f}ms")
|
||||||
|
if drop_rate > max_drop_rate:
|
||||||
|
failures.append(
|
||||||
|
f"drop_rate={drop_rate:.3%} exceeds threshold {max_drop_rate:.3%}"
|
||||||
|
)
|
||||||
|
wal_delta = report["wal"]["delta_mb"]
|
||||||
|
if wal_delta is not None and wal_delta > report["wal"]["threshold_mb"]:
|
||||||
|
failures.append(
|
||||||
|
f"WAL grew {wal_delta:.1f}MB (threshold {report['wal']['threshold_mb']}MB) "
|
||||||
|
"— possible SQLite write pressure (Bottleneck #2)"
|
||||||
|
)
|
||||||
|
|
||||||
|
report["passed"] = len(failures) == 0
|
||||||
|
report["failures"] = failures
|
||||||
|
return report
|
||||||
|
|
||||||
|
|
||||||
|
def _print_report(r: Dict[str, Any]) -> None:
|
||||||
|
s = r["summary"]
|
||||||
|
l = r["latency_ms"]
|
||||||
|
m = r["metrics_delta"]
|
||||||
|
passed = "✅ PASSED" if r["passed"] else "❌ FAILED"
|
||||||
|
|
||||||
|
w = r.get("wal", {})
|
||||||
|
print()
|
||||||
|
print("=" * 60)
|
||||||
|
print(f" matrix-bridge-dagi Soak Report {passed}")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f" Messages: {s['total_messages']} concurrency={s['concurrency']}")
|
||||||
|
print(f" Elapsed: {s['elapsed_s']}s ({s['throughput_rps']} rps)")
|
||||||
|
print(f" Successes: {s['successes']} errors={s['errors']} ({s['error_rate']:.1%})")
|
||||||
|
print()
|
||||||
|
print(f" Latency (client-side): p50={l['p50']}ms p95={l['p95']}ms "
|
||||||
|
f"p99={l['p99']}ms max={l['max']}ms")
|
||||||
|
if r["prometheus_invoke_p95_ms"] is not None:
|
||||||
|
print(f" Invoke p95 (Prometheus): {r['prometheus_invoke_p95_ms']}ms")
|
||||||
|
print()
|
||||||
|
print(f" Queue drops: {m['queue_drops']} (rate {m['drop_rate']:.3%})")
|
||||||
|
print(f" Rate-limited: {m['rate_limited']}")
|
||||||
|
print(f" Failovers: {m['failovers']}")
|
||||||
|
print(f" Sticky sets: {m['sticky_sets']}")
|
||||||
|
if w.get("before_mb") is not None:
|
||||||
|
wal_delta_str = (
|
||||||
|
f"Δ{w['delta_mb']:+.2f}MB" if w.get("delta_mb") is not None else ""
|
||||||
|
)
|
||||||
|
wal_warn = " ⚠️" if (w.get("delta_mb") or 0) > w.get("threshold_mb", 10) else ""
|
||||||
|
print(f" WAL: {w['before_mb']}MB → {w['after_mb']}MB {wal_delta_str}{wal_warn}")
|
||||||
|
print()
|
||||||
|
if r["failures"]:
|
||||||
|
for f in r["failures"]:
|
||||||
|
print(f" ❌ {f}")
|
||||||
|
else:
|
||||||
|
print(" All pass criteria met.")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
parser = argparse.ArgumentParser(description="matrix-bridge-dagi soak test (M11)")
|
||||||
|
parser.add_argument("--url", default="http://localhost:9400",
|
||||||
|
help="Bridge base URL (default: http://localhost:9400)")
|
||||||
|
parser.add_argument("--messages", type=int, default=100,
|
||||||
|
help="Total messages to send (default: 100)")
|
||||||
|
parser.add_argument("--concurrency", type=int, default=4,
|
||||||
|
help="Concurrent requests (default: 4)")
|
||||||
|
parser.add_argument("--agent-id", default="sofiia",
|
||||||
|
help="Agent id for synthetic events (default: sofiia)")
|
||||||
|
parser.add_argument("--room-id", default="!soak-room:home.invalid",
|
||||||
|
help="Room id for synthetic events")
|
||||||
|
parser.add_argument("--sender", default="@soak-user:home.invalid",
|
||||||
|
help="Sender for synthetic events")
|
||||||
|
parser.add_argument("--max-p95-ms", type=float, default=_DEFAULT_MAX_P95_MS,
|
||||||
|
help=f"Max p95 latency ms (default: {_DEFAULT_MAX_P95_MS})")
|
||||||
|
parser.add_argument("--max-drop-rate",type=float, default=_DEFAULT_MAX_DROP_RATE,
|
||||||
|
help=f"Max queue drop rate 0..1 (default: {_DEFAULT_MAX_DROP_RATE})")
|
||||||
|
parser.add_argument("--report-file", default="",
|
||||||
|
help="Optional path to write JSON report")
|
||||||
|
parser.add_argument("--db-path", default="",
|
||||||
|
help="Path to policy_store.db for WAL check "
|
||||||
|
"(e.g. /opt/microdao-daarion/data/matrix_bridge.db)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
report = asyncio.run(run_soak(
|
||||||
|
url=args.url,
|
||||||
|
n_messages=args.messages,
|
||||||
|
concurrency=args.concurrency,
|
||||||
|
agent_id=args.agent_id,
|
||||||
|
room_id=args.room_id,
|
||||||
|
sender=args.sender,
|
||||||
|
max_p95_ms=args.max_p95_ms,
|
||||||
|
max_drop_rate=args.max_drop_rate,
|
||||||
|
db_path=args.db_path,
|
||||||
|
))
|
||||||
|
_print_report(report)
|
||||||
|
|
||||||
|
if args.report_file:
|
||||||
|
with open(args.report_file, "w", encoding="utf-8") as fh:
|
||||||
|
json.dump(report, fh, indent=2)
|
||||||
|
print(f"\n Report saved: {args.report_file}")
|
||||||
|
|
||||||
|
return 0 if report["passed"] else 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
"""
|
"""
|
||||||
matrix-bridge-dagi — configuration and validation (M2.1 + M2.2 + M3.0)
|
matrix-bridge-dagi — configuration and validation (M2.1 + M2.2 + M3.0 + M3.1)
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
@@ -54,6 +54,54 @@ class BridgeConfig:
|
|||||||
# "ignore" | "reply_error" (send ⛔ to room on unauthorized attempt)
|
# "ignore" | "reply_error" (send ⛔ to room on unauthorized attempt)
|
||||||
control_unauthorized_behavior: str
|
control_unauthorized_behavior: str
|
||||||
|
|
||||||
|
# M3.1: Runbook runner — sofiia-console control token
|
||||||
|
sofiia_control_token: str # X-Control-Token for /api/runbooks/internal/runs
|
||||||
|
|
||||||
|
# M3.4: Control channel safety — rate limiting + cooldown
|
||||||
|
control_room_rpm: int # Max commands per room per minute (0 = unlimited)
|
||||||
|
control_operator_rpm: int # Max commands per operator per minute
|
||||||
|
control_run_next_rpm: int # Max !runbook next calls per run_id per minute
|
||||||
|
control_cooldown_s: float # Anti-double-click debounce per (operator, verb, subcmd)
|
||||||
|
|
||||||
|
# M2.3: Persistent event deduplication
|
||||||
|
persistent_dedupe: bool # Enable SQLite-backed dedupe across restarts
|
||||||
|
bridge_data_dir: str # Directory for SQLite DB and other bridge data
|
||||||
|
processed_events_ttl_h: int # TTL for processed events (hours)
|
||||||
|
processed_events_prune_batch: int # Max rows to prune per prune run
|
||||||
|
processed_events_prune_interval_s: int # Prune interval in seconds (0 = disable periodic)
|
||||||
|
|
||||||
|
# M4.0: agent discovery
|
||||||
|
discovery_rpm: int # Max !agents replies per room per minute (0 = unlimited)
|
||||||
|
|
||||||
|
# M5.0: node-aware routing
|
||||||
|
bridge_allowed_nodes: str # Comma-separated: "NODA1,NODA2"
|
||||||
|
bridge_default_node: str # Default node when none specified
|
||||||
|
bridge_room_node_map: str # Optional: "!roomA:server=NODA2;!roomB:server=NODA1"
|
||||||
|
|
||||||
|
# M8.0: node health + soft-failover thresholds
|
||||||
|
node_fail_consecutive: int # Consecutive failures before node marked "down"
|
||||||
|
node_lat_ewma_s: float # EWMA latency threshold (seconds) → "degraded"
|
||||||
|
node_ewma_alpha: float # EWMA smoothing factor (0..1)
|
||||||
|
|
||||||
|
# M8.1: sticky failover cache
|
||||||
|
failover_sticky_ttl_s: float # Seconds to hold sticky node preference after failover (0 = disabled)
|
||||||
|
|
||||||
|
# M8.2: HA state persistence
|
||||||
|
ha_health_snapshot_interval_s: int # Seconds between node health writes to DB (0 = disabled)
|
||||||
|
ha_health_max_age_s: int # Max age of health snapshot to load on startup (seconds)
|
||||||
|
|
||||||
|
# M9.0: Two-step confirmation TTL
|
||||||
|
confirm_ttl_s: float # Seconds a pending !confirm nonce is valid (0 = disabled)
|
||||||
|
|
||||||
|
# M10.0: Policy export retention
|
||||||
|
policy_export_retention_days: int # Days to keep policy exports (0 = keep forever)
|
||||||
|
|
||||||
|
# M10.2: Policy change history
|
||||||
|
policy_history_limit: int # Max rows in policy_changes table (0 = unlimited)
|
||||||
|
|
||||||
|
# M11 soak: synthetic event injection (NEVER enable in production)
|
||||||
|
debug_inject_enabled: bool # POST /v1/debug/inject_event (default: False)
|
||||||
|
|
||||||
# Service identity
|
# Service identity
|
||||||
node_id: str
|
node_id: str
|
||||||
build_sha: str
|
build_sha: str
|
||||||
@@ -99,6 +147,35 @@ def load_config() -> BridgeConfig:
|
|||||||
bridge_operator_allowlist=_optional("BRIDGE_OPERATOR_ALLOWLIST", ""),
|
bridge_operator_allowlist=_optional("BRIDGE_OPERATOR_ALLOWLIST", ""),
|
||||||
bridge_control_rooms=_optional("BRIDGE_CONTROL_ROOMS", ""),
|
bridge_control_rooms=_optional("BRIDGE_CONTROL_ROOMS", ""),
|
||||||
control_unauthorized_behavior=_optional("CONTROL_UNAUTHORIZED_BEHAVIOR", "ignore"),
|
control_unauthorized_behavior=_optional("CONTROL_UNAUTHORIZED_BEHAVIOR", "ignore"),
|
||||||
|
sofiia_control_token=_optional("SOFIIA_CONTROL_TOKEN", ""),
|
||||||
|
control_room_rpm=max(0, int(_optional("CONTROL_ROOM_RPM", "60"))),
|
||||||
|
control_operator_rpm=max(0, int(_optional("CONTROL_OPERATOR_RPM", "30"))),
|
||||||
|
control_run_next_rpm=max(0, int(_optional("CONTROL_RUN_NEXT_RPM", "20"))),
|
||||||
|
control_cooldown_s=max(0.0, float(_optional("CONTROL_COOLDOWN_S", "2.0"))),
|
||||||
|
persistent_dedupe=_optional("PERSISTENT_DEDUPE", "1").strip() not in ("0", "false", ""),
|
||||||
|
bridge_data_dir=_optional("BRIDGE_DATA_DIR", "/app/data"),
|
||||||
|
processed_events_ttl_h=max(1, int(_optional("PROCESSED_EVENTS_TTL_H", "48"))),
|
||||||
|
processed_events_prune_batch=max(1, int(_optional("PROCESSED_EVENTS_PRUNE_BATCH", "5000"))),
|
||||||
|
processed_events_prune_interval_s=max(0, int(_optional("PROCESSED_EVENTS_PRUNE_INTERVAL_S", "3600"))),
|
||||||
|
discovery_rpm=max(0, int(_optional("DISCOVERY_RPM", "20"))),
|
||||||
|
bridge_allowed_nodes=_optional("BRIDGE_ALLOWED_NODES", "NODA1"),
|
||||||
|
bridge_default_node=_optional("BRIDGE_DEFAULT_NODE", "NODA1"),
|
||||||
|
bridge_room_node_map=_optional("BRIDGE_ROOM_NODE_MAP", ""),
|
||||||
|
# M8.0: node health thresholds
|
||||||
|
node_fail_consecutive=max(1, int(_optional("NODE_FAIL_CONSEC", "3"))),
|
||||||
|
node_lat_ewma_s=max(0.5, float(_optional("NODE_LAT_EWMA_S", "12.0"))),
|
||||||
|
node_ewma_alpha=min(1.0, max(0.01, float(_optional("NODE_EWMA_ALPHA", "0.3")))),
|
||||||
|
# M8.1: sticky failover TTL (0 = disabled)
|
||||||
|
failover_sticky_ttl_s=max(0.0, float(_optional("FAILOVER_STICKY_TTL_S", "300.0"))),
|
||||||
|
# M8.2: HA state persistence
|
||||||
|
ha_health_snapshot_interval_s=max(0, int(_optional("HA_HEALTH_SNAPSHOT_INTERVAL_S", "60"))),
|
||||||
|
ha_health_max_age_s=max(0, int(_optional("HA_HEALTH_MAX_AGE_S", "600"))),
|
||||||
|
# M9.0: Two-step confirmation TTL (0 = disabled)
|
||||||
|
confirm_ttl_s=max(0.0, float(_optional("CONFIRM_TTL_S", "120.0"))),
|
||||||
|
policy_export_retention_days=max(0, int(_optional("POLICY_EXPORT_RETENTION_DAYS", "30"))),
|
||||||
|
policy_history_limit=max(0, int(_optional("POLICY_HISTORY_LIMIT", "100"))),
|
||||||
|
debug_inject_enabled=_optional("DEBUG_INJECT_ENABLED", "false").lower()
|
||||||
|
in ("1", "true", "yes"),
|
||||||
node_id=_optional("NODE_ID", "NODA1"),
|
node_id=_optional("NODE_ID", "NODA1"),
|
||||||
build_sha=_optional("BUILD_SHA", "dev"),
|
build_sha=_optional("BUILD_SHA", "dev"),
|
||||||
build_time=_optional("BUILD_TIME", "local"),
|
build_time=_optional("BUILD_TIME", "local"),
|
||||||
|
|||||||
167
services/matrix-bridge-dagi/app/confirm_store.py
Normal file
167
services/matrix-bridge-dagi/app/confirm_store.py
Normal file
@@ -0,0 +1,167 @@
|
|||||||
|
"""
|
||||||
|
confirm_store — M9.0: Two-step confirmation for dangerous control commands.
|
||||||
|
|
||||||
|
Flow:
|
||||||
|
1. Operator issues a dangerous command (e.g. !node set, !policy import mode=replace).
|
||||||
|
2. Bridge calls ConfirmStore.add(..., callback=<coroutine>) → returns a nonce.
|
||||||
|
3. Bridge replies: "Type !confirm <nonce> within Ns to apply."
|
||||||
|
4. Operator sends !confirm <nonce>.
|
||||||
|
5. Bridge calls ConfirmStore.pop(nonce, sender_hash) → returns PendingConfirmation.
|
||||||
|
6. Bridge executes callback() → (reply_text, diff_summary).
|
||||||
|
7. Audit trail: matrix.control.intent / matrix.control.confirmed / matrix.control.applied.
|
||||||
|
|
||||||
|
Safety:
|
||||||
|
- One pending entry per sender (new request replaces old).
|
||||||
|
- Nonce is sender-bound: wrong sender_hash → pop returns None.
|
||||||
|
- TTL enforced via monotonic time; expired entries not returned.
|
||||||
|
- Nonce: 6 uppercase alphanumeric (NONCE_LEN chars from NONCE_CHARS).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import secrets
|
||||||
|
import string
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
NONCE_LEN = 6
|
||||||
|
NONCE_CHARS = string.ascii_uppercase + string.digits
|
||||||
|
|
||||||
|
_DEFAULT_TTL_S = 120.0
|
||||||
|
|
||||||
|
|
||||||
|
def make_nonce() -> str:
|
||||||
|
"""Generate a cryptographically random 6-char uppercase alphanumeric nonce."""
|
||||||
|
return "".join(secrets.choice(NONCE_CHARS) for _ in range(NONCE_LEN))
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PendingConfirmation:
|
||||||
|
"""A pending two-step confirmation waiting for !confirm <nonce>."""
|
||||||
|
nonce: str
|
||||||
|
sender_hash: str
|
||||||
|
verb: str # e.g. "node.set", "room.agents set", "policy.import"
|
||||||
|
normalized_args: str # human-readable args for audit
|
||||||
|
action_summary: str # "!node set room=!x:s node=NODA2"
|
||||||
|
room_id: str # Matrix room_id where the intent was issued
|
||||||
|
callback: Callable[[], Awaitable[Tuple[str, str]]] # async () → (reply_text, diff_summary)
|
||||||
|
expires_at: float # time.monotonic() deadline
|
||||||
|
|
||||||
|
|
||||||
|
class ConfirmStore:
|
||||||
|
"""
|
||||||
|
In-memory, thread-safe store for pending two-step confirmation entries.
|
||||||
|
|
||||||
|
One pending entry per sender at a time. If the same sender issues a new
|
||||||
|
dangerous command before confirming the previous one, the old entry is
|
||||||
|
replaced (new nonce issued).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, ttl_s: float = _DEFAULT_TTL_S) -> None:
|
||||||
|
self.ttl_s = ttl_s
|
||||||
|
self._lock = threading.RLock()
|
||||||
|
self._by_nonce: Dict[str, PendingConfirmation] = {}
|
||||||
|
self._by_sender: Dict[str, str] = {} # sender_hash → nonce
|
||||||
|
|
||||||
|
# ── Public API ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def add(
|
||||||
|
self,
|
||||||
|
sender_hash: str,
|
||||||
|
verb: str,
|
||||||
|
normalized_args: str,
|
||||||
|
action_summary: str,
|
||||||
|
room_id: str,
|
||||||
|
callback: Callable[[], Awaitable[Tuple[str, str]]],
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Create a pending confirmation entry. Returns the nonce string.
|
||||||
|
|
||||||
|
If the sender already has a pending entry it is replaced (old nonce
|
||||||
|
becomes invalid immediately).
|
||||||
|
"""
|
||||||
|
nonce = make_nonce()
|
||||||
|
expires_at = time.monotonic() + self.ttl_s
|
||||||
|
entry = PendingConfirmation(
|
||||||
|
nonce=nonce,
|
||||||
|
sender_hash=sender_hash,
|
||||||
|
verb=verb,
|
||||||
|
normalized_args=normalized_args,
|
||||||
|
action_summary=action_summary,
|
||||||
|
room_id=room_id,
|
||||||
|
callback=callback,
|
||||||
|
expires_at=expires_at,
|
||||||
|
)
|
||||||
|
with self._lock:
|
||||||
|
# Evict any previous pending entry for this sender
|
||||||
|
old_nonce = self._by_sender.get(sender_hash)
|
||||||
|
if old_nonce:
|
||||||
|
self._by_nonce.pop(old_nonce, None)
|
||||||
|
self._by_nonce[nonce] = entry
|
||||||
|
self._by_sender[sender_hash] = nonce
|
||||||
|
return nonce
|
||||||
|
|
||||||
|
def pop(self, nonce: str, sender_hash: str) -> Optional[PendingConfirmation]:
|
||||||
|
"""
|
||||||
|
Retrieve and atomically remove a pending confirmation.
|
||||||
|
|
||||||
|
Returns None if:
|
||||||
|
- nonce does not exist,
|
||||||
|
- sender_hash does not match the entry owner,
|
||||||
|
- or the entry has expired.
|
||||||
|
"""
|
||||||
|
nonce = nonce.upper()
|
||||||
|
with self._lock:
|
||||||
|
entry = self._by_nonce.get(nonce)
|
||||||
|
if entry is None:
|
||||||
|
return None
|
||||||
|
if entry.sender_hash != sender_hash:
|
||||||
|
# Wrong sender — deny without disclosing any detail
|
||||||
|
return None
|
||||||
|
if time.monotonic() > entry.expires_at:
|
||||||
|
# Expired — clean up and deny
|
||||||
|
self._by_nonce.pop(nonce, None)
|
||||||
|
self._by_sender.pop(entry.sender_hash, None)
|
||||||
|
return None
|
||||||
|
# Valid confirmation — consume the entry
|
||||||
|
self._by_nonce.pop(nonce)
|
||||||
|
self._by_sender.pop(sender_hash, None)
|
||||||
|
return entry
|
||||||
|
|
||||||
|
def pending_nonce(self, sender_hash: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Return the current pending nonce for a sender (non-destructive peek).
|
||||||
|
Returns None if no entry or the entry has expired.
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
nonce = self._by_sender.get(sender_hash)
|
||||||
|
if nonce is None:
|
||||||
|
return None
|
||||||
|
entry = self._by_nonce.get(nonce)
|
||||||
|
if entry is None or time.monotonic() > entry.expires_at:
|
||||||
|
# Lazy eviction
|
||||||
|
self._by_nonce.pop(nonce, None)
|
||||||
|
self._by_sender.pop(sender_hash, None)
|
||||||
|
return None
|
||||||
|
return nonce
|
||||||
|
|
||||||
|
def pending_count(self) -> int:
|
||||||
|
"""Number of non-expired pending entries (for /health, metrics)."""
|
||||||
|
now = time.monotonic()
|
||||||
|
with self._lock:
|
||||||
|
return sum(1 for e in self._by_nonce.values() if now <= e.expires_at)
|
||||||
|
|
||||||
|
def cleanup(self) -> int:
|
||||||
|
"""Eagerly remove all expired entries. Returns count removed."""
|
||||||
|
now = time.monotonic()
|
||||||
|
removed = 0
|
||||||
|
with self._lock:
|
||||||
|
expired_nonces = [
|
||||||
|
n for n, e in self._by_nonce.items() if now > e.expires_at
|
||||||
|
]
|
||||||
|
for n in expired_nonces:
|
||||||
|
entry = self._by_nonce.pop(n)
|
||||||
|
self._by_sender.pop(entry.sender_hash, None)
|
||||||
|
removed += 1
|
||||||
|
return removed
|
||||||
@@ -23,18 +23,124 @@ Audit events emitted:
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Dict, FrozenSet, List, Optional, Tuple
|
from typing import Any, Dict, FrozenSet, List, Optional, Tuple
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# ── Constants ─────────────────────────────────────────────────────────────────
|
# ── Constants ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
# Supported control verbs (M3.1+ will implement them fully)
|
# Supported control verbs
|
||||||
VERB_RUNBOOK = "runbook"
|
VERB_RUNBOOK = "runbook"
|
||||||
VERB_STATUS = "status"
|
VERB_STATUS = "status"
|
||||||
|
VERB_NODES = "nodes" # M5.1: node policy overview
|
||||||
|
VERB_NODE = "node" # M6.0: dynamic room-node override commands
|
||||||
|
VERB_ROOM = "room" # M6.1: dynamic mixed room agent overrides
|
||||||
|
VERB_POLICY = "policy" # M6.2: policy snapshot export/import
|
||||||
|
VERB_CONFIRM = "confirm" # M9.0: two-step confirmation for dangerous commands
|
||||||
VERB_HELP = "help"
|
VERB_HELP = "help"
|
||||||
|
|
||||||
KNOWN_VERBS: FrozenSet[str] = frozenset({VERB_RUNBOOK, VERB_STATUS, VERB_HELP})
|
KNOWN_VERBS: FrozenSet[str] = frozenset({
|
||||||
|
VERB_RUNBOOK, VERB_STATUS, VERB_NODES, VERB_NODE,
|
||||||
|
VERB_ROOM, VERB_POLICY, VERB_CONFIRM, VERB_HELP,
|
||||||
|
})
|
||||||
|
|
||||||
|
# ── M9.0: Dangerous command detection ─────────────────────────────────────────
|
||||||
|
|
||||||
|
def is_dangerous_cmd(cmd: "ControlCommand") -> bool:
|
||||||
|
"""
|
||||||
|
Return True if the command requires two-step confirmation before applying.
|
||||||
|
|
||||||
|
Dangerous verbs:
|
||||||
|
!node set room=... node=... — changes room routing
|
||||||
|
!room agents set room=... agents=... — replaces all agents for a room
|
||||||
|
!policy import ... — overwrites policy DB (both modes)
|
||||||
|
"""
|
||||||
|
v = cmd.verb
|
||||||
|
sub = (cmd.subcommand or "").strip().lower()
|
||||||
|
if v == VERB_NODE and sub == "set":
|
||||||
|
return True
|
||||||
|
if v == VERB_ROOM and sub == "agents" and cmd.args and cmd.args[0].lower() == "set":
|
||||||
|
return True
|
||||||
|
if v == VERB_POLICY and sub == "import":
|
||||||
|
return True
|
||||||
|
# M10.0: prune_exports is dangerous only when dry_run=0 (actual deletion)
|
||||||
|
if v == VERB_POLICY and sub == "prune_exports":
|
||||||
|
dry_raw = cmd.kwargs.get("dry_run", "1").strip()
|
||||||
|
is_dry = dry_raw not in ("0", "false", "no")
|
||||||
|
return not is_dry
|
||||||
|
# M10.1: restore is always dangerous (no dry_run option)
|
||||||
|
if v == VERB_POLICY and sub == "restore":
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def build_normalized_args(cmd: "ControlCommand") -> str:
|
||||||
|
"""
|
||||||
|
Build a human-readable normalized representation of the command args.
|
||||||
|
Used in audit events and confirmation prompts.
|
||||||
|
"""
|
||||||
|
parts: list[str] = []
|
||||||
|
# For !room agents set, skip the "set" positional from args display
|
||||||
|
skip_first_arg = cmd.verb == VERB_ROOM and cmd.subcommand == "agents"
|
||||||
|
for i, a in enumerate(cmd.args):
|
||||||
|
if skip_first_arg and i == 0:
|
||||||
|
continue
|
||||||
|
parts.append(a)
|
||||||
|
for k, v in sorted(cmd.kwargs.items()):
|
||||||
|
parts.append(f"{k}={v}")
|
||||||
|
return " ".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def confirm_intent_reply(action_summary: str, nonce: str, ttl_s: int) -> str:
|
||||||
|
"""Reply when a dangerous command is held pending confirmation (M9.0)."""
|
||||||
|
return (
|
||||||
|
f"⚠️ **Confirm required**\n"
|
||||||
|
f"Action: `{action_summary}`\n"
|
||||||
|
f"Type `!confirm {nonce}` within {ttl_s}s to apply.\n"
|
||||||
|
f"_(Only you can confirm this action.)_"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def confirm_success_reply(action_result: str) -> str:
|
||||||
|
"""Reply when a confirmation is accepted and the action applied (M9.0)."""
|
||||||
|
return f"✅ Confirmed and applied.\n{action_result}"
|
||||||
|
|
||||||
|
|
||||||
|
def confirm_expired_reply() -> str:
|
||||||
|
"""Reply when the nonce is invalid, expired, or from a different sender (M9.0)."""
|
||||||
|
return (
|
||||||
|
"❌ Invalid or expired confirmation code. "
|
||||||
|
"The action was **not** applied.\n"
|
||||||
|
"Re-issue the original command to get a new code."
|
||||||
|
)
|
||||||
|
|
||||||
|
# M6.1: !room subcommand + actions
|
||||||
|
ROOM_SUBCMD_AGENTS = "agents"
|
||||||
|
ROOM_ACTION_SET = "set"
|
||||||
|
ROOM_ACTION_ADD = "add"
|
||||||
|
ROOM_ACTION_REMOVE = "remove"
|
||||||
|
ROOM_ACTION_GET = "get"
|
||||||
|
ROOM_ACTION_LIST = "list"
|
||||||
|
ROOM_ACTION_UNSET = "unset" # remove full override
|
||||||
|
_VALID_ROOM_ACTIONS = frozenset({
|
||||||
|
ROOM_ACTION_SET, ROOM_ACTION_ADD, ROOM_ACTION_REMOVE,
|
||||||
|
ROOM_ACTION_GET, ROOM_ACTION_LIST, ROOM_ACTION_UNSET,
|
||||||
|
})
|
||||||
|
|
||||||
|
# M6.0: !node subcommands
|
||||||
|
NODE_SUBCMD_SET = "set"
|
||||||
|
NODE_SUBCMD_UNSET = "unset"
|
||||||
|
NODE_SUBCMD_GET = "get"
|
||||||
|
NODE_SUBCMD_LIST = "list"
|
||||||
|
_VALID_NODE_SUBCMDS = frozenset({NODE_SUBCMD_SET, NODE_SUBCMD_UNSET, NODE_SUBCMD_GET, NODE_SUBCMD_LIST})
|
||||||
|
|
||||||
|
# Runbook subcommands (M3.x)
|
||||||
|
SUBCOMMAND_START = "start" # M3.1 — implemented
|
||||||
|
SUBCOMMAND_NEXT = "next" # M3.2 — implemented
|
||||||
|
SUBCOMMAND_COMPLETE = "complete" # M3.2 — implemented
|
||||||
|
SUBCOMMAND_EVIDENCE = "evidence" # M3.3 — implemented
|
||||||
|
SUBCOMMAND_STATUS = "status" # M3.3 — implemented
|
||||||
|
SUBCOMMAND_POST_REVIEW = "post_review" # M3.3 — implemented
|
||||||
|
|
||||||
# Max command line length to guard against garbage injection
|
# Max command line length to guard against garbage injection
|
||||||
_MAX_CMD_LEN = 512
|
_MAX_CMD_LEN = 512
|
||||||
@@ -225,10 +331,814 @@ def check_authorization(
|
|||||||
# ── Reply helpers ─────────────────────────────────────────────────────────────
|
# ── Reply helpers ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def not_implemented_reply(cmd: ControlCommand) -> str:
|
def not_implemented_reply(cmd: ControlCommand) -> str:
|
||||||
"""Reply for known commands not yet implemented (M3.0 stub)."""
|
"""Reply for known commands not yet implemented."""
|
||||||
return (
|
return (
|
||||||
f"✅ Command acknowledged: `{cmd.raw}`\n"
|
f"✅ Command acknowledged: `{cmd.raw}`\n"
|
||||||
f"⏳ `!{cmd.verb} {cmd.subcommand}` — implementation pending (M3.1+)."
|
f"⏳ `!{cmd.verb} {cmd.subcommand}` — implementation pending."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def next_usage_reply() -> str:
|
||||||
|
"""Reply when !runbook next is called without a run_id."""
|
||||||
|
return (
|
||||||
|
"⚠️ Usage: `!runbook next <run_id>`\n"
|
||||||
|
"Example: `!runbook next abc-123`"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def complete_usage_reply() -> str:
|
||||||
|
"""Reply when !runbook complete is missing required args."""
|
||||||
|
return (
|
||||||
|
"⚠️ Usage: `!runbook complete <run_id> step=<n> status=ok|warn|fail [notes=...]`\n"
|
||||||
|
"Example: `!runbook complete abc-123 step=3 status=ok notes=done`\n"
|
||||||
|
"Notes with spaces: join without quotes — `notes=done_and_verified`."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def start_usage_reply() -> str:
|
||||||
|
"""Reply when !runbook start is called with missing/invalid runbook_path."""
|
||||||
|
return (
|
||||||
|
"⚠️ Usage: `!runbook start <runbook_path> [node=NODA1]`\n"
|
||||||
|
"Example: `!runbook start runbooks/rehearsal-v1-checklist.md node=NODA1`\n"
|
||||||
|
"runbook_path must be a relative path without `..`."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def runbook_started_reply(run_id: str, steps_total: int, status: str) -> str:
|
||||||
|
"""Success reply after sofiia-console creates a runbook run."""
|
||||||
|
return (
|
||||||
|
f"✅ runbook started: `run_id={run_id}` steps={steps_total} status={status}\n"
|
||||||
|
f"Next: `!runbook next {run_id}`"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def runbook_start_error_reply(reason: str) -> str:
|
||||||
|
"""Error reply when sofiia-console returns a non-2xx or connection error."""
|
||||||
|
return f"❌ failed to start runbook: {reason}"
|
||||||
|
|
||||||
|
|
||||||
|
# ── M3.2 reply helpers ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# Max chars of instructions_md to include in Matrix message before truncating
|
||||||
|
_INSTRUCTIONS_EXCERPT_MAX = 1500
|
||||||
|
|
||||||
|
|
||||||
|
def next_manual_reply(
|
||||||
|
run_id: str,
|
||||||
|
step_index: int,
|
||||||
|
steps_total: Optional[int],
|
||||||
|
title: str,
|
||||||
|
instructions_md: str,
|
||||||
|
) -> str:
|
||||||
|
"""Reply for a manual step returned by !runbook next."""
|
||||||
|
step_label = f"Step {step_index + 1}"
|
||||||
|
if steps_total:
|
||||||
|
step_label += f"/{steps_total}"
|
||||||
|
|
||||||
|
excerpt = instructions_md.strip()
|
||||||
|
truncated = False
|
||||||
|
if len(excerpt) > _INSTRUCTIONS_EXCERPT_MAX:
|
||||||
|
excerpt = excerpt[:_INSTRUCTIONS_EXCERPT_MAX].rsplit("\n", 1)[0]
|
||||||
|
truncated = True
|
||||||
|
|
||||||
|
parts = [
|
||||||
|
f"🧭 {step_label}: **{title}**",
|
||||||
|
"",
|
||||||
|
excerpt,
|
||||||
|
]
|
||||||
|
if truncated:
|
||||||
|
parts.append("_...(truncated — open in console for full instructions)_")
|
||||||
|
parts += [
|
||||||
|
"",
|
||||||
|
f"Complete: `!runbook complete {run_id} step={step_index} status=ok`",
|
||||||
|
]
|
||||||
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def next_auto_reply(
|
||||||
|
run_id: str,
|
||||||
|
step_index: int,
|
||||||
|
action_type: str,
|
||||||
|
step_status: str,
|
||||||
|
duration_ms: Optional[int],
|
||||||
|
completed: bool,
|
||||||
|
) -> str:
|
||||||
|
"""Reply for an auto step (http_check/script) completed by !runbook next."""
|
||||||
|
emoji = {"ok": "✅", "warn": "⚠️", "fail": "❌"}.get(step_status, "ℹ️")
|
||||||
|
dur = f" duration={duration_ms}ms" if duration_ms is not None else ""
|
||||||
|
header = f"{emoji} step {step_index + 1} ({action_type}) {step_status}{dur}"
|
||||||
|
|
||||||
|
if completed:
|
||||||
|
return (
|
||||||
|
f"{header}\n"
|
||||||
|
"🎉 All steps completed!\n"
|
||||||
|
f"Get evidence: `!runbook evidence {run_id}`"
|
||||||
|
)
|
||||||
|
return f"{header}\nNext: `!runbook next {run_id}`"
|
||||||
|
|
||||||
|
|
||||||
|
def next_error_reply(run_id: str, reason: str) -> str:
|
||||||
|
"""Error reply when !runbook next fails."""
|
||||||
|
return f"❌ failed to advance runbook: {reason}"
|
||||||
|
|
||||||
|
|
||||||
|
def complete_ok_reply(run_id: str, step_index: int, status: str, run_completed: bool) -> str:
|
||||||
|
"""Success reply after !runbook complete."""
|
||||||
|
emoji = {"ok": "✅", "warn": "⚠️", "fail": "❌", "skipped": "⏭️"}.get(status, "✅")
|
||||||
|
line1 = f"{emoji} recorded step {step_index + 1}: {status}"
|
||||||
|
if run_completed:
|
||||||
|
return f"{line1}\n🎉 All steps completed!\nGet evidence: `!runbook evidence {run_id}`"
|
||||||
|
return f"{line1}\nNext: `!runbook next {run_id}`"
|
||||||
|
|
||||||
|
|
||||||
|
def complete_error_reply(run_id: str, reason: str) -> str:
|
||||||
|
"""Error reply when !runbook complete fails."""
|
||||||
|
return f"❌ failed to complete step: {reason}"
|
||||||
|
|
||||||
|
|
||||||
|
# ── M3.3 reply helpers ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def status_usage_reply() -> str:
|
||||||
|
return (
|
||||||
|
"⚠️ Usage: `!runbook status <run_id>`\n"
|
||||||
|
"Example: `!runbook status abc-123`"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def evidence_usage_reply() -> str:
|
||||||
|
return (
|
||||||
|
"⚠️ Usage: `!runbook evidence <run_id>`\n"
|
||||||
|
"Example: `!runbook evidence abc-123`"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def post_review_usage_reply() -> str:
|
||||||
|
return (
|
||||||
|
"⚠️ Usage: `!runbook post_review <run_id>`\n"
|
||||||
|
"Example: `!runbook post_review abc-123`"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def status_reply(run: dict) -> str:
|
||||||
|
"""Format !runbook status reply from a get_run response."""
|
||||||
|
run_id = run.get("run_id", "?")
|
||||||
|
status = run.get("status", "?")
|
||||||
|
current = run.get("current_step", 0)
|
||||||
|
steps_total = run.get("steps_total") or len(run.get("steps", []))
|
||||||
|
runbook_path = run.get("runbook_path", "?")
|
||||||
|
node_id = run.get("node_id", "?")
|
||||||
|
evidence_path = run.get("evidence_path")
|
||||||
|
|
||||||
|
# Count warn/fail steps
|
||||||
|
steps = run.get("steps", [])
|
||||||
|
warn_count = sum(1 for s in steps if s.get("status") == "warn")
|
||||||
|
fail_count = sum(1 for s in steps if s.get("status") == "fail")
|
||||||
|
|
||||||
|
status_emoji = {
|
||||||
|
"running": "🔄", "completed": "✅", "aborted": "🛑", "paused": "⏸️",
|
||||||
|
}.get(status, "ℹ️")
|
||||||
|
|
||||||
|
step_label = f"{current}/{steps_total}" if steps_total else str(current)
|
||||||
|
lines = [
|
||||||
|
f"{status_emoji} `run_id={run_id}` status={status} step={step_label}",
|
||||||
|
f"runbook: `{runbook_path}` node: {node_id}",
|
||||||
|
]
|
||||||
|
if warn_count or fail_count:
|
||||||
|
lines.append(f"warn={warn_count} fail={fail_count}")
|
||||||
|
if evidence_path:
|
||||||
|
lines.append(f"evidence: `{evidence_path}`")
|
||||||
|
|
||||||
|
if status == "completed" and not evidence_path:
|
||||||
|
lines.append(f"Get evidence: `!runbook evidence {run_id}`")
|
||||||
|
elif status == "completed" and evidence_path:
|
||||||
|
lines.append(f"Post-review: `!runbook post_review {run_id}`")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def status_error_reply(run_id: str, reason: str) -> str:
|
||||||
|
return f"❌ failed to get status: {reason}"
|
||||||
|
|
||||||
|
|
||||||
|
def evidence_reply(result: dict) -> str:
|
||||||
|
"""Success reply after !runbook evidence."""
|
||||||
|
path = result.get("evidence_path", "?")
|
||||||
|
size = result.get("bytes", 0)
|
||||||
|
run_id = result.get("run_id", "")
|
||||||
|
ts = result.get("created_at", "")
|
||||||
|
lines = [f"📄 evidence created: `{path}` (bytes={size})"]
|
||||||
|
if ts:
|
||||||
|
lines.append(f"created_at: {ts}")
|
||||||
|
if run_id:
|
||||||
|
lines.append(f"Next: `!runbook post_review {run_id}`")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def evidence_error_reply(run_id: str, reason: str) -> str:
|
||||||
|
return f"❌ failed to generate evidence: {reason}"
|
||||||
|
|
||||||
|
|
||||||
|
def post_review_reply(result: dict) -> str:
|
||||||
|
"""Success reply after !runbook post_review."""
|
||||||
|
path = result.get("path", "?")
|
||||||
|
size = result.get("bytes", 0)
|
||||||
|
ts = result.get("created_at", "")
|
||||||
|
lines = [f"🧾 post-review created: `{path}` (bytes={size})"]
|
||||||
|
if ts:
|
||||||
|
lines.append(f"created_at: {ts}")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def post_review_error_reply(run_id: str, reason: str) -> str:
|
||||||
|
return f"❌ failed to generate post-review: {reason}"
|
||||||
|
|
||||||
|
|
||||||
|
# ── M3.4 safety helpers ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#: Maximum length of notes/free-text operator input accepted before truncation.
|
||||||
|
MAX_NOTES_LEN: int = 500
|
||||||
|
|
||||||
|
#: Control characters (U+0000–U+001F minus tab/newline) that must be stripped.
|
||||||
|
_CTRL_CHARS = "".join(chr(i) for i in range(32) if i not in (9, 10, 13))
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_notes(notes: str) -> str:
|
||||||
|
"""
|
||||||
|
Strip control characters and truncate notes to MAX_NOTES_LEN.
|
||||||
|
|
||||||
|
Safe to call with any string; returns empty string for falsy input.
|
||||||
|
"""
|
||||||
|
if not notes:
|
||||||
|
return ""
|
||||||
|
cleaned = notes.translate(str.maketrans("", "", _CTRL_CHARS))
|
||||||
|
if len(cleaned) > MAX_NOTES_LEN:
|
||||||
|
cleaned = cleaned[:MAX_NOTES_LEN] + "…"
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def rate_limited_reply(scope: str, retry_after_s: float) -> str:
|
||||||
|
"""Reply when a control command is rejected by rate limiter or cooldown."""
|
||||||
|
secs = f"{retry_after_s:.0f}s" if retry_after_s >= 1 else "a moment"
|
||||||
|
return f"⏳ rate limited ({scope}), retry after {secs}"
|
||||||
|
|
||||||
|
|
||||||
|
def status_not_available_reply() -> str:
|
||||||
|
return "⚠️ Bridge status not available (service initialising or config missing)."
|
||||||
|
|
||||||
|
|
||||||
|
# M5.1: !nodes reply
|
||||||
|
_MAX_ROOM_OVERRIDES_SHOWN = 10
|
||||||
|
|
||||||
|
|
||||||
|
def nodes_reply(
|
||||||
|
policy_info: dict,
|
||||||
|
node_stats: Optional[dict] = None,
|
||||||
|
sticky_info: Optional[dict] = None,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Compact reply for `!nodes` in control room.
|
||||||
|
|
||||||
|
policy_info: from NodePolicy.as_info_dict()
|
||||||
|
node_stats: optional dict {node_id: {"routed": N, "rejected": M, "health": ..., ...}}
|
||||||
|
sticky_info: optional dict from StickyNodeCache (M8.1)
|
||||||
|
"""
|
||||||
|
default = policy_info.get("default_node", "?")
|
||||||
|
allowed = sorted(policy_info.get("allowed_nodes") or [])
|
||||||
|
overrides = policy_info.get("room_overrides", {}) or {}
|
||||||
|
|
||||||
|
allowed_str = ", ".join(f"`{n}`" for n in allowed)
|
||||||
|
lines = [
|
||||||
|
"🌐 **Node policy**",
|
||||||
|
f"Default: `{default}` Allowed: {allowed_str}",
|
||||||
|
]
|
||||||
|
|
||||||
|
if isinstance(overrides, dict) and overrides:
|
||||||
|
lines.append(f"\n**Room overrides** ({len(overrides)}):")
|
||||||
|
items = list(overrides.items())[:_MAX_ROOM_OVERRIDES_SHOWN]
|
||||||
|
for room_id, node in items:
|
||||||
|
lines.append(f" `{room_id}` → `{node}`")
|
||||||
|
if len(overrides) > _MAX_ROOM_OVERRIDES_SHOWN:
|
||||||
|
lines.append(f" _(+{len(overrides) - _MAX_ROOM_OVERRIDES_SHOWN} more)_")
|
||||||
|
elif isinstance(overrides, int):
|
||||||
|
# as_info_dict returns room_overrides as int count, not dict
|
||||||
|
if overrides:
|
||||||
|
lines.append(f"\nRoom overrides: {overrides}")
|
||||||
|
else:
|
||||||
|
lines.append("\nNo room overrides configured.")
|
||||||
|
else:
|
||||||
|
lines.append("\nNo room overrides configured.")
|
||||||
|
|
||||||
|
if node_stats:
|
||||||
|
lines.append("\n**Per-node stats** (since last restart):")
|
||||||
|
for node_id in sorted(node_stats):
|
||||||
|
ns = node_stats[node_id]
|
||||||
|
routed = ns.get("routed", 0)
|
||||||
|
rejected = ns.get("rejected", 0)
|
||||||
|
health = ns.get("health", "")
|
||||||
|
ewma = ns.get("ewma_latency_s")
|
||||||
|
consec = ns.get("consecutive_failures", 0)
|
||||||
|
stat_parts = [f"routed={routed}", f"rejected={rejected}"]
|
||||||
|
if health:
|
||||||
|
stat_parts.append(f"health={health}")
|
||||||
|
if ewma is not None:
|
||||||
|
stat_parts.append(f"ewma={ewma:.2f}s")
|
||||||
|
if consec:
|
||||||
|
stat_parts.append(f"consec_fail={consec}")
|
||||||
|
lines.append(f" `{node_id}`: " + " ".join(stat_parts))
|
||||||
|
|
||||||
|
# M8.1: sticky cache section
|
||||||
|
if sticky_info is not None:
|
||||||
|
active = sticky_info.get("active_keys", 0)
|
||||||
|
ttl = sticky_info.get("ttl_s", 0)
|
||||||
|
if active:
|
||||||
|
lines.append(f"\n**Sticky routing** (anti-flap): {active} active ttl={ttl:.0f}s")
|
||||||
|
for entry in sticky_info.get("entries", []):
|
||||||
|
rem = entry.get("remaining_s", 0)
|
||||||
|
lines.append(
|
||||||
|
f" `{entry['key']}` → `{entry['node']}` ({rem:.0f}s left)"
|
||||||
|
)
|
||||||
|
if sticky_info.get("truncated"):
|
||||||
|
lines.append(f" _(+{sticky_info['truncated']} more)_")
|
||||||
|
else:
|
||||||
|
lines.append(f"\nSticky routing: none active ttl={ttl:.0f}s")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
# ── M6.0: !node subcommand parser + reply helpers ──────────────────────────────
|
||||||
|
|
||||||
|
import re as _re
|
||||||
|
|
||||||
|
_ROOM_KWARG_RE = _re.compile(r"\broom=(\S+)", _re.IGNORECASE)
|
||||||
|
_NODE_VAL_RE = _re.compile(r"\bnode=(\w+)", _re.IGNORECASE)
|
||||||
|
_ROOM_ID_RE = _re.compile(r"^![a-zA-Z0-9._\-]+:[a-zA-Z0-9._\-]+$")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_node_cmd(args_text: str) -> Tuple[str, Optional[str], Optional[str]]:
|
||||||
|
"""
|
||||||
|
Parse `!node <subcommand> [room=...] [node=...]` arguments.
|
||||||
|
|
||||||
|
Returns (subcmd, room_id_or_None, node_id_or_None).
|
||||||
|
subcmd is lower-cased; node_id is upper-cased.
|
||||||
|
"""
|
||||||
|
parts = args_text.strip().split(None, 1)
|
||||||
|
if not parts:
|
||||||
|
return ("", None, None)
|
||||||
|
subcmd = parts[0].lower()
|
||||||
|
rest = parts[1] if len(parts) > 1 else ""
|
||||||
|
|
||||||
|
room_m = _ROOM_KWARG_RE.search(rest)
|
||||||
|
node_m = _NODE_VAL_RE.search(rest)
|
||||||
|
|
||||||
|
room_id = room_m.group(1) if room_m else None
|
||||||
|
node_id = node_m.group(1).upper() if node_m else None
|
||||||
|
return (subcmd, room_id, node_id)
|
||||||
|
|
||||||
|
|
||||||
|
def node_cmd_validate_room(room_id: str) -> bool:
|
||||||
|
"""Return True if room_id matches basic Matrix room ID format."""
|
||||||
|
return bool(_ROOM_ID_RE.match(room_id)) if room_id else False
|
||||||
|
|
||||||
|
|
||||||
|
def node_cmd_reply_set(room_id: str, node_id: str) -> str:
|
||||||
|
return f"✅ Override set: `{room_id}` → `{node_id}`"
|
||||||
|
|
||||||
|
|
||||||
|
def node_cmd_reply_unset_ok(room_id: str) -> str:
|
||||||
|
return f"✅ Override removed for `{room_id}`"
|
||||||
|
|
||||||
|
|
||||||
|
def node_cmd_reply_unset_not_found(room_id: str) -> str:
|
||||||
|
return f"ℹ️ No override was set for `{room_id}`"
|
||||||
|
|
||||||
|
|
||||||
|
def node_cmd_reply_get(
|
||||||
|
room_id: str,
|
||||||
|
node_id: Optional[str],
|
||||||
|
env_node: Optional[str],
|
||||||
|
default_node: str,
|
||||||
|
) -> str:
|
||||||
|
lines = [f"📌 **Node info for** `{room_id}`"]
|
||||||
|
if node_id:
|
||||||
|
lines.append(f"Dynamic override: `{node_id}` _(set by operator)_")
|
||||||
|
else:
|
||||||
|
lines.append("Dynamic override: _none_")
|
||||||
|
if env_node:
|
||||||
|
lines.append(f"Env map: `{env_node}`")
|
||||||
|
lines.append(f"Default: `{default_node}`")
|
||||||
|
effective = node_id or env_node or default_node
|
||||||
|
lines.append(f"\nEffective node: **`{effective}`**")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def node_cmd_reply_list(
|
||||||
|
overrides: List[Tuple[str, str, int]],
|
||||||
|
total: int,
|
||||||
|
) -> str:
|
||||||
|
import datetime
|
||||||
|
lines = [f"📋 **Dynamic node overrides** ({total} total)"]
|
||||||
|
if not overrides:
|
||||||
|
lines.append("_None set._")
|
||||||
|
else:
|
||||||
|
for room_id, node_id, updated_at in overrides:
|
||||||
|
ts = datetime.datetime.utcfromtimestamp(updated_at).strftime("%Y-%m-%d %H:%M")
|
||||||
|
lines.append(f" `{room_id}` → `{node_id}` _(at {ts} UTC)_")
|
||||||
|
if total > len(overrides):
|
||||||
|
lines.append(f" _(+{total - len(overrides)} more)_")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def node_cmd_reply_error(msg: str) -> str:
|
||||||
|
return (
|
||||||
|
f"❌ {msg}\n\n"
|
||||||
|
"Usage:\n"
|
||||||
|
" `!node set room=!room:server node=NODA2`\n"
|
||||||
|
" `!node unset room=!room:server`\n"
|
||||||
|
" `!node get room=!room:server`\n"
|
||||||
|
" `!node list`"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── M6.1: !room agents reply helpers ──────────────────────────────────────────
|
||||||
|
|
||||||
|
_AGENTS_KWARG_RE = _re.compile(r"\bagents=(\S+)", _re.IGNORECASE)
|
||||||
|
_AGENT_KWARG_RE = _re.compile(r"\bagent=(\w+)", _re.IGNORECASE)
|
||||||
|
_DEFAULT_KWARG_RE = _re.compile(r"\bdefault=(\w+)", _re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_room_agents_cmd(
|
||||||
|
subcommand: str,
|
||||||
|
args: tuple,
|
||||||
|
kwargs: Dict[str, str],
|
||||||
|
) -> Tuple[str, Optional[str], Optional[List[str]], Optional[str], Optional[str]]:
|
||||||
|
"""
|
||||||
|
Parse !room agents <action> [room=...] [agents=...] [agent=...] [default=...] args.
|
||||||
|
|
||||||
|
Returns (action, room_id, agents_or_None, single_agent_or_None, default_agent_or_None).
|
||||||
|
action: the ROOM_ACTION_* constant (from args[0] or subcommand)
|
||||||
|
room_id: from kwargs["room"]
|
||||||
|
agents: from kwargs["agents"] as a list (for set command)
|
||||||
|
single_agent: from kwargs["agent"] (for add/remove)
|
||||||
|
default_agent: from kwargs["default"]
|
||||||
|
"""
|
||||||
|
# action is args[0] when subcommand == "agents"
|
||||||
|
action = (args[0].lower() if args else "").strip() or subcommand.lower()
|
||||||
|
room_id = kwargs.get("room")
|
||||||
|
|
||||||
|
# agents= may be comma-separated
|
||||||
|
raw_agents = kwargs.get("agents", "")
|
||||||
|
agents: Optional[List[str]] = (
|
||||||
|
[a.strip().lower() for a in raw_agents.split(",") if a.strip()]
|
||||||
|
if raw_agents else None
|
||||||
|
)
|
||||||
|
|
||||||
|
single_agent = kwargs.get("agent", "").strip().lower() or None
|
||||||
|
default_agent = kwargs.get("default", "").strip().lower() or None
|
||||||
|
return action, room_id, agents, single_agent, default_agent
|
||||||
|
|
||||||
|
|
||||||
|
def room_agents_reply_set(room_id: str, agents: List[str], default_agent: str) -> str:
|
||||||
|
agents_str = ", ".join(f"`{a}`" for a in sorted(agents))
|
||||||
|
return (
|
||||||
|
f"✅ Agent override set for `{room_id}`\n"
|
||||||
|
f"Agents: {agents_str}\n"
|
||||||
|
f"Default: `{default_agent}`"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def room_agents_reply_add(room_id: str, agent: str, agents: List[str], default_agent: Optional[str]) -> str:
|
||||||
|
agents_str = ", ".join(f"`{a}`" for a in sorted(agents))
|
||||||
|
return (
|
||||||
|
f"✅ Agent `{agent}` added to `{room_id}`\n"
|
||||||
|
f"Current agents: {agents_str}"
|
||||||
|
+ (f"\nDefault: `{default_agent}`" if default_agent else "")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def room_agents_reply_remove(room_id: str, agent: str, agents: List[str], default_agent: Optional[str]) -> str:
|
||||||
|
if agents:
|
||||||
|
agents_str = ", ".join(f"`{a}`" for a in sorted(agents))
|
||||||
|
return (
|
||||||
|
f"✅ Agent `{agent}` removed from `{room_id}`\n"
|
||||||
|
f"Remaining: {agents_str}"
|
||||||
|
+ (f"\nDefault: `{default_agent}`" if default_agent else "")
|
||||||
|
)
|
||||||
|
return f"✅ Agent `{agent}` removed — no agents left, override cleared for `{room_id}`"
|
||||||
|
|
||||||
|
|
||||||
|
def room_agents_reply_unset_ok(room_id: str) -> str:
|
||||||
|
return f"✅ Agent override cleared for `{room_id}` (using env/default config)"
|
||||||
|
|
||||||
|
|
||||||
|
def room_agents_reply_unset_not_found(room_id: str) -> str:
|
||||||
|
return f"ℹ️ No agent override was set for `{room_id}`"
|
||||||
|
|
||||||
|
|
||||||
|
def room_agents_reply_get(
|
||||||
|
room_id: str,
|
||||||
|
override_agents: Optional[List[str]],
|
||||||
|
override_default: Optional[str],
|
||||||
|
env_agents: Optional[List[str]],
|
||||||
|
env_default: Optional[str],
|
||||||
|
) -> str:
|
||||||
|
lines = [f"📌 **Agent policy for** `{room_id}`"]
|
||||||
|
if override_agents:
|
||||||
|
agents_str = ", ".join(f"`{a}`" for a in sorted(override_agents))
|
||||||
|
lines.append(f"Dynamic override: {agents_str} default=`{override_default or '?'}`")
|
||||||
|
else:
|
||||||
|
lines.append("Dynamic override: _none_")
|
||||||
|
if env_agents:
|
||||||
|
env_str = ", ".join(f"`{a}`" for a in sorted(env_agents))
|
||||||
|
lines.append(f"Env config: {env_str} default=`{env_default or '?'}`")
|
||||||
|
else:
|
||||||
|
lines.append("Env config: _not configured_")
|
||||||
|
effective_agents = override_agents or env_agents or []
|
||||||
|
effective_default = override_default or env_default or "?"
|
||||||
|
lines.append(f"\nEffective agents: **{', '.join(f'`{a}`' for a in sorted(effective_agents))}** default=**`{effective_default}`**")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def room_agents_reply_list(
|
||||||
|
overrides: List[Tuple[str, List[str], Optional[str], int]],
|
||||||
|
total: int,
|
||||||
|
) -> str:
|
||||||
|
import datetime
|
||||||
|
lines = [f"📋 **Dynamic agent overrides** ({total} total)"]
|
||||||
|
if not overrides:
|
||||||
|
lines.append("_None set._")
|
||||||
|
else:
|
||||||
|
for room_id, agents, default_agent, updated_at in overrides:
|
||||||
|
ts = datetime.datetime.utcfromtimestamp(updated_at).strftime("%Y-%m-%d %H:%M")
|
||||||
|
agents_str = ", ".join(agents)
|
||||||
|
lines.append(f" `{room_id}`: [{agents_str}] default=`{default_agent or '?'}` _(at {ts} UTC)_")
|
||||||
|
if total > len(overrides):
|
||||||
|
lines.append(f" _(+{total - len(overrides)} more)_")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def room_agents_reply_error(msg: str) -> str:
|
||||||
|
return (
|
||||||
|
f"❌ {msg}\n\n"
|
||||||
|
"Usage:\n"
|
||||||
|
" `!room agents set room=!X agents=sofiia,helion [default=sofiia]`\n"
|
||||||
|
" `!room agents add room=!X agent=druid`\n"
|
||||||
|
" `!room agents remove room=!X agent=helion`\n"
|
||||||
|
" `!room agents get room=!X`\n"
|
||||||
|
" `!room agents unset room=!X`\n"
|
||||||
|
" `!room agents list`"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── M6.2: !policy export/import reply helpers + path validator ────────────────
|
||||||
|
|
||||||
|
import os as _os
|
||||||
|
import json as _json
|
||||||
|
|
||||||
|
|
||||||
|
POLICY_EXPORTS_SUBDIR = "policy_exports"
|
||||||
|
|
||||||
|
|
||||||
|
def validate_export_path(exports_dir: str, filename: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Validate and resolve an export filename to an absolute path.
|
||||||
|
|
||||||
|
Security: only allow simple filenames (no slashes, no `..`).
|
||||||
|
Returns the absolute safe path, or None if invalid.
|
||||||
|
"""
|
||||||
|
if not filename:
|
||||||
|
return None
|
||||||
|
# Reject anything with directory separators or traversal sequences
|
||||||
|
if "/" in filename or "\\" in filename or ".." in filename:
|
||||||
|
return None
|
||||||
|
# Only allow safe characters: alphanumeric, dash, underscore, dot
|
||||||
|
if not _re.match(r"^[a-zA-Z0-9._\-]+$", filename):
|
||||||
|
return None
|
||||||
|
full_path = _os.path.join(exports_dir, filename)
|
||||||
|
try:
|
||||||
|
resolved = _os.path.realpath(full_path)
|
||||||
|
exports_resolved = _os.path.realpath(exports_dir)
|
||||||
|
if not resolved.startswith(exports_resolved + _os.sep):
|
||||||
|
return None
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
return None
|
||||||
|
return full_path
|
||||||
|
|
||||||
|
|
||||||
|
def policy_export_reply(path: str, node_count: int, agent_count: int) -> str:
|
||||||
|
filename = _os.path.basename(path)
|
||||||
|
return (
|
||||||
|
f"✅ **Policy exported**\n"
|
||||||
|
f"File: `{filename}`\n"
|
||||||
|
f"Node overrides: {node_count} Agent overrides: {agent_count}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def policy_import_dry_run_reply(stats: dict, mode: str) -> str:
|
||||||
|
return (
|
||||||
|
f"🔍 **Import dry-run** (mode=`{mode}`, no changes applied)\n"
|
||||||
|
f"Node overrides: +{stats.get('node_added',0)} ~{stats.get('node_updated',0)} -{stats.get('node_deleted',0)}\n"
|
||||||
|
f"Agent overrides: +{stats.get('agent_added',0)} ~{stats.get('agent_updated',0)} -{stats.get('agent_deleted',0)}\n"
|
||||||
|
f"_Use `dry_run=0` to apply._"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def format_import_diff(diff: Any) -> str:
|
||||||
|
"""
|
||||||
|
Format an ImportDiff as a human-readable Markdown string (M9.1).
|
||||||
|
`diff` is an ImportDiff instance from policy_store.
|
||||||
|
"""
|
||||||
|
lines: List[str] = []
|
||||||
|
|
||||||
|
# Node overrides row
|
||||||
|
node_parts: List[str] = []
|
||||||
|
if diff.node_added: node_parts.append(f"+{diff.node_added} added")
|
||||||
|
if diff.node_updated: node_parts.append(f"~{diff.node_updated} updated")
|
||||||
|
if diff.node_deleted: node_parts.append(f"-{diff.node_deleted} deleted ⚠️")
|
||||||
|
lines.append("**Node overrides:** " + (", ".join(node_parts) if node_parts else "no changes"))
|
||||||
|
|
||||||
|
# Agent overrides row
|
||||||
|
agent_parts: List[str] = []
|
||||||
|
if diff.agent_added: agent_parts.append(f"+{diff.agent_added} added")
|
||||||
|
if diff.agent_updated: agent_parts.append(f"~{diff.agent_updated} updated")
|
||||||
|
if diff.agent_deleted: agent_parts.append(f"-{diff.agent_deleted} deleted ⚠️")
|
||||||
|
lines.append("**Agent overrides:** " + (", ".join(agent_parts) if agent_parts else "no changes"))
|
||||||
|
|
||||||
|
# Sample affected rooms
|
||||||
|
if getattr(diff, "sample_keys", None):
|
||||||
|
keys_str = ", ".join(f"`{k}`" for k in diff.sample_keys)
|
||||||
|
more = diff.total_changes() - len(diff.sample_keys)
|
||||||
|
suffix = f" _(+{more} more)_" if more > 0 else ""
|
||||||
|
lines.append(f"**Affected rooms:** {keys_str}{suffix}")
|
||||||
|
|
||||||
|
# Replace danger banner
|
||||||
|
if getattr(diff, "is_replace", False):
|
||||||
|
lines.append("⚠️ **REPLACE mode** — existing overrides NOT in the file will be **deleted**.")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def policy_import_intent_reply(
|
||||||
|
diff: Any,
|
||||||
|
action_summary: str,
|
||||||
|
nonce: str,
|
||||||
|
ttl_s: int,
|
||||||
|
) -> str:
|
||||||
|
"""Reply for !policy import intent with diff preview (M9.1)."""
|
||||||
|
lines = [
|
||||||
|
"⚠️ **Confirm required**",
|
||||||
|
f"Action: `{action_summary}`",
|
||||||
|
"",
|
||||||
|
"**Preview:**",
|
||||||
|
format_import_diff(diff),
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
if diff.total_changes() == 0:
|
||||||
|
lines.append("_(No policy changes would be made.)_")
|
||||||
|
lines.append("")
|
||||||
|
lines += [
|
||||||
|
f"Type `!confirm {nonce}` within {ttl_s}s to apply.",
|
||||||
|
"_(Only you can confirm. If the file changes, this confirm will be rejected.)_",
|
||||||
|
]
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def policy_import_reply(stats: dict, mode: str) -> str:
|
||||||
|
return (
|
||||||
|
f"✅ **Policy imported** (mode=`{mode}`)\n"
|
||||||
|
f"Node overrides: +{stats.get('node_added',0)} ~{stats.get('node_updated',0)} -{stats.get('node_deleted',0)}\n"
|
||||||
|
f"Agent overrides: +{stats.get('agent_added',0)} ~{stats.get('agent_updated',0)} -{stats.get('agent_deleted',0)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def policy_restore_intent_reply(
|
||||||
|
diff: Any,
|
||||||
|
action_summary: str,
|
||||||
|
nonce: str,
|
||||||
|
ttl_s: int,
|
||||||
|
) -> str:
|
||||||
|
"""Reply for !policy restore intent — rollback preview + confirm prompt (M10.1)."""
|
||||||
|
diff_text = format_import_diff(diff)
|
||||||
|
return (
|
||||||
|
f"🔄 **Policy restore (rollback) preview**\n"
|
||||||
|
f"{diff_text}\n\n"
|
||||||
|
f"⚠️ **Rollback action:** `{action_summary}`\n\n"
|
||||||
|
f"Type `!confirm {nonce}` to apply restore (expires in {ttl_s}s)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def policy_restore_applied_reply(
|
||||||
|
stats: Any,
|
||||||
|
mode: str,
|
||||||
|
autobackup_basename: str = "",
|
||||||
|
) -> str:
|
||||||
|
"""Reply after !policy restore is confirmed and applied (M10.1)."""
|
||||||
|
n_a = stats.get("node_added", 0) if isinstance(stats, dict) else 0
|
||||||
|
n_u = stats.get("node_updated", 0) if isinstance(stats, dict) else 0
|
||||||
|
n_d = stats.get("node_deleted", 0) if isinstance(stats, dict) else 0
|
||||||
|
a_a = stats.get("agent_added", 0) if isinstance(stats, dict) else 0
|
||||||
|
a_u = stats.get("agent_updated", 0) if isinstance(stats, dict) else 0
|
||||||
|
a_d = stats.get("agent_deleted", 0) if isinstance(stats, dict) else 0
|
||||||
|
backup_line = (
|
||||||
|
f"\n\n💾 Pre-restore backup saved: `{autobackup_basename}`"
|
||||||
|
if autobackup_basename else ""
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
f"✅ **Policy restored** (mode={mode})\n"
|
||||||
|
f"Node overrides: +{n_a} ~{n_u} -{n_d}\n"
|
||||||
|
f"Agent overrides: +{a_a} ~{a_u} -{a_d}"
|
||||||
|
f"{backup_line}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def policy_history_reply(changes: List[Any]) -> str:
|
||||||
|
"""
|
||||||
|
Format policy_changes records for !policy history reply (M10.2).
|
||||||
|
|
||||||
|
Each line: #{n}. [id:NN] [YYYY-MM-DD HH:MM] verb/mode +Xn ~Yn -Zn `file` op:`hash8` [⚠️]
|
||||||
|
Use !policy change id=NN to see full details.
|
||||||
|
"""
|
||||||
|
if not changes:
|
||||||
|
return "📋 **Policy change history**\nNo policy changes recorded yet."
|
||||||
|
lines = ["📋 **Policy change history** (most recent first)\n"]
|
||||||
|
for i, c in enumerate(changes, 1):
|
||||||
|
destr_flag = " ⚠️" if c.is_destructive else ""
|
||||||
|
fname = c.source_file[:40] + "…" if len(c.source_file) > 40 else c.source_file
|
||||||
|
line = (
|
||||||
|
f"{i}. [id:{c.id}] [{c.when_str()}] `{c.verb}/{c.mode}`"
|
||||||
|
f" {c.changes_short()}{destr_flag}"
|
||||||
|
f" `{fname}`"
|
||||||
|
f" op:`{c.sender_hash[:8]}`"
|
||||||
|
)
|
||||||
|
lines.append(line)
|
||||||
|
lines.append("\nUse `!policy change id=<n>` for full details of a specific change.")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def policy_change_detail_reply(change: Any) -> str:
|
||||||
|
"""
|
||||||
|
Format full details of a single PolicyChange for !policy change id=<n> (M10.3).
|
||||||
|
"""
|
||||||
|
destr_str = "⚠️ Yes" if change.is_destructive else "No"
|
||||||
|
fname = change.source_file[:60] + "…" if len(change.source_file) > 60 else change.source_file
|
||||||
|
lines = [
|
||||||
|
f"🔍 **Policy change #{change.id}**\n",
|
||||||
|
f"**Verb:** `{change.verb}`",
|
||||||
|
f"**Mode:** `{change.mode}`",
|
||||||
|
f"**Applied:** {change.when_str()} UTC",
|
||||||
|
f"**Operator:** op:`{change.sender_hash[:8]}`",
|
||||||
|
f"**File:** `{fname}`",
|
||||||
|
f"**Destructive:** {destr_str}",
|
||||||
|
"",
|
||||||
|
"**Changes:**",
|
||||||
|
f" Nodes: +{change.node_added} added ~{change.node_updated} updated -{change.node_deleted} deleted",
|
||||||
|
f" Agents: +{change.agent_added} added ~{change.agent_updated} updated -{change.agent_deleted} deleted",
|
||||||
|
"",
|
||||||
|
"**Summary:**",
|
||||||
|
f" {change.diff_summary}",
|
||||||
|
]
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def policy_prune_preview_reply(result: Any, retention_days: int) -> str:
|
||||||
|
"""Reply for !policy prune_exports dry_run=1 — preview of what would be pruned (M10.0)."""
|
||||||
|
if result.count == 0:
|
||||||
|
return (
|
||||||
|
f"🗑️ **Policy exports prune preview** (retention={retention_days}d)\n"
|
||||||
|
"No files older than the retention window found. Nothing to prune."
|
||||||
|
)
|
||||||
|
samples = result.sample_filenames(5)
|
||||||
|
sample_str = "\n".join(f" - `{f}`" for f in samples)
|
||||||
|
more = result.count - len(samples)
|
||||||
|
more_str = f"\n _(+{more} more)_" if more > 0 else ""
|
||||||
|
size_kb = result.total_bytes // 1024
|
||||||
|
return (
|
||||||
|
f"🗑️ **Policy exports prune preview** (retention={retention_days}d)\n"
|
||||||
|
f"Would delete **{result.count}** file(s) (~{size_kb} KB):\n"
|
||||||
|
f"{sample_str}{more_str}\n\n"
|
||||||
|
f"To actually prune: `!policy prune_exports dry_run=0`"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def policy_prune_applied_reply(result: Any, retention_days: int) -> str:
|
||||||
|
"""Reply after !policy prune_exports dry_run=0 is confirmed and applied (M10.0)."""
|
||||||
|
if result.count == 0:
|
||||||
|
return (
|
||||||
|
f"🗑️ **Policy exports pruned** (retention={retention_days}d)\n"
|
||||||
|
"No files matched the retention window."
|
||||||
|
)
|
||||||
|
size_kb = result.total_bytes // 1024
|
||||||
|
return (
|
||||||
|
f"✅ **Policy exports pruned** (retention={retention_days}d)\n"
|
||||||
|
f"Deleted **{result.count}** file(s) (~{size_kb} KB freed)."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def policy_cmd_error(msg: str) -> str:
|
||||||
|
return (
|
||||||
|
f"❌ {msg}\n\n"
|
||||||
|
"Usage:\n"
|
||||||
|
" `!policy export`\n"
|
||||||
|
" `!policy import path=policy-YYYYMMDD-HHMMSS.json [mode=merge|replace] [dry_run=0]`"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -252,12 +1162,26 @@ def help_reply() -> str:
|
|||||||
"""Brief help text."""
|
"""Brief help text."""
|
||||||
return (
|
return (
|
||||||
"**DAGI Bridge — Control Commands**\n\n"
|
"**DAGI Bridge — Control Commands**\n\n"
|
||||||
"`!runbook start <path> [node=NODA1]` — Start a runbook run\n"
|
"`!runbook start <path> [node=NODA1]` — Start a runbook run ✅\n"
|
||||||
"`!runbook next <run_id>` — Advance to next step\n"
|
"`!runbook next <run_id>` — Advance to next step ✅\n"
|
||||||
"`!runbook complete <run_id> step=<n> status=ok` — Mark step complete\n"
|
"`!runbook complete <run_id> step=<n> status=ok [notes=...]` — Mark step complete ✅\n"
|
||||||
"`!runbook evidence <run_id>` — Get evidence artifact path\n"
|
"`!runbook status <run_id>` — Show run status ✅\n"
|
||||||
"`!runbook status <run_id>` — Show current run state\n"
|
"`!runbook evidence <run_id>` — Generate release evidence ✅\n"
|
||||||
"`!status` — Bridge health summary\n"
|
"`!runbook post_review <run_id>` — Generate post-release review ✅\n"
|
||||||
|
"`!status` — Bridge health summary ✅\n"
|
||||||
|
"`!nodes` — Node policy overview ✅\n"
|
||||||
|
"`!node set room=!room:server node=NODA2` — Set room-node override ✅\n"
|
||||||
|
"`!node unset room=!room:server` — Remove room-node override ✅\n"
|
||||||
|
"`!node get room=!room:server` — Show current override ✅\n"
|
||||||
|
"`!node list` — List dynamic overrides (top 10) ✅\n"
|
||||||
|
"`!room agents set room=!X agents=sofiia,helion [default=sofiia]` — Set agent list ✅\n"
|
||||||
|
"`!room agents add room=!X agent=druid` — Add agent to room ✅\n"
|
||||||
|
"`!room agents remove room=!X agent=helion` — Remove agent from room ✅\n"
|
||||||
|
"`!room agents get room=!X` — Show current agent policy ✅\n"
|
||||||
|
"`!room agents list` — List all rooms with agent overrides ✅\n"
|
||||||
|
"`!room agents unset room=!X` — Remove all agent overrides for room ✅\n"
|
||||||
|
"`!policy export` — Export policy snapshot to file ✅\n"
|
||||||
|
"`!policy import path=<file> [mode=merge|replace] [dry_run=0]` — Import policy snapshot ✅\n"
|
||||||
"`!help` — This message\n\n"
|
"`!help` — This message\n\n"
|
||||||
"_Only authorised operators can issue control commands._"
|
"_Only authorised operators can issue control commands._"
|
||||||
)
|
)
|
||||||
|
|||||||
138
services/matrix-bridge-dagi/app/control_limiter.py
Normal file
138
services/matrix-bridge-dagi/app/control_limiter.py
Normal file
@@ -0,0 +1,138 @@
|
|||||||
|
"""
|
||||||
|
control_limiter — M3.4: Rate limiting + cooldown for Matrix control channel.
|
||||||
|
|
||||||
|
Protection layers:
|
||||||
|
1. Per-room sliding window — CONTROL_ROOM_RPM (default 60)
|
||||||
|
2. Per-operator sliding window — CONTROL_OPERATOR_RPM (default 30)
|
||||||
|
3. Per-run sliding window — CONTROL_RUN_NEXT_RPM (default 20, only !runbook next)
|
||||||
|
4. Per-operator cooldown — CONTROL_COOLDOWN_S (default 2s, anti-double-click)
|
||||||
|
|
||||||
|
All state is in-memory (lost on restart), which is intentional — limits reset with the bridge.
|
||||||
|
|
||||||
|
Thread safety: not needed (asyncio single-threaded event loop).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import time
|
||||||
|
from collections import defaultdict, deque
|
||||||
|
from typing import Dict, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
# Sentinel value for "unlimited" (rpm == 0 → skip check)
|
||||||
|
_UNLIMITED = 0
|
||||||
|
|
||||||
|
|
||||||
|
class ControlRateLimiter:
|
||||||
|
"""
|
||||||
|
Sliding-window rate limiter + cooldown for the Matrix control channel.
|
||||||
|
|
||||||
|
All rpm values are requests-per-minute over a 60-second rolling window.
|
||||||
|
cooldown_s is a per-{operator, verb, subcommand} debounce window (anti-double-click).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
room_rpm: int = 60,
|
||||||
|
operator_rpm: int = 30,
|
||||||
|
run_next_rpm: int = 20,
|
||||||
|
cooldown_s: float = 2.0,
|
||||||
|
) -> None:
|
||||||
|
self.room_rpm = room_rpm
|
||||||
|
self.operator_rpm = operator_rpm
|
||||||
|
self.run_next_rpm = run_next_rpm
|
||||||
|
self.cooldown_s = cooldown_s
|
||||||
|
|
||||||
|
# Sliding-window storage: key → deque[float] (monotonic timestamps)
|
||||||
|
self._room_windows: Dict[str, deque] = defaultdict(deque)
|
||||||
|
self._op_windows: Dict[str, deque] = defaultdict(deque)
|
||||||
|
self._run_windows: Dict[str, deque] = defaultdict(deque)
|
||||||
|
|
||||||
|
# Cooldown: (sender_hash, verb, subcommand) → last accepted timestamp
|
||||||
|
self._cooldown_times: Dict[str, float] = {}
|
||||||
|
|
||||||
|
# ── Sliding window helpers ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _check_window(
|
||||||
|
windows: Dict[str, deque],
|
||||||
|
key: str,
|
||||||
|
rpm: int,
|
||||||
|
) -> Tuple[bool, float]:
|
||||||
|
"""
|
||||||
|
Sliding-window check over a 60-second window.
|
||||||
|
|
||||||
|
Returns (allowed, retry_after_seconds).
|
||||||
|
If rpm == 0, always allowed.
|
||||||
|
"""
|
||||||
|
if rpm == _UNLIMITED:
|
||||||
|
return True, 0.0
|
||||||
|
|
||||||
|
now = time.monotonic()
|
||||||
|
window = windows[key]
|
||||||
|
cutoff = now - 60.0
|
||||||
|
|
||||||
|
# Evict expired entries
|
||||||
|
while window and window[0] < cutoff:
|
||||||
|
window.popleft()
|
||||||
|
|
||||||
|
if len(window) >= rpm:
|
||||||
|
# Time until oldest entry expires
|
||||||
|
retry_after = max(0.0, 60.0 - (now - window[0]))
|
||||||
|
return False, retry_after
|
||||||
|
|
||||||
|
window.append(now)
|
||||||
|
return True, 0.0
|
||||||
|
|
||||||
|
# ── Public check methods ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def check_room(self, room_id: str) -> Tuple[bool, float]:
|
||||||
|
"""Per-room rate limit check. Returns (allowed, retry_after_s)."""
|
||||||
|
return self._check_window(self._room_windows, room_id, self.room_rpm)
|
||||||
|
|
||||||
|
def check_operator(self, sender_hash: str) -> Tuple[bool, float]:
|
||||||
|
"""Per-operator rate limit check. Returns (allowed, retry_after_s)."""
|
||||||
|
return self._check_window(self._op_windows, sender_hash, self.operator_rpm)
|
||||||
|
|
||||||
|
def check_run_next(self, run_id: str) -> Tuple[bool, float]:
|
||||||
|
"""
|
||||||
|
Per-run rate limit for !runbook next — prevents rapid-fire advancement.
|
||||||
|
Returns (allowed, retry_after_s).
|
||||||
|
"""
|
||||||
|
return self._check_window(self._run_windows, run_id, self.run_next_rpm)
|
||||||
|
|
||||||
|
def check_cooldown(
|
||||||
|
self,
|
||||||
|
sender_hash: str,
|
||||||
|
verb: str,
|
||||||
|
subcommand: str,
|
||||||
|
) -> Tuple[bool, float]:
|
||||||
|
"""
|
||||||
|
Anti-double-click cooldown per (operator, verb, subcommand).
|
||||||
|
|
||||||
|
Returns (allowed, wait_s). On first call → records timestamp and allows.
|
||||||
|
On subsequent calls within cooldown_s → blocks and returns remaining wait.
|
||||||
|
"""
|
||||||
|
if self.cooldown_s <= 0:
|
||||||
|
return True, 0.0
|
||||||
|
|
||||||
|
key = f"{sender_hash}:{verb}:{subcommand}"
|
||||||
|
now = time.monotonic()
|
||||||
|
last = self._cooldown_times.get(key)
|
||||||
|
|
||||||
|
if last is not None:
|
||||||
|
elapsed = now - last
|
||||||
|
if elapsed < self.cooldown_s:
|
||||||
|
return False, self.cooldown_s - elapsed
|
||||||
|
|
||||||
|
self._cooldown_times[key] = now
|
||||||
|
return True, 0.0
|
||||||
|
|
||||||
|
# ── Summary ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def as_health_dict(self) -> dict:
|
||||||
|
return {
|
||||||
|
"room_rpm": self.room_rpm,
|
||||||
|
"operator_rpm": self.operator_rpm,
|
||||||
|
"run_next_rpm": self.run_next_rpm,
|
||||||
|
"cooldown_s": self.cooldown_s,
|
||||||
|
}
|
||||||
296
services/matrix-bridge-dagi/app/control_runner.py
Normal file
296
services/matrix-bridge-dagi/app/control_runner.py
Normal file
@@ -0,0 +1,296 @@
|
|||||||
|
"""
|
||||||
|
control_runner — M3.1 + M3.2 + M3.3
|
||||||
|
|
||||||
|
Thin async HTTP client that calls the sofiia-console internal runbook API
|
||||||
|
on behalf of the Matrix bridge control channel.
|
||||||
|
|
||||||
|
All functions are stateless; callers supply the pre-built AsyncClient.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Runbook path guards (fail-fast in the bridge, before calling the console)
|
||||||
|
_MAX_PATH_LEN = 256
|
||||||
|
_FORBIDDEN_SEGMENTS = {"..", "~"}
|
||||||
|
|
||||||
|
|
||||||
|
class RunnerError(Exception):
|
||||||
|
"""Raised when the sofiia-console returns an error or call fails."""
|
||||||
|
|
||||||
|
|
||||||
|
def validate_runbook_path(path: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Return None if valid, or an error string describing the problem.
|
||||||
|
Checks: non-empty, max length, no traversal segments, no absolute paths.
|
||||||
|
"""
|
||||||
|
path = path.strip()
|
||||||
|
if not path:
|
||||||
|
return "runbook_path is required"
|
||||||
|
if len(path) > _MAX_PATH_LEN:
|
||||||
|
return f"runbook_path too long (max {_MAX_PATH_LEN} chars)"
|
||||||
|
if path.startswith("/"):
|
||||||
|
return "absolute paths are not allowed"
|
||||||
|
parts = path.replace("\\", "/").split("/")
|
||||||
|
for part in parts:
|
||||||
|
if part in _FORBIDDEN_SEGMENTS:
|
||||||
|
return f"forbidden path segment: {part!r}"
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def start_runbook_run(
|
||||||
|
http_client: httpx.AsyncClient,
|
||||||
|
console_url: str,
|
||||||
|
control_token: str,
|
||||||
|
runbook_path: str,
|
||||||
|
operator_id: str,
|
||||||
|
node_id: str = "NODA1",
|
||||||
|
timeout: float = 15.0,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
POST /api/runbooks/internal/runs → {run_id, status, current_step, steps_total}
|
||||||
|
|
||||||
|
Raises RunnerError on HTTP error or non-2xx response.
|
||||||
|
"""
|
||||||
|
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs"
|
||||||
|
payload: dict = {
|
||||||
|
"runbook_path": runbook_path,
|
||||||
|
"operator_id": operator_id,
|
||||||
|
"node_id": node_id,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = await http_client.post(
|
||||||
|
url,
|
||||||
|
json=payload,
|
||||||
|
headers={"X-Control-Token": control_token},
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
except httpx.RequestError as exc:
|
||||||
|
raise RunnerError(f"connection error: {exc}") from exc
|
||||||
|
|
||||||
|
if resp.status_code != 200:
|
||||||
|
detail = _extract_error_detail(resp)
|
||||||
|
raise RunnerError(f"HTTP {resp.status_code}: {detail}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
return resp.json()
|
||||||
|
except Exception as exc:
|
||||||
|
raise RunnerError(f"invalid JSON response: {exc}") from exc
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_error_detail(resp: httpx.Response) -> str:
|
||||||
|
"""Extract a short error detail from an httpx response (safe: never raises)."""
|
||||||
|
try:
|
||||||
|
body = resp.json()
|
||||||
|
if isinstance(body, dict) and body.get("detail"):
|
||||||
|
return str(body["detail"])[:200]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
return (resp.text or "")[:200]
|
||||||
|
except Exception:
|
||||||
|
return "<no detail>"
|
||||||
|
|
||||||
|
|
||||||
|
async def get_runbook_run(
|
||||||
|
http_client: httpx.AsyncClient,
|
||||||
|
console_url: str,
|
||||||
|
control_token: str,
|
||||||
|
run_id: str,
|
||||||
|
timeout: float = 10.0,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
GET /api/runbooks/internal/runs/{run_id} → full run with steps.
|
||||||
|
"""
|
||||||
|
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = await http_client.get(
|
||||||
|
url,
|
||||||
|
headers={"X-Control-Token": control_token},
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
except httpx.RequestError as exc:
|
||||||
|
raise RunnerError(f"connection error: {exc}") from exc
|
||||||
|
|
||||||
|
if resp.status_code == 404:
|
||||||
|
raise RunnerError(f"run {run_id!r} not found")
|
||||||
|
if resp.status_code != 200:
|
||||||
|
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
return resp.json()
|
||||||
|
except Exception as exc:
|
||||||
|
raise RunnerError(f"invalid JSON response: {exc}") from exc
|
||||||
|
|
||||||
|
|
||||||
|
# ── M3.2 ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def next_runbook_step(
|
||||||
|
http_client: httpx.AsyncClient,
|
||||||
|
console_url: str,
|
||||||
|
control_token: str,
|
||||||
|
run_id: str,
|
||||||
|
operator_id: str = "",
|
||||||
|
timeout: float = 30.0,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
POST /api/runbooks/internal/runs/{run_id}/next
|
||||||
|
|
||||||
|
Returns either:
|
||||||
|
{type:"manual", step_index, title, section, instructions_md, steps_total?}
|
||||||
|
{type:"http_check"|"script", step_index, title, result, step_status, next_step, completed}
|
||||||
|
|
||||||
|
Raises RunnerError on HTTP error, 404 (run not found / not active).
|
||||||
|
"""
|
||||||
|
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}/next"
|
||||||
|
payload = {"operator_id": operator_id} if operator_id else {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = await http_client.post(
|
||||||
|
url,
|
||||||
|
json=payload,
|
||||||
|
headers={"X-Control-Token": control_token},
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
except httpx.RequestError as exc:
|
||||||
|
raise RunnerError(f"connection error: {exc}") from exc
|
||||||
|
|
||||||
|
if resp.status_code == 404:
|
||||||
|
detail = _extract_error_detail(resp)
|
||||||
|
raise RunnerError(f"run not found or not active: {detail}")
|
||||||
|
if resp.status_code != 200:
|
||||||
|
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
return resp.json()
|
||||||
|
except Exception as exc:
|
||||||
|
raise RunnerError(f"invalid JSON response: {exc}") from exc
|
||||||
|
|
||||||
|
|
||||||
|
async def complete_runbook_step(
|
||||||
|
http_client: httpx.AsyncClient,
|
||||||
|
console_url: str,
|
||||||
|
control_token: str,
|
||||||
|
run_id: str,
|
||||||
|
step_index: int,
|
||||||
|
status: str,
|
||||||
|
notes: str = "",
|
||||||
|
operator_id: str = "",
|
||||||
|
timeout: float = 15.0,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
POST /api/runbooks/internal/runs/{run_id}/steps/{step_index}/complete
|
||||||
|
|
||||||
|
Returns: {ok, run_id, step_index, status, next_step, steps_total, run_completed}
|
||||||
|
|
||||||
|
Raises RunnerError on HTTP error or 404 (run/step not found or wrong current step).
|
||||||
|
"""
|
||||||
|
url = (
|
||||||
|
f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}"
|
||||||
|
f"/steps/{step_index}/complete"
|
||||||
|
)
|
||||||
|
payload: dict = {"status": status}
|
||||||
|
if notes:
|
||||||
|
payload["notes"] = notes
|
||||||
|
if operator_id:
|
||||||
|
payload["operator_id"] = operator_id
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = await http_client.post(
|
||||||
|
url,
|
||||||
|
json=payload,
|
||||||
|
headers={"X-Control-Token": control_token},
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
except httpx.RequestError as exc:
|
||||||
|
raise RunnerError(f"connection error: {exc}") from exc
|
||||||
|
|
||||||
|
if resp.status_code == 404:
|
||||||
|
detail = _extract_error_detail(resp)
|
||||||
|
raise RunnerError(f"step not found or not current: {detail}")
|
||||||
|
if resp.status_code != 200:
|
||||||
|
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
return resp.json()
|
||||||
|
except Exception as exc:
|
||||||
|
raise RunnerError(f"invalid JSON response: {exc}") from exc
|
||||||
|
|
||||||
|
|
||||||
|
# ── M3.3 ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def generate_evidence(
|
||||||
|
http_client: httpx.AsyncClient,
|
||||||
|
console_url: str,
|
||||||
|
control_token: str,
|
||||||
|
run_id: str,
|
||||||
|
timeout: float = 30.0,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
POST /api/runbooks/internal/runs/{run_id}/evidence
|
||||||
|
|
||||||
|
Returns: {evidence_path, bytes, created_at, run_id}
|
||||||
|
|
||||||
|
Raises RunnerError on HTTP error or 404 (run not found).
|
||||||
|
"""
|
||||||
|
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}/evidence"
|
||||||
|
try:
|
||||||
|
resp = await http_client.post(
|
||||||
|
url,
|
||||||
|
headers={"X-Control-Token": control_token},
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
except httpx.RequestError as exc:
|
||||||
|
raise RunnerError(f"connection error: {exc}") from exc
|
||||||
|
|
||||||
|
if resp.status_code == 404:
|
||||||
|
raise RunnerError(f"run {run_id!r} not found")
|
||||||
|
if resp.status_code != 200:
|
||||||
|
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
return resp.json()
|
||||||
|
except Exception as exc:
|
||||||
|
raise RunnerError(f"invalid JSON response: {exc}") from exc
|
||||||
|
|
||||||
|
|
||||||
|
async def generate_post_review(
|
||||||
|
http_client: httpx.AsyncClient,
|
||||||
|
console_url: str,
|
||||||
|
control_token: str,
|
||||||
|
run_id: str,
|
||||||
|
timeout: float = 30.0,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
POST /api/runbooks/internal/runs/{run_id}/post_review
|
||||||
|
|
||||||
|
Returns: {path, bytes, created_at, run_id}
|
||||||
|
|
||||||
|
Raises RunnerError on HTTP error or 404.
|
||||||
|
"""
|
||||||
|
url = f"{console_url.rstrip('/')}/api/runbooks/internal/runs/{run_id}/post_review"
|
||||||
|
try:
|
||||||
|
resp = await http_client.post(
|
||||||
|
url,
|
||||||
|
headers={"X-Control-Token": control_token},
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
except httpx.RequestError as exc:
|
||||||
|
raise RunnerError(f"connection error: {exc}") from exc
|
||||||
|
|
||||||
|
if resp.status_code == 404:
|
||||||
|
raise RunnerError(f"run {run_id!r} not found")
|
||||||
|
if resp.status_code != 200:
|
||||||
|
raise RunnerError(f"HTTP {resp.status_code}: {_extract_error_detail(resp)}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
return resp.json()
|
||||||
|
except Exception as exc:
|
||||||
|
raise RunnerError(f"invalid JSON response: {exc}") from exc
|
||||||
210
services/matrix-bridge-dagi/app/discovery.py
Normal file
210
services/matrix-bridge-dagi/app/discovery.py
Normal file
@@ -0,0 +1,210 @@
|
|||||||
|
"""
|
||||||
|
discovery — M4.0: Agent discovery helpers for Matrix user rooms.
|
||||||
|
|
||||||
|
Provides formatted replies for `!agents` and `!agents status` commands.
|
||||||
|
These commands are available to all room members (no auth required) and
|
||||||
|
are processed BEFORE routing to the LLM agent.
|
||||||
|
|
||||||
|
Supports:
|
||||||
|
- Mixed rooms: list all agents, default, usage examples
|
||||||
|
- Direct rooms: show single agent mapping
|
||||||
|
- Unknown rooms: "no mapping" notice
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from .mixed_routing import MixedRoomConfig
|
||||||
|
from .room_mapping import RoomMappingConfig # noqa: F401 — used in type hints
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_ts(ts: int) -> str:
|
||||||
|
"""Format a Unix timestamp as compact UTC string."""
|
||||||
|
try:
|
||||||
|
return datetime.datetime.fromtimestamp(ts, tz=datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
return str(ts)
|
||||||
|
|
||||||
|
# Discovery command prefix
|
||||||
|
DISCOVERY_CMD = "!agents"
|
||||||
|
|
||||||
|
# Reply length cap (Matrix message, not truncated — kept short by design)
|
||||||
|
_MAX_REPLY_LEN = 3500
|
||||||
|
|
||||||
|
|
||||||
|
def is_discovery_message(text: str) -> bool:
|
||||||
|
"""Return True if the message is a !agents discovery command."""
|
||||||
|
lower = text.strip().lower()
|
||||||
|
return lower == DISCOVERY_CMD or lower.startswith(DISCOVERY_CMD + " ")
|
||||||
|
|
||||||
|
|
||||||
|
def agents_reply(
|
||||||
|
room_id: str,
|
||||||
|
room_map: RoomMappingConfig,
|
||||||
|
mixed_room_config: Optional[MixedRoomConfig],
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Build a discovery reply for the given room.
|
||||||
|
|
||||||
|
Mixed room → list agents, default, usage examples.
|
||||||
|
Direct room → single agent mapping.
|
||||||
|
Unknown → 'no mapping' notice.
|
||||||
|
"""
|
||||||
|
# Mixed room check first
|
||||||
|
if mixed_room_config and mixed_room_config.is_mixed(room_id):
|
||||||
|
room = mixed_room_config.rooms.get(room_id)
|
||||||
|
if room is not None:
|
||||||
|
return _mixed_room_reply(room_id, room)
|
||||||
|
|
||||||
|
# Direct room check
|
||||||
|
agent_id = room_map.agent_for_room(room_id)
|
||||||
|
if agent_id is not None:
|
||||||
|
return _direct_room_reply(agent_id)
|
||||||
|
|
||||||
|
return _unknown_room_reply()
|
||||||
|
|
||||||
|
|
||||||
|
def _mixed_room_reply(room_id: str, room) -> str:
|
||||||
|
"""Format reply for a mixed room."""
|
||||||
|
agents = room.agents
|
||||||
|
default = room.default_agent or (agents[0] if agents else "?")
|
||||||
|
agent_list = ", ".join(f"**{a}**" for a in agents)
|
||||||
|
|
||||||
|
lines = [
|
||||||
|
f"🤖 **Agents available in this room:** {agent_list}",
|
||||||
|
f"⭐ **Default:** {default}",
|
||||||
|
"",
|
||||||
|
"**How to address an agent:**",
|
||||||
|
]
|
||||||
|
for agent in agents[:5]: # show max 5 examples
|
||||||
|
lines.append(f" • `/{agent} <message>` — slash command")
|
||||||
|
lines.append(f" • `@{agent} <message>` — mention")
|
||||||
|
lines.append(f" • `{agent}: <message>` — colon prefix")
|
||||||
|
lines.extend([
|
||||||
|
"",
|
||||||
|
f"_Messages without prefix go to **{default}** by default._",
|
||||||
|
])
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _direct_room_reply(agent_id: str) -> str:
|
||||||
|
"""Format reply for a directly-mapped room (1 agent)."""
|
||||||
|
return (
|
||||||
|
f"🤖 This room is mapped to agent: **{agent_id}**\n\n"
|
||||||
|
f"All messages are forwarded to **{agent_id}** automatically.\n"
|
||||||
|
f"No prefix needed — just write your message."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _unknown_room_reply() -> str:
|
||||||
|
"""Format reply when room has no mapping."""
|
||||||
|
return (
|
||||||
|
"⚠️ This room has no agent mapping.\n\n"
|
||||||
|
"Contact an operator to configure an agent for this room."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Bridge status reply (M4.1) ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def bridge_status_reply(snapshot: dict) -> str:
|
||||||
|
"""
|
||||||
|
Format a concise bridge health snapshot for `!status` in control room.
|
||||||
|
|
||||||
|
snapshot keys (all optional with defaults):
|
||||||
|
node_id, queue_size, queue_max, worker_count,
|
||||||
|
room_count, mixed_room_count, operators_count,
|
||||||
|
control_safety (dict), persistent_dedupe (dict),
|
||||||
|
dedupe_hits, dedupe_inserts
|
||||||
|
"""
|
||||||
|
node_id = snapshot.get("node_id", "?")
|
||||||
|
q_size = snapshot.get("queue_size", "?")
|
||||||
|
q_max = snapshot.get("queue_max", "?")
|
||||||
|
workers = snapshot.get("worker_count", "?")
|
||||||
|
rooms = snapshot.get("room_count", 0)
|
||||||
|
mixed = snapshot.get("mixed_room_count", 0)
|
||||||
|
ops = snapshot.get("operators_count", 0)
|
||||||
|
|
||||||
|
safety = snapshot.get("control_safety") or {}
|
||||||
|
dedupe = snapshot.get("persistent_dedupe") or {}
|
||||||
|
|
||||||
|
node_policy = snapshot.get("node_policy") or {}
|
||||||
|
default_node = node_policy.get("default_node", node_id)
|
||||||
|
allowed_nodes = node_policy.get("allowed_nodes") or []
|
||||||
|
room_overrides = node_policy.get("room_overrides", 0)
|
||||||
|
|
||||||
|
lines = [
|
||||||
|
f"📡 **Bridge status** — node: `{node_id}`",
|
||||||
|
"",
|
||||||
|
f"**Queue:** {q_size}/{q_max} workers: {workers}",
|
||||||
|
f"**Rooms:** {rooms} direct {mixed} mixed ops: {ops} operators",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
|
||||||
|
# M5.0: node policy
|
||||||
|
if allowed_nodes:
|
||||||
|
allowed_str = ", ".join(f"`{n}`" for n in sorted(allowed_nodes))
|
||||||
|
lines.append(
|
||||||
|
f"**Node policy:** default=`{default_node}` allowed={allowed_str} room_overrides={room_overrides}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Control safety
|
||||||
|
if safety:
|
||||||
|
enabled = "✅" if safety.get("enabled") else "⬜"
|
||||||
|
lines.append(
|
||||||
|
f"**Control safety {enabled}:** "
|
||||||
|
f"room={safety.get('room_rpm', '?')}rpm "
|
||||||
|
f"op={safety.get('operator_rpm', '?')}rpm "
|
||||||
|
f"cooldown={safety.get('cooldown_s', '?')}s"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Persistent dedupe
|
||||||
|
if dedupe:
|
||||||
|
ok_emoji = "✅" if dedupe.get("ok") else "❌"
|
||||||
|
pruned = dedupe.get("pruned_rows_last", 0)
|
||||||
|
ttl = dedupe.get("ttl_h", "?")
|
||||||
|
lines.append(
|
||||||
|
f"**Dedupe {ok_emoji}:** ttl={ttl}h pruned_last={pruned} "
|
||||||
|
f"db=`{dedupe.get('db_path') or 'n/a'}`"
|
||||||
|
)
|
||||||
|
|
||||||
|
# M6.0/M6.1: policy store status
|
||||||
|
ps = snapshot.get("policy_store") or {}
|
||||||
|
if ps:
|
||||||
|
ps_ok = "✅" if ps.get("ok") else "❌"
|
||||||
|
ps_node_count = ps.get("overrides_count", 0)
|
||||||
|
ps_agent_count = ps.get("agent_overrides_count", snapshot.get("policy_agent_overrides_count", 0))
|
||||||
|
ps_path = ps.get("policy_store_path") or ps.get("path") or "n/a"
|
||||||
|
lines.append(
|
||||||
|
f"**Policy store {ps_ok}:** node_overrides={ps_node_count} "
|
||||||
|
f"agent_overrides={ps_agent_count} db=`{ps_path}`"
|
||||||
|
)
|
||||||
|
|
||||||
|
# M6.2: last export/import timestamps + DB mtime
|
||||||
|
_last_export = snapshot.get("policy_last_export_at")
|
||||||
|
_last_import = snapshot.get("policy_last_import_at")
|
||||||
|
_db_mtime = snapshot.get("policy_db_mtime")
|
||||||
|
_snap_parts: list = []
|
||||||
|
if _last_export:
|
||||||
|
_snap_parts.append(f"last_export=`{_fmt_ts(_last_export)}`")
|
||||||
|
if _last_import:
|
||||||
|
_snap_parts.append(f"last_import=`{_fmt_ts(_last_import)}`")
|
||||||
|
if _db_mtime:
|
||||||
|
_snap_parts.append(f"db_mtime=`{_fmt_ts(_db_mtime)}`")
|
||||||
|
if _snap_parts:
|
||||||
|
lines.append("**Policy snapshots:** " + " ".join(_snap_parts))
|
||||||
|
|
||||||
|
# M5.1: per-node routed/rejected breakdown
|
||||||
|
node_stats = snapshot.get("nodes") or {}
|
||||||
|
if node_stats:
|
||||||
|
lines.append("\n**Per-node stats:**")
|
||||||
|
for nid in sorted(node_stats):
|
||||||
|
ns = node_stats[nid]
|
||||||
|
lines.append(
|
||||||
|
f" `{nid}`: routed={ns.get('routed', 0)} rejected={ns.get('rejected', 0)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
reply = "\n".join(lines)
|
||||||
|
if len(reply) > _MAX_REPLY_LEN:
|
||||||
|
reply = reply[:_MAX_REPLY_LEN - 3] + "…"
|
||||||
|
return reply
|
||||||
213
services/matrix-bridge-dagi/app/event_store.py
Normal file
213
services/matrix-bridge-dagi/app/event_store.py
Normal file
@@ -0,0 +1,213 @@
|
|||||||
|
"""
|
||||||
|
event_store — M2.3: Persistent event deduplication via SQLite.
|
||||||
|
|
||||||
|
Stores processed Matrix event_ids so that bridge restarts do not reprocess
|
||||||
|
events still returned by /sync (within TTL window).
|
||||||
|
|
||||||
|
Schema:
|
||||||
|
processed_events (room_id, event_id, ts, sender_hash)
|
||||||
|
PK: (room_id, event_id)
|
||||||
|
IDX: idx_processed_events_ts (ts)
|
||||||
|
|
||||||
|
Design notes:
|
||||||
|
- Uses aiosqlite for non-blocking async access from the ingress event loop.
|
||||||
|
- Prune is best-effort: failures are logged but do NOT abort processing.
|
||||||
|
- If the DB is unavailable (init error, corruption), EventStore degrades to
|
||||||
|
a no-op: is_processed() returns False, mark_processed() is a no-op.
|
||||||
|
The in-memory LRU dedupe (H1) continues to protect within a single run.
|
||||||
|
- WAL mode is enabled for better concurrent read performance.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
try:
|
||||||
|
import aiosqlite
|
||||||
|
_AIOSQLITE_OK = True
|
||||||
|
except ImportError: # pragma: no cover
|
||||||
|
aiosqlite = None # type: ignore
|
||||||
|
_AIOSQLITE_OK = False
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_SCHEMA = """
|
||||||
|
CREATE TABLE IF NOT EXISTS processed_events (
|
||||||
|
room_id TEXT NOT NULL,
|
||||||
|
event_id TEXT NOT NULL,
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
sender_hash TEXT,
|
||||||
|
PRIMARY KEY (room_id, event_id)
|
||||||
|
);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_processed_events_ts ON processed_events (ts);
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class EventStore:
|
||||||
|
"""
|
||||||
|
Async SQLite-backed deduplication store for Matrix event_ids.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
store = EventStore("/app/data/matrix_bridge.db", ttl_h=48)
|
||||||
|
await store.open()
|
||||||
|
...
|
||||||
|
hit = await store.is_processed(room_id, event_id)
|
||||||
|
if not hit:
|
||||||
|
await store.mark_processed(room_id, event_id, sender_hash)
|
||||||
|
...
|
||||||
|
pruned = await store.prune(batch=5000)
|
||||||
|
await store.close()
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
db_path: str,
|
||||||
|
ttl_h: int = 48,
|
||||||
|
prune_batch: int = 5000,
|
||||||
|
) -> None:
|
||||||
|
self.db_path = db_path
|
||||||
|
self.ttl_h = ttl_h
|
||||||
|
self.prune_batch = prune_batch
|
||||||
|
self._db: Optional["aiosqlite.Connection"] = None
|
||||||
|
self._ok: bool = False
|
||||||
|
self._last_prune_at: Optional[float] = None
|
||||||
|
self._pruned_rows_last: int = 0
|
||||||
|
|
||||||
|
# ── Lifecycle ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def open(self) -> bool:
|
||||||
|
"""
|
||||||
|
Open the SQLite connection and apply schema.
|
||||||
|
|
||||||
|
Returns True on success; False on failure (degraded mode).
|
||||||
|
"""
|
||||||
|
if not _AIOSQLITE_OK:
|
||||||
|
logger.warning("aiosqlite not available — persistent dedupe disabled")
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._db = await aiosqlite.connect(self.db_path)
|
||||||
|
# WAL mode: better concurrent read, non-blocking writes
|
||||||
|
await self._db.execute("PRAGMA journal_mode=WAL")
|
||||||
|
await self._db.execute("PRAGMA synchronous=NORMAL")
|
||||||
|
await self._db.executescript(_SCHEMA)
|
||||||
|
await self._db.commit()
|
||||||
|
self._ok = True
|
||||||
|
logger.info("EventStore opened: %s (ttl_h=%d)", self.db_path, self.ttl_h)
|
||||||
|
return True
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("EventStore.open failed — degraded: %s", exc)
|
||||||
|
self._ok = False
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def close(self) -> None:
|
||||||
|
"""Close the SQLite connection gracefully."""
|
||||||
|
if self._db is not None:
|
||||||
|
try:
|
||||||
|
await self._db.close()
|
||||||
|
except Exception as exc: # pragma: no cover
|
||||||
|
logger.warning("EventStore.close error: %s", exc)
|
||||||
|
self._db = None
|
||||||
|
self._ok = False
|
||||||
|
|
||||||
|
# ── Core operations ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def is_processed(self, room_id: str, event_id: str) -> bool:
|
||||||
|
"""
|
||||||
|
Return True if (room_id, event_id) has already been processed.
|
||||||
|
|
||||||
|
Safe to call even when degraded (returns False → no false deduplication).
|
||||||
|
"""
|
||||||
|
if not self._ok or self._db is None:
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
async with self._db.execute(
|
||||||
|
"SELECT 1 FROM processed_events WHERE room_id=? AND event_id=? LIMIT 1",
|
||||||
|
(room_id, event_id),
|
||||||
|
) as cursor:
|
||||||
|
row = await cursor.fetchone()
|
||||||
|
return row is not None
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("EventStore.is_processed error (degraded): %s", exc)
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def mark_processed(
|
||||||
|
self,
|
||||||
|
room_id: str,
|
||||||
|
event_id: str,
|
||||||
|
sender_hash: str = "",
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Insert (room_id, event_id) as processed.
|
||||||
|
|
||||||
|
Returns True on success, False if already exists or on error.
|
||||||
|
Uses INSERT OR IGNORE to avoid duplicates without raising.
|
||||||
|
"""
|
||||||
|
if not self._ok or self._db is None:
|
||||||
|
return False
|
||||||
|
ts = int(time.time())
|
||||||
|
try:
|
||||||
|
await self._db.execute(
|
||||||
|
"INSERT OR IGNORE INTO processed_events (room_id, event_id, ts, sender_hash) "
|
||||||
|
"VALUES (?, ?, ?, ?)",
|
||||||
|
(room_id, event_id, ts, sender_hash or None),
|
||||||
|
)
|
||||||
|
await self._db.commit()
|
||||||
|
return True
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("EventStore.mark_processed error (degraded): %s", exc)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# ── Prune ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def prune(self, batch: Optional[int] = None) -> int:
|
||||||
|
"""
|
||||||
|
Delete events older than ttl_h.
|
||||||
|
|
||||||
|
Returns the number of rows deleted (0 on error or degraded).
|
||||||
|
Uses LIMIT batch to avoid long locks on large tables.
|
||||||
|
"""
|
||||||
|
if not self._ok or self._db is None:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cutoff = int(time.time()) - self.ttl_h * 3600
|
||||||
|
effective_batch = batch or self.prune_batch
|
||||||
|
deleted = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
# SQLite DELETE with LIMIT requires compiling with SQLITE_ENABLE_UPDATE_DELETE_LIMIT,
|
||||||
|
# which may not be available. Use a subquery approach instead.
|
||||||
|
await self._db.execute(
|
||||||
|
"DELETE FROM processed_events "
|
||||||
|
"WHERE rowid IN ("
|
||||||
|
" SELECT rowid FROM processed_events WHERE ts < ? LIMIT ?"
|
||||||
|
")",
|
||||||
|
(cutoff, effective_batch),
|
||||||
|
)
|
||||||
|
await self._db.commit()
|
||||||
|
# Estimate rows deleted from changes()
|
||||||
|
async with self._db.execute("SELECT changes()") as cursor:
|
||||||
|
row = await cursor.fetchone()
|
||||||
|
deleted = row[0] if row else 0
|
||||||
|
self._last_prune_at = time.time()
|
||||||
|
self._pruned_rows_last = deleted
|
||||||
|
if deleted:
|
||||||
|
logger.info("EventStore pruned %d rows (cutoff=%d)", deleted, cutoff)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("EventStore.prune error: %s", exc)
|
||||||
|
|
||||||
|
return deleted
|
||||||
|
|
||||||
|
# ── Health / introspection ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def as_health_dict(self) -> dict:
|
||||||
|
return {
|
||||||
|
"enabled": self._ok,
|
||||||
|
"db_path": self.db_path,
|
||||||
|
"ttl_h": self.ttl_h,
|
||||||
|
"ok": self._ok,
|
||||||
|
"last_prune_at": self._last_prune_at,
|
||||||
|
"pruned_rows_last": self._pruned_rows_last,
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -33,6 +33,9 @@ except ImportError: # pragma: no cover
|
|||||||
|
|
||||||
from .config import BridgeConfig, load_config
|
from .config import BridgeConfig, load_config
|
||||||
from .control import ControlConfig, parse_control_config
|
from .control import ControlConfig, parse_control_config
|
||||||
|
from .control_limiter import ControlRateLimiter
|
||||||
|
from .event_store import EventStore
|
||||||
|
from .node_policy import parse_node_policy
|
||||||
from .ingress import MatrixIngressLoop
|
from .ingress import MatrixIngressLoop
|
||||||
from .mixed_routing import MixedRoomConfig, parse_mixed_room_map
|
from .mixed_routing import MixedRoomConfig, parse_mixed_room_map
|
||||||
from .rate_limit import InMemoryRateLimiter
|
from .rate_limit import InMemoryRateLimiter
|
||||||
@@ -69,7 +72,7 @@ if _PROM_OK:
|
|||||||
_invoke_latency = Histogram(
|
_invoke_latency = Histogram(
|
||||||
"matrix_bridge_invoke_duration_seconds",
|
"matrix_bridge_invoke_duration_seconds",
|
||||||
"Latency of DAGI Router infer call",
|
"Latency of DAGI Router infer call",
|
||||||
["agent_id"],
|
["agent_id", "node_id"], # M5.1: per-node latency breakdown
|
||||||
buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 45.0],
|
buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 45.0],
|
||||||
)
|
)
|
||||||
_send_latency = Histogram(
|
_send_latency = Histogram(
|
||||||
@@ -80,7 +83,8 @@ if _PROM_OK:
|
|||||||
)
|
)
|
||||||
_bridge_up = Gauge(
|
_bridge_up = Gauge(
|
||||||
"matrix_bridge_up",
|
"matrix_bridge_up",
|
||||||
"1 if bridge started successfully",
|
"1 if bridge started successfully; 0 on config error",
|
||||||
|
["node_id"], # M7.1: per-node label for multi-node deployments
|
||||||
)
|
)
|
||||||
_rate_limiter_active_rooms = Gauge(
|
_rate_limiter_active_rooms = Gauge(
|
||||||
"matrix_bridge_rate_limiter_active_rooms",
|
"matrix_bridge_rate_limiter_active_rooms",
|
||||||
@@ -106,10 +110,11 @@ if _PROM_OK:
|
|||||||
["agent_id"],
|
["agent_id"],
|
||||||
buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 30.0],
|
buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 30.0],
|
||||||
)
|
)
|
||||||
# M2.2: Mixed room routing metrics
|
# M2.2: Mixed room routing — reason breakdown (slash/mention/name/default/direct)
|
||||||
_routed_total = Counter(
|
# M7.1: Renamed from matrix_bridge_routed_total to avoid collision with M5.0 counter
|
||||||
"matrix_bridge_routed_total",
|
_routing_reasons_total = Counter(
|
||||||
"Successful message routing by reason",
|
"matrix_bridge_routing_reasons_total",
|
||||||
|
"Message routing breakdown by agent and routing reason (slash/mention/name/default/direct)",
|
||||||
["agent_id", "reason"],
|
["agent_id", "reason"],
|
||||||
)
|
)
|
||||||
_route_rejected_total = Counter(
|
_route_rejected_total = Counter(
|
||||||
@@ -127,20 +132,74 @@ if _PROM_OK:
|
|||||||
"Total control commands received from authorized operators",
|
"Total control commands received from authorized operators",
|
||||||
["sender", "verb", "subcommand"],
|
["sender", "verb", "subcommand"],
|
||||||
)
|
)
|
||||||
|
_control_rate_limited_total = Counter(
|
||||||
|
"matrix_bridge_control_rate_limited_total",
|
||||||
|
"Total control commands rejected by rate limiter or cooldown",
|
||||||
|
["scope"],
|
||||||
|
)
|
||||||
|
_dedupe_persistent_hits_total = Counter(
|
||||||
|
"matrix_bridge_dedupe_persistent_hits_total",
|
||||||
|
"Total events dropped by persistent (SQLite) deduplication",
|
||||||
|
["room_id"],
|
||||||
|
)
|
||||||
|
_dedupe_persistent_inserts_total = Counter(
|
||||||
|
"matrix_bridge_dedupe_persistent_inserts_total",
|
||||||
|
"Total events marked as processed in persistent dedupe store",
|
||||||
|
)
|
||||||
|
# M5.0: node-aware routing — primary routed counter (unique name, no collision)
|
||||||
|
_routed_total = Counter(
|
||||||
|
"matrix_bridge_routed_total",
|
||||||
|
"Total messages successfully routed, by agent, resolved node, and node source",
|
||||||
|
["agent_id", "node_id", "source"],
|
||||||
|
)
|
||||||
|
_node_rejected_total = Counter(
|
||||||
|
"matrix_bridge_node_rejected_total",
|
||||||
|
"Total messages with rejected (non-allowlisted) node kwarg",
|
||||||
|
["node_id"],
|
||||||
|
)
|
||||||
|
# M8.0: soft-failover metrics
|
||||||
|
_failover_total = Counter(
|
||||||
|
"matrix_bridge_failover_total",
|
||||||
|
"Total successful soft-failovers by node transition and reason",
|
||||||
|
["from_node", "to_node", "reason"],
|
||||||
|
)
|
||||||
|
_node_health_state = Gauge(
|
||||||
|
"matrix_bridge_node_health_state",
|
||||||
|
"Node health state: 1=healthy 0.5=degraded 0=down",
|
||||||
|
["node_id"],
|
||||||
|
)
|
||||||
|
# M8.1: sticky routing metrics
|
||||||
|
_sticky_set_total = Counter(
|
||||||
|
"matrix_bridge_sticky_node_total",
|
||||||
|
"Total sticky routing entries set after failover, by preferred node and scope",
|
||||||
|
["node_id", "scope"],
|
||||||
|
)
|
||||||
|
_sticky_active = Gauge(
|
||||||
|
"matrix_bridge_sticky_node_active",
|
||||||
|
"Current count of active sticky routing entries",
|
||||||
|
[],
|
||||||
|
)
|
||||||
|
|
||||||
# ── Startup state ─────────────────────────────────────────────────────────────
|
# ── Startup state ─────────────────────────────────────────────────────────────
|
||||||
_START_TIME = time.monotonic()
|
_START_TIME = time.monotonic()
|
||||||
_cfg: Optional[BridgeConfig] = None
|
_cfg: Optional[BridgeConfig] = None
|
||||||
|
# M5.1: in-memory per-node counters (lightweight, for !status reply)
|
||||||
|
from collections import defaultdict as _defaultdict
|
||||||
|
_node_stats: Dict[str, Dict[str, int]] = _defaultdict(lambda: {"routed": 0, "rejected": 0})
|
||||||
_config_error: Optional[str] = None
|
_config_error: Optional[str] = None
|
||||||
_matrix_reachable: Optional[bool] = None
|
_matrix_reachable: Optional[bool] = None
|
||||||
_gateway_reachable: Optional[bool] = None
|
_gateway_reachable: Optional[bool] = None
|
||||||
_room_map: Optional[RoomMappingConfig] = None
|
_room_map: Optional[RoomMappingConfig] = None
|
||||||
_mixed_room_config: Optional[MixedRoomConfig] = None
|
_mixed_room_config: Optional[MixedRoomConfig] = None
|
||||||
_control_config: Optional[ControlConfig] = None
|
_control_config: Optional[ControlConfig] = None
|
||||||
|
_event_store: Optional[EventStore] = None
|
||||||
_rate_limiter: Optional[InMemoryRateLimiter] = None
|
_rate_limiter: Optional[InMemoryRateLimiter] = None
|
||||||
_ingress_loop: Optional["MatrixIngressLoop"] = None # for /health queue_size
|
_ingress_loop: Optional["MatrixIngressLoop"] = None # for /health queue_size
|
||||||
_ingress_task: Optional[asyncio.Task] = None
|
_ingress_task: Optional[asyncio.Task] = None
|
||||||
_ingress_stop: Optional[asyncio.Event] = None
|
_ingress_stop: Optional[asyncio.Event] = None
|
||||||
|
_sticky_cache: Optional[Any] = None # M8.1: StickyNodeCache instance
|
||||||
|
_confirm_store: Optional[Any] = None # M9.0: ConfirmStore instance
|
||||||
|
_dummy_http_client: Optional[Any] = None # M11: soak inject endpoint (debug only)
|
||||||
|
|
||||||
|
|
||||||
async def _probe_url(url: str, timeout: float = 5.0) -> bool:
|
async def _probe_url(url: str, timeout: float = 5.0) -> bool:
|
||||||
@@ -230,7 +289,7 @@ async def lifespan(app_: Any):
|
|||||||
else:
|
else:
|
||||||
logger.warning("⚠️ DAGI Gateway NOT reachable: %s", _cfg.dagi_gateway_url)
|
logger.warning("⚠️ DAGI Gateway NOT reachable: %s", _cfg.dagi_gateway_url)
|
||||||
if _PROM_OK:
|
if _PROM_OK:
|
||||||
_bridge_up.set(1)
|
_bridge_up.labels(node_id=_cfg.node_id or "").set(1) # M7.1: labeled
|
||||||
|
|
||||||
# Start ingress loop (fire-and-forget asyncio task)
|
# Start ingress loop (fire-and-forget asyncio task)
|
||||||
_has_rooms = (_room_map and _room_map.total_mappings > 0) or (
|
_has_rooms = (_room_map and _room_map.total_mappings > 0) or (
|
||||||
@@ -263,9 +322,9 @@ async def lifespan(app_: Any):
|
|||||||
_rate_limiter_active_rooms.set(stats["active_rooms"])
|
_rate_limiter_active_rooms.set(stats["active_rooms"])
|
||||||
_rate_limiter_active_senders.set(stats["active_senders"])
|
_rate_limiter_active_senders.set(stats["active_senders"])
|
||||||
|
|
||||||
def _on_invoke_latency(agent_id: str, duration_s: float) -> None:
|
def _on_invoke_latency(agent_id: str, duration_s: float, node_id: str = "") -> None:
|
||||||
if _PROM_OK:
|
if _PROM_OK:
|
||||||
_invoke_latency.labels(agent_id=agent_id).observe(duration_s)
|
_invoke_latency.labels(agent_id=agent_id, node_id=node_id or "unknown").observe(duration_s)
|
||||||
|
|
||||||
def _on_send_latency(agent_id: str, duration_s: float) -> None:
|
def _on_send_latency(agent_id: str, duration_s: float) -> None:
|
||||||
if _PROM_OK:
|
if _PROM_OK:
|
||||||
@@ -287,7 +346,7 @@ async def lifespan(app_: Any):
|
|||||||
# M2.2 callbacks
|
# M2.2 callbacks
|
||||||
def _on_routed(agent_id: str, reason: str) -> None:
|
def _on_routed(agent_id: str, reason: str) -> None:
|
||||||
if _PROM_OK:
|
if _PROM_OK:
|
||||||
_routed_total.labels(agent_id=agent_id, reason=reason).inc()
|
_routing_reasons_total.labels(agent_id=agent_id, reason=reason).inc() # M7.1: renamed
|
||||||
|
|
||||||
def _on_route_rejected(room_id: str, reason: str) -> None:
|
def _on_route_rejected(room_id: str, reason: str) -> None:
|
||||||
if _PROM_OK:
|
if _PROM_OK:
|
||||||
@@ -300,6 +359,164 @@ async def lifespan(app_: Any):
|
|||||||
sender=sender, verb=verb, subcommand=subcommand
|
sender=sender, verb=verb, subcommand=subcommand
|
||||||
).inc()
|
).inc()
|
||||||
|
|
||||||
|
# M3.4: control safety rate limiter
|
||||||
|
_control_limiter = ControlRateLimiter(
|
||||||
|
room_rpm=_cfg.control_room_rpm,
|
||||||
|
operator_rpm=_cfg.control_operator_rpm,
|
||||||
|
run_next_rpm=_cfg.control_run_next_rpm,
|
||||||
|
cooldown_s=_cfg.control_cooldown_s,
|
||||||
|
) if _control_config and _control_config.is_enabled else None
|
||||||
|
|
||||||
|
def _on_control_rate_limited(scope: str) -> None:
|
||||||
|
if _PROM_OK:
|
||||||
|
_control_rate_limited_total.labels(scope=scope).inc()
|
||||||
|
|
||||||
|
# M2.3: Persistent event deduplication
|
||||||
|
_prune_task: Optional[asyncio.Task] = None
|
||||||
|
if _cfg.persistent_dedupe:
|
||||||
|
import os
|
||||||
|
db_path = os.path.join(_cfg.bridge_data_dir, "matrix_bridge.db")
|
||||||
|
_event_store = EventStore(
|
||||||
|
db_path=db_path,
|
||||||
|
ttl_h=_cfg.processed_events_ttl_h,
|
||||||
|
prune_batch=_cfg.processed_events_prune_batch,
|
||||||
|
)
|
||||||
|
store_ok = await _event_store.open()
|
||||||
|
if store_ok:
|
||||||
|
logger.info(
|
||||||
|
"✅ Persistent dedupe: %s (ttl_h=%d)",
|
||||||
|
db_path, _cfg.processed_events_ttl_h,
|
||||||
|
)
|
||||||
|
# Best-effort prune on startup
|
||||||
|
pruned = await _event_store.prune()
|
||||||
|
if pruned:
|
||||||
|
logger.info("Startup prune removed %d stale events", pruned)
|
||||||
|
# Periodic prune task
|
||||||
|
if _cfg.processed_events_prune_interval_s > 0:
|
||||||
|
async def _prune_loop() -> None:
|
||||||
|
while True:
|
||||||
|
await asyncio.sleep(_cfg.processed_events_prune_interval_s)
|
||||||
|
if _event_store:
|
||||||
|
await _event_store.prune()
|
||||||
|
_prune_task = asyncio.create_task(_prune_loop(), name="event_store_prune")
|
||||||
|
else:
|
||||||
|
logger.warning("⚠️ EventStore init failed — persistent dedupe disabled (degraded)")
|
||||||
|
_event_store = None
|
||||||
|
else:
|
||||||
|
logger.info("Persistent dedupe disabled (PERSISTENT_DEDUPE=0)")
|
||||||
|
|
||||||
|
def _on_dedupe_hit(room_id: str, agent_id: str) -> None:
|
||||||
|
if _PROM_OK:
|
||||||
|
_dedupe_persistent_hits_total.labels(room_id=room_id).inc()
|
||||||
|
|
||||||
|
def _on_dedupe_insert() -> None:
|
||||||
|
if _PROM_OK:
|
||||||
|
_dedupe_persistent_inserts_total.inc()
|
||||||
|
|
||||||
|
# M5.0: node-aware routing policy
|
||||||
|
_node_policy = parse_node_policy(
|
||||||
|
raw_allowed=_cfg.bridge_allowed_nodes,
|
||||||
|
default_node=_cfg.bridge_default_node,
|
||||||
|
raw_room_map=_cfg.bridge_room_node_map,
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"✅ Node policy: default=%s allowed=%s room_overrides=%d",
|
||||||
|
_node_policy.default_node,
|
||||||
|
sorted(_node_policy.allowed_nodes),
|
||||||
|
len(_node_policy.room_node_map),
|
||||||
|
)
|
||||||
|
|
||||||
|
# M6.0: Persistent policy store for dynamic room-node overrides
|
||||||
|
_policy_store: Optional[Any] = None
|
||||||
|
try:
|
||||||
|
from .policy_store import PolicyStore as _PolicyStore
|
||||||
|
import os
|
||||||
|
_ps_path = os.path.join(_cfg.bridge_data_dir, "policy_overrides.db")
|
||||||
|
_policy_store = _PolicyStore(db_path=_ps_path)
|
||||||
|
_policy_store.open()
|
||||||
|
logger.info(
|
||||||
|
"✅ Policy store: %s (%d overrides)",
|
||||||
|
_ps_path, _policy_store.count_overrides(),
|
||||||
|
)
|
||||||
|
except Exception as _ps_exc:
|
||||||
|
logger.warning("Policy store init failed (non-fatal): %s", _ps_exc)
|
||||||
|
_policy_store = None
|
||||||
|
|
||||||
|
def _on_node_selected(agent_id: str, node_id: str, source: str) -> None:
|
||||||
|
if _PROM_OK:
|
||||||
|
_routed_total.labels(agent_id=agent_id, node_id=node_id, source=source).inc()
|
||||||
|
_node_stats[node_id]["routed"] += 1
|
||||||
|
|
||||||
|
def _on_node_rejected(rejected_node: str) -> None:
|
||||||
|
if _PROM_OK:
|
||||||
|
_node_rejected_total.labels(node_id=rejected_node).inc()
|
||||||
|
_node_stats[rejected_node]["rejected"] += 1
|
||||||
|
|
||||||
|
# M8.0: Node health tracker + soft-failover
|
||||||
|
from .node_health import NodeHealthTracker as _NodeHealthTracker, parse_node_health_config as _parse_nhc
|
||||||
|
_health_cfg = _parse_nhc(
|
||||||
|
fail_consecutive=_cfg.node_fail_consecutive,
|
||||||
|
lat_ewma_s=_cfg.node_lat_ewma_s,
|
||||||
|
ewma_alpha=_cfg.node_ewma_alpha,
|
||||||
|
)
|
||||||
|
_node_health_tracker = _NodeHealthTracker(_health_cfg)
|
||||||
|
logger.info(
|
||||||
|
"✅ Node health tracker: fail_consecutive=%d lat_ewma_s=%.1f ewma_alpha=%.2f",
|
||||||
|
_cfg.node_fail_consecutive, _cfg.node_lat_ewma_s, _cfg.node_ewma_alpha,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _on_failover(from_node: str, to_node: str, reason: str) -> None:
|
||||||
|
if _PROM_OK:
|
||||||
|
_failover_total.labels(
|
||||||
|
from_node=from_node, to_node=to_node, reason=reason
|
||||||
|
).inc()
|
||||||
|
if _PROM_OK:
|
||||||
|
_update_health_gauges()
|
||||||
|
logger.info("⚡ Failover: %s → %s reason=%s", from_node, to_node, reason)
|
||||||
|
|
||||||
|
def _update_health_gauges() -> None:
|
||||||
|
if not _PROM_OK or _node_health_tracker is None or _node_policy is None:
|
||||||
|
return
|
||||||
|
_STATE_MAP = {"healthy": 1.0, "degraded": 0.5, "down": 0.0}
|
||||||
|
for nid in _node_policy.allowed_nodes:
|
||||||
|
state = _node_health_tracker.state(nid)
|
||||||
|
_node_health_state.labels(node_id=nid).set(_STATE_MAP.get(state, 1.0))
|
||||||
|
|
||||||
|
# M8.1: Sticky failover cache
|
||||||
|
from .sticky_cache import StickyNodeCache as _StickyNodeCache
|
||||||
|
global _sticky_cache
|
||||||
|
if _cfg.failover_sticky_ttl_s > 0:
|
||||||
|
_sticky_cache = _StickyNodeCache(ttl_s=_cfg.failover_sticky_ttl_s)
|
||||||
|
logger.info("✅ Sticky failover cache: ttl=%.0fs", _cfg.failover_sticky_ttl_s)
|
||||||
|
else:
|
||||||
|
_sticky_cache = None
|
||||||
|
logger.info("ℹ️ Sticky failover disabled (FAILOVER_STICKY_TTL_S=0)")
|
||||||
|
|
||||||
|
# M9.0: Confirm store
|
||||||
|
from .confirm_store import ConfirmStore as _ConfirmStore
|
||||||
|
global _confirm_store
|
||||||
|
if _cfg.confirm_ttl_s > 0:
|
||||||
|
_confirm_store = _ConfirmStore(ttl_s=_cfg.confirm_ttl_s)
|
||||||
|
logger.info("✅ Confirm store: ttl=%.0fs", _cfg.confirm_ttl_s)
|
||||||
|
else:
|
||||||
|
_confirm_store = None
|
||||||
|
logger.info("ℹ️ Confirm store disabled (CONFIRM_TTL_S=0)")
|
||||||
|
|
||||||
|
# M11: debug inject client (only created when inject is enabled)
|
||||||
|
global _dummy_http_client
|
||||||
|
if _cfg.debug_inject_enabled and _HTTPX_OK:
|
||||||
|
_dummy_http_client = _httpx.AsyncClient(timeout=30.0)
|
||||||
|
logger.warning(
|
||||||
|
"⚠️ DEBUG_INJECT_ENABLED=true — synthetic event injection active. "
|
||||||
|
"NEVER use in production!"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _on_sticky_set(node_id: str, scope: str) -> None:
|
||||||
|
if _PROM_OK:
|
||||||
|
_sticky_set_total.labels(node_id=node_id, scope=scope).inc()
|
||||||
|
if _sticky_cache is not None:
|
||||||
|
_sticky_active.labels().set(_sticky_cache.active_count())
|
||||||
|
|
||||||
ingress = MatrixIngressLoop(
|
ingress = MatrixIngressLoop(
|
||||||
matrix_homeserver_url=_cfg.matrix_homeserver_url,
|
matrix_homeserver_url=_cfg.matrix_homeserver_url,
|
||||||
matrix_access_token=_cfg.matrix_access_token,
|
matrix_access_token=_cfg.matrix_access_token,
|
||||||
@@ -330,7 +547,38 @@ async def lifespan(app_: Any):
|
|||||||
on_route_rejected=_on_route_rejected,
|
on_route_rejected=_on_route_rejected,
|
||||||
control_config=_control_config,
|
control_config=_control_config,
|
||||||
control_unauthorized_behavior=_cfg.control_unauthorized_behavior,
|
control_unauthorized_behavior=_cfg.control_unauthorized_behavior,
|
||||||
|
sofiia_control_token=_cfg.sofiia_control_token,
|
||||||
|
control_limiter=_control_limiter,
|
||||||
on_control_command=_on_control_command,
|
on_control_command=_on_control_command,
|
||||||
|
on_control_rate_limited=_on_control_rate_limited,
|
||||||
|
event_store=_event_store,
|
||||||
|
on_dedupe_persistent_hit=_on_dedupe_hit,
|
||||||
|
on_dedupe_persistent_insert=_on_dedupe_insert,
|
||||||
|
# M4.0: agent discovery
|
||||||
|
discovery_rpm=_cfg.discovery_rpm,
|
||||||
|
# M5.0: node-aware routing
|
||||||
|
node_policy=_node_policy,
|
||||||
|
on_node_selected=_on_node_selected,
|
||||||
|
on_node_rejected=_on_node_rejected,
|
||||||
|
# M5.1: node stats getter for !status
|
||||||
|
node_stats_getter=lambda: {k: dict(v) for k, v in _node_stats.items()},
|
||||||
|
# M6.0: dynamic room-node policy store
|
||||||
|
policy_store=_policy_store,
|
||||||
|
# M6.2: data directory for policy exports/imports
|
||||||
|
bridge_data_dir=_cfg.bridge_data_dir,
|
||||||
|
# M8.0: node health tracker + failover callback
|
||||||
|
node_health_tracker=_node_health_tracker,
|
||||||
|
on_failover=_on_failover,
|
||||||
|
# M8.1: sticky failover cache
|
||||||
|
sticky_cache=_sticky_cache,
|
||||||
|
on_sticky_set=_on_sticky_set,
|
||||||
|
# M8.2: HA persistence config
|
||||||
|
ha_health_snapshot_interval_s=_cfg.ha_health_snapshot_interval_s,
|
||||||
|
ha_health_max_age_s=_cfg.ha_health_max_age_s,
|
||||||
|
# M9.0: Two-step confirmation store
|
||||||
|
confirm_store=_confirm_store,
|
||||||
|
policy_export_retention_days=_cfg.policy_export_retention_days,
|
||||||
|
policy_history_limit=_cfg.policy_history_limit,
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
"✅ Backpressure queue: max=%d workers=%d drain_timeout=%.1fs",
|
"✅ Backpressure queue: max=%d workers=%d drain_timeout=%.1fs",
|
||||||
@@ -349,7 +597,8 @@ async def lifespan(app_: Any):
|
|||||||
_config_error = str(exc)
|
_config_error = str(exc)
|
||||||
logger.error("❌ Config error: %s", _config_error)
|
logger.error("❌ Config error: %s", _config_error)
|
||||||
if _PROM_OK:
|
if _PROM_OK:
|
||||||
_bridge_up.set(0)
|
_cfg_node = _cfg.node_id if _cfg else ""
|
||||||
|
_bridge_up.labels(node_id=_cfg_node or "").set(0) # M7.1: labeled
|
||||||
yield
|
yield
|
||||||
# Shutdown: cancel ingress loop
|
# Shutdown: cancel ingress loop
|
||||||
if _ingress_stop:
|
if _ingress_stop:
|
||||||
@@ -360,6 +609,23 @@ async def lifespan(app_: Any):
|
|||||||
await asyncio.wait_for(_ingress_task, timeout=5.0)
|
await asyncio.wait_for(_ingress_task, timeout=5.0)
|
||||||
except (asyncio.CancelledError, asyncio.TimeoutError):
|
except (asyncio.CancelledError, asyncio.TimeoutError):
|
||||||
pass
|
pass
|
||||||
|
# Shutdown: cancel prune task + close EventStore
|
||||||
|
if "_prune_task" in dir() and _prune_task and not _prune_task.done(): # type: ignore[name-defined]
|
||||||
|
_prune_task.cancel() # type: ignore[name-defined]
|
||||||
|
if _event_store is not None:
|
||||||
|
await _event_store.close()
|
||||||
|
# M6.0: close policy store
|
||||||
|
if "_policy_store" in dir() and _policy_store is not None: # type: ignore[name-defined]
|
||||||
|
try:
|
||||||
|
_policy_store.close() # type: ignore[name-defined]
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass
|
||||||
|
# M11: close debug http client if open
|
||||||
|
if _dummy_http_client is not None:
|
||||||
|
try:
|
||||||
|
await _dummy_http_client.aclose()
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass
|
||||||
logger.info("matrix-bridge-dagi shutting down")
|
logger.info("matrix-bridge-dagi shutting down")
|
||||||
|
|
||||||
# ── App ───────────────────────────────────────────────────────────────────────
|
# ── App ───────────────────────────────────────────────────────────────────────
|
||||||
@@ -435,6 +701,89 @@ async def health() -> Dict[str, Any]:
|
|||||||
"operators_count": len(_control_config.operator_allowlist) if _control_config else 0,
|
"operators_count": len(_control_config.operator_allowlist) if _control_config else 0,
|
||||||
"unauthorized_behavior": _cfg.control_unauthorized_behavior,
|
"unauthorized_behavior": _cfg.control_unauthorized_behavior,
|
||||||
},
|
},
|
||||||
|
"control_safety": {
|
||||||
|
"enabled": _cfg.control_room_rpm > 0 or _cfg.control_operator_rpm > 0,
|
||||||
|
"room_rpm": _cfg.control_room_rpm,
|
||||||
|
"operator_rpm": _cfg.control_operator_rpm,
|
||||||
|
"run_next_rpm": _cfg.control_run_next_rpm,
|
||||||
|
"cooldown_s": _cfg.control_cooldown_s,
|
||||||
|
},
|
||||||
|
"persistent_dedupe": _event_store.as_health_dict() if _event_store else {
|
||||||
|
"enabled": False,
|
||||||
|
"db_path": None,
|
||||||
|
"ttl_h": _cfg.processed_events_ttl_h,
|
||||||
|
"ok": False,
|
||||||
|
"last_prune_at": None,
|
||||||
|
"pruned_rows_last": 0,
|
||||||
|
},
|
||||||
|
# M6.0: policy store health
|
||||||
|
"policy_store": _health_policy_store_dict(),
|
||||||
|
# M8.1: sticky failover cache health
|
||||||
|
"sticky_cache": _health_sticky_dict(),
|
||||||
|
# M8.2: HA state persistence info
|
||||||
|
"ha_state": _health_ha_dict(),
|
||||||
|
# M9.0: confirm store
|
||||||
|
"confirm_store": _health_confirm_dict(),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _health_confirm_dict() -> Dict[str, Any]:
|
||||||
|
"""Return confirm store info for /health endpoint (M9.0)."""
|
||||||
|
if _confirm_store is None:
|
||||||
|
return {"enabled": False}
|
||||||
|
return {
|
||||||
|
"enabled": True,
|
||||||
|
"pending": _confirm_store.pending_count(),
|
||||||
|
"ttl_s": _confirm_store.ttl_s,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _health_ha_dict() -> Dict[str, Any]:
|
||||||
|
"""Return HA persistence info for /health endpoint (M8.2)."""
|
||||||
|
if _ingress_loop is None:
|
||||||
|
return {"sticky_loaded": 0, "health_loaded": False, "snapshot_interval_s": 0}
|
||||||
|
try:
|
||||||
|
s = _ingress_loop.get_status()
|
||||||
|
return {
|
||||||
|
"sticky_loaded": s.get("ha_sticky_loaded", 0),
|
||||||
|
"health_loaded": s.get("ha_health_loaded", False),
|
||||||
|
"snapshot_interval_s": s.get("ha_health_snapshot_interval_s", 0),
|
||||||
|
}
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
return {"sticky_loaded": 0, "health_loaded": False, "snapshot_interval_s": 0}
|
||||||
|
|
||||||
|
|
||||||
|
def _health_sticky_dict() -> Dict[str, Any]:
|
||||||
|
"""Return sticky failover cache health for /health endpoint (M8.1)."""
|
||||||
|
if _sticky_cache is None:
|
||||||
|
return {"enabled": False, "active_keys": 0, "ttl_s": 0}
|
||||||
|
return {
|
||||||
|
"enabled": True,
|
||||||
|
"active_keys": _sticky_cache.active_count(),
|
||||||
|
"ttl_s": _sticky_cache.ttl_s,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _health_policy_store_dict() -> Dict[str, Any]:
|
||||||
|
"""Return policy store health info for /health endpoint."""
|
||||||
|
try:
|
||||||
|
if _ingress_loop is not None:
|
||||||
|
s = _ingress_loop.get_status()
|
||||||
|
return {
|
||||||
|
"ok": s.get("policy_store_ok", False),
|
||||||
|
"path": s.get("policy_store_path"),
|
||||||
|
"overrides_count": s.get("policy_overrides_count", 0),
|
||||||
|
"agent_overrides_count": s.get("policy_agent_overrides_count", 0), # M6.1
|
||||||
|
"last_export_at": s.get("policy_last_export_at"), # M6.2
|
||||||
|
"last_import_at": s.get("policy_last_import_at"), # M6.2
|
||||||
|
"db_mtime": s.get("policy_db_mtime"), # M6.2
|
||||||
|
}
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
pass
|
||||||
|
return {
|
||||||
|
"ok": False, "path": None,
|
||||||
|
"overrides_count": 0, "agent_overrides_count": 0,
|
||||||
|
"last_export_at": None, "last_import_at": None, "db_mtime": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -464,6 +813,101 @@ async def bridge_mappings() -> Dict[str, Any]:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── Debug / Soak (M11) ────────────────────────────────────────────────────────
|
||||||
|
@app.post("/v1/debug/inject_event")
|
||||||
|
async def debug_inject_event(body: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Synthetic event injection for soak/load testing.
|
||||||
|
|
||||||
|
Enabled ONLY when DEBUG_INJECT_ENABLED=true (never in production).
|
||||||
|
|
||||||
|
Body: { "room_id": "!room:server", "event": { Matrix event dict } }
|
||||||
|
The event is enqueued directly into the ingress loop, bypassing Matrix poll.
|
||||||
|
|
||||||
|
Returns: { "ok": bool, "enqueued": bool, "room_id": str, "event_id": str }
|
||||||
|
"""
|
||||||
|
if _cfg is None or not _cfg.debug_inject_enabled:
|
||||||
|
return Response( # type: ignore[return-value]
|
||||||
|
'{"ok":false,"error":"debug inject disabled"}',
|
||||||
|
status_code=403,
|
||||||
|
media_type="application/json",
|
||||||
|
)
|
||||||
|
if _ingress_loop is None:
|
||||||
|
return {"ok": False, "error": "ingress loop not running"}
|
||||||
|
|
||||||
|
room_id = body.get("room_id", "")
|
||||||
|
event = body.get("event", {})
|
||||||
|
if not room_id or not event:
|
||||||
|
return {"ok": False, "error": "missing room_id or event"}
|
||||||
|
|
||||||
|
# Ensure event has minimum required fields for ingress processing
|
||||||
|
if not event.get("event_id"):
|
||||||
|
import time as _time
|
||||||
|
event["event_id"] = f"!inject-{int(_time.monotonic() * 1e6)}"
|
||||||
|
if not event.get("type"):
|
||||||
|
event["type"] = "m.room.message"
|
||||||
|
if not event.get("content"):
|
||||||
|
event["content"] = {"msgtype": "m.text", "body": event.get("body", "soak-ping")}
|
||||||
|
|
||||||
|
# Build a minimal sync_resp that looks like a real Matrix /sync response
|
||||||
|
# so _enqueue_from_sync can pick it up via extract_room_messages.
|
||||||
|
# We bypass Matrix polling by directly calling _try_enqueue on the right mapping.
|
||||||
|
enqueued = False
|
||||||
|
try:
|
||||||
|
# Find the matching room mapping (direct rooms only for soak)
|
||||||
|
mapping = None
|
||||||
|
if _ingress_loop._room_map is not None:
|
||||||
|
for m in _ingress_loop._room_map.mappings:
|
||||||
|
if m.room_id == room_id:
|
||||||
|
mapping = m
|
||||||
|
break
|
||||||
|
|
||||||
|
if mapping is None:
|
||||||
|
return {"ok": False, "error": f"no mapping for room_id={room_id!r}"}
|
||||||
|
|
||||||
|
# Build a minimal stub Matrix client — replies are discarded for soak events
|
||||||
|
from .matrix_client import MatrixClient
|
||||||
|
|
||||||
|
class _SoakMatrixClient(MatrixClient): # type: ignore[misc]
|
||||||
|
"""No-op Matrix client for synthetic soak events."""
|
||||||
|
def __init__(self) -> None: # noqa: D107
|
||||||
|
pass # skip real __init__
|
||||||
|
|
||||||
|
async def mark_seen(self, room_id: str, event_id: str) -> None: # type: ignore[override]
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def send_text(self, room_id: str, text: str, # type: ignore[override]
|
||||||
|
txn: Optional[str] = None) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
_stub_client = _SoakMatrixClient()
|
||||||
|
|
||||||
|
if _dummy_http_client is None:
|
||||||
|
return {"ok": False, "error": "debug http client not initialised"}
|
||||||
|
|
||||||
|
await _ingress_loop._try_enqueue(
|
||||||
|
_stub_client, # type: ignore[arg-type]
|
||||||
|
_ingress_loop._queue,
|
||||||
|
_dummy_http_client,
|
||||||
|
event,
|
||||||
|
mapping,
|
||||||
|
)
|
||||||
|
enqueued = True
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
return {"ok": False, "error": str(exc), "enqueued": False}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"ok": True,
|
||||||
|
"enqueued": enqueued,
|
||||||
|
"room_id": room_id,
|
||||||
|
"event_id": event.get("event_id"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def _noop_send(room_id: str, text: str, txn: Optional[str] = None) -> None:
|
||||||
|
"""Discard replies from injected soak events."""
|
||||||
|
|
||||||
|
|
||||||
# ── Metrics ───────────────────────────────────────────────────────────────────
|
# ── Metrics ───────────────────────────────────────────────────────────────────
|
||||||
@app.get("/metrics")
|
@app.get("/metrics")
|
||||||
async def metrics():
|
async def metrics():
|
||||||
|
|||||||
224
services/matrix-bridge-dagi/app/metrics_contract.py
Normal file
224
services/matrix-bridge-dagi/app/metrics_contract.py
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
"""
|
||||||
|
Metrics Contract — Matrix Bridge DAGI
|
||||||
|
Phase M7.1
|
||||||
|
|
||||||
|
Single source of truth for all Prometheus metric names and their label sets.
|
||||||
|
Used by:
|
||||||
|
- main.py (registers metrics against this contract)
|
||||||
|
- tests/test_matrix_bridge_m71_metrics_contract.py (static validation)
|
||||||
|
- ops/prometheus/alerts/matrix-bridge-dagi.rules.yml (PromQL expressions)
|
||||||
|
- ops/grafana/dashboards/matrix-bridge-dagi.json (panel queries)
|
||||||
|
|
||||||
|
Format:
|
||||||
|
METRICS_CONTRACT: Dict[metric_name, MetricSpec]
|
||||||
|
|
||||||
|
MetricSpec fields:
|
||||||
|
kind : "counter" | "histogram" | "gauge"
|
||||||
|
labels : list of label names (empty list = no labels)
|
||||||
|
help : one-line description
|
||||||
|
phase : originating milestone
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class MetricSpec:
|
||||||
|
kind: str # "counter" | "histogram" | "gauge"
|
||||||
|
labels: List[str] # label names; empty = no labels
|
||||||
|
help: str
|
||||||
|
phase: str = "M1" # originating milestone for traceability
|
||||||
|
|
||||||
|
|
||||||
|
# ── Contract ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
METRICS_CONTRACT: Dict[str, MetricSpec] = {
|
||||||
|
|
||||||
|
# ── Core message traffic ──────────────────────────────────────────────────
|
||||||
|
"matrix_bridge_messages_received_total": MetricSpec(
|
||||||
|
kind="counter",
|
||||||
|
labels=["room_id", "agent_id"],
|
||||||
|
help="Total Matrix messages received",
|
||||||
|
phase="M1",
|
||||||
|
),
|
||||||
|
"matrix_bridge_messages_replied_total": MetricSpec(
|
||||||
|
kind="counter",
|
||||||
|
labels=["room_id", "agent_id", "status"],
|
||||||
|
help="Total agent replies sent to Matrix (status=ok|error)",
|
||||||
|
phase="M1",
|
||||||
|
),
|
||||||
|
"matrix_bridge_rate_limited_total": MetricSpec(
|
||||||
|
kind="counter",
|
||||||
|
labels=["room_id", "agent_id", "limit_type"],
|
||||||
|
help="Messages dropped by rate limiter",
|
||||||
|
phase="H1",
|
||||||
|
),
|
||||||
|
"matrix_bridge_gateway_errors_total": MetricSpec(
|
||||||
|
kind="counter",
|
||||||
|
labels=["error_type"],
|
||||||
|
help="Bridge errors by stage: sync_error, network_error, http_<status>, matrix_send_error, unexpected",
|
||||||
|
phase="M1",
|
||||||
|
),
|
||||||
|
|
||||||
|
# ── Latency histograms ────────────────────────────────────────────────────
|
||||||
|
"matrix_bridge_invoke_duration_seconds": MetricSpec(
|
||||||
|
kind="histogram",
|
||||||
|
labels=["agent_id", "node_id"],
|
||||||
|
help="Latency of DAGI Router infer call, per agent and node",
|
||||||
|
phase="H3",
|
||||||
|
),
|
||||||
|
"matrix_bridge_send_duration_seconds": MetricSpec(
|
||||||
|
kind="histogram",
|
||||||
|
labels=["agent_id"],
|
||||||
|
help="Latency of Matrix send_text call",
|
||||||
|
phase="H3",
|
||||||
|
),
|
||||||
|
"matrix_bridge_queue_wait_seconds": MetricSpec(
|
||||||
|
kind="histogram",
|
||||||
|
labels=["agent_id"],
|
||||||
|
help="Time between enqueue and worker start processing",
|
||||||
|
phase="H3",
|
||||||
|
),
|
||||||
|
|
||||||
|
# ── Queue ─────────────────────────────────────────────────────────────────
|
||||||
|
"matrix_bridge_queue_size": MetricSpec(
|
||||||
|
kind="gauge",
|
||||||
|
labels=[],
|
||||||
|
help="Current number of pending items in the work queue",
|
||||||
|
phase="H2",
|
||||||
|
),
|
||||||
|
"matrix_bridge_queue_dropped_total": MetricSpec(
|
||||||
|
kind="counter",
|
||||||
|
labels=["room_id", "agent_id"],
|
||||||
|
help="Messages dropped because queue was full",
|
||||||
|
phase="H2",
|
||||||
|
),
|
||||||
|
|
||||||
|
# ── Rate limiter gauges ───────────────────────────────────────────────────
|
||||||
|
"matrix_bridge_rate_limiter_active_rooms": MetricSpec(
|
||||||
|
kind="gauge",
|
||||||
|
labels=[],
|
||||||
|
help="Rooms with activity in the current rate-limit window",
|
||||||
|
phase="H1",
|
||||||
|
),
|
||||||
|
"matrix_bridge_rate_limiter_active_senders": MetricSpec(
|
||||||
|
kind="gauge",
|
||||||
|
labels=[],
|
||||||
|
help="Senders with activity in the current rate-limit window",
|
||||||
|
phase="H1",
|
||||||
|
),
|
||||||
|
|
||||||
|
# ── Routing ───────────────────────────────────────────────────────────────
|
||||||
|
"matrix_bridge_routing_reasons_total": MetricSpec(
|
||||||
|
kind="counter",
|
||||||
|
labels=["agent_id", "reason"],
|
||||||
|
help="Message routing breakdown by agent and routing reason (slash/mention/name/default/direct)",
|
||||||
|
phase="M2.2",
|
||||||
|
),
|
||||||
|
"matrix_bridge_route_rejected_total": MetricSpec(
|
||||||
|
kind="counter",
|
||||||
|
labels=["room_id", "reason"],
|
||||||
|
help="Messages rejected during routing (unknown agent, bad slash, no mapping, etc.)",
|
||||||
|
phase="M2.2",
|
||||||
|
),
|
||||||
|
"matrix_bridge_active_room_agent_locks": MetricSpec(
|
||||||
|
kind="gauge",
|
||||||
|
labels=[],
|
||||||
|
help="Number of room-agent pairs currently holding a concurrency lock",
|
||||||
|
phase="M2.2",
|
||||||
|
),
|
||||||
|
|
||||||
|
# ── Control channel ───────────────────────────────────────────────────────
|
||||||
|
"matrix_bridge_control_commands_total": MetricSpec(
|
||||||
|
kind="counter",
|
||||||
|
labels=["sender", "verb", "subcommand"],
|
||||||
|
help="Total control commands received from authorized operators",
|
||||||
|
phase="M3.0",
|
||||||
|
),
|
||||||
|
"matrix_bridge_control_rate_limited_total": MetricSpec(
|
||||||
|
kind="counter",
|
||||||
|
labels=["scope"],
|
||||||
|
help="Total control commands rejected by rate limiter or cooldown",
|
||||||
|
phase="M3.4",
|
||||||
|
),
|
||||||
|
|
||||||
|
# ── Persistent deduplication ─────────────────────────────────────────────
|
||||||
|
"matrix_bridge_dedupe_persistent_hits_total": MetricSpec(
|
||||||
|
kind="counter",
|
||||||
|
labels=["room_id"],
|
||||||
|
help="Total events dropped by persistent (SQLite) deduplication",
|
||||||
|
phase="M2.3",
|
||||||
|
),
|
||||||
|
"matrix_bridge_dedupe_persistent_inserts_total": MetricSpec(
|
||||||
|
kind="counter",
|
||||||
|
labels=[],
|
||||||
|
help="Total events marked as processed in persistent dedupe store",
|
||||||
|
phase="M2.3",
|
||||||
|
),
|
||||||
|
|
||||||
|
# ── Node-aware routing (M5.0) ─────────────────────────────────────────────
|
||||||
|
"matrix_bridge_routed_total": MetricSpec(
|
||||||
|
kind="counter",
|
||||||
|
labels=["agent_id", "node_id", "source"],
|
||||||
|
help="Total messages successfully routed, by agent, resolved node, and node source",
|
||||||
|
phase="M5.0",
|
||||||
|
),
|
||||||
|
"matrix_bridge_node_rejected_total": MetricSpec(
|
||||||
|
kind="counter",
|
||||||
|
labels=["node_id"],
|
||||||
|
help="Total messages with rejected (non-allowlisted) node kwarg",
|
||||||
|
phase="M5.0",
|
||||||
|
),
|
||||||
|
|
||||||
|
# ── Bridge health (M7.1) ──────────────────────────────────────────────────
|
||||||
|
"matrix_bridge_up": MetricSpec(
|
||||||
|
kind="gauge",
|
||||||
|
labels=["node_id"],
|
||||||
|
help="1 if bridge started successfully; 0 on config error",
|
||||||
|
phase="M7.1",
|
||||||
|
),
|
||||||
|
|
||||||
|
# ── Soft-failover (M8.0) ─────────────────────────────────────────────────
|
||||||
|
"matrix_bridge_failover_total": MetricSpec(
|
||||||
|
kind="counter",
|
||||||
|
labels=["from_node", "to_node", "reason"],
|
||||||
|
help="Total successful soft-failovers by node transition and reason",
|
||||||
|
phase="M8.0",
|
||||||
|
),
|
||||||
|
"matrix_bridge_node_health_state": MetricSpec(
|
||||||
|
kind="gauge",
|
||||||
|
labels=["node_id"],
|
||||||
|
help="Node health state gauge: 1=healthy 0.5=degraded 0=down",
|
||||||
|
phase="M8.0",
|
||||||
|
),
|
||||||
|
|
||||||
|
# ── Sticky routing anti-flap (M8.1) ──────────────────────────────────────
|
||||||
|
"matrix_bridge_sticky_node_total": MetricSpec(
|
||||||
|
kind="counter",
|
||||||
|
labels=["node_id", "scope"],
|
||||||
|
help="Total sticky routing entries set after failover, by preferred node and scope",
|
||||||
|
phase="M8.1",
|
||||||
|
),
|
||||||
|
"matrix_bridge_sticky_node_active": MetricSpec(
|
||||||
|
kind="gauge",
|
||||||
|
labels=[],
|
||||||
|
help="Current count of active sticky routing entries",
|
||||||
|
phase="M8.1",
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Alert metric references ────────────────────────────────────────────────────
|
||||||
|
# These are the metric base-names referenced in alert rules.
|
||||||
|
# All must exist in METRICS_CONTRACT.
|
||||||
|
ALERT_METRIC_REFS = frozenset({
|
||||||
|
"matrix_bridge_up",
|
||||||
|
"matrix_bridge_gateway_errors_total",
|
||||||
|
"matrix_bridge_messages_replied_total",
|
||||||
|
"matrix_bridge_queue_dropped_total",
|
||||||
|
"matrix_bridge_rate_limited_total",
|
||||||
|
"matrix_bridge_control_rate_limited_total",
|
||||||
|
"matrix_bridge_dedupe_persistent_hits_total",
|
||||||
|
"matrix_bridge_invoke_duration_seconds",
|
||||||
|
})
|
||||||
@@ -309,3 +309,25 @@ def reply_prefix(agent_id: str, is_mixed: bool) -> str:
|
|||||||
return ""
|
return ""
|
||||||
# Capitalise first letter of agent name: "sofiia" → "Sofiia"
|
# Capitalise first letter of agent name: "sofiia" → "Sofiia"
|
||||||
return f"{agent_id.capitalize()}: "
|
return f"{agent_id.capitalize()}: "
|
||||||
|
|
||||||
|
|
||||||
|
def build_override_config(
|
||||||
|
base_config: MixedRoomConfig,
|
||||||
|
room_id: str,
|
||||||
|
agents: List[str],
|
||||||
|
default_agent: str,
|
||||||
|
) -> MixedRoomConfig:
|
||||||
|
"""
|
||||||
|
M6.1: Build a temporary MixedRoomConfig that uses a dynamic store override
|
||||||
|
for room_id while keeping all other rooms from base_config unchanged.
|
||||||
|
|
||||||
|
Used in _enqueue_from_mixed_room to inject PolicyStore agent overrides
|
||||||
|
without mutating the shared base configuration.
|
||||||
|
"""
|
||||||
|
rooms = dict(base_config.rooms)
|
||||||
|
rooms[room_id] = MixedRoom(
|
||||||
|
room_id=room_id,
|
||||||
|
agents=agents,
|
||||||
|
default_agent=default_agent,
|
||||||
|
)
|
||||||
|
return MixedRoomConfig(rooms=rooms)
|
||||||
|
|||||||
262
services/matrix-bridge-dagi/app/node_health.py
Normal file
262
services/matrix-bridge-dagi/app/node_health.py
Normal file
@@ -0,0 +1,262 @@
|
|||||||
|
"""
|
||||||
|
NodeHealthTracker — M8.0: per-node health state tracking for soft-failover.
|
||||||
|
|
||||||
|
Tracks invoke outcomes per node and maintains:
|
||||||
|
- EWMA latency estimate
|
||||||
|
- consecutive failure counter
|
||||||
|
- last ok / last error timestamps
|
||||||
|
- derived health state: "healthy" | "degraded" | "down"
|
||||||
|
|
||||||
|
State transitions
|
||||||
|
-----------------
|
||||||
|
Any state → "down" : consecutive_failures >= fail_consecutive
|
||||||
|
Any state → "degraded" : ewma_latency_s >= lat_ewma_threshold
|
||||||
|
(and not yet "down")
|
||||||
|
"down"/"degraded" → "healthy" : record_ok() resets consecutive_failures to 0
|
||||||
|
and ewma is updated towards the actual latency
|
||||||
|
|
||||||
|
Thread safety
|
||||||
|
-------------
|
||||||
|
All mutations are protected by a threading.Lock so this can be called from
|
||||||
|
asyncio callbacks (e.g. in `_invoke_and_send` on the event loop thread).
|
||||||
|
Use `record_ok` / `record_error` from within coroutines; they are synchronous
|
||||||
|
(no blocking I/O) so they are safe to call directly without to_thread.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Dict, FrozenSet, Optional, Tuple
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ── State constants ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
NODE_STATE_HEALTHY = "healthy"
|
||||||
|
NODE_STATE_DEGRADED = "degraded"
|
||||||
|
NODE_STATE_DOWN = "down"
|
||||||
|
|
||||||
|
# Failover-triggering error classes
|
||||||
|
FAILOVER_REASON_TIMEOUT = "timeout"
|
||||||
|
FAILOVER_REASON_HTTP_5XX = "http_5xx"
|
||||||
|
FAILOVER_REASON_NETWORK = "network"
|
||||||
|
|
||||||
|
|
||||||
|
# ── Config ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class NodeHealthConfig:
|
||||||
|
"""
|
||||||
|
Thresholds controlling when a node is considered degraded or down.
|
||||||
|
|
||||||
|
fail_consecutive : int number of consecutive failures → "down"
|
||||||
|
lat_ewma_s : float EWMA latency estimate (seconds) threshold → "degraded"
|
||||||
|
ewma_alpha : float EWMA smoothing factor (0..1); higher = more reactive
|
||||||
|
"""
|
||||||
|
fail_consecutive: int = 3
|
||||||
|
lat_ewma_s: float = 12.0
|
||||||
|
ewma_alpha: float = 0.3
|
||||||
|
|
||||||
|
def __post_init__(self) -> None:
|
||||||
|
if not (0 < self.ewma_alpha <= 1):
|
||||||
|
raise ValueError(f"ewma_alpha must be in (0, 1], got {self.ewma_alpha}")
|
||||||
|
if self.fail_consecutive < 1:
|
||||||
|
raise ValueError(f"fail_consecutive must be ≥ 1, got {self.fail_consecutive}")
|
||||||
|
if self.lat_ewma_s <= 0:
|
||||||
|
raise ValueError(f"lat_ewma_s must be > 0, got {self.lat_ewma_s}")
|
||||||
|
|
||||||
|
|
||||||
|
# ── Per-node state ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class _NodeState:
|
||||||
|
invoke_ok_total: int = 0
|
||||||
|
invoke_err_total: int = 0
|
||||||
|
consecutive_failures: int = 0
|
||||||
|
last_ok_ts: Optional[float] = None
|
||||||
|
last_err_ts: Optional[float] = None
|
||||||
|
ewma_latency_s: Optional[float] = None # None until first ok record
|
||||||
|
|
||||||
|
|
||||||
|
# ── Tracker ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
class NodeHealthTracker:
|
||||||
|
"""
|
||||||
|
Thread-safe per-node health tracker.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
tracker = NodeHealthTracker(NodeHealthConfig())
|
||||||
|
|
||||||
|
# On successful invoke
|
||||||
|
tracker.record_ok("NODA1", latency_s=1.4)
|
||||||
|
|
||||||
|
# On failed invoke
|
||||||
|
tracker.record_error("NODA1", reason=FAILOVER_REASON_TIMEOUT)
|
||||||
|
|
||||||
|
# Read health state
|
||||||
|
state = tracker.state("NODA1") # "healthy" | "degraded" | "down"
|
||||||
|
fallback = tracker.pick_fallback("NODA1", allowed_nodes=frozenset({"NODA1","NODA2"}))
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: Optional[NodeHealthConfig] = None) -> None:
|
||||||
|
self._cfg = config or NodeHealthConfig()
|
||||||
|
self._nodes: Dict[str, _NodeState] = {}
|
||||||
|
self._lock = threading.RLock() # RLock: re-entrant (needed for all_info → as_info_dict)
|
||||||
|
|
||||||
|
# ── Public mutation API ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def record_ok(self, node_id: str, latency_s: float) -> None:
|
||||||
|
"""Record a successful invoke for node_id with given latency."""
|
||||||
|
with self._lock:
|
||||||
|
ns = self._get_or_create(node_id)
|
||||||
|
ns.invoke_ok_total += 1
|
||||||
|
ns.consecutive_failures = 0
|
||||||
|
ns.last_ok_ts = time.monotonic()
|
||||||
|
if ns.ewma_latency_s is None:
|
||||||
|
ns.ewma_latency_s = latency_s
|
||||||
|
else:
|
||||||
|
alpha = self._cfg.ewma_alpha
|
||||||
|
ns.ewma_latency_s = alpha * latency_s + (1 - alpha) * ns.ewma_latency_s
|
||||||
|
|
||||||
|
def record_error(self, node_id: str, reason: str = "unknown") -> None:
|
||||||
|
"""Record a failed invoke for node_id."""
|
||||||
|
with self._lock:
|
||||||
|
ns = self._get_or_create(node_id)
|
||||||
|
ns.invoke_err_total += 1
|
||||||
|
ns.consecutive_failures += 1
|
||||||
|
ns.last_err_ts = time.monotonic()
|
||||||
|
logger.debug(
|
||||||
|
"NodeHealth: node=%s consecutive_failures=%d reason=%s",
|
||||||
|
node_id, ns.consecutive_failures, reason,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Public read API ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def state(self, node_id: str) -> str:
|
||||||
|
"""Return current health state for node_id."""
|
||||||
|
with self._lock:
|
||||||
|
return self._state_unlocked(node_id)
|
||||||
|
|
||||||
|
def pick_fallback(
|
||||||
|
self,
|
||||||
|
primary: str,
|
||||||
|
allowed_nodes: FrozenSet[str],
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Return the best alternative node for failover.
|
||||||
|
|
||||||
|
Priority: healthy > degraded > (never down)
|
||||||
|
Returns None if no acceptable fallback exists.
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
candidates = sorted(n for n in allowed_nodes if n != primary)
|
||||||
|
# Prefer healthy first
|
||||||
|
for n in candidates:
|
||||||
|
if self._state_unlocked(n) == NODE_STATE_HEALTHY:
|
||||||
|
return n
|
||||||
|
# Accept degraded if no healthy available
|
||||||
|
for n in candidates:
|
||||||
|
if self._state_unlocked(n) == NODE_STATE_DEGRADED:
|
||||||
|
return n
|
||||||
|
# Do not failover to "down" nodes
|
||||||
|
return None
|
||||||
|
|
||||||
|
def as_info_dict(self, node_id: str) -> dict:
|
||||||
|
"""Return a JSON-safe status dict for one node."""
|
||||||
|
with self._lock:
|
||||||
|
ns = self._nodes.get(node_id)
|
||||||
|
if ns is None:
|
||||||
|
return {
|
||||||
|
"node_id": node_id,
|
||||||
|
"state": NODE_STATE_HEALTHY,
|
||||||
|
"invoke_ok": 0,
|
||||||
|
"invoke_err": 0,
|
||||||
|
"consecutive_failures": 0,
|
||||||
|
"ewma_latency_s": None,
|
||||||
|
"last_ok_ts": None,
|
||||||
|
"last_err_ts": None,
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
"node_id": node_id,
|
||||||
|
"state": self._state_unlocked(node_id),
|
||||||
|
"invoke_ok": ns.invoke_ok_total,
|
||||||
|
"invoke_err": ns.invoke_err_total,
|
||||||
|
"consecutive_failures": ns.consecutive_failures,
|
||||||
|
"ewma_latency_s": round(ns.ewma_latency_s, 3) if ns.ewma_latency_s else None,
|
||||||
|
"last_ok_ts": ns.last_ok_ts,
|
||||||
|
"last_err_ts": ns.last_err_ts,
|
||||||
|
}
|
||||||
|
|
||||||
|
def all_info(self, allowed_nodes: Optional[FrozenSet[str]] = None) -> Dict[str, dict]:
|
||||||
|
"""
|
||||||
|
Return status dicts for all tracked (or specified) nodes.
|
||||||
|
If allowed_nodes provided, also include entries for unseen nodes (state=healthy).
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
keys = set(self._nodes.keys())
|
||||||
|
if allowed_nodes:
|
||||||
|
keys |= set(allowed_nodes)
|
||||||
|
return {n: self.as_info_dict(n) for n in sorted(keys)}
|
||||||
|
|
||||||
|
def reset(self, node_id: str) -> None:
|
||||||
|
"""Reset health state for a node (e.g. after manual recovery)."""
|
||||||
|
with self._lock:
|
||||||
|
self._nodes.pop(node_id, None)
|
||||||
|
|
||||||
|
def restore_node(
|
||||||
|
self,
|
||||||
|
node_id: str,
|
||||||
|
ewma_latency_s: Optional[float],
|
||||||
|
consecutive_failures: int,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Restore persisted node state after a restart (M8.2).
|
||||||
|
|
||||||
|
Only restores ewma_latency_s and consecutive_failures; counters
|
||||||
|
(invoke_ok_total, invoke_err_total) start from 0 since they are
|
||||||
|
runtime metrics for the current session.
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
ns = self._get_or_create(node_id)
|
||||||
|
ns.ewma_latency_s = ewma_latency_s
|
||||||
|
ns.consecutive_failures = max(0, consecutive_failures)
|
||||||
|
|
||||||
|
# ── Internal ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _get_or_create(self, node_id: str) -> _NodeState:
|
||||||
|
if node_id not in self._nodes:
|
||||||
|
self._nodes[node_id] = _NodeState()
|
||||||
|
return self._nodes[node_id]
|
||||||
|
|
||||||
|
def _state_unlocked(self, node_id: str) -> str:
|
||||||
|
ns = self._nodes.get(node_id)
|
||||||
|
if ns is None:
|
||||||
|
return NODE_STATE_HEALTHY # unseen nodes are assumed healthy
|
||||||
|
|
||||||
|
if ns.consecutive_failures >= self._cfg.fail_consecutive:
|
||||||
|
return NODE_STATE_DOWN
|
||||||
|
|
||||||
|
if (
|
||||||
|
ns.ewma_latency_s is not None
|
||||||
|
and ns.ewma_latency_s >= self._cfg.lat_ewma_s
|
||||||
|
):
|
||||||
|
return NODE_STATE_DEGRADED
|
||||||
|
|
||||||
|
return NODE_STATE_HEALTHY
|
||||||
|
|
||||||
|
|
||||||
|
# ── Parser (env vars → NodeHealthConfig) ──────────────────────────────────────
|
||||||
|
|
||||||
|
def parse_node_health_config(
|
||||||
|
fail_consecutive: int = 3,
|
||||||
|
lat_ewma_s: float = 12.0,
|
||||||
|
ewma_alpha: float = 0.3,
|
||||||
|
) -> NodeHealthConfig:
|
||||||
|
"""Construct NodeHealthConfig from parsed env values."""
|
||||||
|
return NodeHealthConfig(
|
||||||
|
fail_consecutive=fail_consecutive,
|
||||||
|
lat_ewma_s=lat_ewma_s,
|
||||||
|
ewma_alpha=ewma_alpha,
|
||||||
|
)
|
||||||
179
services/matrix-bridge-dagi/app/node_policy.py
Normal file
179
services/matrix-bridge-dagi/app/node_policy.py
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
"""
|
||||||
|
node_policy — Node-aware routing for matrix-bridge-dagi.
|
||||||
|
|
||||||
|
Resolves which NODA (NODA1, NODA2, …) a message should be tagged with based on:
|
||||||
|
1. Explicit `node=X` kwarg in the message body (mixed rooms only)
|
||||||
|
2. Dynamic store override (PolicyStore, set by operators via !node set) ← M6.0
|
||||||
|
3. Static per-room mapping from BRIDGE_ROOM_NODE_MAP env
|
||||||
|
4. BRIDGE_DEFAULT_NODE (fallback)
|
||||||
|
|
||||||
|
The resolved node_id is embedded in the Router metadata so downstream
|
||||||
|
services (Router / Memory / Agent) can apply per-node policies.
|
||||||
|
|
||||||
|
This module does NOT change the HTTP endpoint called — the Router URL
|
||||||
|
stays the same.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Dict, FrozenSet, Optional, Tuple
|
||||||
|
|
||||||
|
# Regex to find 'node=X' anywhere in message text (case-insensitive)
|
||||||
|
_NODE_KWARG_RE = re.compile(r"\bnode=(\w+)\b", re.IGNORECASE)
|
||||||
|
|
||||||
|
# Node resolution sources (priority order)
|
||||||
|
NODE_SOURCE_EXPLICIT = "explicit"
|
||||||
|
NODE_SOURCE_STORE = "store" # M6.0: dynamic PolicyStore override
|
||||||
|
NODE_SOURCE_ROOM_MAP = "room_map"
|
||||||
|
NODE_SOURCE_DEFAULT = "default"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class NodeResolution:
|
||||||
|
"""Result of resolving the target node for a message."""
|
||||||
|
node_id: str
|
||||||
|
source: str
|
||||||
|
rejected_node: Optional[str] = None # set when explicit node was not allowlisted
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class NodePolicy:
|
||||||
|
"""
|
||||||
|
Node resolution policy.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
allowed_nodes: Set of valid node names (uppercase).
|
||||||
|
default_node: Fallback node when no explicit or room-map match.
|
||||||
|
room_node_map: Optional per-room override (room_id → node_id).
|
||||||
|
"""
|
||||||
|
allowed_nodes: FrozenSet[str]
|
||||||
|
default_node: str
|
||||||
|
room_node_map: Dict[str, str] = field(default_factory=dict)
|
||||||
|
|
||||||
|
def resolve(
|
||||||
|
self,
|
||||||
|
room_id: str,
|
||||||
|
explicit_node: Optional[str] = None,
|
||||||
|
store_override: Optional[str] = None,
|
||||||
|
) -> NodeResolution:
|
||||||
|
"""
|
||||||
|
Resolve target node for a message.
|
||||||
|
|
||||||
|
Priority (highest → lowest):
|
||||||
|
1. explicit_node kwarg (user-supplied, mixed rooms only)
|
||||||
|
2. store_override — dynamic PolicyStore entry (M6.0)
|
||||||
|
3. room_node_map — static BRIDGE_ROOM_NODE_MAP env entry
|
||||||
|
4. default_node
|
||||||
|
"""
|
||||||
|
if explicit_node is not None:
|
||||||
|
upper = explicit_node.upper()
|
||||||
|
if upper in self.allowed_nodes:
|
||||||
|
return NodeResolution(node_id=upper, source=NODE_SOURCE_EXPLICIT)
|
||||||
|
# Rejected — report bad value and fall through to best available
|
||||||
|
fallback = self._fallback(room_id, store_override)
|
||||||
|
return NodeResolution(
|
||||||
|
node_id=fallback.node_id,
|
||||||
|
source=fallback.source,
|
||||||
|
rejected_node=upper,
|
||||||
|
)
|
||||||
|
|
||||||
|
return self._fallback(room_id, store_override)
|
||||||
|
|
||||||
|
def _fallback(
|
||||||
|
self,
|
||||||
|
room_id: str,
|
||||||
|
store_override: Optional[str] = None,
|
||||||
|
) -> NodeResolution:
|
||||||
|
"""Resolve node without an explicit kwarg (store → env map → default)."""
|
||||||
|
if store_override is not None:
|
||||||
|
upper = store_override.upper()
|
||||||
|
if upper in self.allowed_nodes:
|
||||||
|
return NodeResolution(node_id=upper, source=NODE_SOURCE_STORE)
|
||||||
|
|
||||||
|
if room_id in self.room_node_map:
|
||||||
|
mapped = self.room_node_map[room_id].upper()
|
||||||
|
if mapped in self.allowed_nodes:
|
||||||
|
return NodeResolution(node_id=mapped, source=NODE_SOURCE_ROOM_MAP)
|
||||||
|
|
||||||
|
return NodeResolution(node_id=self.default_node, source=NODE_SOURCE_DEFAULT)
|
||||||
|
|
||||||
|
def as_info_dict(self) -> dict:
|
||||||
|
"""Return a safe dict for health/ops snapshots (no secrets)."""
|
||||||
|
return {
|
||||||
|
"default_node": self.default_node,
|
||||||
|
"allowed_nodes": sorted(self.allowed_nodes),
|
||||||
|
"room_overrides": len(self.room_node_map),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_node_policy(
|
||||||
|
raw_allowed: str,
|
||||||
|
default_node: str,
|
||||||
|
raw_room_map: str,
|
||||||
|
) -> NodePolicy:
|
||||||
|
"""
|
||||||
|
Parse node policy from env-style config strings.
|
||||||
|
|
||||||
|
raw_allowed: "NODA1,NODA2"
|
||||||
|
default_node: "NODA1"
|
||||||
|
raw_room_map: "!roomA:server=NODA2;!roomB:server=NODA1"
|
||||||
|
"""
|
||||||
|
default = default_node.strip().upper() or "NODA1"
|
||||||
|
|
||||||
|
allowed: FrozenSet[str] = frozenset(
|
||||||
|
n.strip().upper() for n in raw_allowed.split(",") if n.strip()
|
||||||
|
)
|
||||||
|
if not allowed:
|
||||||
|
allowed = frozenset([default])
|
||||||
|
elif default not in allowed:
|
||||||
|
# default must always be reachable
|
||||||
|
allowed = allowed | frozenset([default])
|
||||||
|
|
||||||
|
room_map: Dict[str, str] = {}
|
||||||
|
for entry in raw_room_map.split(";"):
|
||||||
|
entry = entry.strip()
|
||||||
|
if not entry or "=" not in entry:
|
||||||
|
continue
|
||||||
|
room_id_raw, node_raw = entry.split("=", 1)
|
||||||
|
room_id = room_id_raw.strip()
|
||||||
|
node = node_raw.strip().upper()
|
||||||
|
if room_id and node:
|
||||||
|
room_map[room_id] = node
|
||||||
|
|
||||||
|
return NodePolicy(
|
||||||
|
allowed_nodes=allowed,
|
||||||
|
default_node=default,
|
||||||
|
room_node_map=room_map,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_node_kwarg(text: str) -> Tuple[Optional[str], str]:
|
||||||
|
"""
|
||||||
|
Extract 'node=X' kwarg from message text.
|
||||||
|
|
||||||
|
Returns (node_id_or_None, cleaned_text_without_kwarg).
|
||||||
|
Preserves the rest of the message — no other transformations.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
"/sofiia node=NODA2 Hello!"
|
||||||
|
→ ("NODA2", "/sofiia Hello!")
|
||||||
|
"""
|
||||||
|
m = _NODE_KWARG_RE.search(text)
|
||||||
|
if m:
|
||||||
|
node = m.group(1).upper()
|
||||||
|
cleaned = _NODE_KWARG_RE.sub("", text, count=1)
|
||||||
|
# Collapse runs of whitespace introduced by the removal
|
||||||
|
cleaned = " ".join(cleaned.split())
|
||||||
|
return node, cleaned
|
||||||
|
return None, text
|
||||||
|
|
||||||
|
|
||||||
|
def node_rejected_reply(requested: str, allowed: FrozenSet[str]) -> str:
|
||||||
|
"""Reply when user requests a node not in the allowlist."""
|
||||||
|
allowed_list = ", ".join(f"`{n}`" for n in sorted(allowed))
|
||||||
|
return (
|
||||||
|
f"⚠️ Unknown node: `{requested}`\n"
|
||||||
|
f"Allowed: {allowed_list}\n"
|
||||||
|
f"_Example: `/sofiia node=NODA1 Hello!`_"
|
||||||
|
)
|
||||||
1007
services/matrix-bridge-dagi/app/policy_store.py
Normal file
1007
services/matrix-bridge-dagi/app/policy_store.py
Normal file
File diff suppressed because it is too large
Load Diff
149
services/matrix-bridge-dagi/app/sticky_cache.py
Normal file
149
services/matrix-bridge-dagi/app/sticky_cache.py
Normal file
@@ -0,0 +1,149 @@
|
|||||||
|
"""
|
||||||
|
StickyNodeCache — M8.1: anti-flap sticky routing after soft-failover.
|
||||||
|
|
||||||
|
After a successful failover (primary → fallback), the bridge remembers the
|
||||||
|
fallback node per room:agent pair for `ttl_s` seconds. Subsequent messages
|
||||||
|
for the same pair skip the primary entirely and go directly to the known-good
|
||||||
|
fallback, preventing oscillation ("flapping") while the primary recovers.
|
||||||
|
|
||||||
|
Key design
|
||||||
|
----------
|
||||||
|
key = "{room_id}:{agent_id}"
|
||||||
|
ttl = FAILOVER_STICKY_TTL_S (default 300 s)
|
||||||
|
|
||||||
|
Priority in routing (when source != explicit):
|
||||||
|
1. sticky cache (temporary)
|
||||||
|
2. store override (desired long-term policy)
|
||||||
|
3. env room_node_map
|
||||||
|
4. env default
|
||||||
|
|
||||||
|
Sticky expires naturally; recovery is automatic — no operator action needed.
|
||||||
|
If the sticky node also fails, the entry is removed and normal failover logic
|
||||||
|
takes over again.
|
||||||
|
|
||||||
|
Thread safety
|
||||||
|
-------------
|
||||||
|
Uses threading.RLock — safe to call from asyncio callbacks without to_thread.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_DEFAULT_TTL_S = 300.0
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class _StickyEntry:
|
||||||
|
node_id: str
|
||||||
|
expires_at: float # time.monotonic() deadline
|
||||||
|
|
||||||
|
|
||||||
|
class StickyNodeCache:
|
||||||
|
"""
|
||||||
|
In-memory sticky node preference cache.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
cache = StickyNodeCache(ttl_s=300)
|
||||||
|
|
||||||
|
# After successful failover:
|
||||||
|
cache.set("!room:srv:sofiia", "NODA2")
|
||||||
|
|
||||||
|
# Before routing the next message:
|
||||||
|
node = cache.get("!room:srv:sofiia") # → "NODA2" or None if expired/missing
|
||||||
|
|
||||||
|
# If sticky node also fails:
|
||||||
|
cache.delete("!room:srv:sofiia")
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, ttl_s: float = _DEFAULT_TTL_S) -> None:
|
||||||
|
if ttl_s <= 0:
|
||||||
|
raise ValueError(f"ttl_s must be > 0, got {ttl_s}")
|
||||||
|
self._ttl_s = ttl_s
|
||||||
|
self._cache: Dict[str, _StickyEntry] = {}
|
||||||
|
self._lock = threading.RLock()
|
||||||
|
|
||||||
|
# ── Public API ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def set(self, key: str, node_id: str, ttl_s: Optional[float] = None) -> None:
|
||||||
|
"""Set sticky preference; overwrites existing entry."""
|
||||||
|
ttl = ttl_s if ttl_s is not None else self._ttl_s
|
||||||
|
with self._lock:
|
||||||
|
self._cache[key] = _StickyEntry(
|
||||||
|
node_id=node_id,
|
||||||
|
expires_at=time.monotonic() + ttl,
|
||||||
|
)
|
||||||
|
logger.debug("StickyCache.set: key=%s node=%s ttl=%.0fs", key, node_id, ttl)
|
||||||
|
|
||||||
|
def get(self, key: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Return sticky node_id if entry exists and not expired; else None.
|
||||||
|
Expired entries are lazily removed on access.
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
entry = self._cache.get(key)
|
||||||
|
if entry is None:
|
||||||
|
return None
|
||||||
|
if time.monotonic() >= entry.expires_at:
|
||||||
|
del self._cache[key]
|
||||||
|
logger.debug("StickyCache.expired: key=%s node=%s", key, entry.node_id)
|
||||||
|
return None
|
||||||
|
return entry.node_id
|
||||||
|
|
||||||
|
def delete(self, key: str) -> bool:
|
||||||
|
"""Remove an entry. Returns True if it existed."""
|
||||||
|
with self._lock:
|
||||||
|
existed = key in self._cache
|
||||||
|
self._cache.pop(key, None)
|
||||||
|
if existed:
|
||||||
|
logger.debug("StickyCache.delete: key=%s", key)
|
||||||
|
return existed
|
||||||
|
|
||||||
|
def active_count(self) -> int:
|
||||||
|
"""Count of non-expired entries (best-effort; no eviction)."""
|
||||||
|
now = time.monotonic()
|
||||||
|
with self._lock:
|
||||||
|
return sum(1 for e in self._cache.values() if e.expires_at > now)
|
||||||
|
|
||||||
|
def active_entries(self) -> List[Tuple[str, str, float]]:
|
||||||
|
"""
|
||||||
|
Return (key, node_id, ttl_remaining_s) for all non-expired entries.
|
||||||
|
Useful for ops visibility in !status/!nodes.
|
||||||
|
"""
|
||||||
|
now = time.monotonic()
|
||||||
|
with self._lock:
|
||||||
|
result = []
|
||||||
|
for k, e in self._cache.items():
|
||||||
|
remaining = e.expires_at - now
|
||||||
|
if remaining > 0:
|
||||||
|
result.append((k, e.node_id, remaining))
|
||||||
|
return sorted(result, key=lambda x: x[0])
|
||||||
|
|
||||||
|
def cleanup(self) -> int:
|
||||||
|
"""
|
||||||
|
Remove all expired entries.
|
||||||
|
Call periodically (e.g. in a background task) to reclaim memory.
|
||||||
|
Returns count of removed entries.
|
||||||
|
"""
|
||||||
|
now = time.monotonic()
|
||||||
|
with self._lock:
|
||||||
|
expired_keys = [k for k, e in self._cache.items() if e.expires_at <= now]
|
||||||
|
for k in expired_keys:
|
||||||
|
del self._cache[k]
|
||||||
|
if expired_keys:
|
||||||
|
logger.debug("StickyCache.cleanup: removed %d expired entries", len(expired_keys))
|
||||||
|
return len(expired_keys)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def ttl_s(self) -> float:
|
||||||
|
return self._ttl_s
|
||||||
|
|
||||||
|
|
||||||
|
def make_sticky_key(room_id: str, agent_id: str) -> str:
|
||||||
|
"""Canonical sticky cache key for a room+agent pair."""
|
||||||
|
return f"{room_id}:{agent_id}"
|
||||||
@@ -4,3 +4,4 @@ httpx>=0.25.0
|
|||||||
python-dotenv>=1.0.0
|
python-dotenv>=1.0.0
|
||||||
prometheus-client>=0.20.0
|
prometheus-client>=0.20.0
|
||||||
pyyaml>=6.0
|
pyyaml>=6.0
|
||||||
|
aiosqlite>=0.19.0
|
||||||
|
|||||||
Reference in New Issue
Block a user