Files
microdao-daarion/ops/scripts/matrix_bridge_soak.py
2026-03-05 07:56:30 -08:00

478 lines
18 KiB
Python

#!/usr/bin/env python3
"""
matrix_bridge_soak.py — M11 live soak script for matrix-bridge-dagi
Usage:
python3 ops/scripts/matrix_bridge_soak.py \
--url http://localhost:9400 \
--messages 100 \
--concurrency 4 \
--report-file /tmp/soak_report.json
Requires: httpx (pip install httpx)
What it does:
1. Sends --messages synthetic messages to the bridge /v1/sync endpoint
(or directly to the router if --direct-router is set).
2. Measures latency (p50, p95, p99, max) per batch.
3. After the run, fetches /metrics and extracts key counters:
- matrix_bridge_queue_dropped_total
- matrix_bridge_rate_limited_total
- matrix_bridge_failover_total
- matrix_bridge_sticky_node_total
- matrix_bridge_invoke_duration_seconds (p50/p95 from histogram)
4. Prints a human-readable report and optionally writes JSON.
Exit codes:
0 = all pass criteria met
1 = one or more thresholds exceeded (see --max-p95-ms, --max-drop-rate)
"""
import argparse
import asyncio
import json
import os
import sys
import time
from typing import Any, Dict, List, Optional
try:
import httpx
except ImportError:
print("ERROR: httpx not installed. Run: pip install httpx", file=sys.stderr)
sys.exit(2)
# ── Pass/fail defaults ─────────────────────────────────────────────────────────
_DEFAULT_MAX_P95_MS = 5000 # 5 s p95 per invoke (generous for cold start)
_DEFAULT_MAX_DROP_RATE = 0.01 # 1% queue drops allowed
# ── Metrics parsing ────────────────────────────────────────────────────────────
def _parse_counter(text: str, name: str) -> float:
"""Extract the last reported value of a Prometheus counter by name."""
for line in text.splitlines():
if line.startswith(name + " ") or line.startswith(name + "{"):
parts = line.rsplit(None, 1)
try:
return float(parts[-1])
except (ValueError, IndexError):
pass
return 0.0
def _parse_histogram_quantile(text: str, name: str, quantile: float) -> Optional[float]:
"""
Approximate histogram_quantile from _bucket lines.
Returns estimated value at given quantile or None if data missing.
"""
buckets: List[tuple] = []
total_count = 0.0
for line in text.splitlines():
if f"{name}_bucket" in line and 'le="' in line:
try:
le_part = line.split('le="')[1].split('"')[0]
le = float(le_part) if le_part != "+Inf" else float("inf")
val = float(line.rsplit(None, 1)[-1])
buckets.append((le, val))
except (ValueError, IndexError):
pass
elif (f"{name}_count " in line or (name + "_count{") in line):
try:
total_count = float(line.rsplit(None, 1)[-1])
except (ValueError, IndexError):
pass
if not buckets or total_count == 0:
return None
buckets.sort()
target = quantile * total_count
prev_le, prev_count = 0.0, 0.0
for le, count in buckets:
if count >= target:
if le == float("inf"):
return prev_le
# Linear interpolation
if count == prev_count:
return le
fraction = (target - prev_count) / (count - prev_count)
return prev_le + fraction * (le - prev_le)
prev_le, prev_count = le, count
return prev_le
# ── Soak runner ────────────────────────────────────────────────────────────────
async def _preflight_inject(client: httpx.AsyncClient, url: str, room_id: str) -> str:
"""
Verify the inject endpoint is reachable and enabled.
Returns "" on success, error message on failure.
"""
try:
resp = await client.post(
f"{url.rstrip('/')}/v1/debug/inject_event",
json={"room_id": room_id, "event": {"event_id": "!preflight", "sender": "@soak:test",
"content": {"msgtype": "m.text", "body": "ping"}}},
timeout=5.0,
)
if resp.status_code == 403:
return (
"❌ DEBUG_INJECT_ENABLED=false on bridge. "
"Set DEBUG_INJECT_ENABLED=true and restart for soak.\n"
" NEVER enable in production!"
)
if resp.status_code >= 500:
return f"❌ Bridge inject endpoint returned HTTP {resp.status_code}"
data = resp.json()
if not data.get("ok") and "no mapping" in data.get("error", ""):
return (
f"❌ No room mapping for room_id={room_id!r}. "
"Pass --room-id matching a configured BRIDGE_ROOM_MAP entry."
)
return ""
except httpx.ConnectError:
return f"❌ Cannot connect to bridge at {url}. Is it running?"
except Exception as exc: # noqa: BLE001
return f"❌ Preflight failed: {exc}"
async def _check_wal(db_path: str) -> Dict[str, Any]:
"""
Run WAL size + checkpoint check on the bridge policy DB.
Returns dict with wal_bytes, wal_mb, checkpoint_result.
Requires sqlite3 CLI on PATH; gracefully skips if unavailable.
"""
import subprocess, shutil
result: Dict[str, Any] = {"db_path": db_path, "ok": False}
wal_path = db_path + "-wal"
try:
wal_bytes = os.path.getsize(wal_path) if os.path.exists(wal_path) else 0
result["wal_bytes"] = wal_bytes
result["wal_mb"] = round(wal_bytes / 1_048_576, 2)
except OSError:
result["wal_bytes"] = -1
result["wal_mb"] = -1
if shutil.which("sqlite3"):
try:
cp = subprocess.run(
["sqlite3", db_path, "PRAGMA wal_checkpoint(PASSIVE);"],
capture_output=True, text=True, timeout=5,
)
# Output: busy|log|checkpointed (3 ints)
parts = cp.stdout.strip().split("|")
if len(parts) == 3:
result["wal_checkpoint"] = {
"busy": int(parts[0]), "log": int(parts[1]), "checkpointed": int(parts[2]),
}
result["ok"] = True
except Exception: # noqa: BLE001
result["ok"] = False
else:
result["sqlite3_missing"] = True
return result
async def _send_one(
client: httpx.AsyncClient,
url: str,
agent_id: str,
message: str,
room_id: str,
sender: str,
) -> tuple:
"""
POST a synthetic Matrix-style event to the bridge debug endpoint.
Returns (latency_ms: float, status_code: int, error: str|None).
"""
payload = {
"room_id": room_id,
"event": {
"event_id": f"!soak-{int(time.monotonic() * 1e6)}",
"sender": sender,
"type": "m.room.message",
"content": {"msgtype": "m.text", "body": message},
},
}
t0 = time.monotonic()
try:
resp = await client.post(
f"{url.rstrip('/')}/v1/debug/inject_event",
json=payload,
timeout=30.0,
)
latency_ms = (time.monotonic() - t0) * 1000
if resp.status_code >= 500:
return latency_ms, resp.status_code, f"HTTP {resp.status_code}"
return latency_ms, resp.status_code, None
except httpx.TimeoutException:
latency_ms = (time.monotonic() - t0) * 1000
return latency_ms, 0, "timeout"
except Exception as exc: # noqa: BLE001
latency_ms = (time.monotonic() - t0) * 1000
return latency_ms, 0, str(exc)
async def _fetch_health(client: httpx.AsyncClient, url: str) -> Dict[str, Any]:
try:
resp = await client.get(f"{url.rstrip('/')}/health", timeout=10.0)
return resp.json() if resp.status_code == 200 else {}
except Exception: # noqa: BLE001
return {}
async def _fetch_metrics(client: httpx.AsyncClient, url: str) -> str:
try:
resp = await client.get(f"{url.rstrip('/')}/metrics", timeout=10.0)
return resp.text if resp.status_code == 200 else ""
except Exception: # noqa: BLE001
return ""
def _percentile(values: List[float], p: float) -> float:
if not values:
return 0.0
sv = sorted(values)
idx = int(len(sv) * p / 100)
return sv[min(idx, len(sv) - 1)]
async def run_soak(
url: str,
n_messages: int,
concurrency: int,
agent_id: str,
room_id: str,
sender: str,
max_p95_ms: float,
max_drop_rate: float,
db_path: str = "",
) -> Dict[str, Any]:
results: List[tuple] = []
semaphore = asyncio.Semaphore(concurrency)
async with httpx.AsyncClient() as client:
# Pre-check: inject endpoint + health
preflight_err = await _preflight_inject(client, url, room_id)
if preflight_err:
print(preflight_err, file=sys.stderr)
return {"ok": False, "error": preflight_err, "passed": False, "failures": [preflight_err]}
# WAL check before soak
wal_before: Dict[str, Any] = {}
if db_path:
wal_before = await _check_wal(db_path)
print(f"[soak] WAL before: {wal_before.get('wal_mb', '?')} MB")
# Pre-check: health
health_before = await _fetch_health(client, url)
metrics_before = await _fetch_metrics(client, url)
drops_before = _parse_counter(metrics_before, "matrix_bridge_queue_dropped_total")
rl_before = _parse_counter(metrics_before, "matrix_bridge_rate_limited_total")
fo_before = _parse_counter(metrics_before, "matrix_bridge_failover_total")
print(f"[soak] Bridge health before: {health_before.get('ok', '?')}")
print(f"[soak] Starting {n_messages} messages (concurrency={concurrency}) ...")
t_start = time.monotonic()
async def worker(i: int):
async with semaphore:
msg = f"soak-msg-{i:04d}"
lat, status, err = await _send_one(
client, url, agent_id, msg, room_id, sender
)
results.append((lat, status, err))
if (i + 1) % max(1, n_messages // 10) == 0:
print(f" [{i+1}/{n_messages}] last={lat:.0f}ms status={status}")
await asyncio.gather(*[worker(i) for i in range(n_messages)])
elapsed_s = time.monotonic() - t_start
metrics_after = await _fetch_metrics(client, url)
health_after = await _fetch_health(client, url)
# WAL check after soak
wal_after: Dict[str, Any] = {}
if db_path:
wal_after = await _check_wal(db_path)
print(f"[soak] WAL after: {wal_after.get('wal_mb', '?')} MB "
f"(delta={round(wal_after.get('wal_mb',0) - wal_before.get('wal_mb',0), 2)} MB)")
latencies = [r[0] for r in results]
errors = [r for r in results if r[2] is not None]
successes = len(results) - len(errors)
error_rate = len(errors) / len(results) if results else 0.0
drops_after = _parse_counter(metrics_after, "matrix_bridge_queue_dropped_total")
rl_after = _parse_counter(metrics_after, "matrix_bridge_rate_limited_total")
fo_after = _parse_counter(metrics_after, "matrix_bridge_failover_total")
sticky_after = _parse_counter(metrics_after, "matrix_bridge_sticky_node_total")
delta_drops = drops_after - drops_before
delta_rl = rl_after - rl_before
delta_fo = fo_after - fo_before
p50 = _percentile(latencies, 50)
p95 = _percentile(latencies, 95)
p99 = _percentile(latencies, 99)
p_max = max(latencies) if latencies else 0.0
# Histogram quantile from Prometheus
hist_p95 = _parse_histogram_quantile(
metrics_after, "matrix_bridge_invoke_duration_seconds", 0.95
)
hist_p95_ms = hist_p95 * 1000 if hist_p95 is not None else None
drop_rate = delta_drops / len(results) if results else 0.0
report = {
"wal": {
"before_mb": wal_before.get("wal_mb"),
"after_mb": wal_after.get("wal_mb"),
"delta_mb": round(
(wal_after.get("wal_mb") or 0) - (wal_before.get("wal_mb") or 0), 3
) if wal_before and wal_after else None,
"checkpoint_after": wal_after.get("wal_checkpoint"),
"threshold_mb": 10,
},
"summary": {
"total_messages": n_messages,
"concurrency": concurrency,
"elapsed_s": round(elapsed_s, 2),
"throughput_rps": round(n_messages / elapsed_s, 1) if elapsed_s > 0 else 0,
"successes": successes,
"errors": len(errors),
"error_rate": round(error_rate, 4),
},
"latency_ms": {
"p50": round(p50, 1),
"p95": round(p95, 1),
"p99": round(p99, 1),
"max": round(p_max, 1),
},
"metrics_delta": {
"queue_drops": int(delta_drops),
"rate_limited": int(delta_rl),
"failovers": int(delta_fo),
"sticky_sets": int(sticky_after),
"drop_rate": round(drop_rate, 4),
},
"prometheus_invoke_p95_ms": round(hist_p95_ms, 1) if hist_p95_ms else None,
"health_before": health_before.get("ok"),
"health_after": health_after.get("ok"),
"pass_criteria": {
"max_p95_ms": max_p95_ms,
"max_drop_rate": max_drop_rate,
},
}
# Pass/fail evaluation
failures = []
if p95 > max_p95_ms:
failures.append(f"p95={p95:.0f}ms exceeds threshold {max_p95_ms:.0f}ms")
if drop_rate > max_drop_rate:
failures.append(
f"drop_rate={drop_rate:.3%} exceeds threshold {max_drop_rate:.3%}"
)
wal_delta = report["wal"]["delta_mb"]
if wal_delta is not None and wal_delta > report["wal"]["threshold_mb"]:
failures.append(
f"WAL grew {wal_delta:.1f}MB (threshold {report['wal']['threshold_mb']}MB) "
"— possible SQLite write pressure (Bottleneck #2)"
)
report["passed"] = len(failures) == 0
report["failures"] = failures
return report
def _print_report(r: Dict[str, Any]) -> None:
s = r["summary"]
l = r["latency_ms"]
m = r["metrics_delta"]
passed = "✅ PASSED" if r["passed"] else "❌ FAILED"
w = r.get("wal", {})
print()
print("=" * 60)
print(f" matrix-bridge-dagi Soak Report {passed}")
print("=" * 60)
print(f" Messages: {s['total_messages']} concurrency={s['concurrency']}")
print(f" Elapsed: {s['elapsed_s']}s ({s['throughput_rps']} rps)")
print(f" Successes: {s['successes']} errors={s['errors']} ({s['error_rate']:.1%})")
print()
print(f" Latency (client-side): p50={l['p50']}ms p95={l['p95']}ms "
f"p99={l['p99']}ms max={l['max']}ms")
if r["prometheus_invoke_p95_ms"] is not None:
print(f" Invoke p95 (Prometheus): {r['prometheus_invoke_p95_ms']}ms")
print()
print(f" Queue drops: {m['queue_drops']} (rate {m['drop_rate']:.3%})")
print(f" Rate-limited: {m['rate_limited']}")
print(f" Failovers: {m['failovers']}")
print(f" Sticky sets: {m['sticky_sets']}")
if w.get("before_mb") is not None:
wal_delta_str = (
f"Δ{w['delta_mb']:+.2f}MB" if w.get("delta_mb") is not None else ""
)
wal_warn = " ⚠️" if (w.get("delta_mb") or 0) > w.get("threshold_mb", 10) else ""
print(f" WAL: {w['before_mb']}MB → {w['after_mb']}MB {wal_delta_str}{wal_warn}")
print()
if r["failures"]:
for f in r["failures"]:
print(f"{f}")
else:
print(" All pass criteria met.")
print("=" * 60)
def main() -> int:
parser = argparse.ArgumentParser(description="matrix-bridge-dagi soak test (M11)")
parser.add_argument("--url", default="http://localhost:9400",
help="Bridge base URL (default: http://localhost:9400)")
parser.add_argument("--messages", type=int, default=100,
help="Total messages to send (default: 100)")
parser.add_argument("--concurrency", type=int, default=4,
help="Concurrent requests (default: 4)")
parser.add_argument("--agent-id", default="sofiia",
help="Agent id for synthetic events (default: sofiia)")
parser.add_argument("--room-id", default="!soak-room:home.invalid",
help="Room id for synthetic events")
parser.add_argument("--sender", default="@soak-user:home.invalid",
help="Sender for synthetic events")
parser.add_argument("--max-p95-ms", type=float, default=_DEFAULT_MAX_P95_MS,
help=f"Max p95 latency ms (default: {_DEFAULT_MAX_P95_MS})")
parser.add_argument("--max-drop-rate",type=float, default=_DEFAULT_MAX_DROP_RATE,
help=f"Max queue drop rate 0..1 (default: {_DEFAULT_MAX_DROP_RATE})")
parser.add_argument("--report-file", default="",
help="Optional path to write JSON report")
parser.add_argument("--db-path", default="",
help="Path to policy_store.db for WAL check "
"(e.g. /opt/microdao-daarion/data/matrix_bridge.db)")
args = parser.parse_args()
report = asyncio.run(run_soak(
url=args.url,
n_messages=args.messages,
concurrency=args.concurrency,
agent_id=args.agent_id,
room_id=args.room_id,
sender=args.sender,
max_p95_ms=args.max_p95_ms,
max_drop_rate=args.max_drop_rate,
db_path=args.db_path,
))
_print_report(report)
if args.report_file:
with open(args.report_file, "w", encoding="utf-8") as fh:
json.dump(report, fh, indent=2)
print(f"\n Report saved: {args.report_file}")
return 0 if report["passed"] else 1
if __name__ == "__main__":
sys.exit(main())