478 lines
18 KiB
Python
478 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
matrix_bridge_soak.py — M11 live soak script for matrix-bridge-dagi
|
|
|
|
Usage:
|
|
python3 ops/scripts/matrix_bridge_soak.py \
|
|
--url http://localhost:9400 \
|
|
--messages 100 \
|
|
--concurrency 4 \
|
|
--report-file /tmp/soak_report.json
|
|
|
|
Requires: httpx (pip install httpx)
|
|
|
|
What it does:
|
|
1. Sends --messages synthetic messages to the bridge /v1/sync endpoint
|
|
(or directly to the router if --direct-router is set).
|
|
2. Measures latency (p50, p95, p99, max) per batch.
|
|
3. After the run, fetches /metrics and extracts key counters:
|
|
- matrix_bridge_queue_dropped_total
|
|
- matrix_bridge_rate_limited_total
|
|
- matrix_bridge_failover_total
|
|
- matrix_bridge_sticky_node_total
|
|
- matrix_bridge_invoke_duration_seconds (p50/p95 from histogram)
|
|
4. Prints a human-readable report and optionally writes JSON.
|
|
|
|
Exit codes:
|
|
0 = all pass criteria met
|
|
1 = one or more thresholds exceeded (see --max-p95-ms, --max-drop-rate)
|
|
"""
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
try:
|
|
import httpx
|
|
except ImportError:
|
|
print("ERROR: httpx not installed. Run: pip install httpx", file=sys.stderr)
|
|
sys.exit(2)
|
|
|
|
# ── Pass/fail defaults ─────────────────────────────────────────────────────────
|
|
_DEFAULT_MAX_P95_MS = 5000 # 5 s p95 per invoke (generous for cold start)
|
|
_DEFAULT_MAX_DROP_RATE = 0.01 # 1% queue drops allowed
|
|
|
|
|
|
# ── Metrics parsing ────────────────────────────────────────────────────────────
|
|
def _parse_counter(text: str, name: str) -> float:
|
|
"""Extract the last reported value of a Prometheus counter by name."""
|
|
for line in text.splitlines():
|
|
if line.startswith(name + " ") or line.startswith(name + "{"):
|
|
parts = line.rsplit(None, 1)
|
|
try:
|
|
return float(parts[-1])
|
|
except (ValueError, IndexError):
|
|
pass
|
|
return 0.0
|
|
|
|
|
|
def _parse_histogram_quantile(text: str, name: str, quantile: float) -> Optional[float]:
|
|
"""
|
|
Approximate histogram_quantile from _bucket lines.
|
|
Returns estimated value at given quantile or None if data missing.
|
|
"""
|
|
buckets: List[tuple] = []
|
|
total_count = 0.0
|
|
for line in text.splitlines():
|
|
if f"{name}_bucket" in line and 'le="' in line:
|
|
try:
|
|
le_part = line.split('le="')[1].split('"')[0]
|
|
le = float(le_part) if le_part != "+Inf" else float("inf")
|
|
val = float(line.rsplit(None, 1)[-1])
|
|
buckets.append((le, val))
|
|
except (ValueError, IndexError):
|
|
pass
|
|
elif (f"{name}_count " in line or (name + "_count{") in line):
|
|
try:
|
|
total_count = float(line.rsplit(None, 1)[-1])
|
|
except (ValueError, IndexError):
|
|
pass
|
|
|
|
if not buckets or total_count == 0:
|
|
return None
|
|
|
|
buckets.sort()
|
|
target = quantile * total_count
|
|
prev_le, prev_count = 0.0, 0.0
|
|
for le, count in buckets:
|
|
if count >= target:
|
|
if le == float("inf"):
|
|
return prev_le
|
|
# Linear interpolation
|
|
if count == prev_count:
|
|
return le
|
|
fraction = (target - prev_count) / (count - prev_count)
|
|
return prev_le + fraction * (le - prev_le)
|
|
prev_le, prev_count = le, count
|
|
return prev_le
|
|
|
|
|
|
# ── Soak runner ────────────────────────────────────────────────────────────────
|
|
async def _preflight_inject(client: httpx.AsyncClient, url: str, room_id: str) -> str:
|
|
"""
|
|
Verify the inject endpoint is reachable and enabled.
|
|
Returns "" on success, error message on failure.
|
|
"""
|
|
try:
|
|
resp = await client.post(
|
|
f"{url.rstrip('/')}/v1/debug/inject_event",
|
|
json={"room_id": room_id, "event": {"event_id": "!preflight", "sender": "@soak:test",
|
|
"content": {"msgtype": "m.text", "body": "ping"}}},
|
|
timeout=5.0,
|
|
)
|
|
if resp.status_code == 403:
|
|
return (
|
|
"❌ DEBUG_INJECT_ENABLED=false on bridge. "
|
|
"Set DEBUG_INJECT_ENABLED=true and restart for soak.\n"
|
|
" NEVER enable in production!"
|
|
)
|
|
if resp.status_code >= 500:
|
|
return f"❌ Bridge inject endpoint returned HTTP {resp.status_code}"
|
|
data = resp.json()
|
|
if not data.get("ok") and "no mapping" in data.get("error", ""):
|
|
return (
|
|
f"❌ No room mapping for room_id={room_id!r}. "
|
|
"Pass --room-id matching a configured BRIDGE_ROOM_MAP entry."
|
|
)
|
|
return ""
|
|
except httpx.ConnectError:
|
|
return f"❌ Cannot connect to bridge at {url}. Is it running?"
|
|
except Exception as exc: # noqa: BLE001
|
|
return f"❌ Preflight failed: {exc}"
|
|
|
|
|
|
async def _check_wal(db_path: str) -> Dict[str, Any]:
|
|
"""
|
|
Run WAL size + checkpoint check on the bridge policy DB.
|
|
Returns dict with wal_bytes, wal_mb, checkpoint_result.
|
|
Requires sqlite3 CLI on PATH; gracefully skips if unavailable.
|
|
"""
|
|
import subprocess, shutil
|
|
result: Dict[str, Any] = {"db_path": db_path, "ok": False}
|
|
|
|
wal_path = db_path + "-wal"
|
|
try:
|
|
wal_bytes = os.path.getsize(wal_path) if os.path.exists(wal_path) else 0
|
|
result["wal_bytes"] = wal_bytes
|
|
result["wal_mb"] = round(wal_bytes / 1_048_576, 2)
|
|
except OSError:
|
|
result["wal_bytes"] = -1
|
|
result["wal_mb"] = -1
|
|
|
|
if shutil.which("sqlite3"):
|
|
try:
|
|
cp = subprocess.run(
|
|
["sqlite3", db_path, "PRAGMA wal_checkpoint(PASSIVE);"],
|
|
capture_output=True, text=True, timeout=5,
|
|
)
|
|
# Output: busy|log|checkpointed (3 ints)
|
|
parts = cp.stdout.strip().split("|")
|
|
if len(parts) == 3:
|
|
result["wal_checkpoint"] = {
|
|
"busy": int(parts[0]), "log": int(parts[1]), "checkpointed": int(parts[2]),
|
|
}
|
|
result["ok"] = True
|
|
except Exception: # noqa: BLE001
|
|
result["ok"] = False
|
|
else:
|
|
result["sqlite3_missing"] = True
|
|
|
|
return result
|
|
|
|
|
|
async def _send_one(
|
|
client: httpx.AsyncClient,
|
|
url: str,
|
|
agent_id: str,
|
|
message: str,
|
|
room_id: str,
|
|
sender: str,
|
|
) -> tuple:
|
|
"""
|
|
POST a synthetic Matrix-style event to the bridge debug endpoint.
|
|
Returns (latency_ms: float, status_code: int, error: str|None).
|
|
"""
|
|
payload = {
|
|
"room_id": room_id,
|
|
"event": {
|
|
"event_id": f"!soak-{int(time.monotonic() * 1e6)}",
|
|
"sender": sender,
|
|
"type": "m.room.message",
|
|
"content": {"msgtype": "m.text", "body": message},
|
|
},
|
|
}
|
|
t0 = time.monotonic()
|
|
try:
|
|
resp = await client.post(
|
|
f"{url.rstrip('/')}/v1/debug/inject_event",
|
|
json=payload,
|
|
timeout=30.0,
|
|
)
|
|
latency_ms = (time.monotonic() - t0) * 1000
|
|
if resp.status_code >= 500:
|
|
return latency_ms, resp.status_code, f"HTTP {resp.status_code}"
|
|
return latency_ms, resp.status_code, None
|
|
except httpx.TimeoutException:
|
|
latency_ms = (time.monotonic() - t0) * 1000
|
|
return latency_ms, 0, "timeout"
|
|
except Exception as exc: # noqa: BLE001
|
|
latency_ms = (time.monotonic() - t0) * 1000
|
|
return latency_ms, 0, str(exc)
|
|
|
|
|
|
async def _fetch_health(client: httpx.AsyncClient, url: str) -> Dict[str, Any]:
|
|
try:
|
|
resp = await client.get(f"{url.rstrip('/')}/health", timeout=10.0)
|
|
return resp.json() if resp.status_code == 200 else {}
|
|
except Exception: # noqa: BLE001
|
|
return {}
|
|
|
|
|
|
async def _fetch_metrics(client: httpx.AsyncClient, url: str) -> str:
|
|
try:
|
|
resp = await client.get(f"{url.rstrip('/')}/metrics", timeout=10.0)
|
|
return resp.text if resp.status_code == 200 else ""
|
|
except Exception: # noqa: BLE001
|
|
return ""
|
|
|
|
|
|
def _percentile(values: List[float], p: float) -> float:
|
|
if not values:
|
|
return 0.0
|
|
sv = sorted(values)
|
|
idx = int(len(sv) * p / 100)
|
|
return sv[min(idx, len(sv) - 1)]
|
|
|
|
|
|
async def run_soak(
|
|
url: str,
|
|
n_messages: int,
|
|
concurrency: int,
|
|
agent_id: str,
|
|
room_id: str,
|
|
sender: str,
|
|
max_p95_ms: float,
|
|
max_drop_rate: float,
|
|
db_path: str = "",
|
|
) -> Dict[str, Any]:
|
|
results: List[tuple] = []
|
|
semaphore = asyncio.Semaphore(concurrency)
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
# Pre-check: inject endpoint + health
|
|
preflight_err = await _preflight_inject(client, url, room_id)
|
|
if preflight_err:
|
|
print(preflight_err, file=sys.stderr)
|
|
return {"ok": False, "error": preflight_err, "passed": False, "failures": [preflight_err]}
|
|
|
|
# WAL check before soak
|
|
wal_before: Dict[str, Any] = {}
|
|
if db_path:
|
|
wal_before = await _check_wal(db_path)
|
|
print(f"[soak] WAL before: {wal_before.get('wal_mb', '?')} MB")
|
|
|
|
# Pre-check: health
|
|
health_before = await _fetch_health(client, url)
|
|
metrics_before = await _fetch_metrics(client, url)
|
|
|
|
drops_before = _parse_counter(metrics_before, "matrix_bridge_queue_dropped_total")
|
|
rl_before = _parse_counter(metrics_before, "matrix_bridge_rate_limited_total")
|
|
fo_before = _parse_counter(metrics_before, "matrix_bridge_failover_total")
|
|
|
|
print(f"[soak] Bridge health before: {health_before.get('ok', '?')}")
|
|
print(f"[soak] Starting {n_messages} messages (concurrency={concurrency}) ...")
|
|
|
|
t_start = time.monotonic()
|
|
|
|
async def worker(i: int):
|
|
async with semaphore:
|
|
msg = f"soak-msg-{i:04d}"
|
|
lat, status, err = await _send_one(
|
|
client, url, agent_id, msg, room_id, sender
|
|
)
|
|
results.append((lat, status, err))
|
|
if (i + 1) % max(1, n_messages // 10) == 0:
|
|
print(f" [{i+1}/{n_messages}] last={lat:.0f}ms status={status}")
|
|
|
|
await asyncio.gather(*[worker(i) for i in range(n_messages)])
|
|
|
|
elapsed_s = time.monotonic() - t_start
|
|
metrics_after = await _fetch_metrics(client, url)
|
|
health_after = await _fetch_health(client, url)
|
|
|
|
# WAL check after soak
|
|
wal_after: Dict[str, Any] = {}
|
|
if db_path:
|
|
wal_after = await _check_wal(db_path)
|
|
print(f"[soak] WAL after: {wal_after.get('wal_mb', '?')} MB "
|
|
f"(delta={round(wal_after.get('wal_mb',0) - wal_before.get('wal_mb',0), 2)} MB)")
|
|
|
|
latencies = [r[0] for r in results]
|
|
errors = [r for r in results if r[2] is not None]
|
|
successes = len(results) - len(errors)
|
|
error_rate = len(errors) / len(results) if results else 0.0
|
|
|
|
drops_after = _parse_counter(metrics_after, "matrix_bridge_queue_dropped_total")
|
|
rl_after = _parse_counter(metrics_after, "matrix_bridge_rate_limited_total")
|
|
fo_after = _parse_counter(metrics_after, "matrix_bridge_failover_total")
|
|
sticky_after = _parse_counter(metrics_after, "matrix_bridge_sticky_node_total")
|
|
|
|
delta_drops = drops_after - drops_before
|
|
delta_rl = rl_after - rl_before
|
|
delta_fo = fo_after - fo_before
|
|
|
|
p50 = _percentile(latencies, 50)
|
|
p95 = _percentile(latencies, 95)
|
|
p99 = _percentile(latencies, 99)
|
|
p_max = max(latencies) if latencies else 0.0
|
|
|
|
# Histogram quantile from Prometheus
|
|
hist_p95 = _parse_histogram_quantile(
|
|
metrics_after, "matrix_bridge_invoke_duration_seconds", 0.95
|
|
)
|
|
hist_p95_ms = hist_p95 * 1000 if hist_p95 is not None else None
|
|
|
|
drop_rate = delta_drops / len(results) if results else 0.0
|
|
|
|
report = {
|
|
"wal": {
|
|
"before_mb": wal_before.get("wal_mb"),
|
|
"after_mb": wal_after.get("wal_mb"),
|
|
"delta_mb": round(
|
|
(wal_after.get("wal_mb") or 0) - (wal_before.get("wal_mb") or 0), 3
|
|
) if wal_before and wal_after else None,
|
|
"checkpoint_after": wal_after.get("wal_checkpoint"),
|
|
"threshold_mb": 10,
|
|
},
|
|
"summary": {
|
|
"total_messages": n_messages,
|
|
"concurrency": concurrency,
|
|
"elapsed_s": round(elapsed_s, 2),
|
|
"throughput_rps": round(n_messages / elapsed_s, 1) if elapsed_s > 0 else 0,
|
|
"successes": successes,
|
|
"errors": len(errors),
|
|
"error_rate": round(error_rate, 4),
|
|
},
|
|
"latency_ms": {
|
|
"p50": round(p50, 1),
|
|
"p95": round(p95, 1),
|
|
"p99": round(p99, 1),
|
|
"max": round(p_max, 1),
|
|
},
|
|
"metrics_delta": {
|
|
"queue_drops": int(delta_drops),
|
|
"rate_limited": int(delta_rl),
|
|
"failovers": int(delta_fo),
|
|
"sticky_sets": int(sticky_after),
|
|
"drop_rate": round(drop_rate, 4),
|
|
},
|
|
"prometheus_invoke_p95_ms": round(hist_p95_ms, 1) if hist_p95_ms else None,
|
|
"health_before": health_before.get("ok"),
|
|
"health_after": health_after.get("ok"),
|
|
"pass_criteria": {
|
|
"max_p95_ms": max_p95_ms,
|
|
"max_drop_rate": max_drop_rate,
|
|
},
|
|
}
|
|
|
|
# Pass/fail evaluation
|
|
failures = []
|
|
if p95 > max_p95_ms:
|
|
failures.append(f"p95={p95:.0f}ms exceeds threshold {max_p95_ms:.0f}ms")
|
|
if drop_rate > max_drop_rate:
|
|
failures.append(
|
|
f"drop_rate={drop_rate:.3%} exceeds threshold {max_drop_rate:.3%}"
|
|
)
|
|
wal_delta = report["wal"]["delta_mb"]
|
|
if wal_delta is not None and wal_delta > report["wal"]["threshold_mb"]:
|
|
failures.append(
|
|
f"WAL grew {wal_delta:.1f}MB (threshold {report['wal']['threshold_mb']}MB) "
|
|
"— possible SQLite write pressure (Bottleneck #2)"
|
|
)
|
|
|
|
report["passed"] = len(failures) == 0
|
|
report["failures"] = failures
|
|
return report
|
|
|
|
|
|
def _print_report(r: Dict[str, Any]) -> None:
|
|
s = r["summary"]
|
|
l = r["latency_ms"]
|
|
m = r["metrics_delta"]
|
|
passed = "✅ PASSED" if r["passed"] else "❌ FAILED"
|
|
|
|
w = r.get("wal", {})
|
|
print()
|
|
print("=" * 60)
|
|
print(f" matrix-bridge-dagi Soak Report {passed}")
|
|
print("=" * 60)
|
|
print(f" Messages: {s['total_messages']} concurrency={s['concurrency']}")
|
|
print(f" Elapsed: {s['elapsed_s']}s ({s['throughput_rps']} rps)")
|
|
print(f" Successes: {s['successes']} errors={s['errors']} ({s['error_rate']:.1%})")
|
|
print()
|
|
print(f" Latency (client-side): p50={l['p50']}ms p95={l['p95']}ms "
|
|
f"p99={l['p99']}ms max={l['max']}ms")
|
|
if r["prometheus_invoke_p95_ms"] is not None:
|
|
print(f" Invoke p95 (Prometheus): {r['prometheus_invoke_p95_ms']}ms")
|
|
print()
|
|
print(f" Queue drops: {m['queue_drops']} (rate {m['drop_rate']:.3%})")
|
|
print(f" Rate-limited: {m['rate_limited']}")
|
|
print(f" Failovers: {m['failovers']}")
|
|
print(f" Sticky sets: {m['sticky_sets']}")
|
|
if w.get("before_mb") is not None:
|
|
wal_delta_str = (
|
|
f"Δ{w['delta_mb']:+.2f}MB" if w.get("delta_mb") is not None else ""
|
|
)
|
|
wal_warn = " ⚠️" if (w.get("delta_mb") or 0) > w.get("threshold_mb", 10) else ""
|
|
print(f" WAL: {w['before_mb']}MB → {w['after_mb']}MB {wal_delta_str}{wal_warn}")
|
|
print()
|
|
if r["failures"]:
|
|
for f in r["failures"]:
|
|
print(f" ❌ {f}")
|
|
else:
|
|
print(" All pass criteria met.")
|
|
print("=" * 60)
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="matrix-bridge-dagi soak test (M11)")
|
|
parser.add_argument("--url", default="http://localhost:9400",
|
|
help="Bridge base URL (default: http://localhost:9400)")
|
|
parser.add_argument("--messages", type=int, default=100,
|
|
help="Total messages to send (default: 100)")
|
|
parser.add_argument("--concurrency", type=int, default=4,
|
|
help="Concurrent requests (default: 4)")
|
|
parser.add_argument("--agent-id", default="sofiia",
|
|
help="Agent id for synthetic events (default: sofiia)")
|
|
parser.add_argument("--room-id", default="!soak-room:home.invalid",
|
|
help="Room id for synthetic events")
|
|
parser.add_argument("--sender", default="@soak-user:home.invalid",
|
|
help="Sender for synthetic events")
|
|
parser.add_argument("--max-p95-ms", type=float, default=_DEFAULT_MAX_P95_MS,
|
|
help=f"Max p95 latency ms (default: {_DEFAULT_MAX_P95_MS})")
|
|
parser.add_argument("--max-drop-rate",type=float, default=_DEFAULT_MAX_DROP_RATE,
|
|
help=f"Max queue drop rate 0..1 (default: {_DEFAULT_MAX_DROP_RATE})")
|
|
parser.add_argument("--report-file", default="",
|
|
help="Optional path to write JSON report")
|
|
parser.add_argument("--db-path", default="",
|
|
help="Path to policy_store.db for WAL check "
|
|
"(e.g. /opt/microdao-daarion/data/matrix_bridge.db)")
|
|
args = parser.parse_args()
|
|
|
|
report = asyncio.run(run_soak(
|
|
url=args.url,
|
|
n_messages=args.messages,
|
|
concurrency=args.concurrency,
|
|
agent_id=args.agent_id,
|
|
room_id=args.room_id,
|
|
sender=args.sender,
|
|
max_p95_ms=args.max_p95_ms,
|
|
max_drop_rate=args.max_drop_rate,
|
|
db_path=args.db_path,
|
|
))
|
|
_print_report(report)
|
|
|
|
if args.report_file:
|
|
with open(args.report_file, "w", encoding="utf-8") as fh:
|
|
json.dump(report, fh, indent=2)
|
|
print(f"\n Report saved: {args.report_file}")
|
|
|
|
return 0 if report["passed"] else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|