#!/usr/bin/env python3 """ matrix_bridge_soak.py — M11 live soak script for matrix-bridge-dagi Usage: python3 ops/scripts/matrix_bridge_soak.py \ --url http://localhost:9400 \ --messages 100 \ --concurrency 4 \ --report-file /tmp/soak_report.json Requires: httpx (pip install httpx) What it does: 1. Sends --messages synthetic messages to the bridge /v1/sync endpoint (or directly to the router if --direct-router is set). 2. Measures latency (p50, p95, p99, max) per batch. 3. After the run, fetches /metrics and extracts key counters: - matrix_bridge_queue_dropped_total - matrix_bridge_rate_limited_total - matrix_bridge_failover_total - matrix_bridge_sticky_node_total - matrix_bridge_invoke_duration_seconds (p50/p95 from histogram) 4. Prints a human-readable report and optionally writes JSON. Exit codes: 0 = all pass criteria met 1 = one or more thresholds exceeded (see --max-p95-ms, --max-drop-rate) """ import argparse import asyncio import json import os import sys import time from typing import Any, Dict, List, Optional try: import httpx except ImportError: print("ERROR: httpx not installed. Run: pip install httpx", file=sys.stderr) sys.exit(2) # ── Pass/fail defaults ───────────────────────────────────────────────────────── _DEFAULT_MAX_P95_MS = 5000 # 5 s p95 per invoke (generous for cold start) _DEFAULT_MAX_DROP_RATE = 0.01 # 1% queue drops allowed # ── Metrics parsing ──────────────────────────────────────────────────────────── def _parse_counter(text: str, name: str) -> float: """Extract the last reported value of a Prometheus counter by name.""" for line in text.splitlines(): if line.startswith(name + " ") or line.startswith(name + "{"): parts = line.rsplit(None, 1) try: return float(parts[-1]) except (ValueError, IndexError): pass return 0.0 def _parse_histogram_quantile(text: str, name: str, quantile: float) -> Optional[float]: """ Approximate histogram_quantile from _bucket lines. Returns estimated value at given quantile or None if data missing. """ buckets: List[tuple] = [] total_count = 0.0 for line in text.splitlines(): if f"{name}_bucket" in line and 'le="' in line: try: le_part = line.split('le="')[1].split('"')[0] le = float(le_part) if le_part != "+Inf" else float("inf") val = float(line.rsplit(None, 1)[-1]) buckets.append((le, val)) except (ValueError, IndexError): pass elif (f"{name}_count " in line or (name + "_count{") in line): try: total_count = float(line.rsplit(None, 1)[-1]) except (ValueError, IndexError): pass if not buckets or total_count == 0: return None buckets.sort() target = quantile * total_count prev_le, prev_count = 0.0, 0.0 for le, count in buckets: if count >= target: if le == float("inf"): return prev_le # Linear interpolation if count == prev_count: return le fraction = (target - prev_count) / (count - prev_count) return prev_le + fraction * (le - prev_le) prev_le, prev_count = le, count return prev_le # ── Soak runner ──────────────────────────────────────────────────────────────── async def _preflight_inject(client: httpx.AsyncClient, url: str, room_id: str) -> str: """ Verify the inject endpoint is reachable and enabled. Returns "" on success, error message on failure. """ try: resp = await client.post( f"{url.rstrip('/')}/v1/debug/inject_event", json={"room_id": room_id, "event": {"event_id": "!preflight", "sender": "@soak:test", "content": {"msgtype": "m.text", "body": "ping"}}}, timeout=5.0, ) if resp.status_code == 403: return ( "❌ DEBUG_INJECT_ENABLED=false on bridge. " "Set DEBUG_INJECT_ENABLED=true and restart for soak.\n" " NEVER enable in production!" ) if resp.status_code >= 500: return f"❌ Bridge inject endpoint returned HTTP {resp.status_code}" data = resp.json() if not data.get("ok") and "no mapping" in data.get("error", ""): return ( f"❌ No room mapping for room_id={room_id!r}. " "Pass --room-id matching a configured BRIDGE_ROOM_MAP entry." ) return "" except httpx.ConnectError: return f"❌ Cannot connect to bridge at {url}. Is it running?" except Exception as exc: # noqa: BLE001 return f"❌ Preflight failed: {exc}" async def _check_wal(db_path: str) -> Dict[str, Any]: """ Run WAL size + checkpoint check on the bridge policy DB. Returns dict with wal_bytes, wal_mb, checkpoint_result. Requires sqlite3 CLI on PATH; gracefully skips if unavailable. """ import subprocess, shutil result: Dict[str, Any] = {"db_path": db_path, "ok": False} wal_path = db_path + "-wal" try: wal_bytes = os.path.getsize(wal_path) if os.path.exists(wal_path) else 0 result["wal_bytes"] = wal_bytes result["wal_mb"] = round(wal_bytes / 1_048_576, 2) except OSError: result["wal_bytes"] = -1 result["wal_mb"] = -1 if shutil.which("sqlite3"): try: cp = subprocess.run( ["sqlite3", db_path, "PRAGMA wal_checkpoint(PASSIVE);"], capture_output=True, text=True, timeout=5, ) # Output: busy|log|checkpointed (3 ints) parts = cp.stdout.strip().split("|") if len(parts) == 3: result["wal_checkpoint"] = { "busy": int(parts[0]), "log": int(parts[1]), "checkpointed": int(parts[2]), } result["ok"] = True except Exception: # noqa: BLE001 result["ok"] = False else: result["sqlite3_missing"] = True return result async def _send_one( client: httpx.AsyncClient, url: str, agent_id: str, message: str, room_id: str, sender: str, ) -> tuple: """ POST a synthetic Matrix-style event to the bridge debug endpoint. Returns (latency_ms: float, status_code: int, error: str|None). """ payload = { "room_id": room_id, "event": { "event_id": f"!soak-{int(time.monotonic() * 1e6)}", "sender": sender, "type": "m.room.message", "content": {"msgtype": "m.text", "body": message}, }, } t0 = time.monotonic() try: resp = await client.post( f"{url.rstrip('/')}/v1/debug/inject_event", json=payload, timeout=30.0, ) latency_ms = (time.monotonic() - t0) * 1000 if resp.status_code >= 500: return latency_ms, resp.status_code, f"HTTP {resp.status_code}" return latency_ms, resp.status_code, None except httpx.TimeoutException: latency_ms = (time.monotonic() - t0) * 1000 return latency_ms, 0, "timeout" except Exception as exc: # noqa: BLE001 latency_ms = (time.monotonic() - t0) * 1000 return latency_ms, 0, str(exc) async def _fetch_health(client: httpx.AsyncClient, url: str) -> Dict[str, Any]: try: resp = await client.get(f"{url.rstrip('/')}/health", timeout=10.0) return resp.json() if resp.status_code == 200 else {} except Exception: # noqa: BLE001 return {} async def _fetch_metrics(client: httpx.AsyncClient, url: str) -> str: try: resp = await client.get(f"{url.rstrip('/')}/metrics", timeout=10.0) return resp.text if resp.status_code == 200 else "" except Exception: # noqa: BLE001 return "" def _percentile(values: List[float], p: float) -> float: if not values: return 0.0 sv = sorted(values) idx = int(len(sv) * p / 100) return sv[min(idx, len(sv) - 1)] async def run_soak( url: str, n_messages: int, concurrency: int, agent_id: str, room_id: str, sender: str, max_p95_ms: float, max_drop_rate: float, db_path: str = "", ) -> Dict[str, Any]: results: List[tuple] = [] semaphore = asyncio.Semaphore(concurrency) async with httpx.AsyncClient() as client: # Pre-check: inject endpoint + health preflight_err = await _preflight_inject(client, url, room_id) if preflight_err: print(preflight_err, file=sys.stderr) return {"ok": False, "error": preflight_err, "passed": False, "failures": [preflight_err]} # WAL check before soak wal_before: Dict[str, Any] = {} if db_path: wal_before = await _check_wal(db_path) print(f"[soak] WAL before: {wal_before.get('wal_mb', '?')} MB") # Pre-check: health health_before = await _fetch_health(client, url) metrics_before = await _fetch_metrics(client, url) drops_before = _parse_counter(metrics_before, "matrix_bridge_queue_dropped_total") rl_before = _parse_counter(metrics_before, "matrix_bridge_rate_limited_total") fo_before = _parse_counter(metrics_before, "matrix_bridge_failover_total") print(f"[soak] Bridge health before: {health_before.get('ok', '?')}") print(f"[soak] Starting {n_messages} messages (concurrency={concurrency}) ...") t_start = time.monotonic() async def worker(i: int): async with semaphore: msg = f"soak-msg-{i:04d}" lat, status, err = await _send_one( client, url, agent_id, msg, room_id, sender ) results.append((lat, status, err)) if (i + 1) % max(1, n_messages // 10) == 0: print(f" [{i+1}/{n_messages}] last={lat:.0f}ms status={status}") await asyncio.gather(*[worker(i) for i in range(n_messages)]) elapsed_s = time.monotonic() - t_start metrics_after = await _fetch_metrics(client, url) health_after = await _fetch_health(client, url) # WAL check after soak wal_after: Dict[str, Any] = {} if db_path: wal_after = await _check_wal(db_path) print(f"[soak] WAL after: {wal_after.get('wal_mb', '?')} MB " f"(delta={round(wal_after.get('wal_mb',0) - wal_before.get('wal_mb',0), 2)} MB)") latencies = [r[0] for r in results] errors = [r for r in results if r[2] is not None] successes = len(results) - len(errors) error_rate = len(errors) / len(results) if results else 0.0 drops_after = _parse_counter(metrics_after, "matrix_bridge_queue_dropped_total") rl_after = _parse_counter(metrics_after, "matrix_bridge_rate_limited_total") fo_after = _parse_counter(metrics_after, "matrix_bridge_failover_total") sticky_after = _parse_counter(metrics_after, "matrix_bridge_sticky_node_total") delta_drops = drops_after - drops_before delta_rl = rl_after - rl_before delta_fo = fo_after - fo_before p50 = _percentile(latencies, 50) p95 = _percentile(latencies, 95) p99 = _percentile(latencies, 99) p_max = max(latencies) if latencies else 0.0 # Histogram quantile from Prometheus hist_p95 = _parse_histogram_quantile( metrics_after, "matrix_bridge_invoke_duration_seconds", 0.95 ) hist_p95_ms = hist_p95 * 1000 if hist_p95 is not None else None drop_rate = delta_drops / len(results) if results else 0.0 report = { "wal": { "before_mb": wal_before.get("wal_mb"), "after_mb": wal_after.get("wal_mb"), "delta_mb": round( (wal_after.get("wal_mb") or 0) - (wal_before.get("wal_mb") or 0), 3 ) if wal_before and wal_after else None, "checkpoint_after": wal_after.get("wal_checkpoint"), "threshold_mb": 10, }, "summary": { "total_messages": n_messages, "concurrency": concurrency, "elapsed_s": round(elapsed_s, 2), "throughput_rps": round(n_messages / elapsed_s, 1) if elapsed_s > 0 else 0, "successes": successes, "errors": len(errors), "error_rate": round(error_rate, 4), }, "latency_ms": { "p50": round(p50, 1), "p95": round(p95, 1), "p99": round(p99, 1), "max": round(p_max, 1), }, "metrics_delta": { "queue_drops": int(delta_drops), "rate_limited": int(delta_rl), "failovers": int(delta_fo), "sticky_sets": int(sticky_after), "drop_rate": round(drop_rate, 4), }, "prometheus_invoke_p95_ms": round(hist_p95_ms, 1) if hist_p95_ms else None, "health_before": health_before.get("ok"), "health_after": health_after.get("ok"), "pass_criteria": { "max_p95_ms": max_p95_ms, "max_drop_rate": max_drop_rate, }, } # Pass/fail evaluation failures = [] if p95 > max_p95_ms: failures.append(f"p95={p95:.0f}ms exceeds threshold {max_p95_ms:.0f}ms") if drop_rate > max_drop_rate: failures.append( f"drop_rate={drop_rate:.3%} exceeds threshold {max_drop_rate:.3%}" ) wal_delta = report["wal"]["delta_mb"] if wal_delta is not None and wal_delta > report["wal"]["threshold_mb"]: failures.append( f"WAL grew {wal_delta:.1f}MB (threshold {report['wal']['threshold_mb']}MB) " "— possible SQLite write pressure (Bottleneck #2)" ) report["passed"] = len(failures) == 0 report["failures"] = failures return report def _print_report(r: Dict[str, Any]) -> None: s = r["summary"] l = r["latency_ms"] m = r["metrics_delta"] passed = "✅ PASSED" if r["passed"] else "❌ FAILED" w = r.get("wal", {}) print() print("=" * 60) print(f" matrix-bridge-dagi Soak Report {passed}") print("=" * 60) print(f" Messages: {s['total_messages']} concurrency={s['concurrency']}") print(f" Elapsed: {s['elapsed_s']}s ({s['throughput_rps']} rps)") print(f" Successes: {s['successes']} errors={s['errors']} ({s['error_rate']:.1%})") print() print(f" Latency (client-side): p50={l['p50']}ms p95={l['p95']}ms " f"p99={l['p99']}ms max={l['max']}ms") if r["prometheus_invoke_p95_ms"] is not None: print(f" Invoke p95 (Prometheus): {r['prometheus_invoke_p95_ms']}ms") print() print(f" Queue drops: {m['queue_drops']} (rate {m['drop_rate']:.3%})") print(f" Rate-limited: {m['rate_limited']}") print(f" Failovers: {m['failovers']}") print(f" Sticky sets: {m['sticky_sets']}") if w.get("before_mb") is not None: wal_delta_str = ( f"Δ{w['delta_mb']:+.2f}MB" if w.get("delta_mb") is not None else "" ) wal_warn = " ⚠️" if (w.get("delta_mb") or 0) > w.get("threshold_mb", 10) else "" print(f" WAL: {w['before_mb']}MB → {w['after_mb']}MB {wal_delta_str}{wal_warn}") print() if r["failures"]: for f in r["failures"]: print(f" ❌ {f}") else: print(" All pass criteria met.") print("=" * 60) def main() -> int: parser = argparse.ArgumentParser(description="matrix-bridge-dagi soak test (M11)") parser.add_argument("--url", default="http://localhost:9400", help="Bridge base URL (default: http://localhost:9400)") parser.add_argument("--messages", type=int, default=100, help="Total messages to send (default: 100)") parser.add_argument("--concurrency", type=int, default=4, help="Concurrent requests (default: 4)") parser.add_argument("--agent-id", default="sofiia", help="Agent id for synthetic events (default: sofiia)") parser.add_argument("--room-id", default="!soak-room:home.invalid", help="Room id for synthetic events") parser.add_argument("--sender", default="@soak-user:home.invalid", help="Sender for synthetic events") parser.add_argument("--max-p95-ms", type=float, default=_DEFAULT_MAX_P95_MS, help=f"Max p95 latency ms (default: {_DEFAULT_MAX_P95_MS})") parser.add_argument("--max-drop-rate",type=float, default=_DEFAULT_MAX_DROP_RATE, help=f"Max queue drop rate 0..1 (default: {_DEFAULT_MAX_DROP_RATE})") parser.add_argument("--report-file", default="", help="Optional path to write JSON report") parser.add_argument("--db-path", default="", help="Path to policy_store.db for WAL check " "(e.g. /opt/microdao-daarion/data/matrix_bridge.db)") args = parser.parse_args() report = asyncio.run(run_soak( url=args.url, n_messages=args.messages, concurrency=args.concurrency, agent_id=args.agent_id, room_id=args.room_id, sender=args.sender, max_p95_ms=args.max_p95_ms, max_drop_rate=args.max_drop_rate, db_path=args.db_path, )) _print_report(report) if args.report_file: with open(args.report_file, "w", encoding="utf-8") as fh: json.dump(report, fh, indent=2) print(f"\n Report saved: {args.report_file}") return 0 if report["passed"] else 1 if __name__ == "__main__": sys.exit(main())