microdao-daarion/ops/scripts/matrix_bridge_soak.py

#!/usr/bin/env python3
"""
matrix_bridge_soak.py — M11 live soak script for matrix-bridge-dagi

Usage:
  python3 ops/scripts/matrix_bridge_soak.py \
      --url http://localhost:9400 \
      --messages 100 \
      --concurrency 4 \
      --report-file /tmp/soak_report.json

Requires: httpx (pip install httpx)

What it does:
  1. Sends --messages synthetic messages to the bridge /v1/sync endpoint
     (or directly to the router if --direct-router is set).
  2. Measures latency (p50, p95, p99, max) per batch.
  3. After the run, fetches /metrics and extracts key counters:
       - matrix_bridge_queue_dropped_total
       - matrix_bridge_rate_limited_total
       - matrix_bridge_failover_total
       - matrix_bridge_sticky_node_total
       - matrix_bridge_invoke_duration_seconds (p50/p95 from histogram)
  4. Prints a human-readable report and optionally writes JSON.

Exit codes:
  0 = all pass criteria met
  1 = one or more thresholds exceeded (see --max-p95-ms, --max-drop-rate)
"""
import argparse
import asyncio
import json
import os
import sys
import time
from typing import Any, Dict, List, Optional

try:
    import httpx
except ImportError:
    print("ERROR: httpx not installed. Run: pip install httpx", file=sys.stderr)
    sys.exit(2)

# ── Pass/fail defaults ─────────────────────────────────────────────────────────
_DEFAULT_MAX_P95_MS  = 5000   # 5 s p95 per invoke (generous for cold start)
_DEFAULT_MAX_DROP_RATE = 0.01  # 1% queue drops allowed


# ── Metrics parsing ────────────────────────────────────────────────────────────
def _parse_counter(text: str, name: str) -> float:
    """Extract the last reported value of a Prometheus counter by name."""
    for line in text.splitlines():
        if line.startswith(name + " ") or line.startswith(name + "{"):
            parts = line.rsplit(None, 1)
            try:
                return float(parts[-1])
            except (ValueError, IndexError):
                pass
    return 0.0


def _parse_histogram_quantile(text: str, name: str, quantile: float) -> Optional[float]:
    """
    Approximate histogram_quantile from _bucket lines.
    Returns estimated value at given quantile or None if data missing.
    """
    buckets: List[tuple] = []
    total_count = 0.0
    for line in text.splitlines():
        if f"{name}_bucket" in line and 'le="' in line:
            try:
                le_part = line.split('le="')[1].split('"')[0]
                le = float(le_part) if le_part != "+Inf" else float("inf")
                val = float(line.rsplit(None, 1)[-1])
                buckets.append((le, val))
            except (ValueError, IndexError):
                pass
        elif (f"{name}_count " in line or (name + "_count{") in line):
            try:
                total_count = float(line.rsplit(None, 1)[-1])
            except (ValueError, IndexError):
                pass

    if not buckets or total_count == 0:
        return None

    buckets.sort()
    target = quantile * total_count
    prev_le, prev_count = 0.0, 0.0
    for le, count in buckets:
        if count >= target:
            if le == float("inf"):
                return prev_le
            # Linear interpolation
            if count == prev_count:
                return le
            fraction = (target - prev_count) / (count - prev_count)
            return prev_le + fraction * (le - prev_le)
        prev_le, prev_count = le, count
    return prev_le


# ── Soak runner ────────────────────────────────────────────────────────────────
async def _preflight_inject(client: httpx.AsyncClient, url: str, room_id: str) -> str:
    """
    Verify the inject endpoint is reachable and enabled.
    Returns "" on success, error message on failure.
    """
    try:
        resp = await client.post(
            f"{url.rstrip('/')}/v1/debug/inject_event",
            json={"room_id": room_id, "event": {"event_id": "!preflight", "sender": "@soak:test",
                                                  "content": {"msgtype": "m.text", "body": "ping"}}},
            timeout=5.0,
        )
        if resp.status_code == 403:
            return (
                "❌ DEBUG_INJECT_ENABLED=false on bridge. "
                "Set DEBUG_INJECT_ENABLED=true and restart for soak.\n"
                "   NEVER enable in production!"
            )
        if resp.status_code >= 500:
            return f"❌ Bridge inject endpoint returned HTTP {resp.status_code}"
        data = resp.json()
        if not data.get("ok") and "no mapping" in data.get("error", ""):
            return (
                f"❌ No room mapping for room_id={room_id!r}. "
                "Pass --room-id matching a configured BRIDGE_ROOM_MAP entry."
            )
        return ""
    except httpx.ConnectError:
        return f"❌ Cannot connect to bridge at {url}. Is it running?"
    except Exception as exc:  # noqa: BLE001
        return f"❌ Preflight failed: {exc}"


async def _check_wal(db_path: str) -> Dict[str, Any]:
    """
    Run WAL size + checkpoint check on the bridge policy DB.
    Returns dict with wal_bytes, wal_mb, checkpoint_result.
    Requires sqlite3 CLI on PATH; gracefully skips if unavailable.
    """
    import subprocess, shutil
    result: Dict[str, Any] = {"db_path": db_path, "ok": False}

    wal_path = db_path + "-wal"
    try:
        wal_bytes = os.path.getsize(wal_path) if os.path.exists(wal_path) else 0
        result["wal_bytes"] = wal_bytes
        result["wal_mb"]    = round(wal_bytes / 1_048_576, 2)
    except OSError:
        result["wal_bytes"] = -1
        result["wal_mb"]    = -1

    if shutil.which("sqlite3"):
        try:
            cp = subprocess.run(
                ["sqlite3", db_path, "PRAGMA wal_checkpoint(PASSIVE);"],
                capture_output=True, text=True, timeout=5,
            )
            # Output: busy|log|checkpointed (3 ints)
            parts = cp.stdout.strip().split("|")
            if len(parts) == 3:
                result["wal_checkpoint"] = {
                    "busy": int(parts[0]), "log": int(parts[1]), "checkpointed": int(parts[2]),
                }
            result["ok"] = True
        except Exception:  # noqa: BLE001
            result["ok"] = False
    else:
        result["sqlite3_missing"] = True

    return result


async def _send_one(
    client: httpx.AsyncClient,
    url: str,
    agent_id: str,
    message: str,
    room_id: str,
    sender: str,
) -> tuple:
    """
    POST a synthetic Matrix-style event to the bridge debug endpoint.
    Returns (latency_ms: float, status_code: int, error: str|None).
    """
    payload = {
        "room_id": room_id,
        "event": {
            "event_id": f"!soak-{int(time.monotonic() * 1e6)}",
            "sender": sender,
            "type": "m.room.message",
            "content": {"msgtype": "m.text", "body": message},
        },
    }
    t0 = time.monotonic()
    try:
        resp = await client.post(
            f"{url.rstrip('/')}/v1/debug/inject_event",
            json=payload,
            timeout=30.0,
        )
        latency_ms = (time.monotonic() - t0) * 1000
        if resp.status_code >= 500:
            return latency_ms, resp.status_code, f"HTTP {resp.status_code}"
        return latency_ms, resp.status_code, None
    except httpx.TimeoutException:
        latency_ms = (time.monotonic() - t0) * 1000
        return latency_ms, 0, "timeout"
    except Exception as exc:  # noqa: BLE001
        latency_ms = (time.monotonic() - t0) * 1000
        return latency_ms, 0, str(exc)


async def _fetch_health(client: httpx.AsyncClient, url: str) -> Dict[str, Any]:
    try:
        resp = await client.get(f"{url.rstrip('/')}/health", timeout=10.0)
        return resp.json() if resp.status_code == 200 else {}
    except Exception:  # noqa: BLE001
        return {}


async def _fetch_metrics(client: httpx.AsyncClient, url: str) -> str:
    try:
        resp = await client.get(f"{url.rstrip('/')}/metrics", timeout=10.0)
        return resp.text if resp.status_code == 200 else ""
    except Exception:  # noqa: BLE001
        return ""


def _percentile(values: List[float], p: float) -> float:
    if not values:
        return 0.0
    sv = sorted(values)
    idx = int(len(sv) * p / 100)
    return sv[min(idx, len(sv) - 1)]


async def run_soak(
    url: str,
    n_messages: int,
    concurrency: int,
    agent_id: str,
    room_id: str,
    sender: str,
    max_p95_ms: float,
    max_drop_rate: float,
    db_path: str = "",
) -> Dict[str, Any]:
    results: List[tuple] = []
    semaphore = asyncio.Semaphore(concurrency)

    async with httpx.AsyncClient() as client:
        # Pre-check: inject endpoint + health
        preflight_err = await _preflight_inject(client, url, room_id)
        if preflight_err:
            print(preflight_err, file=sys.stderr)
            return {"ok": False, "error": preflight_err, "passed": False, "failures": [preflight_err]}

        # WAL check before soak
        wal_before: Dict[str, Any] = {}
        if db_path:
            wal_before = await _check_wal(db_path)
            print(f"[soak] WAL before: {wal_before.get('wal_mb', '?')} MB")

        # Pre-check: health
        health_before = await _fetch_health(client, url)
        metrics_before = await _fetch_metrics(client, url)

        drops_before  = _parse_counter(metrics_before, "matrix_bridge_queue_dropped_total")
        rl_before     = _parse_counter(metrics_before, "matrix_bridge_rate_limited_total")
        fo_before     = _parse_counter(metrics_before, "matrix_bridge_failover_total")

        print(f"[soak] Bridge health before: {health_before.get('ok', '?')}")
        print(f"[soak] Starting {n_messages} messages (concurrency={concurrency}) ...")

        t_start = time.monotonic()

        async def worker(i: int):
            async with semaphore:
                msg = f"soak-msg-{i:04d}"
                lat, status, err = await _send_one(
                    client, url, agent_id, msg, room_id, sender
                )
                results.append((lat, status, err))
                if (i + 1) % max(1, n_messages // 10) == 0:
                    print(f"  [{i+1}/{n_messages}] last={lat:.0f}ms status={status}")

        await asyncio.gather(*[worker(i) for i in range(n_messages)])

        elapsed_s = time.monotonic() - t_start
        metrics_after = await _fetch_metrics(client, url)
        health_after  = await _fetch_health(client, url)

        # WAL check after soak
        wal_after: Dict[str, Any] = {}
        if db_path:
            wal_after = await _check_wal(db_path)
            print(f"[soak] WAL after:  {wal_after.get('wal_mb', '?')} MB "
                  f"(delta={round(wal_after.get('wal_mb',0) - wal_before.get('wal_mb',0), 2)} MB)")

    latencies  = [r[0] for r in results]
    errors     = [r for r in results if r[2] is not None]
    successes  = len(results) - len(errors)
    error_rate = len(errors) / len(results) if results else 0.0

    drops_after = _parse_counter(metrics_after, "matrix_bridge_queue_dropped_total")
    rl_after    = _parse_counter(metrics_after, "matrix_bridge_rate_limited_total")
    fo_after    = _parse_counter(metrics_after, "matrix_bridge_failover_total")
    sticky_after = _parse_counter(metrics_after, "matrix_bridge_sticky_node_total")

    delta_drops = drops_after - drops_before
    delta_rl    = rl_after    - rl_before
    delta_fo    = fo_after    - fo_before

    p50 = _percentile(latencies, 50)
    p95 = _percentile(latencies, 95)
    p99 = _percentile(latencies, 99)
    p_max = max(latencies) if latencies else 0.0

    # Histogram quantile from Prometheus
    hist_p95 = _parse_histogram_quantile(
        metrics_after, "matrix_bridge_invoke_duration_seconds", 0.95
    )
    hist_p95_ms = hist_p95 * 1000 if hist_p95 is not None else None

    drop_rate = delta_drops / len(results) if results else 0.0

    report = {
        "wal": {
            "before_mb": wal_before.get("wal_mb"),
            "after_mb":  wal_after.get("wal_mb"),
            "delta_mb":  round(
                (wal_after.get("wal_mb") or 0) - (wal_before.get("wal_mb") or 0), 3
            ) if wal_before and wal_after else None,
            "checkpoint_after": wal_after.get("wal_checkpoint"),
            "threshold_mb": 10,
        },
        "summary": {
            "total_messages": n_messages,
            "concurrency": concurrency,
            "elapsed_s": round(elapsed_s, 2),
            "throughput_rps": round(n_messages / elapsed_s, 1) if elapsed_s > 0 else 0,
            "successes": successes,
            "errors": len(errors),
            "error_rate": round(error_rate, 4),
        },
        "latency_ms": {
            "p50": round(p50, 1),
            "p95": round(p95, 1),
            "p99": round(p99, 1),
            "max": round(p_max, 1),
        },
        "metrics_delta": {
            "queue_drops": int(delta_drops),
            "rate_limited": int(delta_rl),
            "failovers": int(delta_fo),
            "sticky_sets": int(sticky_after),
            "drop_rate": round(drop_rate, 4),
        },
        "prometheus_invoke_p95_ms": round(hist_p95_ms, 1) if hist_p95_ms else None,
        "health_before": health_before.get("ok"),
        "health_after":  health_after.get("ok"),
        "pass_criteria": {
            "max_p95_ms": max_p95_ms,
            "max_drop_rate": max_drop_rate,
        },
    }

    # Pass/fail evaluation
    failures = []
    if p95 > max_p95_ms:
        failures.append(f"p95={p95:.0f}ms exceeds threshold {max_p95_ms:.0f}ms")
    if drop_rate > max_drop_rate:
        failures.append(
            f"drop_rate={drop_rate:.3%} exceeds threshold {max_drop_rate:.3%}"
        )
    wal_delta = report["wal"]["delta_mb"]
    if wal_delta is not None and wal_delta > report["wal"]["threshold_mb"]:
        failures.append(
            f"WAL grew {wal_delta:.1f}MB (threshold {report['wal']['threshold_mb']}MB) "
            "— possible SQLite write pressure (Bottleneck #2)"
        )

    report["passed"] = len(failures) == 0
    report["failures"] = failures
    return report


def _print_report(r: Dict[str, Any]) -> None:
    s = r["summary"]
    l = r["latency_ms"]
    m = r["metrics_delta"]
    passed = "✅ PASSED" if r["passed"] else "❌ FAILED"

    w = r.get("wal", {})
    print()
    print("=" * 60)
    print(f"  matrix-bridge-dagi Soak Report  {passed}")
    print("=" * 60)
    print(f"  Messages:    {s['total_messages']}  concurrency={s['concurrency']}")
    print(f"  Elapsed:     {s['elapsed_s']}s  ({s['throughput_rps']} rps)")
    print(f"  Successes:   {s['successes']}  errors={s['errors']} ({s['error_rate']:.1%})")
    print()
    print(f"  Latency (client-side):  p50={l['p50']}ms  p95={l['p95']}ms  "
          f"p99={l['p99']}ms  max={l['max']}ms")
    if r["prometheus_invoke_p95_ms"] is not None:
        print(f"  Invoke p95 (Prometheus): {r['prometheus_invoke_p95_ms']}ms")
    print()
    print(f"  Queue drops:   {m['queue_drops']}  (rate {m['drop_rate']:.3%})")
    print(f"  Rate-limited:  {m['rate_limited']}")
    print(f"  Failovers:     {m['failovers']}")
    print(f"  Sticky sets:   {m['sticky_sets']}")
    if w.get("before_mb") is not None:
        wal_delta_str = (
            f"Δ{w['delta_mb']:+.2f}MB" if w.get("delta_mb") is not None else ""
        )
        wal_warn = " ⚠️" if (w.get("delta_mb") or 0) > w.get("threshold_mb", 10) else ""
        print(f"  WAL:           {w['before_mb']}MB → {w['after_mb']}MB {wal_delta_str}{wal_warn}")
    print()
    if r["failures"]:
        for f in r["failures"]:
            print(f"  ❌ {f}")
    else:
        print("  All pass criteria met.")
    print("=" * 60)


def main() -> int:
    parser = argparse.ArgumentParser(description="matrix-bridge-dagi soak test (M11)")
    parser.add_argument("--url",          default="http://localhost:9400",
                        help="Bridge base URL (default: http://localhost:9400)")
    parser.add_argument("--messages",     type=int, default=100,
                        help="Total messages to send (default: 100)")
    parser.add_argument("--concurrency",  type=int, default=4,
                        help="Concurrent requests (default: 4)")
    parser.add_argument("--agent-id",     default="sofiia",
                        help="Agent id for synthetic events (default: sofiia)")
    parser.add_argument("--room-id",      default="!soak-room:home.invalid",
                        help="Room id for synthetic events")
    parser.add_argument("--sender",       default="@soak-user:home.invalid",
                        help="Sender for synthetic events")
    parser.add_argument("--max-p95-ms",   type=float, default=_DEFAULT_MAX_P95_MS,
                        help=f"Max p95 latency ms (default: {_DEFAULT_MAX_P95_MS})")
    parser.add_argument("--max-drop-rate",type=float, default=_DEFAULT_MAX_DROP_RATE,
                        help=f"Max queue drop rate 0..1 (default: {_DEFAULT_MAX_DROP_RATE})")
    parser.add_argument("--report-file",  default="",
                        help="Optional path to write JSON report")
    parser.add_argument("--db-path",      default="",
                        help="Path to policy_store.db for WAL check "
                             "(e.g. /opt/microdao-daarion/data/matrix_bridge.db)")
    args = parser.parse_args()

    report = asyncio.run(run_soak(
        url=args.url,
        n_messages=args.messages,
        concurrency=args.concurrency,
        agent_id=args.agent_id,
        room_id=args.room_id,
        sender=args.sender,
        max_p95_ms=args.max_p95_ms,
        max_drop_rate=args.max_drop_rate,
        db_path=args.db_path,
    ))
    _print_report(report)

    if args.report_file:
        with open(args.report_file, "w", encoding="utf-8") as fh:
            json.dump(report, fh, indent=2)
        print(f"\n  Report saved: {args.report_file}")

    return 0 if report["passed"] else 1


if __name__ == "__main__":
    sys.exit(main())