microdao-daarion/scripts/load/burst_100.py

#!/usr/bin/env python3
"""
Burst Load Test - 100 Messages
===============================
Tests end-to-end: Gateway → Router → NATS → Workers → Memory

Usage:
    python3 burst_100.py --messages 100 --burst-time 5
    python3 burst_100.py --messages 100 --duplicates 10
    python3 burst_100.py --messages 100 --kill-worker
"""

import os
import sys
import asyncio
import json
import time
import uuid
import argparse
import logging
from datetime import datetime
from typing import List, Dict, Any
from dataclasses import dataclass
import statistics

import nats
from nats.js.api import StreamInfo
import httpx
import redis.asyncio as redis

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s'
)
logger = logging.getLogger(__name__)

# Configuration
NATS_URL = os.getenv("NATS_URL", "nats://nats:4222")
ROUTER_URL = os.getenv("ROUTER_URL", "http://router:9102")
REDIS_URL = os.getenv("REDIS_URL", "redis://dagi-redis:6379")
NATS_MONITOR_URL = os.getenv("NATS_MONITOR_URL", "http://nats:8222")

# Use existing subjects with test metadata (simpler approach)
SUBJECT_AGENT_RUN_REQUESTED = "agent.run.requested"
SUBJECT_AGENT_RUN_COMPLETED = "agent.run.completed.helion"

# Stream names
STREAM_MESSAGES = "MESSAGES"
STREAM_AGENT_RUNS = "AGENT_RUNS"


@dataclass
class TestMessage:
    """Test message with trace correlation"""
    message_id: str
    job_id: str
    trace_id: str
    user_id: str
    agent_id: str
    content: str
    timestamp: float
    is_duplicate: bool = False


@dataclass
class TestResult:
    """Test execution results"""
    messages_sent: int
    messages_acked: int
    messages_completed: int
    messages_failed: int
    duplicates_detected: int
    expected_unique: int
    missing_job_ids: List[str]
    max_consumer_lag: int
    final_consumer_lag: int
    dlq_depth: int
    p50_latency_ms: float
    p95_latency_ms: float
    p99_latency_ms: float
    error_rate: float
    duration_seconds: float
    redis_in_progress: int
    redis_completed: int


class BurstLoadTest:
    """Burst load test orchestrator"""

    def __init__(self, num_messages: int = 100, burst_time: float = 5.0, duplicates: int = 0):
        self.num_messages = num_messages
        self.burst_time = burst_time
        self.duplicates = duplicates
        self.nc = None
        self.js = None
        self.redis_client = None
        self.http_client = None

        self.messages: List[TestMessage] = []
        self.completed_messages: Dict[str, float] = {}  # job_id -> completion_time
        self.start_time = None
        self.end_time = None

    async def connect(self):
        """Connect to NATS, Redis, HTTP"""
        self.nc = await nats.connect(NATS_URL)
        self.js = self.nc.jetstream()
        self.redis_client = await redis.from_url(REDIS_URL, decode_responses=True)
        self.http_client = httpx.AsyncClient(timeout=30.0)
        logger.info("Connected to NATS, Redis, HTTP")

    async def disconnect(self):
        """Disconnect from all services"""
        if self.nc:
            await self.nc.close()
        if self.redis_client:
            await self.redis_client.close()
        if self.http_client:
            await self.http_client.aclose()

    def generate_messages(self) -> List[TestMessage]:
        """Generate test messages"""
        messages = []
        base_trace_id = str(uuid.uuid4())

        # Generate unique messages
        for i in range(self.num_messages):
            msg_id = f"test-msg-{i}"
            job_id = f"test-job-{i}"
            trace_id = f"{base_trace_id}-{i}"

            msg = TestMessage(
                message_id=msg_id,
                job_id=job_id,
                trace_id=trace_id,
                user_id="tg:test_user",
                agent_id="helion",
                content=f"Test message {i}: Load test burst",
                timestamp=time.time()
            )
            messages.append(msg)

        # Add duplicates
        if self.duplicates > 0:
            import random
            duplicate_indices = random.sample(range(self.num_messages), min(self.duplicates, self.num_messages))
            for idx in duplicate_indices:
                original = messages[idx]
                duplicate = TestMessage(
                    message_id=f"{original.message_id}-dup",
                    job_id=original.job_id,  # Same job_id for idempotency test
                    trace_id=f"{original.trace_id}-dup",
                    user_id=original.user_id,
                    agent_id=original.agent_id,
                    content=original.content,
                    timestamp=time.time(),
                    is_duplicate=True
                )
                messages.append(duplicate)

        return messages

    async def publish_message(self, msg: TestMessage) -> bool:
        """Publish test message to NATS"""
        try:
            payload = {
                "task_id": msg.job_id,
                "job_id": msg.job_id,
                "workflow_type": "test",  # Worker will detect "test" and use mock execution
                "agent_id": msg.agent_id,
                "trace_id": msg.trace_id,
                "user_id": msg.user_id,
                "test_mode": True,  # Explicit test flag
                "payload": {
                    "prompt": msg.content,
                    "test": True,
                    "is_duplicate": msg.is_duplicate
                },
                "priority": 1,
                "timeout": 30
            }

            headers = {
                "Nats-Trace-ID": msg.trace_id,
                "Nats-Job-ID": msg.job_id,
                "Nats-User-ID": msg.user_id,
                "Nats-Agent-ID": msg.agent_id,
                "Nats-Timestamp": datetime.utcnow().isoformat()
            }

            await self.js.publish(
                SUBJECT_AGENT_RUN_REQUESTED,
                json.dumps(payload).encode(),
                headers=headers
            )

            return True
        except Exception as e:
            logger.error(f"Failed to publish {msg.message_id}: {e}")
            return False

    async def publish_burst(self) -> int:
        """Publish all messages in burst"""
        self.messages = self.generate_messages()
        total = len(self.messages)
        logger.info(f"Publishing {total} messages over {self.burst_time}s...")

        self.start_time = time.time()

        # Calculate delay between messages
        delay = self.burst_time / total if total > 0 else 0

        published = 0
        for msg in self.messages:
            if await self.publish_message(msg):
                published += 1
            if delay > 0:
                await asyncio.sleep(delay)

        logger.info(f"Published {published}/{total} messages")
        return published

    async def get_consumer_lag(self) -> Dict[str, int]:
        """Get consumer lag for all streams"""
        try:
            resp = await self.http_client.get(f"{NATS_MONITOR_URL}/jsz")
            data = resp.json()

            lag = {}
            for stream_name in [STREAM_MESSAGES, STREAM_AGENT_RUNS]:
                stream_info = data.get("account_details", {}).get("stream_detail", [])
                for s in stream_info:
                    if s.get("name") == stream_name:
                        consumers = s.get("consumer_detail", [])
                        for c in consumers:
                            consumer_name = c.get("name", "unknown")
                            num_pending = c.get("num_pending", 0)
                            lag[f"{stream_name}:{consumer_name}"] = num_pending

            return lag
        except Exception as e:
            logger.warning(f"Failed to get consumer lag: {e}")
            return {}

    async def get_dlq_depth(self) -> int:
        """Get DLQ depth"""
        try:
            # Check DLQ subjects
            dlq_subjects = ["attachment.failed.dlq", "agent.run.failed.dlq"]
            total = 0

            for subject in dlq_subjects:
                try:
                    # Try to get message count from stream
                    stream_info = await self.js.stream_info("AUDIT")
                    # This is simplified - actual DLQ depth requires stream inspection
                    # For now, return 0 if we can't measure
                except:
                    pass

            return total
        except Exception as e:
            logger.warning(f"Failed to get DLQ depth: {e}")
            return 0

    async def monitor_completions(self, duration: float = 60.0):
        """Monitor for completed messages"""
        logger.info(f"Monitoring completions for {duration}s...")

        async def completion_handler(msg):
            try:
                data = json.loads(msg.data.decode())
                job_id = data.get("job_id") or data.get("task_id")
                if job_id:
                    self.completed_messages[job_id] = time.time()
                    logger.debug(f"Received completion for job_id: {job_id}")
            except Exception as e:
                logger.warning(f"Error processing completion: {e}")
            await msg.ack()

        # Subscribe to completion events
        sub = await self.js.subscribe(
            SUBJECT_AGENT_RUN_COMPLETED,
            "burst-test-monitor",
            cb=completion_handler
        )

        await asyncio.sleep(duration)
        await sub.unsubscribe()

    async def get_redis_idempotency_stats(self) -> Dict[str, int]:
        """Get Redis idempotency key statistics"""
        try:
            keys_in_progress = await self.redis_client.keys("idemp:*")
            in_progress = 0
            completed = 0

            for key in keys_in_progress:
                value = await self.redis_client.get(key)
                if value:
                    try:
                        data = json.loads(value)
                        status = data.get("status", "")
                        if status == "in_progress":
                            in_progress += 1
                        elif status == "completed":
                            completed += 1
                    except:
                        pass

            return {"in_progress": in_progress, "completed": completed}
        except Exception as e:
            logger.warning(f"Failed to get Redis stats: {e}")
            return {"in_progress": 0, "completed": 0}

    async def calculate_latencies(self) -> Dict[str, float]:
        """Calculate latency percentiles"""
        if not self.completed_messages or not self.start_time:
            return {"p50": 0, "p95": 0, "p99": 0}

        latencies = []
        for job_id, completion_time in self.completed_messages.items():
            # Find original message
            msg = next((m for m in self.messages if m.job_id == job_id), None)
            if msg:
                latency_ms = (completion_time - msg.timestamp) * 1000
                latencies.append(latency_ms)

        if not latencies:
            return {"p50": 0, "p95": 0, "p99": 0}

        latencies.sort()
        return {
            "p50": statistics.median(latencies),
            "p95": latencies[int(len(latencies) * 0.95)] if len(latencies) > 0 else 0,
            "p99": latencies[int(len(latencies) * 0.99)] if len(latencies) > 0 else 0
        }

    async def run(self) -> TestResult:
        """Run the burst load test"""
        try:
            await self.connect()

            # Pre-test baseline
            baseline_lag = await self.get_consumer_lag()
            logger.info(f"Baseline consumer lag: {baseline_lag}")

            # Publish burst
            published = await self.publish_burst()

            # Monitor for completions (in parallel)
            monitor_task = asyncio.create_task(self.monitor_completions(duration=120.0))

            # Track max lag during test
            max_lag = 0
            lag_samples = []

            for i in range(12):  # 12 samples over 60 seconds
                await asyncio.sleep(5)
                lag = await self.get_consumer_lag()
                total_lag = sum(lag.values())
                lag_samples.append(total_lag)
                max_lag = max(max_lag, total_lag)
                logger.info(f"Sample {i+1}/12: Consumer lag = {total_lag}")

            await monitor_task

            self.end_time = time.time()
            duration = self.end_time - self.start_time if self.start_time else 0

            # Final measurements
            final_lag = await self.get_consumer_lag()
            final_lag_total = sum(final_lag.values())
            dlq_depth = await self.get_dlq_depth()
            redis_stats = await self.get_redis_idempotency_stats()
            latencies = await self.calculate_latencies()

            # Count duplicates detected (idempotency working)
            duplicates_detected = sum(1 for m in self.messages if m.is_duplicate and m.job_id in self.completed_messages)

            # Calculate success rate based on unique job_ids
            unique_job_ids = {m.job_id for m in self.messages}
            expected_unique = len(unique_job_ids)
            completed_count = len(self.completed_messages)
            missing_job_ids = sorted(list(unique_job_ids - set(self.completed_messages.keys())))
            failed_count = max(expected_unique - completed_count, 0)
            error_rate = (failed_count / expected_unique * 100) if expected_unique > 0 else 0

            result = TestResult(
                messages_sent=published,
                messages_acked=published,  # Assuming all published = acked
                messages_completed=completed_count,
                messages_failed=failed_count,
                duplicates_detected=duplicates_detected,
                expected_unique=expected_unique,
                missing_job_ids=missing_job_ids,
                max_consumer_lag=max_lag,
                final_consumer_lag=final_lag_total,
                dlq_depth=dlq_depth,
                p50_latency_ms=latencies["p50"],
                p95_latency_ms=latencies["p95"],
                p99_latency_ms=latencies["p99"],
                error_rate=error_rate,
                duration_seconds=duration,
                redis_in_progress=redis_stats["in_progress"],
                redis_completed=redis_stats["completed"]
            )

            return result

        finally:
            await self.disconnect()

    def print_summary(self, result: TestResult):
        """Print test summary"""
        print("\n" + "="*70)
        print("BURST LOAD TEST SUMMARY")
        print("="*70)
        print(f"Messages sent:        {result.messages_sent}")
        print(f"Expected unique jobs: {result.expected_unique}")
        print(f"Messages completed:    {result.messages_completed}")
        print(f"Messages failed:       {result.messages_failed}")
        print(f"Duplicates detected:   {result.duplicates_detected}")
        if result.missing_job_ids:
            sample = ", ".join(result.missing_job_ids[:10])
            print(f"Missing job_ids:      {len(result.missing_job_ids)} (sample: {sample})")
        print(f"Error rate:           {result.error_rate:.2f}%")
        print(f"Duration:             {result.duration_seconds:.2f}s")
        print()
        print("Consumer Lag:")
        print(f"  Max during test:     {result.max_consumer_lag}")
        print(f"  Final:               {result.final_consumer_lag}")
        print(f"  DLQ depth:           {result.dlq_depth}")
        print()
        print("Latency (ms):")
        print(f"  p50:                 {result.p50_latency_ms:.2f}")
        print(f"  p95:                 {result.p95_latency_ms:.2f}")
        print(f"  p99:                 {result.p99_latency_ms:.2f}")
        print()
        print("Redis Idempotency:")
        print(f"  In progress:         {result.redis_in_progress}")
        print(f"  Completed:           {result.redis_completed}")
        print()

        # Acceptance criteria
        print("Acceptance Criteria:")
        print(f"  ✅ Consumer lag → 0:  {'PASS' if result.final_consumer_lag == 0 else 'FAIL'} (final: {result.final_consumer_lag})")
        print(f"  ✅ Success rate ≥99%:  {'PASS' if result.error_rate <= 1.0 else 'FAIL'} ({100-result.error_rate:.2f}%)")
        print(f"  ✅ DLQ ≤ 2:            {'PASS' if result.dlq_depth <= 2 else 'FAIL'} (depth: {result.dlq_depth})")
        print(f"  ✅ No stuck keys:      {'PASS' if result.redis_in_progress == 0 else 'FAIL'} (in_progress: {result.redis_in_progress})")
        print("="*70)


async def main():
    parser = argparse.ArgumentParser(description="Burst Load Test - 100 Messages")
    parser.add_argument("--messages", type=int, default=100, help="Number of messages")
    parser.add_argument("--burst-time", type=float, default=5.0, help="Burst duration (seconds)")
    parser.add_argument("--duplicates", type=int, default=0, help="Number of duplicate job_ids")
    parser.add_argument("--kill-worker", action="store_true", help="Kill worker during test (advanced)")

    args = parser.parse_args()

    test = BurstLoadTest(
        num_messages=args.messages,
        burst_time=args.burst_time,
        duplicates=args.duplicates
    )

    result = await test.run()
    test.print_summary(result)

    # Exit code based on acceptance
    if (result.final_consumer_lag == 0 and
        result.error_rate <= 1.0 and
        result.dlq_depth <= 2 and
        result.redis_in_progress == 0):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    asyncio.run(main())