Complete snapshot of /opt/microdao-daarion/ from NODE1 (144.76.224.179).
This represents the actual running production code that has diverged
significantly from the previous main branch.
Key changes from old main:
- Gateway (http_api.py): expanded from ~40KB to 164KB with full agent support
- Router: new /v1/agents/{id}/infer endpoint with vision + DeepSeek routing
- Behavior Policy: SOWA v2.2 (3-level: FULL/ACK/SILENT)
- Agent Registry: config/agent_registry.yml as single source of truth
- 13 agents configured (was 3)
- Memory service integration
- CrewAI teams and roles
Excluded from snapshot: venv/, .env, data/, backups, .tgz archives
Co-authored-by: Cursor <cursoragent@cursor.com>
484 lines
18 KiB
Python
Executable File
484 lines
18 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Burst Load Test - 100 Messages
|
|
===============================
|
|
Tests end-to-end: Gateway → Router → NATS → Workers → Memory
|
|
|
|
Usage:
|
|
python3 burst_100.py --messages 100 --burst-time 5
|
|
python3 burst_100.py --messages 100 --duplicates 10
|
|
python3 burst_100.py --messages 100 --kill-worker
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import asyncio
|
|
import json
|
|
import time
|
|
import uuid
|
|
import argparse
|
|
import logging
|
|
from datetime import datetime
|
|
from typing import List, Dict, Any
|
|
from dataclasses import dataclass
|
|
import statistics
|
|
|
|
import nats
|
|
from nats.js.api import StreamInfo
|
|
import httpx
|
|
import redis.asyncio as redis
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s [%(levelname)s] %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration
|
|
NATS_URL = os.getenv("NATS_URL", "nats://nats:4222")
|
|
ROUTER_URL = os.getenv("ROUTER_URL", "http://router:9102")
|
|
REDIS_URL = os.getenv("REDIS_URL", "redis://dagi-redis:6379")
|
|
NATS_MONITOR_URL = os.getenv("NATS_MONITOR_URL", "http://nats:8222")
|
|
|
|
# Use existing subjects with test metadata (simpler approach)
|
|
SUBJECT_AGENT_RUN_REQUESTED = "agent.run.requested"
|
|
SUBJECT_AGENT_RUN_COMPLETED = "agent.run.completed.helion"
|
|
|
|
# Stream names
|
|
STREAM_MESSAGES = "MESSAGES"
|
|
STREAM_AGENT_RUNS = "AGENT_RUNS"
|
|
|
|
|
|
@dataclass
|
|
class TestMessage:
|
|
"""Test message with trace correlation"""
|
|
message_id: str
|
|
job_id: str
|
|
trace_id: str
|
|
user_id: str
|
|
agent_id: str
|
|
content: str
|
|
timestamp: float
|
|
is_duplicate: bool = False
|
|
|
|
|
|
@dataclass
|
|
class TestResult:
|
|
"""Test execution results"""
|
|
messages_sent: int
|
|
messages_acked: int
|
|
messages_completed: int
|
|
messages_failed: int
|
|
duplicates_detected: int
|
|
expected_unique: int
|
|
missing_job_ids: List[str]
|
|
max_consumer_lag: int
|
|
final_consumer_lag: int
|
|
dlq_depth: int
|
|
p50_latency_ms: float
|
|
p95_latency_ms: float
|
|
p99_latency_ms: float
|
|
error_rate: float
|
|
duration_seconds: float
|
|
redis_in_progress: int
|
|
redis_completed: int
|
|
|
|
|
|
class BurstLoadTest:
|
|
"""Burst load test orchestrator"""
|
|
|
|
def __init__(self, num_messages: int = 100, burst_time: float = 5.0, duplicates: int = 0):
|
|
self.num_messages = num_messages
|
|
self.burst_time = burst_time
|
|
self.duplicates = duplicates
|
|
self.nc = None
|
|
self.js = None
|
|
self.redis_client = None
|
|
self.http_client = None
|
|
|
|
self.messages: List[TestMessage] = []
|
|
self.completed_messages: Dict[str, float] = {} # job_id -> completion_time
|
|
self.start_time = None
|
|
self.end_time = None
|
|
|
|
async def connect(self):
|
|
"""Connect to NATS, Redis, HTTP"""
|
|
self.nc = await nats.connect(NATS_URL)
|
|
self.js = self.nc.jetstream()
|
|
self.redis_client = await redis.from_url(REDIS_URL, decode_responses=True)
|
|
self.http_client = httpx.AsyncClient(timeout=30.0)
|
|
logger.info("Connected to NATS, Redis, HTTP")
|
|
|
|
async def disconnect(self):
|
|
"""Disconnect from all services"""
|
|
if self.nc:
|
|
await self.nc.close()
|
|
if self.redis_client:
|
|
await self.redis_client.close()
|
|
if self.http_client:
|
|
await self.http_client.aclose()
|
|
|
|
def generate_messages(self) -> List[TestMessage]:
|
|
"""Generate test messages"""
|
|
messages = []
|
|
base_trace_id = str(uuid.uuid4())
|
|
|
|
# Generate unique messages
|
|
for i in range(self.num_messages):
|
|
msg_id = f"test-msg-{i}"
|
|
job_id = f"test-job-{i}"
|
|
trace_id = f"{base_trace_id}-{i}"
|
|
|
|
msg = TestMessage(
|
|
message_id=msg_id,
|
|
job_id=job_id,
|
|
trace_id=trace_id,
|
|
user_id="tg:test_user",
|
|
agent_id="helion",
|
|
content=f"Test message {i}: Load test burst",
|
|
timestamp=time.time()
|
|
)
|
|
messages.append(msg)
|
|
|
|
# Add duplicates
|
|
if self.duplicates > 0:
|
|
import random
|
|
duplicate_indices = random.sample(range(self.num_messages), min(self.duplicates, self.num_messages))
|
|
for idx in duplicate_indices:
|
|
original = messages[idx]
|
|
duplicate = TestMessage(
|
|
message_id=f"{original.message_id}-dup",
|
|
job_id=original.job_id, # Same job_id for idempotency test
|
|
trace_id=f"{original.trace_id}-dup",
|
|
user_id=original.user_id,
|
|
agent_id=original.agent_id,
|
|
content=original.content,
|
|
timestamp=time.time(),
|
|
is_duplicate=True
|
|
)
|
|
messages.append(duplicate)
|
|
|
|
return messages
|
|
|
|
async def publish_message(self, msg: TestMessage) -> bool:
|
|
"""Publish test message to NATS"""
|
|
try:
|
|
payload = {
|
|
"task_id": msg.job_id,
|
|
"job_id": msg.job_id,
|
|
"workflow_type": "test", # Worker will detect "test" and use mock execution
|
|
"agent_id": msg.agent_id,
|
|
"trace_id": msg.trace_id,
|
|
"user_id": msg.user_id,
|
|
"test_mode": True, # Explicit test flag
|
|
"payload": {
|
|
"prompt": msg.content,
|
|
"test": True,
|
|
"is_duplicate": msg.is_duplicate
|
|
},
|
|
"priority": 1,
|
|
"timeout": 30
|
|
}
|
|
|
|
headers = {
|
|
"Nats-Trace-ID": msg.trace_id,
|
|
"Nats-Job-ID": msg.job_id,
|
|
"Nats-User-ID": msg.user_id,
|
|
"Nats-Agent-ID": msg.agent_id,
|
|
"Nats-Timestamp": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
await self.js.publish(
|
|
SUBJECT_AGENT_RUN_REQUESTED,
|
|
json.dumps(payload).encode(),
|
|
headers=headers
|
|
)
|
|
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Failed to publish {msg.message_id}: {e}")
|
|
return False
|
|
|
|
async def publish_burst(self) -> int:
|
|
"""Publish all messages in burst"""
|
|
self.messages = self.generate_messages()
|
|
total = len(self.messages)
|
|
logger.info(f"Publishing {total} messages over {self.burst_time}s...")
|
|
|
|
self.start_time = time.time()
|
|
|
|
# Calculate delay between messages
|
|
delay = self.burst_time / total if total > 0 else 0
|
|
|
|
published = 0
|
|
for msg in self.messages:
|
|
if await self.publish_message(msg):
|
|
published += 1
|
|
if delay > 0:
|
|
await asyncio.sleep(delay)
|
|
|
|
logger.info(f"Published {published}/{total} messages")
|
|
return published
|
|
|
|
async def get_consumer_lag(self) -> Dict[str, int]:
|
|
"""Get consumer lag for all streams"""
|
|
try:
|
|
resp = await self.http_client.get(f"{NATS_MONITOR_URL}/jsz")
|
|
data = resp.json()
|
|
|
|
lag = {}
|
|
for stream_name in [STREAM_MESSAGES, STREAM_AGENT_RUNS]:
|
|
stream_info = data.get("account_details", {}).get("stream_detail", [])
|
|
for s in stream_info:
|
|
if s.get("name") == stream_name:
|
|
consumers = s.get("consumer_detail", [])
|
|
for c in consumers:
|
|
consumer_name = c.get("name", "unknown")
|
|
num_pending = c.get("num_pending", 0)
|
|
lag[f"{stream_name}:{consumer_name}"] = num_pending
|
|
|
|
return lag
|
|
except Exception as e:
|
|
logger.warning(f"Failed to get consumer lag: {e}")
|
|
return {}
|
|
|
|
async def get_dlq_depth(self) -> int:
|
|
"""Get DLQ depth"""
|
|
try:
|
|
# Check DLQ subjects
|
|
dlq_subjects = ["attachment.failed.dlq", "agent.run.failed.dlq"]
|
|
total = 0
|
|
|
|
for subject in dlq_subjects:
|
|
try:
|
|
# Try to get message count from stream
|
|
stream_info = await self.js.stream_info("AUDIT")
|
|
# This is simplified - actual DLQ depth requires stream inspection
|
|
# For now, return 0 if we can't measure
|
|
except:
|
|
pass
|
|
|
|
return total
|
|
except Exception as e:
|
|
logger.warning(f"Failed to get DLQ depth: {e}")
|
|
return 0
|
|
|
|
async def monitor_completions(self, duration: float = 60.0):
|
|
"""Monitor for completed messages"""
|
|
logger.info(f"Monitoring completions for {duration}s...")
|
|
|
|
async def completion_handler(msg):
|
|
try:
|
|
data = json.loads(msg.data.decode())
|
|
job_id = data.get("job_id") or data.get("task_id")
|
|
if job_id:
|
|
self.completed_messages[job_id] = time.time()
|
|
logger.debug(f"Received completion for job_id: {job_id}")
|
|
except Exception as e:
|
|
logger.warning(f"Error processing completion: {e}")
|
|
await msg.ack()
|
|
|
|
# Subscribe to completion events
|
|
sub = await self.js.subscribe(
|
|
SUBJECT_AGENT_RUN_COMPLETED,
|
|
"burst-test-monitor",
|
|
cb=completion_handler
|
|
)
|
|
|
|
await asyncio.sleep(duration)
|
|
await sub.unsubscribe()
|
|
|
|
async def get_redis_idempotency_stats(self) -> Dict[str, int]:
|
|
"""Get Redis idempotency key statistics"""
|
|
try:
|
|
keys_in_progress = await self.redis_client.keys("idemp:*")
|
|
in_progress = 0
|
|
completed = 0
|
|
|
|
for key in keys_in_progress:
|
|
value = await self.redis_client.get(key)
|
|
if value:
|
|
try:
|
|
data = json.loads(value)
|
|
status = data.get("status", "")
|
|
if status == "in_progress":
|
|
in_progress += 1
|
|
elif status == "completed":
|
|
completed += 1
|
|
except:
|
|
pass
|
|
|
|
return {"in_progress": in_progress, "completed": completed}
|
|
except Exception as e:
|
|
logger.warning(f"Failed to get Redis stats: {e}")
|
|
return {"in_progress": 0, "completed": 0}
|
|
|
|
async def calculate_latencies(self) -> Dict[str, float]:
|
|
"""Calculate latency percentiles"""
|
|
if not self.completed_messages or not self.start_time:
|
|
return {"p50": 0, "p95": 0, "p99": 0}
|
|
|
|
latencies = []
|
|
for job_id, completion_time in self.completed_messages.items():
|
|
# Find original message
|
|
msg = next((m for m in self.messages if m.job_id == job_id), None)
|
|
if msg:
|
|
latency_ms = (completion_time - msg.timestamp) * 1000
|
|
latencies.append(latency_ms)
|
|
|
|
if not latencies:
|
|
return {"p50": 0, "p95": 0, "p99": 0}
|
|
|
|
latencies.sort()
|
|
return {
|
|
"p50": statistics.median(latencies),
|
|
"p95": latencies[int(len(latencies) * 0.95)] if len(latencies) > 0 else 0,
|
|
"p99": latencies[int(len(latencies) * 0.99)] if len(latencies) > 0 else 0
|
|
}
|
|
|
|
async def run(self) -> TestResult:
|
|
"""Run the burst load test"""
|
|
try:
|
|
await self.connect()
|
|
|
|
# Pre-test baseline
|
|
baseline_lag = await self.get_consumer_lag()
|
|
logger.info(f"Baseline consumer lag: {baseline_lag}")
|
|
|
|
# Publish burst
|
|
published = await self.publish_burst()
|
|
|
|
# Monitor for completions (in parallel)
|
|
monitor_task = asyncio.create_task(self.monitor_completions(duration=120.0))
|
|
|
|
# Track max lag during test
|
|
max_lag = 0
|
|
lag_samples = []
|
|
|
|
for i in range(12): # 12 samples over 60 seconds
|
|
await asyncio.sleep(5)
|
|
lag = await self.get_consumer_lag()
|
|
total_lag = sum(lag.values())
|
|
lag_samples.append(total_lag)
|
|
max_lag = max(max_lag, total_lag)
|
|
logger.info(f"Sample {i+1}/12: Consumer lag = {total_lag}")
|
|
|
|
await monitor_task
|
|
|
|
self.end_time = time.time()
|
|
duration = self.end_time - self.start_time if self.start_time else 0
|
|
|
|
# Final measurements
|
|
final_lag = await self.get_consumer_lag()
|
|
final_lag_total = sum(final_lag.values())
|
|
dlq_depth = await self.get_dlq_depth()
|
|
redis_stats = await self.get_redis_idempotency_stats()
|
|
latencies = await self.calculate_latencies()
|
|
|
|
# Count duplicates detected (idempotency working)
|
|
duplicates_detected = sum(1 for m in self.messages if m.is_duplicate and m.job_id in self.completed_messages)
|
|
|
|
# Calculate success rate based on unique job_ids
|
|
unique_job_ids = {m.job_id for m in self.messages}
|
|
expected_unique = len(unique_job_ids)
|
|
completed_count = len(self.completed_messages)
|
|
missing_job_ids = sorted(list(unique_job_ids - set(self.completed_messages.keys())))
|
|
failed_count = max(expected_unique - completed_count, 0)
|
|
error_rate = (failed_count / expected_unique * 100) if expected_unique > 0 else 0
|
|
|
|
result = TestResult(
|
|
messages_sent=published,
|
|
messages_acked=published, # Assuming all published = acked
|
|
messages_completed=completed_count,
|
|
messages_failed=failed_count,
|
|
duplicates_detected=duplicates_detected,
|
|
expected_unique=expected_unique,
|
|
missing_job_ids=missing_job_ids,
|
|
max_consumer_lag=max_lag,
|
|
final_consumer_lag=final_lag_total,
|
|
dlq_depth=dlq_depth,
|
|
p50_latency_ms=latencies["p50"],
|
|
p95_latency_ms=latencies["p95"],
|
|
p99_latency_ms=latencies["p99"],
|
|
error_rate=error_rate,
|
|
duration_seconds=duration,
|
|
redis_in_progress=redis_stats["in_progress"],
|
|
redis_completed=redis_stats["completed"]
|
|
)
|
|
|
|
return result
|
|
|
|
finally:
|
|
await self.disconnect()
|
|
|
|
def print_summary(self, result: TestResult):
|
|
"""Print test summary"""
|
|
print("\n" + "="*70)
|
|
print("BURST LOAD TEST SUMMARY")
|
|
print("="*70)
|
|
print(f"Messages sent: {result.messages_sent}")
|
|
print(f"Expected unique jobs: {result.expected_unique}")
|
|
print(f"Messages completed: {result.messages_completed}")
|
|
print(f"Messages failed: {result.messages_failed}")
|
|
print(f"Duplicates detected: {result.duplicates_detected}")
|
|
if result.missing_job_ids:
|
|
sample = ", ".join(result.missing_job_ids[:10])
|
|
print(f"Missing job_ids: {len(result.missing_job_ids)} (sample: {sample})")
|
|
print(f"Error rate: {result.error_rate:.2f}%")
|
|
print(f"Duration: {result.duration_seconds:.2f}s")
|
|
print()
|
|
print("Consumer Lag:")
|
|
print(f" Max during test: {result.max_consumer_lag}")
|
|
print(f" Final: {result.final_consumer_lag}")
|
|
print(f" DLQ depth: {result.dlq_depth}")
|
|
print()
|
|
print("Latency (ms):")
|
|
print(f" p50: {result.p50_latency_ms:.2f}")
|
|
print(f" p95: {result.p95_latency_ms:.2f}")
|
|
print(f" p99: {result.p99_latency_ms:.2f}")
|
|
print()
|
|
print("Redis Idempotency:")
|
|
print(f" In progress: {result.redis_in_progress}")
|
|
print(f" Completed: {result.redis_completed}")
|
|
print()
|
|
|
|
# Acceptance criteria
|
|
print("Acceptance Criteria:")
|
|
print(f" ✅ Consumer lag → 0: {'PASS' if result.final_consumer_lag == 0 else 'FAIL'} (final: {result.final_consumer_lag})")
|
|
print(f" ✅ Success rate ≥99%: {'PASS' if result.error_rate <= 1.0 else 'FAIL'} ({100-result.error_rate:.2f}%)")
|
|
print(f" ✅ DLQ ≤ 2: {'PASS' if result.dlq_depth <= 2 else 'FAIL'} (depth: {result.dlq_depth})")
|
|
print(f" ✅ No stuck keys: {'PASS' if result.redis_in_progress == 0 else 'FAIL'} (in_progress: {result.redis_in_progress})")
|
|
print("="*70)
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser(description="Burst Load Test - 100 Messages")
|
|
parser.add_argument("--messages", type=int, default=100, help="Number of messages")
|
|
parser.add_argument("--burst-time", type=float, default=5.0, help="Burst duration (seconds)")
|
|
parser.add_argument("--duplicates", type=int, default=0, help="Number of duplicate job_ids")
|
|
parser.add_argument("--kill-worker", action="store_true", help="Kill worker during test (advanced)")
|
|
|
|
args = parser.parse_args()
|
|
|
|
test = BurstLoadTest(
|
|
num_messages=args.messages,
|
|
burst_time=args.burst_time,
|
|
duplicates=args.duplicates
|
|
)
|
|
|
|
result = await test.run()
|
|
test.print_summary(result)
|
|
|
|
# Exit code based on acceptance
|
|
if (result.final_consumer_lag == 0 and
|
|
result.error_rate <= 1.0 and
|
|
result.dlq_depth <= 2 and
|
|
result.redis_in_progress == 0):
|
|
sys.exit(0)
|
|
else:
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|