snapshot: NODE1 production state 2026-02-09
Complete snapshot of /opt/microdao-daarion/ from NODE1 (144.76.224.179).
This represents the actual running production code that has diverged
significantly from the previous main branch.
Key changes from old main:
- Gateway (http_api.py): expanded from ~40KB to 164KB with full agent support
- Router: new /v1/agents/{id}/infer endpoint with vision + DeepSeek routing
- Behavior Policy: SOWA v2.2 (3-level: FULL/ACK/SILENT)
- Agent Registry: config/agent_registry.yml as single source of truth
- 13 agents configured (was 3)
- Memory service integration
- CrewAI teams and roles
Excluded from snapshot: venv/, .env, data/, backups, .tgz archives
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
483
scripts/load/burst_100.py
Executable file
483
scripts/load/burst_100.py
Executable file
@@ -0,0 +1,483 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Burst Load Test - 100 Messages
|
||||
===============================
|
||||
Tests end-to-end: Gateway → Router → NATS → Workers → Memory
|
||||
|
||||
Usage:
|
||||
python3 burst_100.py --messages 100 --burst-time 5
|
||||
python3 burst_100.py --messages 100 --duplicates 10
|
||||
python3 burst_100.py --messages 100 --kill-worker
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
import json
|
||||
import time
|
||||
import uuid
|
||||
import argparse
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any
|
||||
from dataclasses import dataclass
|
||||
import statistics
|
||||
|
||||
import nats
|
||||
from nats.js.api import StreamInfo
|
||||
import httpx
|
||||
import redis.asyncio as redis
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [%(levelname)s] %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration
|
||||
NATS_URL = os.getenv("NATS_URL", "nats://nats:4222")
|
||||
ROUTER_URL = os.getenv("ROUTER_URL", "http://router:9102")
|
||||
REDIS_URL = os.getenv("REDIS_URL", "redis://dagi-redis:6379")
|
||||
NATS_MONITOR_URL = os.getenv("NATS_MONITOR_URL", "http://nats:8222")
|
||||
|
||||
# Use existing subjects with test metadata (simpler approach)
|
||||
SUBJECT_AGENT_RUN_REQUESTED = "agent.run.requested"
|
||||
SUBJECT_AGENT_RUN_COMPLETED = "agent.run.completed.helion"
|
||||
|
||||
# Stream names
|
||||
STREAM_MESSAGES = "MESSAGES"
|
||||
STREAM_AGENT_RUNS = "AGENT_RUNS"
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestMessage:
|
||||
"""Test message with trace correlation"""
|
||||
message_id: str
|
||||
job_id: str
|
||||
trace_id: str
|
||||
user_id: str
|
||||
agent_id: str
|
||||
content: str
|
||||
timestamp: float
|
||||
is_duplicate: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestResult:
|
||||
"""Test execution results"""
|
||||
messages_sent: int
|
||||
messages_acked: int
|
||||
messages_completed: int
|
||||
messages_failed: int
|
||||
duplicates_detected: int
|
||||
expected_unique: int
|
||||
missing_job_ids: List[str]
|
||||
max_consumer_lag: int
|
||||
final_consumer_lag: int
|
||||
dlq_depth: int
|
||||
p50_latency_ms: float
|
||||
p95_latency_ms: float
|
||||
p99_latency_ms: float
|
||||
error_rate: float
|
||||
duration_seconds: float
|
||||
redis_in_progress: int
|
||||
redis_completed: int
|
||||
|
||||
|
||||
class BurstLoadTest:
|
||||
"""Burst load test orchestrator"""
|
||||
|
||||
def __init__(self, num_messages: int = 100, burst_time: float = 5.0, duplicates: int = 0):
|
||||
self.num_messages = num_messages
|
||||
self.burst_time = burst_time
|
||||
self.duplicates = duplicates
|
||||
self.nc = None
|
||||
self.js = None
|
||||
self.redis_client = None
|
||||
self.http_client = None
|
||||
|
||||
self.messages: List[TestMessage] = []
|
||||
self.completed_messages: Dict[str, float] = {} # job_id -> completion_time
|
||||
self.start_time = None
|
||||
self.end_time = None
|
||||
|
||||
async def connect(self):
|
||||
"""Connect to NATS, Redis, HTTP"""
|
||||
self.nc = await nats.connect(NATS_URL)
|
||||
self.js = self.nc.jetstream()
|
||||
self.redis_client = await redis.from_url(REDIS_URL, decode_responses=True)
|
||||
self.http_client = httpx.AsyncClient(timeout=30.0)
|
||||
logger.info("Connected to NATS, Redis, HTTP")
|
||||
|
||||
async def disconnect(self):
|
||||
"""Disconnect from all services"""
|
||||
if self.nc:
|
||||
await self.nc.close()
|
||||
if self.redis_client:
|
||||
await self.redis_client.close()
|
||||
if self.http_client:
|
||||
await self.http_client.aclose()
|
||||
|
||||
def generate_messages(self) -> List[TestMessage]:
|
||||
"""Generate test messages"""
|
||||
messages = []
|
||||
base_trace_id = str(uuid.uuid4())
|
||||
|
||||
# Generate unique messages
|
||||
for i in range(self.num_messages):
|
||||
msg_id = f"test-msg-{i}"
|
||||
job_id = f"test-job-{i}"
|
||||
trace_id = f"{base_trace_id}-{i}"
|
||||
|
||||
msg = TestMessage(
|
||||
message_id=msg_id,
|
||||
job_id=job_id,
|
||||
trace_id=trace_id,
|
||||
user_id="tg:test_user",
|
||||
agent_id="helion",
|
||||
content=f"Test message {i}: Load test burst",
|
||||
timestamp=time.time()
|
||||
)
|
||||
messages.append(msg)
|
||||
|
||||
# Add duplicates
|
||||
if self.duplicates > 0:
|
||||
import random
|
||||
duplicate_indices = random.sample(range(self.num_messages), min(self.duplicates, self.num_messages))
|
||||
for idx in duplicate_indices:
|
||||
original = messages[idx]
|
||||
duplicate = TestMessage(
|
||||
message_id=f"{original.message_id}-dup",
|
||||
job_id=original.job_id, # Same job_id for idempotency test
|
||||
trace_id=f"{original.trace_id}-dup",
|
||||
user_id=original.user_id,
|
||||
agent_id=original.agent_id,
|
||||
content=original.content,
|
||||
timestamp=time.time(),
|
||||
is_duplicate=True
|
||||
)
|
||||
messages.append(duplicate)
|
||||
|
||||
return messages
|
||||
|
||||
async def publish_message(self, msg: TestMessage) -> bool:
|
||||
"""Publish test message to NATS"""
|
||||
try:
|
||||
payload = {
|
||||
"task_id": msg.job_id,
|
||||
"job_id": msg.job_id,
|
||||
"workflow_type": "test", # Worker will detect "test" and use mock execution
|
||||
"agent_id": msg.agent_id,
|
||||
"trace_id": msg.trace_id,
|
||||
"user_id": msg.user_id,
|
||||
"test_mode": True, # Explicit test flag
|
||||
"payload": {
|
||||
"prompt": msg.content,
|
||||
"test": True,
|
||||
"is_duplicate": msg.is_duplicate
|
||||
},
|
||||
"priority": 1,
|
||||
"timeout": 30
|
||||
}
|
||||
|
||||
headers = {
|
||||
"Nats-Trace-ID": msg.trace_id,
|
||||
"Nats-Job-ID": msg.job_id,
|
||||
"Nats-User-ID": msg.user_id,
|
||||
"Nats-Agent-ID": msg.agent_id,
|
||||
"Nats-Timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
await self.js.publish(
|
||||
SUBJECT_AGENT_RUN_REQUESTED,
|
||||
json.dumps(payload).encode(),
|
||||
headers=headers
|
||||
)
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to publish {msg.message_id}: {e}")
|
||||
return False
|
||||
|
||||
async def publish_burst(self) -> int:
|
||||
"""Publish all messages in burst"""
|
||||
self.messages = self.generate_messages()
|
||||
total = len(self.messages)
|
||||
logger.info(f"Publishing {total} messages over {self.burst_time}s...")
|
||||
|
||||
self.start_time = time.time()
|
||||
|
||||
# Calculate delay between messages
|
||||
delay = self.burst_time / total if total > 0 else 0
|
||||
|
||||
published = 0
|
||||
for msg in self.messages:
|
||||
if await self.publish_message(msg):
|
||||
published += 1
|
||||
if delay > 0:
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
logger.info(f"Published {published}/{total} messages")
|
||||
return published
|
||||
|
||||
async def get_consumer_lag(self) -> Dict[str, int]:
|
||||
"""Get consumer lag for all streams"""
|
||||
try:
|
||||
resp = await self.http_client.get(f"{NATS_MONITOR_URL}/jsz")
|
||||
data = resp.json()
|
||||
|
||||
lag = {}
|
||||
for stream_name in [STREAM_MESSAGES, STREAM_AGENT_RUNS]:
|
||||
stream_info = data.get("account_details", {}).get("stream_detail", [])
|
||||
for s in stream_info:
|
||||
if s.get("name") == stream_name:
|
||||
consumers = s.get("consumer_detail", [])
|
||||
for c in consumers:
|
||||
consumer_name = c.get("name", "unknown")
|
||||
num_pending = c.get("num_pending", 0)
|
||||
lag[f"{stream_name}:{consumer_name}"] = num_pending
|
||||
|
||||
return lag
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get consumer lag: {e}")
|
||||
return {}
|
||||
|
||||
async def get_dlq_depth(self) -> int:
|
||||
"""Get DLQ depth"""
|
||||
try:
|
||||
# Check DLQ subjects
|
||||
dlq_subjects = ["attachment.failed.dlq", "agent.run.failed.dlq"]
|
||||
total = 0
|
||||
|
||||
for subject in dlq_subjects:
|
||||
try:
|
||||
# Try to get message count from stream
|
||||
stream_info = await self.js.stream_info("AUDIT")
|
||||
# This is simplified - actual DLQ depth requires stream inspection
|
||||
# For now, return 0 if we can't measure
|
||||
except:
|
||||
pass
|
||||
|
||||
return total
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get DLQ depth: {e}")
|
||||
return 0
|
||||
|
||||
async def monitor_completions(self, duration: float = 60.0):
|
||||
"""Monitor for completed messages"""
|
||||
logger.info(f"Monitoring completions for {duration}s...")
|
||||
|
||||
async def completion_handler(msg):
|
||||
try:
|
||||
data = json.loads(msg.data.decode())
|
||||
job_id = data.get("job_id") or data.get("task_id")
|
||||
if job_id:
|
||||
self.completed_messages[job_id] = time.time()
|
||||
logger.debug(f"Received completion for job_id: {job_id}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing completion: {e}")
|
||||
await msg.ack()
|
||||
|
||||
# Subscribe to completion events
|
||||
sub = await self.js.subscribe(
|
||||
SUBJECT_AGENT_RUN_COMPLETED,
|
||||
"burst-test-monitor",
|
||||
cb=completion_handler
|
||||
)
|
||||
|
||||
await asyncio.sleep(duration)
|
||||
await sub.unsubscribe()
|
||||
|
||||
async def get_redis_idempotency_stats(self) -> Dict[str, int]:
|
||||
"""Get Redis idempotency key statistics"""
|
||||
try:
|
||||
keys_in_progress = await self.redis_client.keys("idemp:*")
|
||||
in_progress = 0
|
||||
completed = 0
|
||||
|
||||
for key in keys_in_progress:
|
||||
value = await self.redis_client.get(key)
|
||||
if value:
|
||||
try:
|
||||
data = json.loads(value)
|
||||
status = data.get("status", "")
|
||||
if status == "in_progress":
|
||||
in_progress += 1
|
||||
elif status == "completed":
|
||||
completed += 1
|
||||
except:
|
||||
pass
|
||||
|
||||
return {"in_progress": in_progress, "completed": completed}
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to get Redis stats: {e}")
|
||||
return {"in_progress": 0, "completed": 0}
|
||||
|
||||
async def calculate_latencies(self) -> Dict[str, float]:
|
||||
"""Calculate latency percentiles"""
|
||||
if not self.completed_messages or not self.start_time:
|
||||
return {"p50": 0, "p95": 0, "p99": 0}
|
||||
|
||||
latencies = []
|
||||
for job_id, completion_time in self.completed_messages.items():
|
||||
# Find original message
|
||||
msg = next((m for m in self.messages if m.job_id == job_id), None)
|
||||
if msg:
|
||||
latency_ms = (completion_time - msg.timestamp) * 1000
|
||||
latencies.append(latency_ms)
|
||||
|
||||
if not latencies:
|
||||
return {"p50": 0, "p95": 0, "p99": 0}
|
||||
|
||||
latencies.sort()
|
||||
return {
|
||||
"p50": statistics.median(latencies),
|
||||
"p95": latencies[int(len(latencies) * 0.95)] if len(latencies) > 0 else 0,
|
||||
"p99": latencies[int(len(latencies) * 0.99)] if len(latencies) > 0 else 0
|
||||
}
|
||||
|
||||
async def run(self) -> TestResult:
|
||||
"""Run the burst load test"""
|
||||
try:
|
||||
await self.connect()
|
||||
|
||||
# Pre-test baseline
|
||||
baseline_lag = await self.get_consumer_lag()
|
||||
logger.info(f"Baseline consumer lag: {baseline_lag}")
|
||||
|
||||
# Publish burst
|
||||
published = await self.publish_burst()
|
||||
|
||||
# Monitor for completions (in parallel)
|
||||
monitor_task = asyncio.create_task(self.monitor_completions(duration=120.0))
|
||||
|
||||
# Track max lag during test
|
||||
max_lag = 0
|
||||
lag_samples = []
|
||||
|
||||
for i in range(12): # 12 samples over 60 seconds
|
||||
await asyncio.sleep(5)
|
||||
lag = await self.get_consumer_lag()
|
||||
total_lag = sum(lag.values())
|
||||
lag_samples.append(total_lag)
|
||||
max_lag = max(max_lag, total_lag)
|
||||
logger.info(f"Sample {i+1}/12: Consumer lag = {total_lag}")
|
||||
|
||||
await monitor_task
|
||||
|
||||
self.end_time = time.time()
|
||||
duration = self.end_time - self.start_time if self.start_time else 0
|
||||
|
||||
# Final measurements
|
||||
final_lag = await self.get_consumer_lag()
|
||||
final_lag_total = sum(final_lag.values())
|
||||
dlq_depth = await self.get_dlq_depth()
|
||||
redis_stats = await self.get_redis_idempotency_stats()
|
||||
latencies = await self.calculate_latencies()
|
||||
|
||||
# Count duplicates detected (idempotency working)
|
||||
duplicates_detected = sum(1 for m in self.messages if m.is_duplicate and m.job_id in self.completed_messages)
|
||||
|
||||
# Calculate success rate based on unique job_ids
|
||||
unique_job_ids = {m.job_id for m in self.messages}
|
||||
expected_unique = len(unique_job_ids)
|
||||
completed_count = len(self.completed_messages)
|
||||
missing_job_ids = sorted(list(unique_job_ids - set(self.completed_messages.keys())))
|
||||
failed_count = max(expected_unique - completed_count, 0)
|
||||
error_rate = (failed_count / expected_unique * 100) if expected_unique > 0 else 0
|
||||
|
||||
result = TestResult(
|
||||
messages_sent=published,
|
||||
messages_acked=published, # Assuming all published = acked
|
||||
messages_completed=completed_count,
|
||||
messages_failed=failed_count,
|
||||
duplicates_detected=duplicates_detected,
|
||||
expected_unique=expected_unique,
|
||||
missing_job_ids=missing_job_ids,
|
||||
max_consumer_lag=max_lag,
|
||||
final_consumer_lag=final_lag_total,
|
||||
dlq_depth=dlq_depth,
|
||||
p50_latency_ms=latencies["p50"],
|
||||
p95_latency_ms=latencies["p95"],
|
||||
p99_latency_ms=latencies["p99"],
|
||||
error_rate=error_rate,
|
||||
duration_seconds=duration,
|
||||
redis_in_progress=redis_stats["in_progress"],
|
||||
redis_completed=redis_stats["completed"]
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
finally:
|
||||
await self.disconnect()
|
||||
|
||||
def print_summary(self, result: TestResult):
|
||||
"""Print test summary"""
|
||||
print("\n" + "="*70)
|
||||
print("BURST LOAD TEST SUMMARY")
|
||||
print("="*70)
|
||||
print(f"Messages sent: {result.messages_sent}")
|
||||
print(f"Expected unique jobs: {result.expected_unique}")
|
||||
print(f"Messages completed: {result.messages_completed}")
|
||||
print(f"Messages failed: {result.messages_failed}")
|
||||
print(f"Duplicates detected: {result.duplicates_detected}")
|
||||
if result.missing_job_ids:
|
||||
sample = ", ".join(result.missing_job_ids[:10])
|
||||
print(f"Missing job_ids: {len(result.missing_job_ids)} (sample: {sample})")
|
||||
print(f"Error rate: {result.error_rate:.2f}%")
|
||||
print(f"Duration: {result.duration_seconds:.2f}s")
|
||||
print()
|
||||
print("Consumer Lag:")
|
||||
print(f" Max during test: {result.max_consumer_lag}")
|
||||
print(f" Final: {result.final_consumer_lag}")
|
||||
print(f" DLQ depth: {result.dlq_depth}")
|
||||
print()
|
||||
print("Latency (ms):")
|
||||
print(f" p50: {result.p50_latency_ms:.2f}")
|
||||
print(f" p95: {result.p95_latency_ms:.2f}")
|
||||
print(f" p99: {result.p99_latency_ms:.2f}")
|
||||
print()
|
||||
print("Redis Idempotency:")
|
||||
print(f" In progress: {result.redis_in_progress}")
|
||||
print(f" Completed: {result.redis_completed}")
|
||||
print()
|
||||
|
||||
# Acceptance criteria
|
||||
print("Acceptance Criteria:")
|
||||
print(f" ✅ Consumer lag → 0: {'PASS' if result.final_consumer_lag == 0 else 'FAIL'} (final: {result.final_consumer_lag})")
|
||||
print(f" ✅ Success rate ≥99%: {'PASS' if result.error_rate <= 1.0 else 'FAIL'} ({100-result.error_rate:.2f}%)")
|
||||
print(f" ✅ DLQ ≤ 2: {'PASS' if result.dlq_depth <= 2 else 'FAIL'} (depth: {result.dlq_depth})")
|
||||
print(f" ✅ No stuck keys: {'PASS' if result.redis_in_progress == 0 else 'FAIL'} (in_progress: {result.redis_in_progress})")
|
||||
print("="*70)
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(description="Burst Load Test - 100 Messages")
|
||||
parser.add_argument("--messages", type=int, default=100, help="Number of messages")
|
||||
parser.add_argument("--burst-time", type=float, default=5.0, help="Burst duration (seconds)")
|
||||
parser.add_argument("--duplicates", type=int, default=0, help="Number of duplicate job_ids")
|
||||
parser.add_argument("--kill-worker", action="store_true", help="Kill worker during test (advanced)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
test = BurstLoadTest(
|
||||
num_messages=args.messages,
|
||||
burst_time=args.burst_time,
|
||||
duplicates=args.duplicates
|
||||
)
|
||||
|
||||
result = await test.run()
|
||||
test.print_summary(result)
|
||||
|
||||
# Exit code based on acceptance
|
||||
if (result.final_consumer_lag == 0 and
|
||||
result.error_rate <= 1.0 and
|
||||
result.dlq_depth <= 2 and
|
||||
result.redis_in_progress == 0):
|
||||
sys.exit(0)
|
||||
else:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user