snapshot: NODE1 production state 2026-02-09

Complete snapshot of /opt/microdao-daarion/ from NODE1 (144.76.224.179). This represents the actual running production code that has diverged significantly from the previous main branch. Key changes from old main: - Gateway (http_api.py): expanded from ~40KB to 164KB with full agent support - Router: new /v1/agents/{id}/infer endpoint with vision + DeepSeek routing - Behavior Policy: SOWA v2.2 (3-level: FULL/ACK/SILENT) - Agent Registry: config/agent_registry.yml as single source of truth - 13 agents configured (was 3) - Memory service integration - CrewAI teams and roles Excluded from snapshot: venv/, .env, data/, backups, .tgz archives Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-09 08:46:46 -08:00
parent 134c044c21
commit ef3473db21
9473 changed files with 408933 additions and 2769877 deletions
--- a/scripts/load/burst_100.py
+++ b/scripts/load/burst_100.py
@@ -0,0 +1,483 @@
+#!/usr/bin/env python3
+"""
+Burst Load Test - 100 Messages
+===============================
+Tests end-to-end: Gateway → Router → NATS → Workers → Memory
+
+Usage:
+    python3 burst_100.py --messages 100 --burst-time 5
+    python3 burst_100.py --messages 100 --duplicates 10
+    python3 burst_100.py --messages 100 --kill-worker
+"""
+
+import os
+import sys
+import asyncio
+import json
+import time
+import uuid
+import argparse
+import logging
+from datetime import datetime
+from typing import List, Dict, Any
+from dataclasses import dataclass
+import statistics
+
+import nats
+from nats.js.api import StreamInfo
+import httpx
+import redis.asyncio as redis
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s [%(levelname)s] %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Configuration
+NATS_URL = os.getenv("NATS_URL", "nats://nats:4222")
+ROUTER_URL = os.getenv("ROUTER_URL", "http://router:9102")
+REDIS_URL = os.getenv("REDIS_URL", "redis://dagi-redis:6379")
+NATS_MONITOR_URL = os.getenv("NATS_MONITOR_URL", "http://nats:8222")
+
+# Use existing subjects with test metadata (simpler approach)
+SUBJECT_AGENT_RUN_REQUESTED = "agent.run.requested"
+SUBJECT_AGENT_RUN_COMPLETED = "agent.run.completed.helion"
+
+# Stream names
+STREAM_MESSAGES = "MESSAGES"
+STREAM_AGENT_RUNS = "AGENT_RUNS"
+
+
+@dataclass
+class TestMessage:
+    """Test message with trace correlation"""
+    message_id: str
+    job_id: str
+    trace_id: str
+    user_id: str
+    agent_id: str
+    content: str
+    timestamp: float
+    is_duplicate: bool = False
+
+
+@dataclass
+class TestResult:
+    """Test execution results"""
+    messages_sent: int
+    messages_acked: int
+    messages_completed: int
+    messages_failed: int
+    duplicates_detected: int
+    expected_unique: int
+    missing_job_ids: List[str]
+    max_consumer_lag: int
+    final_consumer_lag: int
+    dlq_depth: int
+    p50_latency_ms: float
+    p95_latency_ms: float
+    p99_latency_ms: float
+    error_rate: float
+    duration_seconds: float
+    redis_in_progress: int
+    redis_completed: int
+
+
+class BurstLoadTest:
+    """Burst load test orchestrator"""
+    
+    def __init__(self, num_messages: int = 100, burst_time: float = 5.0, duplicates: int = 0):
+        self.num_messages = num_messages
+        self.burst_time = burst_time
+        self.duplicates = duplicates
+        self.nc = None
+        self.js = None
+        self.redis_client = None
+        self.http_client = None
+        
+        self.messages: List[TestMessage] = []
+        self.completed_messages: Dict[str, float] = {}  # job_id -> completion_time
+        self.start_time = None
+        self.end_time = None
+    
+    async def connect(self):
+        """Connect to NATS, Redis, HTTP"""
+        self.nc = await nats.connect(NATS_URL)
+        self.js = self.nc.jetstream()
+        self.redis_client = await redis.from_url(REDIS_URL, decode_responses=True)
+        self.http_client = httpx.AsyncClient(timeout=30.0)
+        logger.info("Connected to NATS, Redis, HTTP")
+    
+    async def disconnect(self):
+        """Disconnect from all services"""
+        if self.nc:
+            await self.nc.close()
+        if self.redis_client:
+            await self.redis_client.close()
+        if self.http_client:
+            await self.http_client.aclose()
+    
+    def generate_messages(self) -> List[TestMessage]:
+        """Generate test messages"""
+        messages = []
+        base_trace_id = str(uuid.uuid4())
+        
+        # Generate unique messages
+        for i in range(self.num_messages):
+            msg_id = f"test-msg-{i}"
+            job_id = f"test-job-{i}"
+            trace_id = f"{base_trace_id}-{i}"
+            
+            msg = TestMessage(
+                message_id=msg_id,
+                job_id=job_id,
+                trace_id=trace_id,
+                user_id="tg:test_user",
+                agent_id="helion",
+                content=f"Test message {i}: Load test burst",
+                timestamp=time.time()
+            )
+            messages.append(msg)
+        
+        # Add duplicates
+        if self.duplicates > 0:
+            import random
+            duplicate_indices = random.sample(range(self.num_messages), min(self.duplicates, self.num_messages))
+            for idx in duplicate_indices:
+                original = messages[idx]
+                duplicate = TestMessage(
+                    message_id=f"{original.message_id}-dup",
+                    job_id=original.job_id,  # Same job_id for idempotency test
+                    trace_id=f"{original.trace_id}-dup",
+                    user_id=original.user_id,
+                    agent_id=original.agent_id,
+                    content=original.content,
+                    timestamp=time.time(),
+                    is_duplicate=True
+                )
+                messages.append(duplicate)
+        
+        return messages
+    
+    async def publish_message(self, msg: TestMessage) -> bool:
+        """Publish test message to NATS"""
+        try:
+            payload = {
+                "task_id": msg.job_id,
+                "job_id": msg.job_id,
+                "workflow_type": "test",  # Worker will detect "test" and use mock execution
+                "agent_id": msg.agent_id,
+                "trace_id": msg.trace_id,
+                "user_id": msg.user_id,
+                "test_mode": True,  # Explicit test flag
+                "payload": {
+                    "prompt": msg.content,
+                    "test": True,
+                    "is_duplicate": msg.is_duplicate
+                },
+                "priority": 1,
+                "timeout": 30
+            }
+            
+            headers = {
+                "Nats-Trace-ID": msg.trace_id,
+                "Nats-Job-ID": msg.job_id,
+                "Nats-User-ID": msg.user_id,
+                "Nats-Agent-ID": msg.agent_id,
+                "Nats-Timestamp": datetime.utcnow().isoformat()
+            }
+            
+            await self.js.publish(
+                SUBJECT_AGENT_RUN_REQUESTED,
+                json.dumps(payload).encode(),
+                headers=headers
+            )
+            
+            return True
+        except Exception as e:
+            logger.error(f"Failed to publish {msg.message_id}: {e}")
+            return False
+    
+    async def publish_burst(self) -> int:
+        """Publish all messages in burst"""
+        self.messages = self.generate_messages()
+        total = len(self.messages)
+        logger.info(f"Publishing {total} messages over {self.burst_time}s...")
+        
+        self.start_time = time.time()
+        
+        # Calculate delay between messages
+        delay = self.burst_time / total if total > 0 else 0
+        
+        published = 0
+        for msg in self.messages:
+            if await self.publish_message(msg):
+                published += 1
+            if delay > 0:
+                await asyncio.sleep(delay)
+        
+        logger.info(f"Published {published}/{total} messages")
+        return published
+    
+    async def get_consumer_lag(self) -> Dict[str, int]:
+        """Get consumer lag for all streams"""
+        try:
+            resp = await self.http_client.get(f"{NATS_MONITOR_URL}/jsz")
+            data = resp.json()
+            
+            lag = {}
+            for stream_name in [STREAM_MESSAGES, STREAM_AGENT_RUNS]:
+                stream_info = data.get("account_details", {}).get("stream_detail", [])
+                for s in stream_info:
+                    if s.get("name") == stream_name:
+                        consumers = s.get("consumer_detail", [])
+                        for c in consumers:
+                            consumer_name = c.get("name", "unknown")
+                            num_pending = c.get("num_pending", 0)
+                            lag[f"{stream_name}:{consumer_name}"] = num_pending
+            
+            return lag
+        except Exception as e:
+            logger.warning(f"Failed to get consumer lag: {e}")
+            return {}
+    
+    async def get_dlq_depth(self) -> int:
+        """Get DLQ depth"""
+        try:
+            # Check DLQ subjects
+            dlq_subjects = ["attachment.failed.dlq", "agent.run.failed.dlq"]
+            total = 0
+            
+            for subject in dlq_subjects:
+                try:
+                    # Try to get message count from stream
+                    stream_info = await self.js.stream_info("AUDIT")
+                    # This is simplified - actual DLQ depth requires stream inspection
+                    # For now, return 0 if we can't measure
+                except:
+                    pass
+            
+            return total
+        except Exception as e:
+            logger.warning(f"Failed to get DLQ depth: {e}")
+            return 0
+    
+    async def monitor_completions(self, duration: float = 60.0):
+        """Monitor for completed messages"""
+        logger.info(f"Monitoring completions for {duration}s...")
+        
+        async def completion_handler(msg):
+            try:
+                data = json.loads(msg.data.decode())
+                job_id = data.get("job_id") or data.get("task_id")
+                if job_id:
+                    self.completed_messages[job_id] = time.time()
+                    logger.debug(f"Received completion for job_id: {job_id}")
+            except Exception as e:
+                logger.warning(f"Error processing completion: {e}")
+            await msg.ack()
+        
+        # Subscribe to completion events
+        sub = await self.js.subscribe(
+            SUBJECT_AGENT_RUN_COMPLETED,
+            "burst-test-monitor",
+            cb=completion_handler
+        )
+        
+        await asyncio.sleep(duration)
+        await sub.unsubscribe()
+    
+    async def get_redis_idempotency_stats(self) -> Dict[str, int]:
+        """Get Redis idempotency key statistics"""
+        try:
+            keys_in_progress = await self.redis_client.keys("idemp:*")
+            in_progress = 0
+            completed = 0
+            
+            for key in keys_in_progress:
+                value = await self.redis_client.get(key)
+                if value:
+                    try:
+                        data = json.loads(value)
+                        status = data.get("status", "")
+                        if status == "in_progress":
+                            in_progress += 1
+                        elif status == "completed":
+                            completed += 1
+                    except:
+                        pass
+            
+            return {"in_progress": in_progress, "completed": completed}
+        except Exception as e:
+            logger.warning(f"Failed to get Redis stats: {e}")
+            return {"in_progress": 0, "completed": 0}
+    
+    async def calculate_latencies(self) -> Dict[str, float]:
+        """Calculate latency percentiles"""
+        if not self.completed_messages or not self.start_time:
+            return {"p50": 0, "p95": 0, "p99": 0}
+        
+        latencies = []
+        for job_id, completion_time in self.completed_messages.items():
+            # Find original message
+            msg = next((m for m in self.messages if m.job_id == job_id), None)
+            if msg:
+                latency_ms = (completion_time - msg.timestamp) * 1000
+                latencies.append(latency_ms)
+        
+        if not latencies:
+            return {"p50": 0, "p95": 0, "p99": 0}
+        
+        latencies.sort()
+        return {
+            "p50": statistics.median(latencies),
+            "p95": latencies[int(len(latencies) * 0.95)] if len(latencies) > 0 else 0,
+            "p99": latencies[int(len(latencies) * 0.99)] if len(latencies) > 0 else 0
+        }
+    
+    async def run(self) -> TestResult:
+        """Run the burst load test"""
+        try:
+            await self.connect()
+            
+            # Pre-test baseline
+            baseline_lag = await self.get_consumer_lag()
+            logger.info(f"Baseline consumer lag: {baseline_lag}")
+            
+            # Publish burst
+            published = await self.publish_burst()
+            
+            # Monitor for completions (in parallel)
+            monitor_task = asyncio.create_task(self.monitor_completions(duration=120.0))
+            
+            # Track max lag during test
+            max_lag = 0
+            lag_samples = []
+            
+            for i in range(12):  # 12 samples over 60 seconds
+                await asyncio.sleep(5)
+                lag = await self.get_consumer_lag()
+                total_lag = sum(lag.values())
+                lag_samples.append(total_lag)
+                max_lag = max(max_lag, total_lag)
+                logger.info(f"Sample {i+1}/12: Consumer lag = {total_lag}")
+            
+            await monitor_task
+            
+            self.end_time = time.time()
+            duration = self.end_time - self.start_time if self.start_time else 0
+            
+            # Final measurements
+            final_lag = await self.get_consumer_lag()
+            final_lag_total = sum(final_lag.values())
+            dlq_depth = await self.get_dlq_depth()
+            redis_stats = await self.get_redis_idempotency_stats()
+            latencies = await self.calculate_latencies()
+            
+            # Count duplicates detected (idempotency working)
+            duplicates_detected = sum(1 for m in self.messages if m.is_duplicate and m.job_id in self.completed_messages)
+            
+            # Calculate success rate based on unique job_ids
+            unique_job_ids = {m.job_id for m in self.messages}
+            expected_unique = len(unique_job_ids)
+            completed_count = len(self.completed_messages)
+            missing_job_ids = sorted(list(unique_job_ids - set(self.completed_messages.keys())))
+            failed_count = max(expected_unique - completed_count, 0)
+            error_rate = (failed_count / expected_unique * 100) if expected_unique > 0 else 0
+            
+            result = TestResult(
+                messages_sent=published,
+                messages_acked=published,  # Assuming all published = acked
+                messages_completed=completed_count,
+                messages_failed=failed_count,
+                duplicates_detected=duplicates_detected,
+                expected_unique=expected_unique,
+                missing_job_ids=missing_job_ids,
+                max_consumer_lag=max_lag,
+                final_consumer_lag=final_lag_total,
+                dlq_depth=dlq_depth,
+                p50_latency_ms=latencies["p50"],
+                p95_latency_ms=latencies["p95"],
+                p99_latency_ms=latencies["p99"],
+                error_rate=error_rate,
+                duration_seconds=duration,
+                redis_in_progress=redis_stats["in_progress"],
+                redis_completed=redis_stats["completed"]
+            )
+            
+            return result
+            
+        finally:
+            await self.disconnect()
+    
+    def print_summary(self, result: TestResult):
+        """Print test summary"""
+        print("\n" + "="*70)
+        print("BURST LOAD TEST SUMMARY")
+        print("="*70)
+        print(f"Messages sent:        {result.messages_sent}")
+        print(f"Expected unique jobs: {result.expected_unique}")
+        print(f"Messages completed:    {result.messages_completed}")
+        print(f"Messages failed:       {result.messages_failed}")
+        print(f"Duplicates detected:   {result.duplicates_detected}")
+        if result.missing_job_ids:
+            sample = ", ".join(result.missing_job_ids[:10])
+            print(f"Missing job_ids:      {len(result.missing_job_ids)} (sample: {sample})")
+        print(f"Error rate:           {result.error_rate:.2f}%")
+        print(f"Duration:             {result.duration_seconds:.2f}s")
+        print()
+        print("Consumer Lag:")
+        print(f"  Max during test:     {result.max_consumer_lag}")
+        print(f"  Final:               {result.final_consumer_lag}")
+        print(f"  DLQ depth:           {result.dlq_depth}")
+        print()
+        print("Latency (ms):")
+        print(f"  p50:                 {result.p50_latency_ms:.2f}")
+        print(f"  p95:                 {result.p95_latency_ms:.2f}")
+        print(f"  p99:                 {result.p99_latency_ms:.2f}")
+        print()
+        print("Redis Idempotency:")
+        print(f"  In progress:         {result.redis_in_progress}")
+        print(f"  Completed:           {result.redis_completed}")
+        print()
+        
+        # Acceptance criteria
+        print("Acceptance Criteria:")
+        print(f"  ✅ Consumer lag → 0:  {'PASS' if result.final_consumer_lag == 0 else 'FAIL'} (final: {result.final_consumer_lag})")
+        print(f"  ✅ Success rate ≥99%:  {'PASS' if result.error_rate <= 1.0 else 'FAIL'} ({100-result.error_rate:.2f}%)")
+        print(f"  ✅ DLQ ≤ 2:            {'PASS' if result.dlq_depth <= 2 else 'FAIL'} (depth: {result.dlq_depth})")
+        print(f"  ✅ No stuck keys:      {'PASS' if result.redis_in_progress == 0 else 'FAIL'} (in_progress: {result.redis_in_progress})")
+        print("="*70)
+
+
+async def main():
+    parser = argparse.ArgumentParser(description="Burst Load Test - 100 Messages")
+    parser.add_argument("--messages", type=int, default=100, help="Number of messages")
+    parser.add_argument("--burst-time", type=float, default=5.0, help="Burst duration (seconds)")
+    parser.add_argument("--duplicates", type=int, default=0, help="Number of duplicate job_ids")
+    parser.add_argument("--kill-worker", action="store_true", help="Kill worker during test (advanced)")
+    
+    args = parser.parse_args()
+    
+    test = BurstLoadTest(
+        num_messages=args.messages,
+        burst_time=args.burst_time,
+        duplicates=args.duplicates
+    )
+    
+    result = await test.run()
+    test.print_summary(result)
+    
+    # Exit code based on acceptance
+    if (result.final_consumer_lag == 0 and 
+        result.error_rate <= 1.0 and 
+        result.dlq_depth <= 2 and 
+        result.redis_in_progress == 0):
+        sys.exit(0)
+    else:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())