snapshot: NODE1 production state 2026-02-09

Complete snapshot of /opt/microdao-daarion/ from NODE1 (144.76.224.179). This represents the actual running production code that has diverged significantly from the previous main branch. Key changes from old main: - Gateway (http_api.py): expanded from ~40KB to 164KB with full agent support - Router: new /v1/agents/{id}/infer endpoint with vision + DeepSeek routing - Behavior Policy: SOWA v2.2 (3-level: FULL/ACK/SILENT) - Agent Registry: config/agent_registry.yml as single source of truth - 13 agents configured (was 3) - Memory service integration - CrewAI teams and roles Excluded from snapshot: venv/, .env, data/, backups, .tgz archives Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-09 08:46:46 -08:00
parent 134c044c21
commit ef3473db21
9473 changed files with 408933 additions and 2769877 deletions
--- a/scripts/dlq_replay.py
+++ b/scripts/dlq_replay.py
@@ -0,0 +1,190 @@
+#!/usr/bin/env python3
+"""
+DLQ Replay Automation
+=====================
+Replays messages from Dead Letter Queue back to original streams.
+
+Usage:
+    python3 dlq_replay.py --subject attachment.failed.dlq --max-replays 3
+    python3 dlq_replay.py --all --dry-run
+"""
+
+import os
+import sys
+import json
+import argparse
+import asyncio
+import logging
+from typing import List, Dict, Any
+import nats
+from nats.js.api import StreamInfo
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+NATS_URL = os.getenv("NATS_URL", "nats://nats:4222")
+MAX_REPLAYS = int(os.getenv("MAX_REPLAYS", "3"))
+
+
+async def get_dlq_messages(js, subject: str, stream: str, limit: int = 100) -> List[Dict[str, Any]]:
+    """Fetch messages from DLQ subject"""
+    messages = []
+    try:
+        # Ephemeral pull consumer scoped to DLQ subject
+        sub = await js.pull_subscribe(subject, durable=None, stream=stream)
+        for _ in range(limit):
+            try:
+                batch = await sub.fetch(1, timeout=1.0)
+                if batch:
+                    msg = batch[0]
+                    data = json.loads(msg.data.decode())
+                    headers = dict(msg.headers) if msg.headers else {}
+                    messages.append({
+                        "data": data,
+                        "headers": headers,
+                        "subject": msg.subject,
+                        "sequence": msg.metadata.sequence if hasattr(msg, "metadata") else None
+                    })
+                    await msg.ack()
+            except asyncio.TimeoutError:
+                break
+    except Exception as e:
+        logger.error(f"Error fetching DLQ messages: {e}")
+    
+    return messages
+
+
+async def replay_message(js, message: Dict[str, Any], original_subject: str, dry_run: bool = False) -> bool:
+    """Replay a message to original subject"""
+    replay_count = int(message.get("headers", {}).get("replay_count", "0") or "0")
+    
+    if replay_count >= MAX_REPLAYS:
+        msg_id = message.get("data", {}).get("event_id") or message.get("data", {}).get("job_id")
+        logger.warning(f"Message {msg_id} exceeded max replays ({replay_count})")
+        return False
+    
+    if dry_run:
+        logger.info(f"[DRY RUN] Would replay to {original_subject}")
+        return True
+    
+    try:
+        # Add replay headers
+        headers = message.get("headers", {}).copy()
+        next_replay = replay_count + 1
+        headers["replay_count"] = str(next_replay)
+        headers["replayed"] = "true"
+        headers["original_subject"] = original_subject
+        # Optional message id for traceability
+        original_msg_id = headers.get("original_msg_id") or headers.get("Nats-Msg-Id")
+        if not original_msg_id:
+            original_msg_id = message.get("data", {}).get("event_id") or message.get("data", {}).get("job_id")
+        if original_msg_id:
+            headers["original_msg_id"] = str(original_msg_id)
+        from datetime import datetime
+        headers["replayed_at"] = datetime.utcnow().isoformat()
+        
+        # Publish to original subject (strip force_fail if present)
+        data = message["data"]
+        publish_payload = data
+        if isinstance(data, dict) and isinstance(data.get("data"), dict):
+            # If DLQ wrapper contains original request under data, replay that
+            publish_payload = data["data"]
+        if isinstance(publish_payload, dict):
+            publish_payload.pop("force_fail", None)
+
+        await js.publish(
+            original_subject,
+            json.dumps(publish_payload).encode(),
+            headers=headers
+        )
+        
+        job_id = None
+        if isinstance(publish_payload, dict):
+            job_id = publish_payload.get("job_id") or publish_payload.get("task_id")
+        logger.info(f"Replayed message to {original_subject} (replay_count={replay_count + 1})")
+        if job_id:
+            print(f"replayed_job_id={job_id}")
+        return True
+    except Exception as e:
+        logger.error(f"Error replaying message: {e}")
+        return False
+
+
+
+
+def resolve_stream(subject: str, override: str | None = None) -> str:
+    if override:
+        return override
+    # DLQ events are stored in AUDIT stream
+    if subject.endswith(".dlq"):
+        return "AUDIT"
+    return "AUDIT"
+
+
+def extract_original_subject(dlq_subject: str) -> str:
+    """Extract original subject from DLQ subject"""
+    # attachment.failed.dlq -> attachment.created.{type}
+    # agent.run.failed.dlq -> agent.run.requested
+    if "attachment.failed.dlq" in dlq_subject:
+        # Try to extract type from message data
+        return "attachment.created"  # Default, should be extracted from message
+    elif "agent.run.failed.dlq" in dlq_subject:
+        return "agent.run.requested"
+    else:
+        return dlq_subject.replace(".dlq", "").replace(".failed", "")
+
+
+async def replay_dlq(subject: str, dry_run: bool = False, limit: int = 100, stream: str | None = None):
+    """Replay all messages from a DLQ subject"""
+    nc = await nats.connect(NATS_URL)
+    js = nc.jetstream()
+    
+    stream_name = resolve_stream(subject, stream)
+    logger.info(f"Fetching messages from DLQ: {subject} (stream={stream_name})")
+    messages = await get_dlq_messages(js, subject, stream_name, limit)
+    
+    if not messages:
+        logger.info("No messages found in DLQ")
+        await nc.close()
+        return
+    
+    logger.info(f"Found {len(messages)} messages in DLQ")
+    
+    replayed = 0
+    skipped = 0
+    
+    for msg in messages:
+        original_subject = msg.get("data", {}).get("original_subject") or extract_original_subject(msg["subject"])
+        
+        if await replay_message(js, msg, original_subject, dry_run):
+            replayed += 1
+        else:
+            skipped += 1
+    
+    logger.info(f"Replay complete: {replayed} replayed, {skipped} skipped")
+    await nc.close()
+
+
+async def main():
+    parser = argparse.ArgumentParser(description="DLQ Replay Automation")
+    parser.add_argument("--subject", help="DLQ subject to replay")
+    parser.add_argument("--all", action="store_true", help="Replay all DLQ subjects")
+    parser.add_argument("--dry-run", action="store_true", help="Dry run mode")
+    parser.add_argument("--limit", type=int, default=100, help="Max messages to replay")
+    parser.add_argument("--max-replays", type=int, default=MAX_REPLAYS, help="Max replay count")
+    parser.add_argument("--stream", help="JetStream stream name (default inferred)")
+    
+    args = parser.parse_args()
+    
+    if args.all:
+        dlq_subjects = ["attachment.failed.dlq", "agent.run.failed.dlq", "memory.failed.dlq"]
+        for subject in dlq_subjects:
+            await replay_dlq(subject, args.dry_run, args.limit, args.stream)
+    elif args.subject:
+        await replay_dlq(args.subject, args.dry_run, args.limit, args.stream)
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())