snapshot: NODE1 production state 2026-02-09
Complete snapshot of /opt/microdao-daarion/ from NODE1 (144.76.224.179).
This represents the actual running production code that has diverged
significantly from the previous main branch.
Key changes from old main:
- Gateway (http_api.py): expanded from ~40KB to 164KB with full agent support
- Router: new /v1/agents/{id}/infer endpoint with vision + DeepSeek routing
- Behavior Policy: SOWA v2.2 (3-level: FULL/ACK/SILENT)
- Agent Registry: config/agent_registry.yml as single source of truth
- 13 agents configured (was 3)
- Memory service integration
- CrewAI teams and roles
Excluded from snapshot: venv/, .env, data/, backups, .tgz archives
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
190
scripts/dlq_replay.py
Executable file
190
scripts/dlq_replay.py
Executable file
@@ -0,0 +1,190 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
DLQ Replay Automation
|
||||
=====================
|
||||
Replays messages from Dead Letter Queue back to original streams.
|
||||
|
||||
Usage:
|
||||
python3 dlq_replay.py --subject attachment.failed.dlq --max-replays 3
|
||||
python3 dlq_replay.py --all --dry-run
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import List, Dict, Any
|
||||
import nats
|
||||
from nats.js.api import StreamInfo
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
NATS_URL = os.getenv("NATS_URL", "nats://nats:4222")
|
||||
MAX_REPLAYS = int(os.getenv("MAX_REPLAYS", "3"))
|
||||
|
||||
|
||||
async def get_dlq_messages(js, subject: str, stream: str, limit: int = 100) -> List[Dict[str, Any]]:
|
||||
"""Fetch messages from DLQ subject"""
|
||||
messages = []
|
||||
try:
|
||||
# Ephemeral pull consumer scoped to DLQ subject
|
||||
sub = await js.pull_subscribe(subject, durable=None, stream=stream)
|
||||
for _ in range(limit):
|
||||
try:
|
||||
batch = await sub.fetch(1, timeout=1.0)
|
||||
if batch:
|
||||
msg = batch[0]
|
||||
data = json.loads(msg.data.decode())
|
||||
headers = dict(msg.headers) if msg.headers else {}
|
||||
messages.append({
|
||||
"data": data,
|
||||
"headers": headers,
|
||||
"subject": msg.subject,
|
||||
"sequence": msg.metadata.sequence if hasattr(msg, "metadata") else None
|
||||
})
|
||||
await msg.ack()
|
||||
except asyncio.TimeoutError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching DLQ messages: {e}")
|
||||
|
||||
return messages
|
||||
|
||||
|
||||
async def replay_message(js, message: Dict[str, Any], original_subject: str, dry_run: bool = False) -> bool:
|
||||
"""Replay a message to original subject"""
|
||||
replay_count = int(message.get("headers", {}).get("replay_count", "0") or "0")
|
||||
|
||||
if replay_count >= MAX_REPLAYS:
|
||||
msg_id = message.get("data", {}).get("event_id") or message.get("data", {}).get("job_id")
|
||||
logger.warning(f"Message {msg_id} exceeded max replays ({replay_count})")
|
||||
return False
|
||||
|
||||
if dry_run:
|
||||
logger.info(f"[DRY RUN] Would replay to {original_subject}")
|
||||
return True
|
||||
|
||||
try:
|
||||
# Add replay headers
|
||||
headers = message.get("headers", {}).copy()
|
||||
next_replay = replay_count + 1
|
||||
headers["replay_count"] = str(next_replay)
|
||||
headers["replayed"] = "true"
|
||||
headers["original_subject"] = original_subject
|
||||
# Optional message id for traceability
|
||||
original_msg_id = headers.get("original_msg_id") or headers.get("Nats-Msg-Id")
|
||||
if not original_msg_id:
|
||||
original_msg_id = message.get("data", {}).get("event_id") or message.get("data", {}).get("job_id")
|
||||
if original_msg_id:
|
||||
headers["original_msg_id"] = str(original_msg_id)
|
||||
from datetime import datetime
|
||||
headers["replayed_at"] = datetime.utcnow().isoformat()
|
||||
|
||||
# Publish to original subject (strip force_fail if present)
|
||||
data = message["data"]
|
||||
publish_payload = data
|
||||
if isinstance(data, dict) and isinstance(data.get("data"), dict):
|
||||
# If DLQ wrapper contains original request under data, replay that
|
||||
publish_payload = data["data"]
|
||||
if isinstance(publish_payload, dict):
|
||||
publish_payload.pop("force_fail", None)
|
||||
|
||||
await js.publish(
|
||||
original_subject,
|
||||
json.dumps(publish_payload).encode(),
|
||||
headers=headers
|
||||
)
|
||||
|
||||
job_id = None
|
||||
if isinstance(publish_payload, dict):
|
||||
job_id = publish_payload.get("job_id") or publish_payload.get("task_id")
|
||||
logger.info(f"Replayed message to {original_subject} (replay_count={replay_count + 1})")
|
||||
if job_id:
|
||||
print(f"replayed_job_id={job_id}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error replaying message: {e}")
|
||||
return False
|
||||
|
||||
|
||||
|
||||
|
||||
def resolve_stream(subject: str, override: str | None = None) -> str:
|
||||
if override:
|
||||
return override
|
||||
# DLQ events are stored in AUDIT stream
|
||||
if subject.endswith(".dlq"):
|
||||
return "AUDIT"
|
||||
return "AUDIT"
|
||||
|
||||
|
||||
def extract_original_subject(dlq_subject: str) -> str:
|
||||
"""Extract original subject from DLQ subject"""
|
||||
# attachment.failed.dlq -> attachment.created.{type}
|
||||
# agent.run.failed.dlq -> agent.run.requested
|
||||
if "attachment.failed.dlq" in dlq_subject:
|
||||
# Try to extract type from message data
|
||||
return "attachment.created" # Default, should be extracted from message
|
||||
elif "agent.run.failed.dlq" in dlq_subject:
|
||||
return "agent.run.requested"
|
||||
else:
|
||||
return dlq_subject.replace(".dlq", "").replace(".failed", "")
|
||||
|
||||
|
||||
async def replay_dlq(subject: str, dry_run: bool = False, limit: int = 100, stream: str | None = None):
|
||||
"""Replay all messages from a DLQ subject"""
|
||||
nc = await nats.connect(NATS_URL)
|
||||
js = nc.jetstream()
|
||||
|
||||
stream_name = resolve_stream(subject, stream)
|
||||
logger.info(f"Fetching messages from DLQ: {subject} (stream={stream_name})")
|
||||
messages = await get_dlq_messages(js, subject, stream_name, limit)
|
||||
|
||||
if not messages:
|
||||
logger.info("No messages found in DLQ")
|
||||
await nc.close()
|
||||
return
|
||||
|
||||
logger.info(f"Found {len(messages)} messages in DLQ")
|
||||
|
||||
replayed = 0
|
||||
skipped = 0
|
||||
|
||||
for msg in messages:
|
||||
original_subject = msg.get("data", {}).get("original_subject") or extract_original_subject(msg["subject"])
|
||||
|
||||
if await replay_message(js, msg, original_subject, dry_run):
|
||||
replayed += 1
|
||||
else:
|
||||
skipped += 1
|
||||
|
||||
logger.info(f"Replay complete: {replayed} replayed, {skipped} skipped")
|
||||
await nc.close()
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(description="DLQ Replay Automation")
|
||||
parser.add_argument("--subject", help="DLQ subject to replay")
|
||||
parser.add_argument("--all", action="store_true", help="Replay all DLQ subjects")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Dry run mode")
|
||||
parser.add_argument("--limit", type=int, default=100, help="Max messages to replay")
|
||||
parser.add_argument("--max-replays", type=int, default=MAX_REPLAYS, help="Max replay count")
|
||||
parser.add_argument("--stream", help="JetStream stream name (default inferred)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.all:
|
||||
dlq_subjects = ["attachment.failed.dlq", "agent.run.failed.dlq", "memory.failed.dlq"]
|
||||
for subject in dlq_subjects:
|
||||
await replay_dlq(subject, args.dry_run, args.limit, args.stream)
|
||||
elif args.subject:
|
||||
await replay_dlq(args.subject, args.dry_run, args.limit, args.stream)
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user