#!/usr/bin/env python3 """ DLQ Replay Automation ===================== Replays messages from Dead Letter Queue back to original streams. Usage: python3 dlq_replay.py --subject attachment.failed.dlq --max-replays 3 python3 dlq_replay.py --all --dry-run """ import os import sys import json import argparse import asyncio import logging from typing import List, Dict, Any import nats from nats.js.api import StreamInfo logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) NATS_URL = os.getenv("NATS_URL", "nats://nats:4222") MAX_REPLAYS = int(os.getenv("MAX_REPLAYS", "3")) async def get_dlq_messages(js, subject: str, stream: str, limit: int = 100) -> List[Dict[str, Any]]: """Fetch messages from DLQ subject""" messages = [] try: # Ephemeral pull consumer scoped to DLQ subject sub = await js.pull_subscribe(subject, durable=None, stream=stream) for _ in range(limit): try: batch = await sub.fetch(1, timeout=1.0) if batch: msg = batch[0] data = json.loads(msg.data.decode()) headers = dict(msg.headers) if msg.headers else {} messages.append({ "data": data, "headers": headers, "subject": msg.subject, "sequence": msg.metadata.sequence if hasattr(msg, "metadata") else None }) await msg.ack() except asyncio.TimeoutError: break except Exception as e: logger.error(f"Error fetching DLQ messages: {e}") return messages async def replay_message(js, message: Dict[str, Any], original_subject: str, dry_run: bool = False) -> bool: """Replay a message to original subject""" replay_count = int(message.get("headers", {}).get("replay_count", "0") or "0") if replay_count >= MAX_REPLAYS: msg_id = message.get("data", {}).get("event_id") or message.get("data", {}).get("job_id") logger.warning(f"Message {msg_id} exceeded max replays ({replay_count})") return False if dry_run: logger.info(f"[DRY RUN] Would replay to {original_subject}") return True try: # Add replay headers headers = message.get("headers", {}).copy() next_replay = replay_count + 1 headers["replay_count"] = str(next_replay) headers["replayed"] = "true" headers["original_subject"] = original_subject # Optional message id for traceability original_msg_id = headers.get("original_msg_id") or headers.get("Nats-Msg-Id") if not original_msg_id: original_msg_id = message.get("data", {}).get("event_id") or message.get("data", {}).get("job_id") if original_msg_id: headers["original_msg_id"] = str(original_msg_id) from datetime import datetime headers["replayed_at"] = datetime.utcnow().isoformat() # Publish to original subject (strip force_fail if present) data = message["data"] publish_payload = data if isinstance(data, dict) and isinstance(data.get("data"), dict): # If DLQ wrapper contains original request under data, replay that publish_payload = data["data"] if isinstance(publish_payload, dict): publish_payload.pop("force_fail", None) await js.publish( original_subject, json.dumps(publish_payload).encode(), headers=headers ) job_id = None if isinstance(publish_payload, dict): job_id = publish_payload.get("job_id") or publish_payload.get("task_id") logger.info(f"Replayed message to {original_subject} (replay_count={replay_count + 1})") if job_id: print(f"replayed_job_id={job_id}") return True except Exception as e: logger.error(f"Error replaying message: {e}") return False def resolve_stream(subject: str, override: str | None = None) -> str: if override: return override # DLQ events are stored in AUDIT stream if subject.endswith(".dlq"): return "AUDIT" return "AUDIT" def extract_original_subject(dlq_subject: str) -> str: """Extract original subject from DLQ subject""" # attachment.failed.dlq -> attachment.created.{type} # agent.run.failed.dlq -> agent.run.requested if "attachment.failed.dlq" in dlq_subject: # Try to extract type from message data return "attachment.created" # Default, should be extracted from message elif "agent.run.failed.dlq" in dlq_subject: return "agent.run.requested" else: return dlq_subject.replace(".dlq", "").replace(".failed", "") async def replay_dlq(subject: str, dry_run: bool = False, limit: int = 100, stream: str | None = None): """Replay all messages from a DLQ subject""" nc = await nats.connect(NATS_URL) js = nc.jetstream() stream_name = resolve_stream(subject, stream) logger.info(f"Fetching messages from DLQ: {subject} (stream={stream_name})") messages = await get_dlq_messages(js, subject, stream_name, limit) if not messages: logger.info("No messages found in DLQ") await nc.close() return logger.info(f"Found {len(messages)} messages in DLQ") replayed = 0 skipped = 0 for msg in messages: original_subject = msg.get("data", {}).get("original_subject") or extract_original_subject(msg["subject"]) if await replay_message(js, msg, original_subject, dry_run): replayed += 1 else: skipped += 1 logger.info(f"Replay complete: {replayed} replayed, {skipped} skipped") await nc.close() async def main(): parser = argparse.ArgumentParser(description="DLQ Replay Automation") parser.add_argument("--subject", help="DLQ subject to replay") parser.add_argument("--all", action="store_true", help="Replay all DLQ subjects") parser.add_argument("--dry-run", action="store_true", help="Dry run mode") parser.add_argument("--limit", type=int, default=100, help="Max messages to replay") parser.add_argument("--max-replays", type=int, default=MAX_REPLAYS, help="Max replay count") parser.add_argument("--stream", help="JetStream stream name (default inferred)") args = parser.parse_args() if args.all: dlq_subjects = ["attachment.failed.dlq", "agent.run.failed.dlq", "memory.failed.dlq"] for subject in dlq_subjects: await replay_dlq(subject, args.dry_run, args.limit, args.stream) elif args.subject: await replay_dlq(args.subject, args.dry_run, args.limit, args.stream) else: parser.print_help() if __name__ == "__main__": asyncio.run(main())