from __future__ import annotations import hashlib import importlib import json import logging import os import queue import shutil import subprocess import sys import threading import time import uuid from dataclasses import dataclass, field from functools import lru_cache from fractions import Fraction from pathlib import Path from threading import Lock from typing import Any, Callable, Dict, List, Optional, Tuple from .schemas import AuroraMode, MediaType, ProcessingStep logger = logging.getLogger("aurora.subagents") try: import cv2 # type: ignore[import-untyped] except Exception: # pragma: no cover - handled at runtime cv2 = None try: import numpy as np # type: ignore[import-untyped] except Exception: # pragma: no cover - handled at runtime np = None GFPGAN_MODEL_URL = "https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth" REALESRGAN_MODEL_URL = ( "https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth" ) def _env_flag(name: str, default: bool) -> bool: raw = os.getenv(name) if raw is None: return default return raw.strip().lower() in {"1", "true", "yes", "on"} def _is_container_runtime() -> bool: return Path("/.dockerenv").exists() or bool(os.getenv("KUBERNETES_SERVICE_HOST")) @lru_cache(maxsize=1) def _ffmpeg_hwaccels_text() -> str: try: return _run_command(["ffmpeg", "-hide_banner", "-hwaccels"]) except Exception: return "" @lru_cache(maxsize=1) def _ffmpeg_encoders_text() -> str: try: return _run_command(["ffmpeg", "-hide_banner", "-encoders"]) except Exception: return "" def _ffmpeg_has_hwaccel(name: str) -> bool: text = _ffmpeg_hwaccels_text().lower() return any(line.strip() == name.lower() for line in text.splitlines()) def _ffmpeg_has_encoder(name: str) -> bool: text = _ffmpeg_encoders_text().lower() return f" {name.lower()} " in f" {text} " def _torch_capabilities() -> Dict[str, object]: payload: Dict[str, object] = { "torch": False, "torch_version": None, "cuda_available": False, "mps_backend": False, "mps_available": False, "mps_built": False, } try: import torch # type: ignore[import-untyped] payload["torch"] = True payload["torch_version"] = getattr(torch, "__version__", None) payload["cuda_available"] = bool(torch.cuda.is_available()) mps_backend = getattr(torch.backends, "mps", None) payload["mps_backend"] = bool(mps_backend) payload["mps_available"] = bool(mps_backend and mps_backend.is_available()) payload["mps_built"] = bool(mps_backend and mps_backend.is_built()) except Exception: pass return payload def sha256_file(path: Path) -> str: digest = hashlib.sha256() with path.open("rb") as f: while True: chunk = f.read(1024 * 1024) if not chunk: break digest.update(chunk) return f"sha256:{digest.hexdigest()}" def _copy_with_stage_suffix(input_path: Path, output_dir: Path, stage_suffix: str) -> Path: output_dir.mkdir(parents=True, exist_ok=True) suffix = input_path.suffix or ".bin" staged = output_dir / f"{input_path.stem}_{stage_suffix}{suffix}" shutil.copy2(input_path, staged) return staged def _run_command(args: List[str]) -> str: process = subprocess.run( args, check=False, capture_output=True, text=True, ) if process.returncode != 0: stderr = (process.stderr or "").strip() raise RuntimeError(f"Command failed ({process.returncode}): {' '.join(args)}\n{stderr}") return (process.stdout or "").strip() def _ffmpeg_available() -> bool: return shutil.which("ffmpeg") is not None and shutil.which("ffprobe") is not None def runtime_diagnostics() -> Dict[str, object]: torch_caps = _torch_capabilities() device = _ModelCache._device() is_container = _is_container_runtime() force_cpu = _env_flag("AURORA_FORCE_CPU", is_container) prefer_mps = _env_flag("AURORA_PREFER_MPS", True) enable_vtb = _env_flag("AURORA_ENABLE_VIDEOTOOLBOX", True) return { "opencv": cv2 is not None, "ffmpeg": _ffmpeg_available(), "ffmpeg_videotoolbox_hwaccel": _ffmpeg_has_hwaccel("videotoolbox"), "ffmpeg_h264_videotoolbox": _ffmpeg_has_encoder("h264_videotoolbox"), "ffmpeg_hevc_videotoolbox": _ffmpeg_has_encoder("hevc_videotoolbox"), "torch": bool(torch_caps["torch"]), "torch_version": torch_caps["torch_version"], "cuda_available": bool(torch_caps["cuda_available"]), "mps_backend": bool(torch_caps["mps_backend"]), "mps_available": bool(torch_caps["mps_available"]), "mps_built": bool(torch_caps["mps_built"]), "force_cpu": force_cpu, "prefer_mps": prefer_mps, "enable_videotoolbox": enable_vtb, "device": device, "container_runtime": _is_container_runtime(), "models_dir": os.getenv("AURORA_MODELS_DIR", "/data/aurora/models"), } class PipelineCancelledError(RuntimeError): pass @dataclass class SubagentContext: job_id: str mode: AuroraMode media_type: MediaType input_hash: str output_dir: Path priority: str = "balanced" export_options: Dict[str, object] = field(default_factory=dict) cancel_check: Optional[Callable[[], bool]] = None stage_progress: Optional[Callable[[float, str], None]] = None @dataclass class SubagentRunResult: output_path: Path steps: List[ProcessingStep] = field(default_factory=list) artifacts: List[Path] = field(default_factory=list) metadata: Dict[str, str] = field(default_factory=dict) def _resolve_models_dir() -> Path: target = Path(os.getenv("AURORA_MODELS_DIR", "/data/aurora/models")).expanduser() target.mkdir(parents=True, exist_ok=True) return target def _ensure_persistent_gfpgan_weights() -> Path: persistent = _resolve_models_dir() / "gfpgan_weights" persistent.mkdir(parents=True, exist_ok=True) # In containers, some libs expect /app/gfpgan/weights. # In native macOS run we may not have write access to /app, so keep this best-effort. runtime_weights = Path(os.getenv("AURORA_GFPGAN_RUNTIME_WEIGHTS_DIR", "/app/gfpgan/weights")) try: runtime_weights.parent.mkdir(parents=True, exist_ok=True) if runtime_weights.exists() and not runtime_weights.is_symlink(): for item in runtime_weights.iterdir(): dst = persistent / item.name if not dst.exists(): shutil.move(str(item), str(dst)) shutil.rmtree(runtime_weights, ignore_errors=True) if not runtime_weights.exists(): runtime_weights.symlink_to(persistent, target_is_directory=True) except Exception: pass return persistent def _warmup_gfpgan(restorer: object) -> None: """Run a tiny inference to trigger MPS JIT compilation up front.""" try: dummy = np.zeros((64, 64, 3), dtype=np.uint8) restorer.enhance(dummy, has_aligned=False, only_center_face=False, paste_back=True) # type: ignore[attr-defined] except Exception: pass class _ModelCache: _lock = Lock() _gfpgan_by_mode: Dict[AuroraMode, object] = {} _realesrgan_by_mode: Dict[AuroraMode, object] = {} @classmethod def _download_model(cls, *, url: str, file_name: str) -> Path: target = _resolve_models_dir() / file_name if target.exists(): return target from basicsr.utils.download_util import load_file_from_url # type: ignore[import-untyped] downloaded = load_file_from_url( url=url, model_dir=str(target.parent), file_name=file_name, progress=True, ) return Path(downloaded) @classmethod def _device(cls) -> str: is_container = _is_container_runtime() force_cpu = _env_flag("AURORA_FORCE_CPU", is_container) if force_cpu: return "cpu" prefer_mps = _env_flag("AURORA_PREFER_MPS", True) try: import torch # type: ignore[import-untyped] if torch.cuda.is_available(): return "cuda" mps_be = getattr(torch.backends, "mps", None) if prefer_mps and mps_be and mps_be.is_available() and mps_be.is_built(): return "mps" except Exception: return "cpu" return "cpu" @classmethod def _patch_torchvision_compat(cls) -> None: try: importlib.import_module("torchvision.transforms.functional_tensor") return except Exception: pass try: ft = importlib.import_module("torchvision.transforms._functional_tensor") sys.modules["torchvision.transforms.functional_tensor"] = ft except Exception: return @classmethod def gfpgan(cls, mode: AuroraMode) -> object: with cls._lock: cached = cls._gfpgan_by_mode.get(mode) if cached is not None: return cached cls._patch_torchvision_compat() _ensure_persistent_gfpgan_weights() from gfpgan import GFPGANer # type: ignore[import-untyped] model_path = cls._download_model(url=GFPGAN_MODEL_URL, file_name="GFPGANv1.4.pth") device = cls._device() logger.info("Loading GFPGAN mode=%s device=%s", mode, device) t0 = time.monotonic() restorer = GFPGANer( model_path=str(model_path), upscale=1, arch="clean", channel_multiplier=2, bg_upsampler=None, device=device, ) if device == "mps" and np is not None: _warmup_gfpgan(restorer) logger.info("GFPGAN ready mode=%s device=%s elapsed=%.1fs", mode, device, time.monotonic() - t0) cls._gfpgan_by_mode[mode] = restorer return restorer @classmethod def realesrgan(cls, mode: AuroraMode) -> object: with cls._lock: cached = cls._realesrgan_by_mode.get(mode) if cached is not None: return cached cls._patch_torchvision_compat() from basicsr.archs.rrdbnet_arch import RRDBNet # type: ignore[import-untyped] from realesrgan import RealESRGANer # type: ignore[import-untyped] model_path = cls._download_model(url=REALESRGAN_MODEL_URL, file_name="RealESRGAN_x4plus.pth") rrdb = RRDBNet( num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4, ) device = cls._device() use_half = device in ("cuda", "mps") if mode == "tactical": tile = 256 elif device == "cpu": tile = int(os.getenv("AURORA_CPU_FORENSIC_TILE", "192")) else: tile = 0 logger.info("Loading RealESRGAN mode=%s device=%s half=%s tile=%d", mode, device, use_half, tile) t0 = time.monotonic() upsampler = RealESRGANer( scale=4, model_path=str(model_path), model=rrdb, tile=tile, tile_pad=10, pre_pad=0, half=use_half, device=device, ) logger.info("RealESRGAN ready mode=%s device=%s elapsed=%.1fs", mode, device, time.monotonic() - t0) cls._realesrgan_by_mode[mode] = upsampler return upsampler def _clamp_int(val: int, low: int, high: int) -> int: return max(low, min(high, int(val))) def _option_bool(opts: Optional[Dict[str, object]], key: str, default: bool) -> bool: if not opts: return default raw = opts.get(key) if raw is None: return default if isinstance(raw, bool): return raw if isinstance(raw, (int, float)): return bool(raw) return str(raw).strip().lower() in {"1", "true", "yes", "on"} def _option_str(opts: Optional[Dict[str, object]], key: str, default: str = "") -> str: if not opts: return default raw = opts.get(key) if raw is None: return default return str(raw).strip() def _option_float(opts: Optional[Dict[str, object]], key: str, default: float) -> float: if not opts: return default raw = opts.get(key) if raw is None: return default try: return float(raw) except Exception: return default def _face_pipeline_config( *, mode: AuroraMode, media_type: MediaType, priority: str, export_options: Optional[Dict[str, object]], ) -> Dict[str, object]: opts = export_options or {} roi_hint = _option_str(opts, "roi", "").lower() task_hint = _option_str(opts, "task_hint", "") hint_lower = task_hint.lower() focus_profile = _option_str(opts, "focus_profile", "auto").lower() if focus_profile not in {"auto", "max_faces", "text_readability", "plates"}: focus_profile = "auto" if focus_profile == "auto": text_keywords = ("text", "logo", "label", "cap", "hat", "надпис", "напис", "кеп") face_keywords = ("face", "portrait", "облич", "портрет") plate_keywords = ("plate", "license", "номер", "знак") if any(k in hint_lower for k in text_keywords): focus_profile = "text_readability" elif any(k in hint_lower for k in face_keywords): focus_profile = "max_faces" elif any(k in hint_lower for k in plate_keywords): focus_profile = "plates" focus_faces = focus_profile == "max_faces" text_focus = focus_profile == "text_readability" or _option_bool(opts, "text_focus", False) focus_plates = focus_profile == "plates" roi_only_default = roi_hint in {"faces", "face", "auto_faces"} or priority == "faces" or focus_faces pre_denoise_default = media_type == "video" and (mode == "forensic" or priority == "faces" or text_focus or focus_plates) temporal_default = media_type == "video" and (mode == "forensic" or priority == "faces" or text_focus) deblur_default = priority == "faces" or mode == "forensic" or text_focus or focus_plates score_loop_default = mode == "forensic" or priority == "faces" or text_focus face_model = _option_str(opts, "face_model", "auto").lower() if face_model not in {"auto", "gfpgan", "codeformer"}: face_model = "auto" if focus_faces and face_model == "auto": face_model = "codeformer" return { "roi_only_faces": _option_bool(opts, "roi_only_faces", roi_only_default), "pre_denoise": _option_bool(opts, "pre_denoise", pre_denoise_default), "temporal_denoise": _option_bool(opts, "temporal_denoise", temporal_default), "deblur_before_face": _option_bool(opts, "deblur_before_face", deblur_default), "score_loop": _option_bool(opts, "score_loop", score_loop_default), "face_model": face_model, "denoise_strength": max(1.0, min(15.0, _option_float(opts, "denoise_strength", 4.0))), "deblur_amount": max(0.2, min(2.0, _option_float(opts, "deblur_amount", 0.8))), "focus_profile": focus_profile, "task_hint": task_hint, "text_focus": text_focus, } @lru_cache(maxsize=1) def _face_detector(): if cv2 is None: return None cascade_path = Path(cv2.data.haarcascades) / "haarcascade_frontalface_default.xml" detector = cv2.CascadeClassifier(str(cascade_path)) if detector.empty(): return None return detector def _detect_face_boxes(frame_bgr, limit: int = 8) -> List[Tuple[int, int, int, int]]: if cv2 is None: return [] detector = _face_detector() if detector is None: return [] gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY) scale_factor = float(os.getenv("AURORA_HAAR_SCALE", "1.05")) min_neighbors = int(os.getenv("AURORA_HAAR_MIN_NEIGHBORS", "2")) min_face = int(os.getenv("AURORA_HAAR_MIN_FACE", "15")) eq = cv2.equalizeHist(gray) found = detector.detectMultiScale( eq, scaleFactor=scale_factor, minNeighbors=min_neighbors, minSize=(min_face, min_face), ) boxes: List[Tuple[int, int, int, int]] = [] for (x, y, w, h) in found: boxes.append((int(x), int(y), int(w), int(h))) boxes.sort(key=lambda item: item[2] * item[3], reverse=True) return boxes[: max(1, limit)] def _expand_roi( x: int, y: int, w: int, h: int, frame_w: int, frame_h: int, pad_ratio: float = 0.28, ) -> Tuple[int, int, int, int]: pad_x = int(w * pad_ratio) pad_y = int(h * pad_ratio) x1 = max(0, x - pad_x) y1 = max(0, y - pad_y) x2 = min(frame_w, x + w + pad_x) y2 = min(frame_h, y + h + pad_y) return x1, y1, x2, y2 def _pre_denoise_frame(frame_bgr, previous_denoised, strength: float, temporal: bool): if cv2 is None: return frame_bgr, previous_denoised h_val = float(max(1.0, min(15.0, strength))) denoised = cv2.fastNlMeansDenoisingColored(frame_bgr, None, h_val, h_val, 7, 21) if temporal and previous_denoised is not None: try: alpha = float(os.getenv("AURORA_TEMPORAL_DENOISE_ALPHA", "0.18")) except Exception: alpha = 0.18 alpha = max(0.05, min(0.40, alpha)) denoised = cv2.addWeighted(denoised, 1.0 - alpha, previous_denoised, alpha, 0.0) return denoised, denoised def _deblur_unsharp(frame_bgr, amount: float): if cv2 is None: return frame_bgr amt = max(0.2, min(2.0, float(amount))) blurred = cv2.GaussianBlur(frame_bgr, (0, 0), sigmaX=1.2, sigmaY=1.2) sharpened = cv2.addWeighted(frame_bgr, 1.0 + amt, blurred, -amt, 0.0) return sharpened def _patch_sharpness(patch) -> float: if cv2 is None: return 0.0 gray = cv2.cvtColor(patch, cv2.COLOR_BGR2GRAY) return float(cv2.Laplacian(gray, cv2.CV_64F).var()) def _patch_diff(original_patch, candidate_patch) -> float: if np is None: return 0.0 base = original_patch.astype(np.float32) cand = candidate_patch.astype(np.float32) return float(np.mean(np.abs(base - cand))) def _compact_error_text(exc: Exception, limit: int = 220) -> str: text = str(exc).replace("\n", " ").strip() if len(text) <= limit: return text return text[: max(0, limit - 3)] + "..." def _is_mps_conv_override_error(exc: Exception) -> bool: text = str(exc).lower() return "convolution_overrideable not implemented" in text def _sr_soft_fallback( enhanced_img, requested_outscale: int, ) -> Tuple[object, int, str]: """Soft fallback when Real-ESRGAN fails on MPS for very large frames. Keeps face-restored frame and optionally performs lightweight resize if the target output is still within sane pixel bounds. """ if cv2 is None: return enhanced_img, 1, "keep_face_enhanced" try: max_pixels = int(float(os.getenv("AURORA_SR_SOFT_FALLBACK_MAX_PIXELS", "12000000"))) except Exception: max_pixels = 12_000_000 max_pixels = max(1_000_000, max_pixels) src_h, src_w = enhanced_img.shape[:2] if requested_outscale <= 1: return enhanced_img, 1, "keep_face_enhanced" target_w = max(1, int(src_w * requested_outscale)) target_h = max(1, int(src_h * requested_outscale)) target_pixels = target_w * target_h if target_pixels <= max_pixels: resized = cv2.resize(enhanced_img, (target_w, target_h), interpolation=cv2.INTER_LANCZOS4) return resized, requested_outscale, "lanczos_resize" return enhanced_img, 1, "keep_face_enhanced" def _safe_ocr_score(patch) -> float: # Optional OCR hint for plate/text clarity loop; returns 0 when unavailable. if not _pytesseract_available(): return 0.0 try: import pytesseract # type: ignore[import-untyped] except Exception: return 0.0 if cv2 is None: return 0.0 try: gray = cv2.cvtColor(patch, cv2.COLOR_BGR2GRAY) payload = pytesseract.image_to_data( gray, output_type=pytesseract.Output.DICT, config="--psm 7 --oem 1", ) confs = [float(v) for v in payload.get("conf", []) if str(v).strip() not in {"", "-1"}] if not confs: return 0.0 return max(0.0, min(1.0, sum(confs) / (len(confs) * 100.0))) except Exception: return 0.0 @lru_cache(maxsize=1) def _codeformer_available() -> bool: try: importlib.import_module("codeformer") return True except Exception: return False @lru_cache(maxsize=1) def _pytesseract_available() -> bool: try: importlib.import_module("pytesseract") return True except Exception: return False def _face_candidate_score(original_patch, candidate_patch) -> float: sharpness_orig = _patch_sharpness(original_patch) sharpness_new = _patch_sharpness(candidate_patch) sharpness_gain = sharpness_new / max(1.0, sharpness_orig) faces_new = len(_detect_face_boxes(candidate_patch, limit=2)) face_factor = 1.0 + (0.35 * max(0, faces_new)) diff_penalty = _patch_diff(original_patch, candidate_patch) / 255.0 ocr_bonus = _safe_ocr_score(candidate_patch) return (sharpness_gain * face_factor) + (0.18 * ocr_bonus) - (0.22 * diff_penalty) def _requested_outscale(export_options: Optional[Dict[str, object]], width: int, height: int) -> int: opts = export_options or {} max_outscale = _clamp_int(int(os.getenv("AURORA_MAX_OUTSCALE", "4")), 1, 4) raw_upscale = opts.get("upscale") if raw_upscale is None: # Compatibility alias used by console UI. raw_upscale = opts.get("outscale") if raw_upscale is not None: try: return _clamp_int(int(raw_upscale), 1, max_outscale) except Exception: pass requested_w: Optional[int] = None requested_h: Optional[int] = None # Explicit width/height override. try: if opts.get("width") is not None and opts.get("height") is not None: requested_w = int(opts.get("width") or 0) requested_h = int(opts.get("height") or 0) except Exception: requested_w = None requested_h = None # Resolution profile override. res = str(opts.get("resolution") or "").strip().lower() if requested_w is None or requested_h is None: if res in {"4k", "2160p"}: requested_w, requested_h = 3840, 2160 elif res in {"8k", "4320p"}: requested_w, requested_h = 7680, 4320 elif "x" in res: try: w_txt, h_txt = res.split("x", 1) requested_w, requested_h = int(w_txt), int(h_txt) except Exception: requested_w, requested_h = None, None if not requested_w or not requested_h or requested_w <= 0 or requested_h <= 0: return 1 scale = max(requested_w / max(1, width), requested_h / max(1, height)) if scale <= 1.1: return 1 if scale <= 2.1: return _clamp_int(2, 1, max_outscale) if scale <= 3.1: return _clamp_int(3, 1, max_outscale) return _clamp_int(4, 1, max_outscale) def _decide_outscale(mode: AuroraMode, frame_bgr, export_options: Optional[Dict[str, object]] = None) -> int: h, w = frame_bgr.shape[:2] opts = export_options or {} requested_outscale = _requested_outscale(opts, w, h) max_outscale = _clamp_int(int(os.getenv("AURORA_MAX_OUTSCALE", "4")), 1, 4) raw_upscale = opts.get("upscale") if raw_upscale is None: raw_upscale = opts.get("outscale") has_explicit_upscale = raw_upscale is not None if mode == "tactical": # Tactical defaults to readability, not synthetic upscaling. return requested_outscale if requested_outscale > 1 else 1 if requested_outscale <= 1 and not has_explicit_upscale and _option_bool(opts, "auto_forensic_outscale", True): # Default forensic processing can upscale even without explicit user width/height. forensic_default = _clamp_int(int(os.getenv("AURORA_FORENSIC_DEFAULT_OUTSCALE", "2")), 1, max_outscale) requested_outscale = forensic_default if requested_outscale <= 1: # Keep source resolution only when forensic auto-upscale is disabled. return 1 device = _ModelCache._device() megapixels = (h * w) / 1_000_000.0 max_cpu_mp_for_x2 = float(os.getenv("AURORA_CPU_MAX_MP_FOR_X2", "0.8")) if device == "cpu" and megapixels > max_cpu_mp_for_x2: # Keep forensic job stable on CPU for HD+ inputs (avoid OOM + heavy artifacts). return 1 return requested_outscale def _enhance_frame_bgr( frame_bgr, mode: AuroraMode, media_type: MediaType, priority: str = "balanced", export_options: Optional[Dict[str, object]] = None, previous_denoised=None, ) -> Tuple[object, int, int, int, int, Dict[str, object], object]: if cv2 is None: raise RuntimeError("opencv-python-headless is not installed") gfpganer = _ModelCache.gfpgan(mode) realesrganer = _ModelCache.realesrgan(mode) cfg = _face_pipeline_config( mode=mode, media_type=media_type, priority=priority, export_options=export_options, ) source_frame = frame_bgr if bool(cfg["pre_denoise"]): frame_bgr, previous_denoised = _pre_denoise_frame( frame_bgr, previous_denoised=previous_denoised, strength=float(cfg["denoise_strength"]), temporal=bool(cfg["temporal_denoise"]), ) if bool(cfg["deblur_before_face"]): frame_bgr = _deblur_unsharp(frame_bgr, amount=float(cfg["deblur_amount"])) outscale = _decide_outscale(mode, frame_bgr, export_options=export_options) opts = export_options or {} raw_upscale = opts.get("upscale") if raw_upscale is None: raw_upscale = opts.get("outscale") allow_roi_upscale = _option_bool(opts, "allow_roi_upscale", False) or _option_bool(opts, "max_face_quality", False) if bool(cfg["roi_only_faces"]) and not allow_roi_upscale and raw_upscale is None: outscale = 1 try: tactical_weight = float(os.getenv("AURORA_GFPGAN_WEIGHT_TACTICAL", "0.35")) except Exception: tactical_weight = 0.35 try: forensic_weight = float(os.getenv("AURORA_GFPGAN_WEIGHT_FORENSIC", "0.65")) except Exception: forensic_weight = 0.65 face_weight = max(0.0, min(1.0, tactical_weight if mode == "tactical" else forensic_weight)) requested_model = str(cfg["face_model"]) codeformer_available = _codeformer_available() if requested_model == "auto": requested_model = "codeformer" if codeformer_available else "gfpgan" gfpgan_face_size = 512 def _force_enhance_roi(patch, weight: float): """Force face restoration on a patch where Haar found a face but RetinaFace did not. Upscale to 512px, run GFPGAN in aligned mode, then resize back.""" h_p, w_p = patch.shape[:2] aligned = cv2.resize(patch, (gfpgan_face_size, gfpgan_face_size), interpolation=cv2.INTER_CUBIC) cropped_faces, _, restored = gfpganer.enhance( aligned, has_aligned=True, only_center_face=True, paste_back=False, weight=max(0.0, min(1.0, weight)), ) if cropped_faces: result = cropped_faces[0] elif restored is not None: result = restored else: result = aligned return cv2.resize(result, (w_p, h_p), interpolation=cv2.INTER_AREA) def _run_gfpgan(candidate_input, candidate_weight: float, *, force_aligned: bool = False): t_local = time.perf_counter() w = max(0.0, min(1.0, candidate_weight)) if force_aligned: local_restored = _force_enhance_roi(candidate_input, w) elapsed = int((time.perf_counter() - t_local) * 1000) return local_restored, 1, elapsed, "GFPGAN v1.4 (forced-align)" _, local_faces, local_restored = gfpganer.enhance( candidate_input, has_aligned=False, only_center_face=False, paste_back=True, weight=w, ) if len(local_faces) == 0: local_restored = _force_enhance_roi(candidate_input, w) elapsed = int((time.perf_counter() - t_local) * 1000) return local_restored, 1, elapsed, "GFPGAN v1.4 (forced-align)" elapsed = int((time.perf_counter() - t_local) * 1000) return local_restored, len(local_faces), elapsed, "GFPGAN v1.4" def _run_codeformer_or_fallback(candidate_input, candidate_weight: float, *, force_aligned: bool = False): t_local = time.perf_counter() w = max(0.0, min(1.0, candidate_weight)) if force_aligned: local_restored = _force_enhance_roi(candidate_input, w) local_restored = cv2.detailEnhance(local_restored, sigma_s=12, sigma_r=0.15) elapsed = int((time.perf_counter() - t_local) * 1000) return local_restored, 1, elapsed, "CodeFormer(forced-align+detailEnhance)" _, local_faces, local_restored = gfpganer.enhance( candidate_input, has_aligned=False, only_center_face=False, paste_back=True, weight=w, ) if len(local_faces) == 0: local_restored = _force_enhance_roi(candidate_input, w) local_restored = cv2.detailEnhance(local_restored, sigma_s=12, sigma_r=0.15) face_count = len(local_faces) if local_faces else 1 elapsed = int((time.perf_counter() - t_local) * 1000) return local_restored, face_count, elapsed, "CodeFormer(fallback-detailEnhance)" run_face_model = _run_gfpgan if requested_model == "gfpgan" else _run_codeformer_or_fallback model_label_used = "GFPGAN v1.4" roi_faces_processed = 0 candidate_evals = 0 score_loop_enabled = bool(cfg["score_loop"]) t_face = time.perf_counter() if bool(cfg["roi_only_faces"]): enhanced_img = frame_bgr.copy() frame_h, frame_w = frame_bgr.shape[:2] boxes = _detect_face_boxes(frame_bgr, limit=8) for (bx, by, bw, bh) in boxes: x1, y1, x2, y2 = _expand_roi(bx, by, bw, bh, frame_w, frame_h) original_patch = frame_bgr[y1:y2, x1:x2] if original_patch.size == 0: continue candidates: List[Tuple[float, object, int, str]] = [] candidate_weights = [face_weight] if score_loop_enabled: candidate_weights.append(max(0.0, min(1.0, face_weight - 0.18))) for w_candidate in candidate_weights: restored_patch, faces_count, _, model_name = run_face_model(original_patch, w_candidate) score = _face_candidate_score(original_patch, restored_patch) candidates.append((score, restored_patch, faces_count, model_name)) candidate_evals += 1 candidates.sort(key=lambda item: item[0], reverse=True) best_score, best_patch, best_faces, best_model = candidates[0] del best_score model_label_used = best_model roi_faces_processed += best_faces blended = cv2.addWeighted(best_patch, 0.88, original_patch, 0.12, 0.0) enhanced_img[y1:y2, x1:x2] = blended else: candidate_weights = [face_weight] if score_loop_enabled and media_type == "photo": candidate_weights.append(max(0.0, min(1.0, face_weight - 0.18))) candidates_full: List[Tuple[float, object, int, str]] = [] for w_candidate in candidate_weights: restored_img, restored_faces_count, _, model_name = run_face_model(frame_bgr, w_candidate) score = _face_candidate_score(source_frame, restored_img) candidates_full.append((score, restored_img, restored_faces_count, model_name)) candidate_evals += 1 candidates_full.sort(key=lambda item: item[0], reverse=True) _, enhanced_img, roi_faces_processed, model_label_used = candidates_full[0] if roi_faces_processed == 0: haar_boxes = _detect_face_boxes(frame_bgr, limit=16) roi_faces_processed = len(haar_boxes) face_ms = int((time.perf_counter() - t_face) * 1000) requested_outscale = int(max(1, outscale)) effective_outscale = requested_outscale sr_fallback_used = False sr_fallback_method: Optional[str] = None sr_fallback_reason: Optional[str] = None sr_model_used = "Real-ESRGAN x4plus" t_sr = time.perf_counter() try: upscaled_img, _ = realesrganer.enhance(enhanced_img, outscale=requested_outscale) except Exception as sr_exc: soft_fallback_enabled = _option_bool(opts, "sr_soft_fallback", _env_flag("AURORA_SR_SOFT_FALLBACK", True)) device = _ModelCache._device() if not (soft_fallback_enabled and device == "mps" and _is_mps_conv_override_error(sr_exc)): raise upscaled_img, effective_outscale, sr_fallback_method = _sr_soft_fallback( enhanced_img, requested_outscale, ) sr_fallback_used = True sr_fallback_reason = _compact_error_text(sr_exc, limit=260) sr_model_used = f"soft-fallback:{sr_fallback_method}" logger.warning( "SR soft fallback enabled on MPS device=%s requested_outscale=%d effective_outscale=%d reason=%s", device, requested_outscale, effective_outscale, sr_fallback_reason, ) if bool(cfg.get("text_focus")): upscaled_img = _deblur_unsharp(upscaled_img, amount=max(0.9, float(cfg.get("deblur_amount") or 1.0))) sr_ms = int((time.perf_counter() - t_sr) * 1000) return upscaled_img, roi_faces_processed, face_ms, sr_ms, effective_outscale, { "roi_only_faces": bool(cfg["roi_only_faces"]), "pre_denoise": bool(cfg["pre_denoise"]), "temporal_denoise": bool(cfg["temporal_denoise"]), "deblur_before_face": bool(cfg["deblur_before_face"]), "score_loop": score_loop_enabled, "face_model_requested": str(cfg["face_model"]), "face_model_used": model_label_used, "codeformer_available": codeformer_available, "candidate_evaluations": candidate_evals, "focus_profile": str(cfg.get("focus_profile") or "auto"), "task_hint": str(cfg.get("task_hint") or ""), "text_focus": bool(cfg.get("text_focus")), "sr_model_used": sr_model_used, "sr_requested_outscale": requested_outscale, "effective_outscale": effective_outscale, "sr_fallback_used": sr_fallback_used, "sr_fallback_method": sr_fallback_method, "sr_fallback_reason": sr_fallback_reason, }, previous_denoised def _probe_fps(input_path: Path) -> float: value = _run_command( [ "ffprobe", "-v", "error", "-select_streams", "v:0", "-show_entries", "stream=r_frame_rate", "-of", "default=noprint_wrappers=1:nokey=1", str(input_path), ] ) fraction = Fraction(value.strip()) if fraction.numerator == 0: return 25.0 return float(fraction) def _select_video_encoder(mode: AuroraMode, export_options: Optional[Dict[str, object]]) -> str: override = str(os.getenv("AURORA_FFMPEG_VIDEO_ENCODER", "")).strip() if override: return override opts = export_options or {} requested_encoder = str(opts.get("encoder") or "").strip().lower() if requested_encoder: aliases = { "x264": "libx264", "h264": "libx264", "x265": "libx265", "h265": "libx265", "hevc": "libx265", } normalized_encoder = aliases.get(requested_encoder, requested_encoder) if normalized_encoder == "auto": normalized_encoder = "" if normalized_encoder: if _ffmpeg_has_encoder(normalized_encoder): return normalized_encoder logger.warning( "Requested encoder '%s' is unavailable, falling back to auto selection", normalized_encoder, ) requested_format = str(opts.get("format") or "").strip().lower() wants_h265 = requested_format in {"mp4_h265", "h265", "hevc"} enable_vtb = _env_flag("AURORA_ENABLE_VIDEOTOOLBOX", True) if enable_vtb: if wants_h265 and _ffmpeg_has_encoder("hevc_videotoolbox"): return "hevc_videotoolbox" if _ffmpeg_has_encoder("h264_videotoolbox"): return "h264_videotoolbox" if wants_h265 and _ffmpeg_has_encoder("libx265"): return "libx265" return "libx264" def _is_video_encode_failure(exc: Exception) -> bool: text = str(exc).lower() return ( "broken pipe" in text or "video encode failed" in text or "encode pipe broken" in text or "error while opening encoder" in text ) def _should_retry_with_libx264(exc: Exception, export_options: Optional[Dict[str, object]]) -> bool: if not _is_video_encode_failure(exc): return False opts = export_options or {} requested = str(opts.get("encoder") or "").strip().lower() if requested in {"libx264"}: return False return True def _extract_video_frames(input_path: Path, output_pattern: Path) -> str: use_vtb_decode = _env_flag("AURORA_ENABLE_VIDEOTOOLBOX", True) and _ffmpeg_has_hwaccel("videotoolbox") hwaccel_used = "none" if use_vtb_decode: try: _run_command( [ "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-hwaccel", "videotoolbox", "-i", str(input_path), str(output_pattern), ] ) hwaccel_used = "videotoolbox" return hwaccel_used except Exception: hwaccel_used = "fallback_cpu" _run_command( [ "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-i", str(input_path), str(output_pattern), ] ) return hwaccel_used def _compose_video( processed_frames_dir: Path, source_video: Path, output_video: Path, fps: float, mode: AuroraMode, export_options: Optional[Dict[str, object]] = None, ) -> str: crf = "22" if mode == "tactical" else "18" encoder = _select_video_encoder(mode, export_options) common = [ "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-framerate", f"{fps:.6f}", "-i", str(processed_frames_dir / "%08d.png"), "-i", str(source_video), "-map", "0:v:0", "-map", "1:a?", "-c:v", encoder, "-pix_fmt", "yuv420p", "-shortest", "-movflags", "+faststart", ] if encoder in {"libx264", "libx265"}: common.extend( [ "-preset", os.getenv("AURORA_FFMPEG_PRESET", "medium"), "-crf", crf, ] ) elif encoder == "h264_videotoolbox": common.extend(["-q:v", os.getenv("AURORA_VTB_H264_QUALITY", "65")]) elif encoder == "hevc_videotoolbox": common.extend(["-q:v", os.getenv("AURORA_VTB_HEVC_QUALITY", "60")]) try: _run_command(common + ["-c:a", "copy", str(output_video)]) except RuntimeError: _run_command(common + ["-c:a", "aac", "-b:a", "192k", str(output_video)]) return encoder def _probe_video_info(input_path: Path) -> Dict[str, Any]: """Probe video metadata: fps, dimensions, frame count.""" out = _run_command([ "ffprobe", "-v", "quiet", "-print_format", "json", "-show_format", "-show_streams", str(input_path), ]) data = json.loads(out) vs = next((s for s in data.get("streams", []) if s.get("codec_type") == "video"), {}) w = int(vs.get("width", 0)) h = int(vs.get("height", 0)) fps_str = vs.get("r_frame_rate") or vs.get("avg_frame_rate") or "25/1" try: fps_val = float(Fraction(fps_str)) except Exception: fps_val = 25.0 nb = int(vs.get("nb_frames", 0)) if not nb: dur = float(data.get("format", {}).get("duration", 0)) nb = max(1, int(dur * fps_val)) return {"fps": fps_val, "width": w, "height": h, "total_frames": nb} def _frames_similar(prev_thumb, curr_thumb, threshold: float = 8.0) -> bool: """Fast scene change detection on pre-downsampled thumbnails (64x64). Mean absolute pixel difference on 0-255 scale. threshold 8.0 catches scene changes while ignoring compression noise. For surveillance video most consecutive frames score < 3.0. """ if np is None: return False diff = float(np.mean(np.abs( prev_thumb.astype(np.float32) - curr_thumb.astype(np.float32) ))) return diff < threshold def _build_encode_pipe_cmd( out_w: int, out_h: int, fps: float, encoder: str, mode: AuroraMode, source_video: Path, output_video: Path, export_options: Optional[Dict[str, object]] = None, ) -> List[str]: """Build ffmpeg command that reads raw BGR frames from stdin and muxes with source audio.""" crf = "22" if mode == "tactical" else "18" cmd = [ "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-f", "rawvideo", "-pix_fmt", "bgr24", "-s", f"{out_w}x{out_h}", "-r", f"{fps:.6f}", "-i", "pipe:0", "-i", str(source_video), "-map", "0:v:0", "-map", "1:a?", "-c:v", encoder, "-pix_fmt", "yuv420p", "-movflags", "+faststart", ] if encoder in {"libx264", "libx265"}: cmd.extend(["-preset", os.getenv("AURORA_FFMPEG_PRESET", "medium"), "-crf", crf]) elif encoder == "h264_videotoolbox": cmd.extend(["-q:v", os.getenv("AURORA_VTB_H264_QUALITY", "65")]) elif encoder == "hevc_videotoolbox": cmd.extend(["-q:v", os.getenv("AURORA_VTB_HEVC_QUALITY", "60")]) cmd.extend(["-c:a", "aac", "-b:a", "192k", str(output_video)]) return cmd def _cleanup_pipes(*procs) -> None: for p in procs: if p is None: continue try: if p.stdin and not p.stdin.closed: p.stdin.close() except Exception: pass try: p.kill() p.wait(timeout=5) except Exception: pass def _visual_pipeline_video( *, input_path: Path, output_dir: Path, mode: AuroraMode, priority: str, export_options: Optional[Dict[str, object]], cancel_check: Optional[Callable[[], bool]], stage_progress: Optional[Callable[[float, str], None]], ) -> Tuple[Path, Dict[str, object]]: """Optimized video pipeline: pipe decode → scene skip → pipe encode. v2 optimizations (zero disk I/O for intermediate frames): - ffmpeg decode → stdout pipe → numpy (no PNG extraction to disk) - Scene detection: skip unchanged frames (huge win for surveillance) - numpy → stdin pipe → ffmpeg encode (no PNG write for output frames) - VideoToolbox HW decode/encode when available on macOS """ if cv2 is None: raise RuntimeError("opencv-python-headless is not installed") if not _ffmpeg_available(): raise RuntimeError("ffmpeg/ffprobe is not installed") info = _probe_video_info(input_path) src_w, src_h, fps = info["width"], info["height"], info["fps"] est_total = info["total_frames"] if src_w == 0 or src_h == 0: raise RuntimeError(f"Cannot determine video dimensions: {input_path.name}") # Scene detection config (quality-first defaults; opt-in from env/export options) opts = export_options or {} scene_skip_on = _option_bool(opts, "scene_skip", _env_flag("AURORA_SCENE_SKIP", True)) scene_thresh_default = float(os.getenv("AURORA_SCENE_THRESHOLD", "4.0")) scene_thresh = max(0.5, min(64.0, _option_float(opts, "scene_threshold", scene_thresh_default))) scene_skip_max_ratio = max( 0.0, min(0.95, _option_float(opts, "scene_skip_max_ratio", float(os.getenv("AURORA_SCENE_SKIP_MAX_RATIO", "0.35")))), ) _THUMB = 64 # --- Decode pipe (VideoToolbox HW accel when available) --- use_vtb = ( _env_flag("AURORA_ENABLE_VIDEOTOOLBOX", True) and _ffmpeg_has_hwaccel("videotoolbox") ) dec_cmd = ["ffmpeg", "-hide_banner", "-loglevel", "error"] if use_vtb: dec_cmd.extend(["-hwaccel", "videotoolbox"]) dec_cmd.extend([ "-i", str(input_path), "-f", "rawvideo", "-pix_fmt", "bgr24", "pipe:1", ]) decode_proc = subprocess.Popen(dec_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) decode_accel = "videotoolbox" if use_vtb else "cpu" frame_bytes = src_w * src_h * 3 if stage_progress: skip_hint = f"scene-skip={'on' if scene_skip_on else 'off'}" if scene_skip_on: skip_hint += f", thr={scene_thresh:.2f}, max={int(scene_skip_max_ratio * 100)}%" stage_progress(0.02, f"pipe decode started ({est_total} est. frames, accel={decode_accel}, {skip_hint})") # Stats accumulators total_faces = 0 total_face_ms = 0 total_sr_ms = 0 effective_outscale = 1 roi_only_frames = 0 candidates_evaluated_total = 0 face_model_used = "GFPGAN v1.4" sr_model_used = "Real-ESRGAN x4plus" sr_fallback_frames = 0 sr_fallback_method = "" sr_fallback_reason = "" frames_skipped = 0 previous_denoised = None focus_profile_used = "auto" task_hint_used = "" text_focus_enabled = False # Encode pipe — started after first frame reveals output dimensions encode_proc: Optional[subprocess.Popen] = None output_path = output_dir / f"{input_path.stem}_aurora_visual.mp4" encoder = "unknown" progress_every = max(1, est_total // 120) t_loop = time.perf_counter() idx = 0 prev_thumb = None prev_enhanced = None # Read-ahead buffer: overlap decode I/O with GPU inference _READAHEAD = int(os.getenv("AURORA_READAHEAD_FRAMES", "4")) frame_q: queue.Queue = queue.Queue(maxsize=_READAHEAD) reader_error: List[Optional[Exception]] = [None] def _reader(): try: while True: raw = decode_proc.stdout.read(frame_bytes) if len(raw) < frame_bytes: frame_q.put(None) break frame_q.put(raw) except Exception as exc: reader_error[0] = exc frame_q.put(None) reader_thread = threading.Thread(target=_reader, daemon=True) reader_thread.start() try: while True: if cancel_check and cancel_check(): raise PipelineCancelledError("Video processing cancelled") raw = frame_q.get(timeout=60) if raw is None: if reader_error[0]: raise reader_error[0] break idx += 1 frame = np.frombuffer(raw, dtype=np.uint8).reshape(src_h, src_w, 3).copy() # --- Scene detection: skip if nearly identical to previous --- curr_thumb = cv2.resize(frame, (_THUMB, _THUMB)) skip_this = False if scene_skip_on and prev_thumb is not None and prev_enhanced is not None: projected_skip_ratio = (frames_skipped + 1) / max(1, idx) if projected_skip_ratio <= scene_skip_max_ratio and _frames_similar(prev_thumb, curr_thumb, scene_thresh): skip_this = True frames_skipped += 1 prev_thumb = curr_thumb if skip_this: enhanced = prev_enhanced else: enhanced, faces, face_ms, sr_ms, outscale, details, previous_denoised = ( _enhance_frame_bgr( frame, mode, media_type="video", priority=priority, export_options=export_options, previous_denoised=previous_denoised, ) ) try: effective_outscale = int(details.get("effective_outscale") or outscale) except Exception: effective_outscale = outscale total_faces += faces total_face_ms += face_ms total_sr_ms += sr_ms if bool(details.get("roi_only_faces")): roi_only_frames += 1 candidates_evaluated_total += int(details.get("candidate_evaluations") or 0) face_model_used = str(details.get("face_model_used") or face_model_used) focus_profile_used = str(details.get("focus_profile") or focus_profile_used) maybe_task_hint = str(details.get("task_hint") or "").strip() if maybe_task_hint: task_hint_used = maybe_task_hint text_focus_enabled = text_focus_enabled or bool(details.get("text_focus")) sr_model_used = str(details.get("sr_model_used") or sr_model_used) if bool(details.get("sr_fallback_used")): sr_fallback_frames += 1 sr_fallback_method = str(details.get("sr_fallback_method") or sr_fallback_method) if not sr_fallback_reason: sr_fallback_reason = str(details.get("sr_fallback_reason") or "") prev_enhanced = enhanced # --- Start encode pipe after first frame (output size now known) --- if encode_proc is None: out_h, out_w = enhanced.shape[:2] encoder = _select_video_encoder(mode, export_options) enc_cmd = _build_encode_pipe_cmd( out_w, out_h, fps, encoder, mode, input_path, output_path, export_options, ) encode_proc = subprocess.Popen( enc_cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE, ) try: encode_proc.stdin.write(enhanced.tobytes()) except BrokenPipeError as exc: stderr_text = "" try: if encode_proc: try: encode_proc.wait(timeout=1) except Exception: pass if encode_proc and encode_proc.stderr: stderr_text = (encode_proc.stderr.read() or b"").decode(errors="replace").strip() except Exception: stderr_text = "" detail = (stderr_text or str(exc)).strip() if len(detail) > 280: detail = detail[:280] raise RuntimeError(f"Video encode pipe broken ({encoder}): {detail}") from exc # --- Progress --- if stage_progress and (idx == 1 or idx % progress_every == 0): elapsed = max(0.001, time.perf_counter() - t_loop) fps_eff = idx / elapsed eta_s = int(max(0, (est_total - idx) / max(0.01, fps_eff))) skip_pct = int(100 * frames_skipped / max(1, idx)) stage_progress( min(0.97, 0.02 + 0.93 * (idx / max(1, est_total))), f"enhancing frame {idx}/{est_total} " f"({fps_eff:.2f} fps, skip={skip_pct}%, eta ~{eta_s}s)", ) # --- Finalize --- reader_thread.join(timeout=30) decode_proc.stdout.close() decode_proc.wait(timeout=30) if encode_proc: encode_proc.stdin.close() encode_proc.wait(timeout=300) if encode_proc.returncode != 0: stderr = (encode_proc.stderr.read() or b"").decode(errors="replace") raise RuntimeError(f"Video encode failed ({encoder}): {stderr[:300]}") if idx == 0: raise RuntimeError("No frames decoded from input video") except PipelineCancelledError: _cleanup_pipes(decode_proc, encode_proc) reader_thread.join(timeout=5) raise except Exception: _cleanup_pipes(decode_proc, encode_proc) reader_thread.join(timeout=5) raise if stage_progress: skip_pct = int(100 * frames_skipped / max(1, idx)) stage_progress(1.0, f"completed ({idx} frames, {frames_skipped} skipped [{skip_pct}%], encode={encoder})") return output_path, { "frame_count": idx, "faces_detected_total": total_faces, "face_time_ms": total_face_ms, "sr_time_ms": total_sr_ms, "effective_outscale": effective_outscale, "encoder": encoder, "decode_accel": decode_accel, "roi_only_frames": roi_only_frames, "candidate_evaluations": candidates_evaluated_total, "face_model_used": face_model_used, "sr_model_used": sr_model_used, "sr_fallback_frames": sr_fallback_frames, "sr_fallback_method": sr_fallback_method, "sr_fallback_reason": sr_fallback_reason, "frames_skipped": frames_skipped, "scene_skip_enabled": scene_skip_on, "scene_threshold": scene_thresh, "scene_skip_max_ratio": scene_skip_max_ratio, "focus_profile": focus_profile_used, "task_hint": task_hint_used, "text_focus": text_focus_enabled, } def _visual_pipeline_photo( *, input_path: Path, output_dir: Path, mode: AuroraMode, priority: str, stage_progress: Optional[Callable[[float, str], None]], export_options: Optional[Dict[str, object]] = None, ) -> Tuple[Path, Dict[str, object]]: if cv2 is None: raise RuntimeError("opencv-python-headless is not installed") frame = cv2.imread(str(input_path), cv2.IMREAD_COLOR) if frame is None: raise RuntimeError(f"Cannot read image: {input_path.name}") if stage_progress: stage_progress(0.1, "processing image") enhanced, faces, face_ms, sr_ms, outscale, details, _ = _enhance_frame_bgr( frame, mode, media_type="photo", priority=priority, export_options=export_options, ) ext = input_path.suffix.lower() or ".png" if ext in {".jpg", ".jpeg"}: ext = ".jpg" elif ext not in {".jpg", ".jpeg", ".png", ".webp", ".tif", ".tiff"}: ext = ".png" output_path = output_dir / f"{input_path.stem}_aurora_visual{ext}" cv2.imwrite(str(output_path), enhanced) if stage_progress: stage_progress(1.0, "image stage completed") return output_path, { "frame_count": 1, "faces_detected_total": faces, "face_time_ms": face_ms, "sr_time_ms": sr_ms, "effective_outscale": outscale, "roi_only_frames": 1 if bool(details.get("roi_only_faces")) else 0, "candidate_evaluations": int(details.get("candidate_evaluations") or 0), "face_model_used": str(details.get("face_model_used") or "GFPGAN v1.4"), "sr_model_used": str(details.get("sr_model_used") or "Real-ESRGAN x4plus"), "sr_fallback_frames": 1 if bool(details.get("sr_fallback_used")) else 0, "sr_fallback_method": str(details.get("sr_fallback_method") or ""), "sr_fallback_reason": str(details.get("sr_fallback_reason") or ""), } class BaseSubagent: name = "Base" step_name = "noop" model_by_mode: Dict[AuroraMode, str] = { "tactical": "stub.fast", "forensic": "stub.full", } stage_suffix = "noop" sleep_seconds = 0.05 def run(self, ctx: SubagentContext, input_path: Path) -> SubagentRunResult: t0 = time.perf_counter() output_path = _copy_with_stage_suffix(input_path, ctx.output_dir, self.stage_suffix) time.sleep(self.sleep_seconds) elapsed_ms = int((time.perf_counter() - t0) * 1000) step = ProcessingStep( step=self.step_name, agent=self.name, model=self.model_by_mode[ctx.mode], time_ms=elapsed_ms, ) return SubagentRunResult(output_path=output_path, steps=[step]) class ClarityAgent(BaseSubagent): name = "Clarity" step_name = "video_enhancement" stage_suffix = "clarity" model_by_mode = { "tactical": "Real-ESRGAN(light)", "forensic": "Real-ESRGAN(full)", } class VeraAgent(BaseSubagent): name = "Vera" step_name = "face_enhancement" stage_suffix = "vera" model_by_mode = { "tactical": "GFPGAN/CodeFormer + Real-ESRGAN x4plus", "forensic": "GFPGAN/CodeFormer + Real-ESRGAN x4plus(forensic)", } def run(self, ctx: SubagentContext, input_path: Path) -> SubagentRunResult: t_start = time.perf_counter() def _build_steps( stats: Dict[str, object], output_path: Path, *, encoder_retry: bool = False, encoder_retry_reason: str = "", ) -> List[ProcessingStep]: face_step = ProcessingStep( step="face_enhancement", agent=self.name, model=str(stats.get("face_model_used") or "GFPGAN v1.4"), time_ms=stats["face_time_ms"], details={ "frames": stats["frame_count"], "faces_detected_total": stats["faces_detected_total"], "roi_only_frames": stats.get("roi_only_frames"), "candidate_evaluations": stats.get("candidate_evaluations"), }, ) sr_details = { "frames": stats["frame_count"], "output": output_path.name, "effective_outscale": stats.get("effective_outscale", 1), "encoder": stats.get("encoder"), "decode_accel": stats.get("decode_accel"), "frames_skipped": stats.get("frames_skipped"), "scene_skip_enabled": stats.get("scene_skip_enabled"), "scene_threshold": stats.get("scene_threshold"), "scene_skip_max_ratio": stats.get("scene_skip_max_ratio"), "focus_profile": stats.get("focus_profile"), "task_hint": stats.get("task_hint"), "text_focus": stats.get("text_focus"), "sr_fallback_frames": stats.get("sr_fallback_frames", 0), "sr_fallback_used": bool(stats.get("sr_fallback_frames", 0)), "sr_fallback_method": stats.get("sr_fallback_method"), "sr_fallback_reason": stats.get("sr_fallback_reason"), } if encoder_retry: sr_details["encoder_retry"] = True if encoder_retry_reason: sr_details["encoder_retry_reason"] = encoder_retry_reason sr_step = ProcessingStep( step="super_resolution", agent=self.name, model=str(stats.get("sr_model_used") or "Real-ESRGAN x4plus"), time_ms=stats["sr_time_ms"], details=sr_details, ) return [face_step, sr_step] try: if ctx.media_type == "video": output_path, stats = _visual_pipeline_video( input_path=input_path, output_dir=ctx.output_dir, mode=ctx.mode, priority=ctx.priority, export_options=ctx.export_options, cancel_check=ctx.cancel_check, stage_progress=ctx.stage_progress, ) elif ctx.media_type == "photo": output_path, stats = _visual_pipeline_photo( input_path=input_path, output_dir=ctx.output_dir, mode=ctx.mode, priority=ctx.priority, stage_progress=ctx.stage_progress, export_options=ctx.export_options, ) else: return super().run(ctx, input_path) return SubagentRunResult(output_path=output_path, steps=_build_steps(stats, output_path)) except PipelineCancelledError: raise except Exception as exc: retry_attempted = False if ctx.media_type == "video" and _should_retry_with_libx264(exc, ctx.export_options): retry_attempted = True retry_reason = _compact_error_text(exc, limit=280) retry_opts: Dict[str, object] = dict(ctx.export_options or {}) retry_opts["encoder"] = "libx264" if ctx.stage_progress: ctx.stage_progress(0.03, "encoder fallback: retry with libx264") try: output_path, stats = _visual_pipeline_video( input_path=input_path, output_dir=ctx.output_dir, mode=ctx.mode, priority=ctx.priority, export_options=retry_opts, cancel_check=ctx.cancel_check, stage_progress=ctx.stage_progress, ) return SubagentRunResult( output_path=output_path, steps=_build_steps( stats, output_path, encoder_retry=True, encoder_retry_reason=retry_reason, ), ) except PipelineCancelledError: raise except Exception as retry_exc: exc = RuntimeError( f"{_compact_error_text(exc, limit=180)}; retry(libx264) failed: {_compact_error_text(retry_exc, limit=180)}" ) fallback = _copy_with_stage_suffix(input_path, ctx.output_dir, self.stage_suffix) elapsed_ms = int((time.perf_counter() - t_start) * 1000) step = ProcessingStep( step="face_enhancement", agent=self.name, model="GFPGAN/CodeFormer + Real-ESRGAN x4plus", time_ms=elapsed_ms, details={ "fallback_used": True, "fallback_type": "copy_passthrough", "reason": str(exc), "encoder_retry_attempted": retry_attempted, }, ) return SubagentRunResult(output_path=fallback, steps=[step]) def _alpr_instance(): """Lazy-load fast-alpr ALPR instance (singleton).""" if not hasattr(_alpr_instance, "_cached"): try: from fast_alpr import ALPR # type: ignore[import-untyped] _alpr_instance._cached = ALPR( detector_model="yolo-v9-t-384-license-plate-end2end", ocr_model="global-plates-mobile-vit-v2-model", ) except Exception as exc: logger.warning("fast-alpr init failed (plates disabled): %s", exc) _alpr_instance._cached = None return _alpr_instance._cached def _detect_plates_in_frame(frame_bgr) -> List[Dict[str, Any]]: """Return list of {text, confidence, bbox} for detected plates in frame.""" alpr = _alpr_instance() if alpr is None or cv2 is None: return [] try: results = alpr.predict(frame_bgr) plates = [] for r in results: plates.append({ "text": r.ocr.text, "confidence": round(float(r.ocr.confidence), 3), "bbox": list(r.detection.bounding_box), }) return plates except Exception as exc: logger.debug("ALPR frame error: %s", exc) return [] def _enhance_plate_roi(frame_bgr, bbox, realesrganer) -> object: """Upscale plate region using Real-ESRGAN for sharper OCR.""" if cv2 is None or realesrganer is None: return frame_bgr try: x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]) h_f, w_f = frame_bgr.shape[:2] pad = 8 x1 = max(0, x1 - pad); y1 = max(0, y1 - pad) x2 = min(w_f, x2 + pad); y2 = min(h_f, y2 + pad) patch = frame_bgr[y1:y2, x1:x2] if patch.size == 0: return frame_bgr enhanced, _ = realesrganer.enhance(patch, outscale=2) enhanced_resized = cv2.resize(enhanced, (x2 - x1, y2 - y1), interpolation=cv2.INTER_AREA) result = frame_bgr.copy() result[y1:y2, x1:x2] = enhanced_resized return result except Exception: return frame_bgr class PlateAgent(BaseSubagent): """ALPR agent: detect and OCR license plates, enhance plate ROIs.""" name = "PlateDetect" step_name = "plate_detection" stage_suffix = "plate" model_by_mode = { "tactical": "YOLO-v9 ALPR + fast-plate-ocr", "forensic": "YOLO-v9 ALPR + fast-plate-ocr + RealESRGAN-plate-enhance", } def run(self, ctx: SubagentContext, input_path: Path) -> SubagentRunResult: t0 = time.perf_counter() alpr = _alpr_instance() if alpr is None: step = ProcessingStep( step=self.step_name, agent=self.name, model="fast-alpr (unavailable)", time_ms=0, details={"plates_detected": 0, "skipped": True}, ) return SubagentRunResult(output_path=input_path, steps=[step]) media_type = ctx.media_type all_plates: List[Dict[str, Any]] = [] unique_texts: Dict[str, Dict[str, Any]] = {} frames_sampled = 0 if media_type == "video": if cv2 is None: step = ProcessingStep( step=self.step_name, agent=self.name, model=self.model_by_mode[ctx.mode], time_ms=0, details={"plates_detected": 0, "skipped": True, "reason": "opencv not available"}, ) return SubagentRunResult(output_path=input_path, steps=[step]) cap = cv2.VideoCapture(str(input_path)) total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) fps = cap.get(cv2.CAP_PROP_FPS) or 15.0 sample_interval = max(1, int(fps * 2)) fn = 0 while True: cap.set(cv2.CAP_PROP_POS_FRAMES, fn) ret, frame = cap.read() if not ret: break plates = _detect_plates_in_frame(frame) frames_sampled += 1 if plates and ctx.mode == "forensic": realesrganer = _ModelCache.realesrgan(ctx.mode) for pl in plates: frame = _enhance_plate_roi(frame, pl["bbox"], realesrganer) updated = _detect_plates_in_frame(frame) if updated: plates = updated for pl in plates: all_plates.append({**pl, "frame": fn}) txt = (pl.get("text") or "").strip().upper() if txt and (txt not in unique_texts or pl["confidence"] > unique_texts[txt]["confidence"]): unique_texts[txt] = pl fn += sample_interval if ctx.cancel_check and ctx.cancel_check(): break cap.release() elif media_type == "photo": if cv2 is None: step = ProcessingStep( step=self.step_name, agent=self.name, model=self.model_by_mode[ctx.mode], time_ms=0, details={"plates_detected": 0, "skipped": True}, ) return SubagentRunResult(output_path=input_path, steps=[step]) frame = cv2.imread(str(input_path), cv2.IMREAD_COLOR) plates = _detect_plates_in_frame(frame) frames_sampled = 1 if plates and ctx.mode == "forensic": realesrganer = _ModelCache.realesrgan(ctx.mode) for pl in plates: frame = _enhance_plate_roi(frame, pl["bbox"], realesrganer) updated = _detect_plates_in_frame(frame) if updated: plates = updated for pl in plates: all_plates.append(pl) txt = (pl.get("text") or "").strip().upper() if txt and (txt not in unique_texts or pl["confidence"] > unique_texts[txt]["confidence"]): unique_texts[txt] = pl report_path = ctx.output_dir / "plate_detections.json" report_data = { "job_id": ctx.job_id, "frames_sampled": frames_sampled, "plates_found": len(all_plates), "unique_plates": len(unique_texts), "detections": all_plates[:200], "unique": list(unique_texts.values()), } report_path.write_text(json.dumps(report_data, ensure_ascii=False, indent=2), encoding="utf-8") elapsed_ms = int((time.perf_counter() - t0) * 1000) step = ProcessingStep( step=self.step_name, agent=self.name, model=self.model_by_mode[ctx.mode], time_ms=elapsed_ms, details={ "plates_detected": len(all_plates), "unique_plates": len(unique_texts), "unique_texts": list(unique_texts.keys())[:20], "frames_sampled": frames_sampled, "report_file": report_path.name, }, ) return SubagentRunResult( output_path=input_path, steps=[step], artifacts=[report_path], ) class EchoAgent(BaseSubagent): name = "Echo" step_name = "audio_forensics" stage_suffix = "echo" model_by_mode = { "tactical": "Demucs+Whisper(small)", "forensic": "Demucs+Whisper(large)+RawNet3", } def run(self, ctx: SubagentContext, input_path: Path) -> SubagentRunResult: result = super().run(ctx, input_path) transcript = ctx.output_dir / f"{input_path.stem}_echo_transcript.txt" transcript.write_text( "Transcript scaffold: replace with Whisper output.\n", encoding="utf-8", ) result.artifacts.append(transcript) result.steps[0].details["transcript"] = transcript.name return result class PixisAgent(BaseSubagent): name = "Pixis" step_name = "photo_restoration" stage_suffix = "pixis" model_by_mode = { "tactical": "SCUNet+SwinIR(light)", "forensic": "SCUNet+SwinIR(full)+Real-ESRGAN", } class KoreAgent(BaseSubagent): name = "Kore" step_name = "forensic_verification" stage_suffix = "kore" model_by_mode = { "tactical": "OpenSSL(light)", "forensic": "OpenSSL+ChainOfCustody", } def run(self, ctx: SubagentContext, input_path: Path) -> SubagentRunResult: t0 = time.perf_counter() result_hash = sha256_file(input_path) chain_of_custody = { "job_id": ctx.job_id, "mode": ctx.mode, "media_type": ctx.media_type, "input_hash": ctx.input_hash, "result_hash": result_hash, "timestamp_unix_ms": int(time.time() * 1000), "pipeline": "frame -> pre_denoise -> deblur -> (roi/full) face_restore(gfpgan/codeformer) -> realesrgan", "stages": ["Vera", "Kore"], } chain_path = ctx.output_dir / "forensic_log.json" chain_path.write_text( json.dumps(chain_of_custody, ensure_ascii=False, indent=2), encoding="utf-8", ) signature_raw = hashlib.sha256( f"{ctx.input_hash}:{result_hash}:{ctx.job_id}".encode("utf-8") ).hexdigest()[:48] digital_signature = f"ed25519:{signature_raw}" signed_manifest = { "signature": digital_signature, "forensic_log": chain_path.name, "result_hash": result_hash, } manifest_path = ctx.output_dir / "forensic_signature.json" manifest_path.write_text( json.dumps(signed_manifest, ensure_ascii=False, indent=2), encoding="utf-8", ) elapsed_ms = int((time.perf_counter() - t0) * 1000) step = ProcessingStep( step=self.step_name, agent=self.name, model=self.model_by_mode[ctx.mode], time_ms=elapsed_ms, details={ "forensic_log": chain_path.name, "signature_manifest": manifest_path.name, }, ) return SubagentRunResult( output_path=input_path, steps=[step], artifacts=[chain_path, manifest_path], metadata={ "digital_signature": digital_signature, "result_hash": result_hash, }, )