microdao-daarion/services/memory-service/app/voice_endpoints.py

"""
DAARION Memory Service — Voice Endpoints
STT: faster-whisper (Docker/Linux) → mlx-audio (macOS) → whisper-cli
TTS: edge-tts Python API (primary, pure Python, no ffmpeg needed)
     → piper (fallback, if model present)
     → espeak-ng (offline Linux fallback)
     → macOS say (fallback, macOS-only)
"""
from __future__ import annotations

import asyncio
import io
import logging
import os
import subprocess
import tempfile
import uuid
from pathlib import Path
from typing import Optional

from fastapi import APIRouter, File, HTTPException, Query, UploadFile
from fastapi.responses import StreamingResponse
from pydantic import BaseModel

logger = logging.getLogger(__name__)
router = APIRouter(prefix="/voice", tags=["voice"])

MODELS_CACHE: dict = {}

# ── Prometheus metrics (optional — skip if not installed) ─────────────────────
try:
    from prometheus_client import Counter, Histogram

    _tts_compute_hist = Histogram(
        "voice_tts_compute_ms",
        "TTS synthesis compute time in ms",
        ["engine", "voice"],
        buckets=[50, 100, 250, 500, 1000, 2000, 5000],
    )
    _tts_bytes_hist = Histogram(
        "voice_tts_audio_bytes",
        "TTS audio output size in bytes",
        ["engine"],
        buckets=[5000, 15000, 30000, 60000, 120000],
    )
    _tts_errors_total = Counter(
        "voice_tts_errors_total",
        "TTS engine errors",
        ["engine", "error_type"],
    )
    _stt_compute_hist = Histogram(
        "voice_stt_compute_ms",
        "STT transcription time in ms",
        ["engine"],
        buckets=[200, 500, 1000, 2000, 5000, 10000],
    )
    _PROM_OK = True
except ImportError:
    _PROM_OK = False
    _tts_compute_hist = None
    _tts_bytes_hist = None
    _tts_errors_total = None
    _stt_compute_hist = None


def _prom_tts_observe(engine: str, voice: str, ms: float, audio_bytes: int) -> None:
    if not _PROM_OK:
        return
    try:
        _tts_compute_hist.labels(engine=engine, voice=voice).observe(ms)
        _tts_bytes_hist.labels(engine=engine).observe(audio_bytes)
    except Exception:
        pass


def _prom_tts_error(engine: str, error_type: str) -> None:
    if not _PROM_OK:
        return
    try:
        _tts_errors_total.labels(engine=engine, error_type=error_type).inc()
    except Exception:
        pass


def _prom_stt_observe(engine: str, ms: float) -> None:
    if not _PROM_OK:
        return
    try:
        _stt_compute_hist.labels(engine=engine).observe(ms)
    except Exception:
        pass

# ── Voice mapping ─────────────────────────────────────────────────────────────
# Maps UI voice id → edge-tts voice name
_EDGE_VOICES: dict[str, str] = {
    "default":              "uk-UA-PolinaNeural",
    "Polina":               "uk-UA-PolinaNeural",
    "uk-UA-Polina":         "uk-UA-PolinaNeural",
    "uk-UA-PolinaNeural":   "uk-UA-PolinaNeural",
    "Ostap":                "uk-UA-OstapNeural",
    "uk-UA-Ostap":          "uk-UA-OstapNeural",
    "uk-UA-OstapNeural":    "uk-UA-OstapNeural",
    # English voices — used for English-language segments
    "en-US-GuyNeural":      "en-US-GuyNeural",
    "en-US-JennyNeural":    "en-US-JennyNeural",
    "en":                   "en-US-GuyNeural",
    # macOS-only names: map to closest Ukrainian voice
    "Milena":               "uk-UA-PolinaNeural",
    "Yuri":                 "uk-UA-OstapNeural",
    "af_heart":             "uk-UA-PolinaNeural",
}

def _edge_voice(name: str | None) -> str:
    """Allow any valid edge-tts voice name to pass through directly."""
    n = name or "default"
    # If already a valid neural voice name (contains "Neural"), pass through
    if "Neural" in n or n == "en":
        return _EDGE_VOICES.get(n, n)
    return _EDGE_VOICES.get(n, "uk-UA-PolinaNeural")


def _ffmpeg_available() -> bool:
    try:
        result = subprocess.run(["ffmpeg", "-version"], capture_output=True, timeout=3)
        return result.returncode == 0
    except Exception:
        return False


def _espeak_available() -> bool:
    try:
        result = subprocess.run(["espeak-ng", "--version"], capture_output=True, timeout=3)
        return result.returncode == 0
    except Exception:
        return False


class TTSRequest(BaseModel):
    text: str
    voice: Optional[str] = "default"
    speed: Optional[float] = 1.0
    model: Optional[str] = "auto"


# ── Status & Live Health ───────────────────────────────────────────────────────

@router.get("/health")
async def voice_health():
    """Live health check — actually synthesizes a short test phrase via edge-tts.
    Returns edge_tts=ok/error with details; used by preflight to detect 403/blocked.
    """
    import importlib.metadata
    import time

    result: dict = {}

    # edge-tts version
    try:
        ver = importlib.metadata.version("edge-tts")
    except Exception:
        ver = "unknown"
    result["edge_tts_version"] = ver

    # Live synthesis test for each required Neural voice
    live_voices: list[dict] = []
    test_text = "Test"  # Minimal — just enough to trigger actual API call
    for voice_id in ("uk-UA-PolinaNeural", "uk-UA-OstapNeural"):
        t0 = time.monotonic()
        try:
            import edge_tts
            comm = edge_tts.Communicate(test_text, voice_id)
            byte_count = 0
            async for chunk in comm.stream():
                if chunk["type"] == "audio":
                    byte_count += len(chunk["data"])
            elapsed_ms = int((time.monotonic() - t0) * 1000)
            live_voices.append({"voice": voice_id, "status": "ok",
                                 "bytes": byte_count, "ms": elapsed_ms})
        except Exception as e:
            elapsed_ms = int((time.monotonic() - t0) * 1000)
            live_voices.append({"voice": voice_id, "status": "error",
                                 "error": str(e)[:150], "ms": elapsed_ms})

    all_ok = all(v["status"] == "ok" for v in live_voices)
    result["edge_tts"] = "ok" if all_ok else "error"
    result["voices"] = live_voices

    # STT check (import only — no actual transcription in health)
    try:
        import faster_whisper  # noqa: F401
        result["faster_whisper"] = "ok"
    except ImportError:
        result["faster_whisper"] = "unavailable"

    result["ok"] = all_ok

    # ── Repro pack (incident diagnosis) ──────────────────────────────────────
    import os as _os
    import socket as _socket
    result["repro"] = {
        "node_id":       _os.getenv("NODE_ID", _socket.gethostname()),
        "service_name":  _os.getenv("MEMORY_SERVICE_NAME", "memory-service"),
        "image_digest":  _os.getenv("IMAGE_DIGEST", "unknown"),  # set via docker label
        "memory_service_url": _os.getenv("MEMORY_SERVICE_URL", "http://localhost:8000"),
        "tts_max_chars": 700,
        "canary_test_text": test_text,
        "canary_audio_bytes": {
            v["voice"]: v.get("bytes", 0) for v in live_voices
        },
    }
    return result


@router.get("/status")
async def voice_status():
    edge_ok = False
    try:
        import edge_tts  # noqa: F401
        edge_ok = True
    except ImportError:
        pass

    espeak_ok = _espeak_available()

    fw_ok = False
    try:
        import faster_whisper  # noqa: F401
        fw_ok = True
    except ImportError:
        pass

    mlx_ok = False
    try:
        import mlx_audio  # noqa: F401
        mlx_ok = True
    except ImportError:
        pass

    return {
        "available": True,
        "tts_engine": "edge-tts" if edge_ok else ("espeak-ng" if espeak_ok else "piper/say"),
        "stt_engine": ("faster-whisper" if fw_ok else "") + ("mlx-audio" if mlx_ok else ""),
        "edge_tts": edge_ok,
        "espeak_ng": espeak_ok,
        "faster_whisper": fw_ok,
        "mlx_audio": mlx_ok,
        "ffmpeg": _ffmpeg_available(),
        "voices": list(_EDGE_VOICES.keys()),
    }


# ── TTS ───────────────────────────────────────────────────────────────────────

async def _tts_edge(text: str, voice_name: str, speed: float = 1.0) -> bytes:
    """
    edge-tts pure Python API — no subprocess, no ffmpeg.
    Returns MP3 bytes directly (browsers play MP3 natively).
    """
    import edge_tts
    rate_str = f"+{int((speed - 1.0) * 50)}%" if speed != 1.0 else "+0%"
    communicate = edge_tts.Communicate(text, voice_name, rate=rate_str)
    buf = io.BytesIO()
    async for chunk in communicate.stream():
        if chunk["type"] == "audio":
            buf.write(chunk["data"])
    buf.seek(0)
    data = buf.read()
    if not data:
        raise RuntimeError("edge-tts returned empty audio")
    return data


async def _tts_piper(text: str) -> bytes | None:
    """Piper TTS — returns WAV bytes or None if unavailable."""
    model_path = os.path.expanduser("~/.local/share/piper-voices/uk-UA-low/uk-UA-low.onnx")
    if not Path(model_path).exists():
        return None
    try:
        import piper as piper_mod
        voice = piper_mod.PiperVoice.load(model_path)
        buf = io.BytesIO()
        voice.synthesize(text, buf)
        buf.seek(0)
        data = buf.read()
        return data if data else None
    except Exception as e:
        logger.debug("Piper TTS failed: %s", e)
        return None


async def _tts_macos_say(text: str, voice: str = "Milena") -> bytes | None:
    """macOS say — only works outside Docker. Returns WAV bytes or None."""
    try:
        tmp_id = uuid.uuid4().hex[:8]
        aiff_path = f"/tmp/tts_{tmp_id}.aiff"
        wav_path  = f"/tmp/tts_{tmp_id}.wav"
        proc = await asyncio.create_subprocess_exec(
            "say", "-v", voice, "-o", aiff_path, text,
            stdout=asyncio.subprocess.DEVNULL,
            stderr=asyncio.subprocess.DEVNULL,
        )
        await asyncio.wait_for(proc.wait(), timeout=15)
        if not Path(aiff_path).exists() or Path(aiff_path).stat().st_size == 0:
            return None
        # Convert to WAV only if ffmpeg available
        if _ffmpeg_available():
            subprocess.run(["ffmpeg", "-y", "-i", aiff_path, "-ar", "22050", "-ac", "1", wav_path],
                           capture_output=True, timeout=10)
            Path(aiff_path).unlink(missing_ok=True)
            if Path(wav_path).exists() and Path(wav_path).stat().st_size > 0:
                data = Path(wav_path).read_bytes()
                Path(wav_path).unlink(missing_ok=True)
                return data
        # Return AIFF if no ffmpeg — most browsers won't play it but at least we tried
        data = Path(aiff_path).read_bytes()
        Path(aiff_path).unlink(missing_ok=True)
        return data if data else None
    except Exception as e:
        logger.debug("macOS say failed: %s", e)
        return None


async def _tts_espeak(text: str, voice: str = "uk", speed: float = 1.0) -> bytes | None:
    """espeak-ng offline fallback for Linux. Returns WAV bytes or None."""
    if not _espeak_available():
        return None
    try:
        tmp_id = uuid.uuid4().hex[:8]
        wav_path = f"/tmp/tts_espeak_{tmp_id}.wav"
        rate = max(120, min(240, int((speed or 1.0) * 170)))
        proc = await asyncio.create_subprocess_exec(
            "espeak-ng", "-v", voice, "-s", str(rate), "-w", wav_path, text,
            stdout=asyncio.subprocess.DEVNULL,
            stderr=asyncio.subprocess.PIPE,
        )
        _stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=10)
        if proc.returncode != 0:
            logger.debug("espeak-ng failed rc=%s stderr=%s", proc.returncode, (stderr or b"")[:200])
            return None
        p = Path(wav_path)
        if not p.exists() or p.stat().st_size == 0:
            return None
        data = p.read_bytes()
        p.unlink(missing_ok=True)
        return data if data else None
    except Exception as e:
        logger.debug("espeak-ng TTS failed: %s", e)
        return None


@router.post("/tts")
async def text_to_speech(request: TTSRequest):
    """
    TTS pipeline:
    1. edge-tts (primary — pure Python, returns MP3, works anywhere)
    2. piper (if model file present)
    3. espeak-ng (offline Linux fallback)
    4. macOS say (macOS-only fallback)
    """
    import time as _time

    text = (request.text or "").strip()[:700]
    if not text:
        raise HTTPException(400, "Empty text")

    edge_voice = _edge_voice(request.voice)
    errors: list[str] = []

    # ── 1. edge-tts (MP3, no ffmpeg needed) ──────────────────────────────
    _t0 = _time.monotonic()
    try:
        data = await asyncio.wait_for(
            _tts_edge(text, edge_voice, speed=request.speed or 1.0),
            timeout=20.0,
        )
        _compute_ms = int((_time.monotonic() - _t0) * 1000)
        logger.info("TTS edge-tts OK: voice=%s len=%d ms=%d", edge_voice, len(data), _compute_ms)
        _prom_tts_observe("edge-tts", edge_voice, _compute_ms, len(data))
        return StreamingResponse(
            io.BytesIO(data),
            media_type="audio/mpeg",
            headers={"Content-Disposition": "inline; filename=speech.mp3",
                     "X-TTS-Engine": "edge-tts",
                     "X-TTS-Voice": edge_voice,
                     "X-TTS-Compute-MS": str(_compute_ms),
                     "Cache-Control": "no-store"},
        )
    except Exception as e:
        _prom_tts_error("edge-tts", type(e).__name__)
        errors.append(f"edge-tts: {e}")
        logger.warning("edge-tts failed: %s", e)

    # ── 2. piper ──────────────────────────────────────────────────────────
    _t0 = _time.monotonic()
    try:
        data = await asyncio.wait_for(_tts_piper(text), timeout=15.0)
        if data:
            _compute_ms = int((_time.monotonic() - _t0) * 1000)
            logger.info("TTS piper OK len=%d ms=%d", len(data), _compute_ms)
            _prom_tts_observe("piper", "uk-UA", _compute_ms, len(data))
            return StreamingResponse(
                io.BytesIO(data),
                media_type="audio/wav",
                headers={"Content-Disposition": "inline; filename=speech.wav",
                         "X-TTS-Engine": "piper",
                         "X-TTS-Compute-MS": str(_compute_ms),
                         "Cache-Control": "no-store"},
            )
    except Exception as e:
        _prom_tts_error("piper", type(e).__name__)
        errors.append(f"piper: {e}")
        logger.debug("piper failed: %s", e)

    # ── 3. espeak-ng (offline Linux) ─────────────────────────────────────
    espeak_voice = "en-us" if str(request.voice or "").startswith("en") else "uk"
    _t0 = _time.monotonic()
    try:
        data = await asyncio.wait_for(_tts_espeak(text, espeak_voice, request.speed or 1.0), timeout=12.0)
        if data:
            _compute_ms = int((_time.monotonic() - _t0) * 1000)
            logger.info("TTS espeak-ng OK voice=%s len=%d ms=%d", espeak_voice, len(data), _compute_ms)
            _prom_tts_observe("espeak-ng", espeak_voice, _compute_ms, len(data))
            return StreamingResponse(
                io.BytesIO(data),
                media_type="audio/wav",
                headers={"Content-Disposition": "inline; filename=speech.wav",
                         "X-TTS-Engine": "espeak-ng",
                         "X-TTS-Voice": espeak_voice,
                         "X-TTS-Compute-MS": str(_compute_ms),
                         "Cache-Control": "no-store"},
            )
    except Exception as e:
        _prom_tts_error("espeak-ng", type(e).__name__)
        errors.append(f"espeak-ng: {e}")
        logger.debug("espeak-ng failed: %s", e)

    # ── 4. macOS say ──────────────────────────────────────────────────────
    say_voice = "Milena" if request.voice in (None, "default", "Polina", "Milena") else "Yuri"
    _t0 = _time.monotonic()
    try:
        data = await asyncio.wait_for(_tts_macos_say(text, say_voice), timeout=20.0)
        if data:
            _compute_ms = int((_time.monotonic() - _t0) * 1000)
            mime = "audio/wav" if data[:4] == b"RIFF" else "audio/aiff"
            logger.info("TTS macOS say OK voice=%s len=%d ms=%d", say_voice, len(data), _compute_ms)
            _prom_tts_observe("macos-say", say_voice, _compute_ms, len(data))
            return StreamingResponse(
                io.BytesIO(data),
                media_type=mime,
                headers={"Content-Disposition": f"inline; filename=speech.{'wav' if mime=='audio/wav' else 'aiff'}",
                         "X-TTS-Engine": "macos-say",
                         "X-TTS-Compute-MS": str(_compute_ms),
                         "Cache-Control": "no-store"},
            )
    except Exception as e:
        _prom_tts_error("macos-say", type(e).__name__)
        errors.append(f"say: {e}")
        logger.debug("macOS say failed: %s", e)

    logger.error("All TTS engines failed: %s", errors)
    raise HTTPException(503, f"All TTS engines failed: {'; '.join(errors)}")


# ── STT ───────────────────────────────────────────────────────────────────────

async def _convert_audio_to_wav(input_path: str, output_path: str) -> bool:
    """Convert audio to WAV using ffmpeg if available."""
    if not _ffmpeg_available():
        return False
    try:
        result = subprocess.run(
            ["ffmpeg", "-y", "-i", input_path, "-ar", "16000", "-ac", "1", output_path],
            capture_output=True, timeout=30,
        )
        return result.returncode == 0 and Path(output_path).exists()
    except Exception:
        return False


def _stt_faster_whisper_sync(wav_path: str, language: str | None) -> str:
    """faster-whisper STT — sync, runs in executor — works in Docker/Linux."""
    from faster_whisper import WhisperModel
    # Use 'small' for better Ukrainian accuracy (still fast on CPU)
    model_size = os.getenv("WHISPER_MODEL", "small")
    cache_key = f"faster_whisper_{model_size}"
    if cache_key not in MODELS_CACHE:
        logger.info("Loading faster-whisper model=%s (first call)...", model_size)
        MODELS_CACHE[cache_key] = WhisperModel(
            model_size, device="cpu", compute_type="int8",
        )
    model = MODELS_CACHE[cache_key]
    segments, info = model.transcribe(wav_path, language=language or "uk", beam_size=5)
    text = " ".join(seg.text for seg in segments).strip()
    logger.info("faster-whisper OK: lang=%s text_len=%d", info.language, len(text))
    return text


def _stt_mlx_audio_sync(wav_path: str, language: str | None) -> str:
    """mlx-audio STT — sync, runs in executor — macOS Apple Silicon only."""
    from mlx_audio.stt.utils import load_model
    if "mlx_whisper" not in MODELS_CACHE:
        logger.info("Loading mlx-audio whisper model (first call)...")
        MODELS_CACHE["mlx_whisper"] = load_model(
            "mlx-community/whisper-large-v3-turbo-asr-fp16"
        )
    model = MODELS_CACHE["mlx_whisper"]
    result = model.generate(wav_path, language=language)
    return result.text if hasattr(result, "text") else str(result)


async def _stt_whisper_cli(wav_path: str, language: str | None) -> str:
    """whisper CLI fallback."""
    proc = await asyncio.create_subprocess_exec(
        "whisper", wav_path,
        "--language", language or "uk",
        "--model", "base",
        "--output_format", "txt",
        "--output_dir", "/tmp",
        stdout=asyncio.subprocess.DEVNULL,
        stderr=asyncio.subprocess.DEVNULL,
    )
    await asyncio.wait_for(proc.wait(), timeout=90)
    txt_path = Path(wav_path).with_suffix(".txt")
    if txt_path.exists():
        return txt_path.read_text().strip()
    raise RuntimeError("whisper CLI produced no output")


@router.post("/stt")
async def speech_to_text(
    audio: UploadFile = File(...),
    model: str = Query("auto", description="STT model: auto|faster-whisper|mlx-audio|whisper-cli"),
    language: Optional[str] = Query(None, description="Language code (auto-detect if None)"),
):
    """
    STT pipeline:
    1. Convert audio to WAV via ffmpeg (if available; skip if already WAV)
    2. faster-whisper (primary — Docker/Linux)
    3. mlx-audio (macOS Apple Silicon)
    4. whisper CLI (last resort)
    """
    tmp_path: str | None = None
    wav_path: str | None = None

    try:
        content = await audio.read()
        if not content:
            raise HTTPException(400, "Empty audio file")

        # Detect MIME type
        fname = audio.filename or "audio.webm"
        suffix = Path(fname).suffix or ".webm"
        if audio.content_type and "wav" in audio.content_type:
            suffix = ".wav"
        elif audio.content_type and "ogg" in audio.content_type:
            suffix = ".ogg"

        tmp_id = uuid.uuid4().hex[:8]
        tmp_path = f"/tmp/stt_in_{tmp_id}{suffix}"
        wav_path = f"/tmp/stt_wav_{tmp_id}.wav"

        with open(tmp_path, "wb") as f:
            f.write(content)

        # Convert to WAV (required by whisper models)
        converted = False
        if suffix == ".wav":
            import shutil
            shutil.copy(tmp_path, wav_path)
            converted = True
        else:
            converted = await _convert_audio_to_wav(tmp_path, wav_path)
            if not converted:
                # No ffmpeg — try to use input directly (faster-whisper accepts many formats)
                import shutil
                shutil.copy(tmp_path, wav_path)
                converted = True

        if not Path(wav_path).exists():
            raise HTTPException(500, "Audio conversion failed — ffmpeg missing and no WAV input")

        errors: list[str] = []

        loop = asyncio.get_event_loop()

        # ── 1. faster-whisper ─────────────────────────────────────────────
        if model in ("auto", "faster-whisper"):
            _t0_stt = asyncio.get_event_loop().time()
            try:
                _wpath = wav_path  # capture for lambda
                _lang  = language
                text = await asyncio.wait_for(
                    loop.run_in_executor(None, _stt_faster_whisper_sync, _wpath, _lang),
                    timeout=60.0,
                )
                _stt_ms = int((asyncio.get_event_loop().time() - _t0_stt) * 1000)
                _prom_stt_observe("faster-whisper", _stt_ms)
                return {"text": text, "model": "faster-whisper", "language": language,
                        "compute_ms": _stt_ms}
            except Exception as e:
                errors.append(f"faster-whisper: {e}")
                logger.warning("faster-whisper failed: %s", e)

        # ── 2. mlx-audio (macOS) ─────────────────────────────────────────
        if model in ("auto", "mlx-audio"):
            try:
                _wpath = wav_path
                _lang  = language
                text = await asyncio.wait_for(
                    loop.run_in_executor(None, _stt_mlx_audio_sync, _wpath, _lang),
                    timeout=60.0,
                )
                return {"text": text, "model": "mlx-audio", "language": language}
            except Exception as e:
                errors.append(f"mlx-audio: {e}")
                logger.warning("mlx-audio failed: %s", e)

        # ── 3. whisper CLI ────────────────────────────────────────────────
        if model in ("auto", "whisper-cli"):
            try:
                text = await asyncio.wait_for(
                    _stt_whisper_cli(wav_path, language), timeout=90.0
                )
                return {"text": text, "model": "whisper-cli", "language": language}
            except Exception as e:
                errors.append(f"whisper-cli: {e}")
                logger.warning("whisper-cli failed: %s", e)

        raise HTTPException(503, f"All STT engines failed: {'; '.join(str(e)[:80] for e in errors)}")

    except HTTPException:
        raise
    except Exception as e:
        logger.error("STT error: %s", e)
        raise HTTPException(500, str(e)[:200])
    finally:
        for p in [tmp_path, wav_path]:
            if p:
                Path(p).unlink(missing_ok=True)


# ── Voices list ───────────────────────────────────────────────────────────────

@router.get("/voices")
async def list_voices():
    edge_voices = []
    try:
        import edge_tts  # noqa: F401
        edge_voices = [
            {"id": "default",  "name": "Polina Neural (uk-UA)", "lang": "uk-UA", "engine": "edge-tts"},
            {"id": "Polina",   "name": "Polina Neural (uk-UA)", "lang": "uk-UA", "engine": "edge-tts"},
            {"id": "Ostap",    "name": "Ostap Neural (uk-UA)",  "lang": "uk-UA", "engine": "edge-tts"},
        ]
    except ImportError:
        pass

    piper_voices = []
    if Path(os.path.expanduser("~/.local/share/piper-voices/uk-UA-low/uk-UA-low.onnx")).exists():
        piper_voices = [{"id": "uk-UA-low", "name": "Ukrainian Low (uk-UA)", "lang": "uk-UA", "engine": "piper"}]

    macos_voices = []
    if os.path.exists("/usr/bin/say") or os.path.exists("/usr/local/bin/say"):
        macos_voices = [
            {"id": "Milena", "name": "Milena (uk-UA, macOS)", "lang": "uk-UA", "engine": "say"},
            {"id": "Yuri",   "name": "Yuri (uk-UA, macOS)",   "lang": "uk-UA", "engine": "say"},
        ]

    espeak_voices = []
    if _espeak_available():
        espeak_voices = [
            {"id": "uk", "name": "Ukrainian (espeak-ng)", "lang": "uk-UA", "engine": "espeak-ng"},
            {"id": "en-us", "name": "English US (espeak-ng)", "lang": "en-US", "engine": "espeak-ng"},
        ]

    return {
        "edge": edge_voices,
        "piper": piper_voices,
        "macos": macos_voices,
        "espeak": espeak_voices,
    }