feat(platform): add new services, tools, tests and crews modules

New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
2026-03-03 07:14:14 -08:00
parent e9dedffa48
commit 129e4ea1fc
241 changed files with 69349 additions and 0 deletions
--- a/services/memory-service/app/voice_endpoints.py
+++ b/services/memory-service/app/voice_endpoints.py
@@ -0,0 +1,680 @@
+"""
+DAARION Memory Service — Voice Endpoints
+STT: faster-whisper (Docker/Linux) → mlx-audio (macOS) → whisper-cli
+TTS: edge-tts Python API (primary, pure Python, no ffmpeg needed)
+     → piper (fallback, if model present)
+     → espeak-ng (offline Linux fallback)
+     → macOS say (fallback, macOS-only)
+"""
+from __future__ import annotations
+
+import asyncio
+import io
+import logging
+import os
+import subprocess
+import tempfile
+import uuid
+from pathlib import Path
+from typing import Optional
+
+from fastapi import APIRouter, File, HTTPException, Query, UploadFile
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/voice", tags=["voice"])
+
+MODELS_CACHE: dict = {}
+
+# ── Prometheus metrics (optional — skip if not installed) ─────────────────────
+try:
+    from prometheus_client import Counter, Histogram
+
+    _tts_compute_hist = Histogram(
+        "voice_tts_compute_ms",
+        "TTS synthesis compute time in ms",
+        ["engine", "voice"],
+        buckets=[50, 100, 250, 500, 1000, 2000, 5000],
+    )
+    _tts_bytes_hist = Histogram(
+        "voice_tts_audio_bytes",
+        "TTS audio output size in bytes",
+        ["engine"],
+        buckets=[5000, 15000, 30000, 60000, 120000],
+    )
+    _tts_errors_total = Counter(
+        "voice_tts_errors_total",
+        "TTS engine errors",
+        ["engine", "error_type"],
+    )
+    _stt_compute_hist = Histogram(
+        "voice_stt_compute_ms",
+        "STT transcription time in ms",
+        ["engine"],
+        buckets=[200, 500, 1000, 2000, 5000, 10000],
+    )
+    _PROM_OK = True
+except ImportError:
+    _PROM_OK = False
+    _tts_compute_hist = None
+    _tts_bytes_hist = None
+    _tts_errors_total = None
+    _stt_compute_hist = None
+
+
+def _prom_tts_observe(engine: str, voice: str, ms: float, audio_bytes: int) -> None:
+    if not _PROM_OK:
+        return
+    try:
+        _tts_compute_hist.labels(engine=engine, voice=voice).observe(ms)
+        _tts_bytes_hist.labels(engine=engine).observe(audio_bytes)
+    except Exception:
+        pass
+
+
+def _prom_tts_error(engine: str, error_type: str) -> None:
+    if not _PROM_OK:
+        return
+    try:
+        _tts_errors_total.labels(engine=engine, error_type=error_type).inc()
+    except Exception:
+        pass
+
+
+def _prom_stt_observe(engine: str, ms: float) -> None:
+    if not _PROM_OK:
+        return
+    try:
+        _stt_compute_hist.labels(engine=engine).observe(ms)
+    except Exception:
+        pass
+
+# ── Voice mapping ─────────────────────────────────────────────────────────────
+# Maps UI voice id → edge-tts voice name
+_EDGE_VOICES: dict[str, str] = {
+    "default":              "uk-UA-PolinaNeural",
+    "Polina":               "uk-UA-PolinaNeural",
+    "uk-UA-Polina":         "uk-UA-PolinaNeural",
+    "uk-UA-PolinaNeural":   "uk-UA-PolinaNeural",
+    "Ostap":                "uk-UA-OstapNeural",
+    "uk-UA-Ostap":          "uk-UA-OstapNeural",
+    "uk-UA-OstapNeural":    "uk-UA-OstapNeural",
+    # English voices — used for English-language segments
+    "en-US-GuyNeural":      "en-US-GuyNeural",
+    "en-US-JennyNeural":    "en-US-JennyNeural",
+    "en":                   "en-US-GuyNeural",
+    # macOS-only names: map to closest Ukrainian voice
+    "Milena":               "uk-UA-PolinaNeural",
+    "Yuri":                 "uk-UA-OstapNeural",
+    "af_heart":             "uk-UA-PolinaNeural",
+}
+
+def _edge_voice(name: str | None) -> str:
+    """Allow any valid edge-tts voice name to pass through directly."""
+    n = name or "default"
+    # If already a valid neural voice name (contains "Neural"), pass through
+    if "Neural" in n or n == "en":
+        return _EDGE_VOICES.get(n, n)
+    return _EDGE_VOICES.get(n, "uk-UA-PolinaNeural")
+
+
+def _ffmpeg_available() -> bool:
+    try:
+        result = subprocess.run(["ffmpeg", "-version"], capture_output=True, timeout=3)
+        return result.returncode == 0
+    except Exception:
+        return False
+
+
+def _espeak_available() -> bool:
+    try:
+        result = subprocess.run(["espeak-ng", "--version"], capture_output=True, timeout=3)
+        return result.returncode == 0
+    except Exception:
+        return False
+
+
+class TTSRequest(BaseModel):
+    text: str
+    voice: Optional[str] = "default"
+    speed: Optional[float] = 1.0
+    model: Optional[str] = "auto"
+
+
+# ── Status & Live Health ───────────────────────────────────────────────────────
+
+@router.get("/health")
+async def voice_health():
+    """Live health check — actually synthesizes a short test phrase via edge-tts.
+    Returns edge_tts=ok/error with details; used by preflight to detect 403/blocked.
+    """
+    import importlib.metadata
+    import time
+
+    result: dict = {}
+
+    # edge-tts version
+    try:
+        ver = importlib.metadata.version("edge-tts")
+    except Exception:
+        ver = "unknown"
+    result["edge_tts_version"] = ver
+
+    # Live synthesis test for each required Neural voice
+    live_voices: list[dict] = []
+    test_text = "Test"  # Minimal — just enough to trigger actual API call
+    for voice_id in ("uk-UA-PolinaNeural", "uk-UA-OstapNeural"):
+        t0 = time.monotonic()
+        try:
+            import edge_tts
+            comm = edge_tts.Communicate(test_text, voice_id)
+            byte_count = 0
+            async for chunk in comm.stream():
+                if chunk["type"] == "audio":
+                    byte_count += len(chunk["data"])
+            elapsed_ms = int((time.monotonic() - t0) * 1000)
+            live_voices.append({"voice": voice_id, "status": "ok",
+                                 "bytes": byte_count, "ms": elapsed_ms})
+        except Exception as e:
+            elapsed_ms = int((time.monotonic() - t0) * 1000)
+            live_voices.append({"voice": voice_id, "status": "error",
+                                 "error": str(e)[:150], "ms": elapsed_ms})
+
+    all_ok = all(v["status"] == "ok" for v in live_voices)
+    result["edge_tts"] = "ok" if all_ok else "error"
+    result["voices"] = live_voices
+
+    # STT check (import only — no actual transcription in health)
+    try:
+        import faster_whisper  # noqa: F401
+        result["faster_whisper"] = "ok"
+    except ImportError:
+        result["faster_whisper"] = "unavailable"
+
+    result["ok"] = all_ok
+
+    # ── Repro pack (incident diagnosis) ──────────────────────────────────────
+    import os as _os
+    import socket as _socket
+    result["repro"] = {
+        "node_id":       _os.getenv("NODE_ID", _socket.gethostname()),
+        "service_name":  _os.getenv("MEMORY_SERVICE_NAME", "memory-service"),
+        "image_digest":  _os.getenv("IMAGE_DIGEST", "unknown"),  # set via docker label
+        "memory_service_url": _os.getenv("MEMORY_SERVICE_URL", "http://localhost:8000"),
+        "tts_max_chars": 700,
+        "canary_test_text": test_text,
+        "canary_audio_bytes": {
+            v["voice"]: v.get("bytes", 0) for v in live_voices
+        },
+    }
+    return result
+
+
+@router.get("/status")
+async def voice_status():
+    edge_ok = False
+    try:
+        import edge_tts  # noqa: F401
+        edge_ok = True
+    except ImportError:
+        pass
+
+    espeak_ok = _espeak_available()
+
+    fw_ok = False
+    try:
+        import faster_whisper  # noqa: F401
+        fw_ok = True
+    except ImportError:
+        pass
+
+    mlx_ok = False
+    try:
+        import mlx_audio  # noqa: F401
+        mlx_ok = True
+    except ImportError:
+        pass
+
+    return {
+        "available": True,
+        "tts_engine": "edge-tts" if edge_ok else ("espeak-ng" if espeak_ok else "piper/say"),
+        "stt_engine": ("faster-whisper" if fw_ok else "") + ("mlx-audio" if mlx_ok else ""),
+        "edge_tts": edge_ok,
+        "espeak_ng": espeak_ok,
+        "faster_whisper": fw_ok,
+        "mlx_audio": mlx_ok,
+        "ffmpeg": _ffmpeg_available(),
+        "voices": list(_EDGE_VOICES.keys()),
+    }
+
+
+# ── TTS ───────────────────────────────────────────────────────────────────────
+
+async def _tts_edge(text: str, voice_name: str, speed: float = 1.0) -> bytes:
+    """
+    edge-tts pure Python API — no subprocess, no ffmpeg.
+    Returns MP3 bytes directly (browsers play MP3 natively).
+    """
+    import edge_tts
+    rate_str = f"+{int((speed - 1.0) * 50)}%" if speed != 1.0 else "+0%"
+    communicate = edge_tts.Communicate(text, voice_name, rate=rate_str)
+    buf = io.BytesIO()
+    async for chunk in communicate.stream():
+        if chunk["type"] == "audio":
+            buf.write(chunk["data"])
+    buf.seek(0)
+    data = buf.read()
+    if not data:
+        raise RuntimeError("edge-tts returned empty audio")
+    return data
+
+
+async def _tts_piper(text: str) -> bytes | None:
+    """Piper TTS — returns WAV bytes or None if unavailable."""
+    model_path = os.path.expanduser("~/.local/share/piper-voices/uk-UA-low/uk-UA-low.onnx")
+    if not Path(model_path).exists():
+        return None
+    try:
+        import piper as piper_mod
+        voice = piper_mod.PiperVoice.load(model_path)
+        buf = io.BytesIO()
+        voice.synthesize(text, buf)
+        buf.seek(0)
+        data = buf.read()
+        return data if data else None
+    except Exception as e:
+        logger.debug("Piper TTS failed: %s", e)
+        return None
+
+
+async def _tts_macos_say(text: str, voice: str = "Milena") -> bytes | None:
+    """macOS say — only works outside Docker. Returns WAV bytes or None."""
+    try:
+        tmp_id = uuid.uuid4().hex[:8]
+        aiff_path = f"/tmp/tts_{tmp_id}.aiff"
+        wav_path  = f"/tmp/tts_{tmp_id}.wav"
+        proc = await asyncio.create_subprocess_exec(
+            "say", "-v", voice, "-o", aiff_path, text,
+            stdout=asyncio.subprocess.DEVNULL,
+            stderr=asyncio.subprocess.DEVNULL,
+        )
+        await asyncio.wait_for(proc.wait(), timeout=15)
+        if not Path(aiff_path).exists() or Path(aiff_path).stat().st_size == 0:
+            return None
+        # Convert to WAV only if ffmpeg available
+        if _ffmpeg_available():
+            subprocess.run(["ffmpeg", "-y", "-i", aiff_path, "-ar", "22050", "-ac", "1", wav_path],
+                           capture_output=True, timeout=10)
+            Path(aiff_path).unlink(missing_ok=True)
+            if Path(wav_path).exists() and Path(wav_path).stat().st_size > 0:
+                data = Path(wav_path).read_bytes()
+                Path(wav_path).unlink(missing_ok=True)
+                return data
+        # Return AIFF if no ffmpeg — most browsers won't play it but at least we tried
+        data = Path(aiff_path).read_bytes()
+        Path(aiff_path).unlink(missing_ok=True)
+        return data if data else None
+    except Exception as e:
+        logger.debug("macOS say failed: %s", e)
+        return None
+
+
+async def _tts_espeak(text: str, voice: str = "uk", speed: float = 1.0) -> bytes | None:
+    """espeak-ng offline fallback for Linux. Returns WAV bytes or None."""
+    if not _espeak_available():
+        return None
+    try:
+        tmp_id = uuid.uuid4().hex[:8]
+        wav_path = f"/tmp/tts_espeak_{tmp_id}.wav"
+        rate = max(120, min(240, int((speed or 1.0) * 170)))
+        proc = await asyncio.create_subprocess_exec(
+            "espeak-ng", "-v", voice, "-s", str(rate), "-w", wav_path, text,
+            stdout=asyncio.subprocess.DEVNULL,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        _stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=10)
+        if proc.returncode != 0:
+            logger.debug("espeak-ng failed rc=%s stderr=%s", proc.returncode, (stderr or b"")[:200])
+            return None
+        p = Path(wav_path)
+        if not p.exists() or p.stat().st_size == 0:
+            return None
+        data = p.read_bytes()
+        p.unlink(missing_ok=True)
+        return data if data else None
+    except Exception as e:
+        logger.debug("espeak-ng TTS failed: %s", e)
+        return None
+
+
+@router.post("/tts")
+async def text_to_speech(request: TTSRequest):
+    """
+    TTS pipeline:
+    1. edge-tts (primary — pure Python, returns MP3, works anywhere)
+    2. piper (if model file present)
+    3. espeak-ng (offline Linux fallback)
+    4. macOS say (macOS-only fallback)
+    """
+    import time as _time
+
+    text = (request.text or "").strip()[:700]
+    if not text:
+        raise HTTPException(400, "Empty text")
+
+    edge_voice = _edge_voice(request.voice)
+    errors: list[str] = []
+
+    # ── 1. edge-tts (MP3, no ffmpeg needed) ──────────────────────────────
+    _t0 = _time.monotonic()
+    try:
+        data = await asyncio.wait_for(
+            _tts_edge(text, edge_voice, speed=request.speed or 1.0),
+            timeout=20.0,
+        )
+        _compute_ms = int((_time.monotonic() - _t0) * 1000)
+        logger.info("TTS edge-tts OK: voice=%s len=%d ms=%d", edge_voice, len(data), _compute_ms)
+        _prom_tts_observe("edge-tts", edge_voice, _compute_ms, len(data))
+        return StreamingResponse(
+            io.BytesIO(data),
+            media_type="audio/mpeg",
+            headers={"Content-Disposition": "inline; filename=speech.mp3",
+                     "X-TTS-Engine": "edge-tts",
+                     "X-TTS-Voice": edge_voice,
+                     "X-TTS-Compute-MS": str(_compute_ms),
+                     "Cache-Control": "no-store"},
+        )
+    except Exception as e:
+        _prom_tts_error("edge-tts", type(e).__name__)
+        errors.append(f"edge-tts: {e}")
+        logger.warning("edge-tts failed: %s", e)
+
+    # ── 2. piper ──────────────────────────────────────────────────────────
+    _t0 = _time.monotonic()
+    try:
+        data = await asyncio.wait_for(_tts_piper(text), timeout=15.0)
+        if data:
+            _compute_ms = int((_time.monotonic() - _t0) * 1000)
+            logger.info("TTS piper OK len=%d ms=%d", len(data), _compute_ms)
+            _prom_tts_observe("piper", "uk-UA", _compute_ms, len(data))
+            return StreamingResponse(
+                io.BytesIO(data),
+                media_type="audio/wav",
+                headers={"Content-Disposition": "inline; filename=speech.wav",
+                         "X-TTS-Engine": "piper",
+                         "X-TTS-Compute-MS": str(_compute_ms),
+                         "Cache-Control": "no-store"},
+            )
+    except Exception as e:
+        _prom_tts_error("piper", type(e).__name__)
+        errors.append(f"piper: {e}")
+        logger.debug("piper failed: %s", e)
+
+    # ── 3. espeak-ng (offline Linux) ─────────────────────────────────────
+    espeak_voice = "en-us" if str(request.voice or "").startswith("en") else "uk"
+    _t0 = _time.monotonic()
+    try:
+        data = await asyncio.wait_for(_tts_espeak(text, espeak_voice, request.speed or 1.0), timeout=12.0)
+        if data:
+            _compute_ms = int((_time.monotonic() - _t0) * 1000)
+            logger.info("TTS espeak-ng OK voice=%s len=%d ms=%d", espeak_voice, len(data), _compute_ms)
+            _prom_tts_observe("espeak-ng", espeak_voice, _compute_ms, len(data))
+            return StreamingResponse(
+                io.BytesIO(data),
+                media_type="audio/wav",
+                headers={"Content-Disposition": "inline; filename=speech.wav",
+                         "X-TTS-Engine": "espeak-ng",
+                         "X-TTS-Voice": espeak_voice,
+                         "X-TTS-Compute-MS": str(_compute_ms),
+                         "Cache-Control": "no-store"},
+            )
+    except Exception as e:
+        _prom_tts_error("espeak-ng", type(e).__name__)
+        errors.append(f"espeak-ng: {e}")
+        logger.debug("espeak-ng failed: %s", e)
+
+    # ── 4. macOS say ──────────────────────────────────────────────────────
+    say_voice = "Milena" if request.voice in (None, "default", "Polina", "Milena") else "Yuri"
+    _t0 = _time.monotonic()
+    try:
+        data = await asyncio.wait_for(_tts_macos_say(text, say_voice), timeout=20.0)
+        if data:
+            _compute_ms = int((_time.monotonic() - _t0) * 1000)
+            mime = "audio/wav" if data[:4] == b"RIFF" else "audio/aiff"
+            logger.info("TTS macOS say OK voice=%s len=%d ms=%d", say_voice, len(data), _compute_ms)
+            _prom_tts_observe("macos-say", say_voice, _compute_ms, len(data))
+            return StreamingResponse(
+                io.BytesIO(data),
+                media_type=mime,
+                headers={"Content-Disposition": f"inline; filename=speech.{'wav' if mime=='audio/wav' else 'aiff'}",
+                         "X-TTS-Engine": "macos-say",
+                         "X-TTS-Compute-MS": str(_compute_ms),
+                         "Cache-Control": "no-store"},
+            )
+    except Exception as e:
+        _prom_tts_error("macos-say", type(e).__name__)
+        errors.append(f"say: {e}")
+        logger.debug("macOS say failed: %s", e)
+
+    logger.error("All TTS engines failed: %s", errors)
+    raise HTTPException(503, f"All TTS engines failed: {'; '.join(errors)}")
+
+
+# ── STT ───────────────────────────────────────────────────────────────────────
+
+async def _convert_audio_to_wav(input_path: str, output_path: str) -> bool:
+    """Convert audio to WAV using ffmpeg if available."""
+    if not _ffmpeg_available():
+        return False
+    try:
+        result = subprocess.run(
+            ["ffmpeg", "-y", "-i", input_path, "-ar", "16000", "-ac", "1", output_path],
+            capture_output=True, timeout=30,
+        )
+        return result.returncode == 0 and Path(output_path).exists()
+    except Exception:
+        return False
+
+
+def _stt_faster_whisper_sync(wav_path: str, language: str | None) -> str:
+    """faster-whisper STT — sync, runs in executor — works in Docker/Linux."""
+    from faster_whisper import WhisperModel
+    # Use 'small' for better Ukrainian accuracy (still fast on CPU)
+    model_size = os.getenv("WHISPER_MODEL", "small")
+    cache_key = f"faster_whisper_{model_size}"
+    if cache_key not in MODELS_CACHE:
+        logger.info("Loading faster-whisper model=%s (first call)...", model_size)
+        MODELS_CACHE[cache_key] = WhisperModel(
+            model_size, device="cpu", compute_type="int8",
+        )
+    model = MODELS_CACHE[cache_key]
+    segments, info = model.transcribe(wav_path, language=language or "uk", beam_size=5)
+    text = " ".join(seg.text for seg in segments).strip()
+    logger.info("faster-whisper OK: lang=%s text_len=%d", info.language, len(text))
+    return text
+
+
+def _stt_mlx_audio_sync(wav_path: str, language: str | None) -> str:
+    """mlx-audio STT — sync, runs in executor — macOS Apple Silicon only."""
+    from mlx_audio.stt.utils import load_model
+    if "mlx_whisper" not in MODELS_CACHE:
+        logger.info("Loading mlx-audio whisper model (first call)...")
+        MODELS_CACHE["mlx_whisper"] = load_model(
+            "mlx-community/whisper-large-v3-turbo-asr-fp16"
+        )
+    model = MODELS_CACHE["mlx_whisper"]
+    result = model.generate(wav_path, language=language)
+    return result.text if hasattr(result, "text") else str(result)
+
+
+async def _stt_whisper_cli(wav_path: str, language: str | None) -> str:
+    """whisper CLI fallback."""
+    proc = await asyncio.create_subprocess_exec(
+        "whisper", wav_path,
+        "--language", language or "uk",
+        "--model", "base",
+        "--output_format", "txt",
+        "--output_dir", "/tmp",
+        stdout=asyncio.subprocess.DEVNULL,
+        stderr=asyncio.subprocess.DEVNULL,
+    )
+    await asyncio.wait_for(proc.wait(), timeout=90)
+    txt_path = Path(wav_path).with_suffix(".txt")
+    if txt_path.exists():
+        return txt_path.read_text().strip()
+    raise RuntimeError("whisper CLI produced no output")
+
+
+@router.post("/stt")
+async def speech_to_text(
+    audio: UploadFile = File(...),
+    model: str = Query("auto", description="STT model: auto|faster-whisper|mlx-audio|whisper-cli"),
+    language: Optional[str] = Query(None, description="Language code (auto-detect if None)"),
+):
+    """
+    STT pipeline:
+    1. Convert audio to WAV via ffmpeg (if available; skip if already WAV)
+    2. faster-whisper (primary — Docker/Linux)
+    3. mlx-audio (macOS Apple Silicon)
+    4. whisper CLI (last resort)
+    """
+    tmp_path: str | None = None
+    wav_path: str | None = None
+
+    try:
+        content = await audio.read()
+        if not content:
+            raise HTTPException(400, "Empty audio file")
+
+        # Detect MIME type
+        fname = audio.filename or "audio.webm"
+        suffix = Path(fname).suffix or ".webm"
+        if audio.content_type and "wav" in audio.content_type:
+            suffix = ".wav"
+        elif audio.content_type and "ogg" in audio.content_type:
+            suffix = ".ogg"
+
+        tmp_id = uuid.uuid4().hex[:8]
+        tmp_path = f"/tmp/stt_in_{tmp_id}{suffix}"
+        wav_path = f"/tmp/stt_wav_{tmp_id}.wav"
+
+        with open(tmp_path, "wb") as f:
+            f.write(content)
+
+        # Convert to WAV (required by whisper models)
+        converted = False
+        if suffix == ".wav":
+            import shutil
+            shutil.copy(tmp_path, wav_path)
+            converted = True
+        else:
+            converted = await _convert_audio_to_wav(tmp_path, wav_path)
+            if not converted:
+                # No ffmpeg — try to use input directly (faster-whisper accepts many formats)
+                import shutil
+                shutil.copy(tmp_path, wav_path)
+                converted = True
+
+        if not Path(wav_path).exists():
+            raise HTTPException(500, "Audio conversion failed — ffmpeg missing and no WAV input")
+
+        errors: list[str] = []
+
+        loop = asyncio.get_event_loop()
+
+        # ── 1. faster-whisper ─────────────────────────────────────────────
+        if model in ("auto", "faster-whisper"):
+            _t0_stt = asyncio.get_event_loop().time()
+            try:
+                _wpath = wav_path  # capture for lambda
+                _lang  = language
+                text = await asyncio.wait_for(
+                    loop.run_in_executor(None, _stt_faster_whisper_sync, _wpath, _lang),
+                    timeout=60.0,
+                )
+                _stt_ms = int((asyncio.get_event_loop().time() - _t0_stt) * 1000)
+                _prom_stt_observe("faster-whisper", _stt_ms)
+                return {"text": text, "model": "faster-whisper", "language": language,
+                        "compute_ms": _stt_ms}
+            except Exception as e:
+                errors.append(f"faster-whisper: {e}")
+                logger.warning("faster-whisper failed: %s", e)
+
+        # ── 2. mlx-audio (macOS) ─────────────────────────────────────────
+        if model in ("auto", "mlx-audio"):
+            try:
+                _wpath = wav_path
+                _lang  = language
+                text = await asyncio.wait_for(
+                    loop.run_in_executor(None, _stt_mlx_audio_sync, _wpath, _lang),
+                    timeout=60.0,
+                )
+                return {"text": text, "model": "mlx-audio", "language": language}
+            except Exception as e:
+                errors.append(f"mlx-audio: {e}")
+                logger.warning("mlx-audio failed: %s", e)
+
+        # ── 3. whisper CLI ────────────────────────────────────────────────
+        if model in ("auto", "whisper-cli"):
+            try:
+                text = await asyncio.wait_for(
+                    _stt_whisper_cli(wav_path, language), timeout=90.0
+                )
+                return {"text": text, "model": "whisper-cli", "language": language}
+            except Exception as e:
+                errors.append(f"whisper-cli: {e}")
+                logger.warning("whisper-cli failed: %s", e)
+
+        raise HTTPException(503, f"All STT engines failed: {'; '.join(str(e)[:80] for e in errors)}")
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error("STT error: %s", e)
+        raise HTTPException(500, str(e)[:200])
+    finally:
+        for p in [tmp_path, wav_path]:
+            if p:
+                Path(p).unlink(missing_ok=True)
+
+
+# ── Voices list ───────────────────────────────────────────────────────────────
+
+@router.get("/voices")
+async def list_voices():
+    edge_voices = []
+    try:
+        import edge_tts  # noqa: F401
+        edge_voices = [
+            {"id": "default",  "name": "Polina Neural (uk-UA)", "lang": "uk-UA", "engine": "edge-tts"},
+            {"id": "Polina",   "name": "Polina Neural (uk-UA)", "lang": "uk-UA", "engine": "edge-tts"},
+            {"id": "Ostap",    "name": "Ostap Neural (uk-UA)",  "lang": "uk-UA", "engine": "edge-tts"},
+        ]
+    except ImportError:
+        pass
+
+    piper_voices = []
+    if Path(os.path.expanduser("~/.local/share/piper-voices/uk-UA-low/uk-UA-low.onnx")).exists():
+        piper_voices = [{"id": "uk-UA-low", "name": "Ukrainian Low (uk-UA)", "lang": "uk-UA", "engine": "piper"}]
+
+    macos_voices = []
+    if os.path.exists("/usr/bin/say") or os.path.exists("/usr/local/bin/say"):
+        macos_voices = [
+            {"id": "Milena", "name": "Milena (uk-UA, macOS)", "lang": "uk-UA", "engine": "say"},
+            {"id": "Yuri",   "name": "Yuri (uk-UA, macOS)",   "lang": "uk-UA", "engine": "say"},
+        ]
+
+    espeak_voices = []
+    if _espeak_available():
+        espeak_voices = [
+            {"id": "uk", "name": "Ukrainian (espeak-ng)", "lang": "uk-UA", "engine": "espeak-ng"},
+            {"id": "en-us", "name": "English US (espeak-ng)", "lang": "en-US", "engine": "espeak-ng"},
+        ]
+
+    return {
+        "edge": edge_voices,
+        "piper": piper_voices,
+        "macos": macos_voices,
+        "espeak": espeak_voices,
+    }