"""MLX Kokoro TTS provider — generates speech via kokoro on host. Runs inside Docker; delegates to Kokoro HTTP API on the host. Falls back to local kokoro-onnx if running natively on Apple Silicon. """ import base64 import logging import os import tempfile from typing import Any, Dict import httpx logger = logging.getLogger("provider.tts_mlx_kokoro") MLX_KOKORO_URL = os.getenv("MLX_KOKORO_URL", "") MLX_KOKORO_MODEL = os.getenv("MLX_KOKORO_MODEL", "kokoro-v1.0") DEFAULT_VOICE = os.getenv("TTS_DEFAULT_VOICE", "af_heart") MAX_TEXT_CHARS = int(os.getenv("TTS_MAX_TEXT_CHARS", "5000")) DEFAULT_SAMPLE_RATE = int(os.getenv("TTS_SAMPLE_RATE", "24000")) _local_pipeline = None _local_lock = None def _lazy_init_local(): global _local_lock if _local_lock is not None: return import asyncio _local_lock = asyncio.Lock() async def _synthesize_local(text: str, voice: str, sample_rate: int) -> bytes: """Synthesize via local kokoro (Apple Silicon).""" _lazy_init_local() global _local_pipeline async with _local_lock: if _local_pipeline is None: from kokoro import KPipeline _local_pipeline = KPipeline(lang_code="a") logger.info(f"Kokoro pipeline initialized: voice={voice}") import soundfile as sf import io generator = _local_pipeline(text, voice=voice) all_audio = [] for _, _, audio in generator: all_audio.append(audio) if not all_audio: raise RuntimeError("Kokoro produced no audio") import numpy as np combined = np.concatenate(all_audio) buf = io.BytesIO() sf.write(buf, combined, sample_rate, format="WAV") return buf.getvalue() async def _synthesize_remote(text: str, voice: str, fmt: str, sample_rate: int) -> Dict[str, Any]: """Synthesize via Kokoro HTTP service on host.""" payload = { "text": text, "voice": voice, "format": fmt, "sample_rate": sample_rate, } async with httpx.AsyncClient(timeout=120) as c: resp = await c.post(f"{MLX_KOKORO_URL}/synthesize", json=payload) resp.raise_for_status() return resp.json() async def synthesize(payload: Dict[str, Any]) -> Dict[str, Any]: """Canonical TTS entry point. Payload: text: str (required) voice: str (optional, default "af_heart") format: "wav" | "mp3" (default "wav") sample_rate: int (default 24000) """ text = payload.get("text", "") if not text: raise ValueError("text is required") if len(text) > MAX_TEXT_CHARS: raise ValueError(f"Text exceeds {MAX_TEXT_CHARS} chars") voice = payload.get("voice", DEFAULT_VOICE) fmt = payload.get("format", "wav") sample_rate = payload.get("sample_rate", DEFAULT_SAMPLE_RATE) meta = { "model": MLX_KOKORO_MODEL, "provider": "mlx_kokoro", "voice": voice, "device": "apple_silicon", } if MLX_KOKORO_URL: result = await _synthesize_remote(text, voice, fmt, sample_rate) return { "audio_b64": result.get("audio_b64", ""), "audio_url": result.get("audio_url", ""), "format": fmt, "meta": meta, "provider": "mlx_kokoro", "model": MLX_KOKORO_MODEL, } wav_bytes = await _synthesize_local(text, voice, sample_rate) audio_b64 = base64.b64encode(wav_bytes).decode() return { "audio_b64": audio_b64, "format": "wav", "meta": meta, "provider": "mlx_kokoro", "model": MLX_KOKORO_MODEL, }