"""Kokoro TTS Service — lightweight HTTP wrapper for kokoro on Apple Silicon. Runs natively on host (not in Docker) to access Metal/MPS acceleration. Port: 8201 """ import asyncio import base64 import io import logging import os import time from typing import Optional from fastapi import FastAPI, HTTPException from pydantic import BaseModel, Field import uvicorn logging.basicConfig(level=logging.INFO) logger = logging.getLogger("mlx-tts") app = FastAPI(title="Kokoro TTS", version="1.0.0") DEFAULT_VOICE = os.getenv("TTS_DEFAULT_VOICE", "af_heart") MAX_TEXT_CHARS = int(os.getenv("TTS_MAX_TEXT_CHARS", "5000")) DEFAULT_SAMPLE_RATE = int(os.getenv("TTS_SAMPLE_RATE", "24000")) _pipeline = None _lock = asyncio.Lock() def _load_pipeline(): global _pipeline if _pipeline is not None: return logger.info("Loading Kokoro pipeline...") t0 = time.time() from kokoro import KPipeline _pipeline = KPipeline(lang_code="a") logger.info(f"Kokoro ready in {time.time()-t0:.1f}s") class SynthesizeRequest(BaseModel): text: str voice: str = Field(default="af_heart") format: str = Field(default="wav", description="wav|mp3") sample_rate: int = Field(default=24000) class SynthesizeResponse(BaseModel): audio_b64: str = "" format: str = "wav" meta: dict = Field(default_factory=dict) @app.on_event("startup") async def startup(): _load_pipeline() @app.get("/health") async def health(): return {"status": "ok", "model": "kokoro-v1.0", "ready": _pipeline is not None} @app.post("/synthesize", response_model=SynthesizeResponse) async def synthesize(req: SynthesizeRequest): if not req.text: raise HTTPException(400, "text is required") if len(req.text) > MAX_TEXT_CHARS: raise HTTPException(413, f"Text exceeds {MAX_TEXT_CHARS} chars") voice = req.voice or DEFAULT_VOICE sample_rate = req.sample_rate or DEFAULT_SAMPLE_RATE async with _lock: t0 = time.time() import numpy as np import soundfile as sf all_audio = [] for _, _, audio in _pipeline(req.text, voice=voice): all_audio.append(audio) if not all_audio: raise HTTPException(500, "Kokoro produced no audio") combined = np.concatenate(all_audio) buf = io.BytesIO() sf.write(buf, combined, sample_rate, format="WAV") wav_bytes = buf.getvalue() duration_ms = int((time.time() - t0) * 1000) audio_b64 = base64.b64encode(wav_bytes).decode() return SynthesizeResponse( audio_b64=audio_b64, format="wav", meta={ "model": "kokoro-v1.0", "voice": voice, "duration_ms": duration_ms, "audio_bytes": len(wav_bytes), "device": "apple_silicon", }, ) if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "8201")))