microdao-daarion/services/mlx-tts-service/main.py

"""Kokoro TTS Service — lightweight HTTP wrapper for kokoro on Apple Silicon.

Runs natively on host (not in Docker) to access Metal/MPS acceleration.
Port: 8201
"""
import asyncio
import base64
import io
import logging
import os
import time
from typing import Optional

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
import uvicorn

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("mlx-tts")

app = FastAPI(title="Kokoro TTS", version="1.0.0")

DEFAULT_VOICE = os.getenv("TTS_DEFAULT_VOICE", "af_heart")
MAX_TEXT_CHARS = int(os.getenv("TTS_MAX_TEXT_CHARS", "5000"))
DEFAULT_SAMPLE_RATE = int(os.getenv("TTS_SAMPLE_RATE", "24000"))

_pipeline = None
_lock = asyncio.Lock()


def _load_pipeline():
    global _pipeline
    if _pipeline is not None:
        return
    logger.info("Loading Kokoro pipeline...")
    t0 = time.time()
    from kokoro import KPipeline
    _pipeline = KPipeline(lang_code="a")
    logger.info(f"Kokoro ready in {time.time()-t0:.1f}s")


class SynthesizeRequest(BaseModel):
    text: str
    voice: str = Field(default="af_heart")
    format: str = Field(default="wav", description="wav|mp3")
    sample_rate: int = Field(default=24000)


class SynthesizeResponse(BaseModel):
    audio_b64: str = ""
    format: str = "wav"
    meta: dict = Field(default_factory=dict)


@app.on_event("startup")
async def startup():
    _load_pipeline()


@app.get("/health")
async def health():
    return {"status": "ok", "model": "kokoro-v1.0", "ready": _pipeline is not None}


@app.post("/synthesize", response_model=SynthesizeResponse)
async def synthesize(req: SynthesizeRequest):
    if not req.text:
        raise HTTPException(400, "text is required")
    if len(req.text) > MAX_TEXT_CHARS:
        raise HTTPException(413, f"Text exceeds {MAX_TEXT_CHARS} chars")

    voice = req.voice or DEFAULT_VOICE
    sample_rate = req.sample_rate or DEFAULT_SAMPLE_RATE

    async with _lock:
        t0 = time.time()
        import numpy as np
        import soundfile as sf

        all_audio = []
        for _, _, audio in _pipeline(req.text, voice=voice):
            all_audio.append(audio)

        if not all_audio:
            raise HTTPException(500, "Kokoro produced no audio")

        combined = np.concatenate(all_audio)
        buf = io.BytesIO()
        sf.write(buf, combined, sample_rate, format="WAV")
        wav_bytes = buf.getvalue()
        duration_ms = int((time.time() - t0) * 1000)

    audio_b64 = base64.b64encode(wav_bytes).decode()

    return SynthesizeResponse(
        audio_b64=audio_b64,
        format="wav",
        meta={
            "model": "kokoro-v1.0",
            "voice": voice,
            "duration_ms": duration_ms,
            "audio_bytes": len(wav_bytes),
            "device": "apple_silicon",
        },
    )


if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "8201")))