NCS:
- _collect_worker_caps() fetches capability flags from node-worker /caps
- _derive_capabilities() merges served model types + worker provider flags
- installed_artifacts replaces inventory_only (disk scan with DISK_SCAN_PATHS env)
- New endpoints: /capabilities/caps, /capabilities/installed
Node Worker:
- STT_PROVIDER, TTS_PROVIDER, OCR_PROVIDER, IMAGE_PROVIDER env flags
- /caps endpoint returns capabilities + providers for NCS aggregation
- STT adapter (providers/stt_mlx_whisper.py) — remote + local mode
- TTS adapter (providers/tts_mlx_kokoro.py) — remote + local mode
- OCR handler via vision_prompted (ollama_vision with OCR prompt)
- NATS subjects: node.{id}.stt/tts/ocr/image.request
Router:
- POST /v1/capability/{stt,tts,ocr,image} — capability-based offload routing
- GET /v1/capabilities — global view with capabilities_by_node
- require_fresh_caps(ttl) preflight guard
- find_nodes_with_capability(cap) + load-based node selection
Ops:
- ops/fabric_snapshot.py — full runtime snapshot collector
- ops/fabric_preflight.sh — quick check + snapshot save + diff
- docs/fabric_contract.md — Dev Contract v0.1 (preflight-first)
- tests/test_fabric_contract.py — CI enforcement (6 tests)
Made-with: Cursor
124 lines
3.6 KiB
Python
124 lines
3.6 KiB
Python
"""MLX Kokoro TTS provider — generates speech via kokoro on host.
|
|
|
|
Runs inside Docker; delegates to Kokoro HTTP API on the host.
|
|
Falls back to local kokoro-onnx if running natively on Apple Silicon.
|
|
"""
|
|
import base64
|
|
import logging
|
|
import os
|
|
import tempfile
|
|
from typing import Any, Dict
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger("provider.tts_mlx_kokoro")
|
|
|
|
MLX_KOKORO_URL = os.getenv("MLX_KOKORO_URL", "")
|
|
MLX_KOKORO_MODEL = os.getenv("MLX_KOKORO_MODEL", "kokoro-v1.0")
|
|
DEFAULT_VOICE = os.getenv("TTS_DEFAULT_VOICE", "af_heart")
|
|
MAX_TEXT_CHARS = int(os.getenv("TTS_MAX_TEXT_CHARS", "5000"))
|
|
DEFAULT_SAMPLE_RATE = int(os.getenv("TTS_SAMPLE_RATE", "24000"))
|
|
|
|
_local_pipeline = None
|
|
_local_lock = None
|
|
|
|
|
|
def _lazy_init_local():
|
|
global _local_lock
|
|
if _local_lock is not None:
|
|
return
|
|
import asyncio
|
|
_local_lock = asyncio.Lock()
|
|
|
|
|
|
async def _synthesize_local(text: str, voice: str, sample_rate: int) -> bytes:
|
|
"""Synthesize via local kokoro (Apple Silicon)."""
|
|
_lazy_init_local()
|
|
global _local_pipeline
|
|
async with _local_lock:
|
|
if _local_pipeline is None:
|
|
from kokoro import KPipeline
|
|
_local_pipeline = KPipeline(lang_code="a")
|
|
logger.info(f"Kokoro pipeline initialized: voice={voice}")
|
|
|
|
import soundfile as sf
|
|
import io
|
|
|
|
generator = _local_pipeline(text, voice=voice)
|
|
all_audio = []
|
|
for _, _, audio in generator:
|
|
all_audio.append(audio)
|
|
|
|
if not all_audio:
|
|
raise RuntimeError("Kokoro produced no audio")
|
|
|
|
import numpy as np
|
|
combined = np.concatenate(all_audio)
|
|
|
|
buf = io.BytesIO()
|
|
sf.write(buf, combined, sample_rate, format="WAV")
|
|
return buf.getvalue()
|
|
|
|
|
|
async def _synthesize_remote(text: str, voice: str, fmt: str, sample_rate: int) -> Dict[str, Any]:
|
|
"""Synthesize via Kokoro HTTP service on host."""
|
|
payload = {
|
|
"text": text,
|
|
"voice": voice,
|
|
"format": fmt,
|
|
"sample_rate": sample_rate,
|
|
}
|
|
async with httpx.AsyncClient(timeout=120) as c:
|
|
resp = await c.post(f"{MLX_KOKORO_URL}/synthesize", json=payload)
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
|
|
|
|
async def synthesize(payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Canonical TTS entry point.
|
|
|
|
Payload:
|
|
text: str (required)
|
|
voice: str (optional, default "af_heart")
|
|
format: "wav" | "mp3" (default "wav")
|
|
sample_rate: int (default 24000)
|
|
"""
|
|
text = payload.get("text", "")
|
|
if not text:
|
|
raise ValueError("text is required")
|
|
if len(text) > MAX_TEXT_CHARS:
|
|
raise ValueError(f"Text exceeds {MAX_TEXT_CHARS} chars")
|
|
|
|
voice = payload.get("voice", DEFAULT_VOICE)
|
|
fmt = payload.get("format", "wav")
|
|
sample_rate = payload.get("sample_rate", DEFAULT_SAMPLE_RATE)
|
|
|
|
meta = {
|
|
"model": MLX_KOKORO_MODEL,
|
|
"provider": "mlx_kokoro",
|
|
"voice": voice,
|
|
"device": "apple_silicon",
|
|
}
|
|
|
|
if MLX_KOKORO_URL:
|
|
result = await _synthesize_remote(text, voice, fmt, sample_rate)
|
|
return {
|
|
"audio_b64": result.get("audio_b64", ""),
|
|
"audio_url": result.get("audio_url", ""),
|
|
"format": fmt,
|
|
"meta": meta,
|
|
"provider": "mlx_kokoro",
|
|
"model": MLX_KOKORO_MODEL,
|
|
}
|
|
|
|
wav_bytes = await _synthesize_local(text, voice, sample_rate)
|
|
audio_b64 = base64.b64encode(wav_bytes).decode()
|
|
|
|
return {
|
|
"audio_b64": audio_b64,
|
|
"format": "wav",
|
|
"meta": meta,
|
|
"provider": "mlx_kokoro",
|
|
"model": MLX_KOKORO_MODEL,
|
|
}
|