New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
681 lines
26 KiB
Python
681 lines
26 KiB
Python
"""
|
|
DAARION Memory Service — Voice Endpoints
|
|
STT: faster-whisper (Docker/Linux) → mlx-audio (macOS) → whisper-cli
|
|
TTS: edge-tts Python API (primary, pure Python, no ffmpeg needed)
|
|
→ piper (fallback, if model present)
|
|
→ espeak-ng (offline Linux fallback)
|
|
→ macOS say (fallback, macOS-only)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import io
|
|
import logging
|
|
import os
|
|
import subprocess
|
|
import tempfile
|
|
import uuid
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
from fastapi import APIRouter, File, HTTPException, Query, UploadFile
|
|
from fastapi.responses import StreamingResponse
|
|
from pydantic import BaseModel
|
|
|
|
logger = logging.getLogger(__name__)
|
|
router = APIRouter(prefix="/voice", tags=["voice"])
|
|
|
|
MODELS_CACHE: dict = {}
|
|
|
|
# ── Prometheus metrics (optional — skip if not installed) ─────────────────────
|
|
try:
|
|
from prometheus_client import Counter, Histogram
|
|
|
|
_tts_compute_hist = Histogram(
|
|
"voice_tts_compute_ms",
|
|
"TTS synthesis compute time in ms",
|
|
["engine", "voice"],
|
|
buckets=[50, 100, 250, 500, 1000, 2000, 5000],
|
|
)
|
|
_tts_bytes_hist = Histogram(
|
|
"voice_tts_audio_bytes",
|
|
"TTS audio output size in bytes",
|
|
["engine"],
|
|
buckets=[5000, 15000, 30000, 60000, 120000],
|
|
)
|
|
_tts_errors_total = Counter(
|
|
"voice_tts_errors_total",
|
|
"TTS engine errors",
|
|
["engine", "error_type"],
|
|
)
|
|
_stt_compute_hist = Histogram(
|
|
"voice_stt_compute_ms",
|
|
"STT transcription time in ms",
|
|
["engine"],
|
|
buckets=[200, 500, 1000, 2000, 5000, 10000],
|
|
)
|
|
_PROM_OK = True
|
|
except ImportError:
|
|
_PROM_OK = False
|
|
_tts_compute_hist = None
|
|
_tts_bytes_hist = None
|
|
_tts_errors_total = None
|
|
_stt_compute_hist = None
|
|
|
|
|
|
def _prom_tts_observe(engine: str, voice: str, ms: float, audio_bytes: int) -> None:
|
|
if not _PROM_OK:
|
|
return
|
|
try:
|
|
_tts_compute_hist.labels(engine=engine, voice=voice).observe(ms)
|
|
_tts_bytes_hist.labels(engine=engine).observe(audio_bytes)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _prom_tts_error(engine: str, error_type: str) -> None:
|
|
if not _PROM_OK:
|
|
return
|
|
try:
|
|
_tts_errors_total.labels(engine=engine, error_type=error_type).inc()
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _prom_stt_observe(engine: str, ms: float) -> None:
|
|
if not _PROM_OK:
|
|
return
|
|
try:
|
|
_stt_compute_hist.labels(engine=engine).observe(ms)
|
|
except Exception:
|
|
pass
|
|
|
|
# ── Voice mapping ─────────────────────────────────────────────────────────────
|
|
# Maps UI voice id → edge-tts voice name
|
|
_EDGE_VOICES: dict[str, str] = {
|
|
"default": "uk-UA-PolinaNeural",
|
|
"Polina": "uk-UA-PolinaNeural",
|
|
"uk-UA-Polina": "uk-UA-PolinaNeural",
|
|
"uk-UA-PolinaNeural": "uk-UA-PolinaNeural",
|
|
"Ostap": "uk-UA-OstapNeural",
|
|
"uk-UA-Ostap": "uk-UA-OstapNeural",
|
|
"uk-UA-OstapNeural": "uk-UA-OstapNeural",
|
|
# English voices — used for English-language segments
|
|
"en-US-GuyNeural": "en-US-GuyNeural",
|
|
"en-US-JennyNeural": "en-US-JennyNeural",
|
|
"en": "en-US-GuyNeural",
|
|
# macOS-only names: map to closest Ukrainian voice
|
|
"Milena": "uk-UA-PolinaNeural",
|
|
"Yuri": "uk-UA-OstapNeural",
|
|
"af_heart": "uk-UA-PolinaNeural",
|
|
}
|
|
|
|
def _edge_voice(name: str | None) -> str:
|
|
"""Allow any valid edge-tts voice name to pass through directly."""
|
|
n = name or "default"
|
|
# If already a valid neural voice name (contains "Neural"), pass through
|
|
if "Neural" in n or n == "en":
|
|
return _EDGE_VOICES.get(n, n)
|
|
return _EDGE_VOICES.get(n, "uk-UA-PolinaNeural")
|
|
|
|
|
|
def _ffmpeg_available() -> bool:
|
|
try:
|
|
result = subprocess.run(["ffmpeg", "-version"], capture_output=True, timeout=3)
|
|
return result.returncode == 0
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def _espeak_available() -> bool:
|
|
try:
|
|
result = subprocess.run(["espeak-ng", "--version"], capture_output=True, timeout=3)
|
|
return result.returncode == 0
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
class TTSRequest(BaseModel):
|
|
text: str
|
|
voice: Optional[str] = "default"
|
|
speed: Optional[float] = 1.0
|
|
model: Optional[str] = "auto"
|
|
|
|
|
|
# ── Status & Live Health ───────────────────────────────────────────────────────
|
|
|
|
@router.get("/health")
|
|
async def voice_health():
|
|
"""Live health check — actually synthesizes a short test phrase via edge-tts.
|
|
Returns edge_tts=ok/error with details; used by preflight to detect 403/blocked.
|
|
"""
|
|
import importlib.metadata
|
|
import time
|
|
|
|
result: dict = {}
|
|
|
|
# edge-tts version
|
|
try:
|
|
ver = importlib.metadata.version("edge-tts")
|
|
except Exception:
|
|
ver = "unknown"
|
|
result["edge_tts_version"] = ver
|
|
|
|
# Live synthesis test for each required Neural voice
|
|
live_voices: list[dict] = []
|
|
test_text = "Test" # Minimal — just enough to trigger actual API call
|
|
for voice_id in ("uk-UA-PolinaNeural", "uk-UA-OstapNeural"):
|
|
t0 = time.monotonic()
|
|
try:
|
|
import edge_tts
|
|
comm = edge_tts.Communicate(test_text, voice_id)
|
|
byte_count = 0
|
|
async for chunk in comm.stream():
|
|
if chunk["type"] == "audio":
|
|
byte_count += len(chunk["data"])
|
|
elapsed_ms = int((time.monotonic() - t0) * 1000)
|
|
live_voices.append({"voice": voice_id, "status": "ok",
|
|
"bytes": byte_count, "ms": elapsed_ms})
|
|
except Exception as e:
|
|
elapsed_ms = int((time.monotonic() - t0) * 1000)
|
|
live_voices.append({"voice": voice_id, "status": "error",
|
|
"error": str(e)[:150], "ms": elapsed_ms})
|
|
|
|
all_ok = all(v["status"] == "ok" for v in live_voices)
|
|
result["edge_tts"] = "ok" if all_ok else "error"
|
|
result["voices"] = live_voices
|
|
|
|
# STT check (import only — no actual transcription in health)
|
|
try:
|
|
import faster_whisper # noqa: F401
|
|
result["faster_whisper"] = "ok"
|
|
except ImportError:
|
|
result["faster_whisper"] = "unavailable"
|
|
|
|
result["ok"] = all_ok
|
|
|
|
# ── Repro pack (incident diagnosis) ──────────────────────────────────────
|
|
import os as _os
|
|
import socket as _socket
|
|
result["repro"] = {
|
|
"node_id": _os.getenv("NODE_ID", _socket.gethostname()),
|
|
"service_name": _os.getenv("MEMORY_SERVICE_NAME", "memory-service"),
|
|
"image_digest": _os.getenv("IMAGE_DIGEST", "unknown"), # set via docker label
|
|
"memory_service_url": _os.getenv("MEMORY_SERVICE_URL", "http://localhost:8000"),
|
|
"tts_max_chars": 700,
|
|
"canary_test_text": test_text,
|
|
"canary_audio_bytes": {
|
|
v["voice"]: v.get("bytes", 0) for v in live_voices
|
|
},
|
|
}
|
|
return result
|
|
|
|
|
|
@router.get("/status")
|
|
async def voice_status():
|
|
edge_ok = False
|
|
try:
|
|
import edge_tts # noqa: F401
|
|
edge_ok = True
|
|
except ImportError:
|
|
pass
|
|
|
|
espeak_ok = _espeak_available()
|
|
|
|
fw_ok = False
|
|
try:
|
|
import faster_whisper # noqa: F401
|
|
fw_ok = True
|
|
except ImportError:
|
|
pass
|
|
|
|
mlx_ok = False
|
|
try:
|
|
import mlx_audio # noqa: F401
|
|
mlx_ok = True
|
|
except ImportError:
|
|
pass
|
|
|
|
return {
|
|
"available": True,
|
|
"tts_engine": "edge-tts" if edge_ok else ("espeak-ng" if espeak_ok else "piper/say"),
|
|
"stt_engine": ("faster-whisper" if fw_ok else "") + ("mlx-audio" if mlx_ok else ""),
|
|
"edge_tts": edge_ok,
|
|
"espeak_ng": espeak_ok,
|
|
"faster_whisper": fw_ok,
|
|
"mlx_audio": mlx_ok,
|
|
"ffmpeg": _ffmpeg_available(),
|
|
"voices": list(_EDGE_VOICES.keys()),
|
|
}
|
|
|
|
|
|
# ── TTS ───────────────────────────────────────────────────────────────────────
|
|
|
|
async def _tts_edge(text: str, voice_name: str, speed: float = 1.0) -> bytes:
|
|
"""
|
|
edge-tts pure Python API — no subprocess, no ffmpeg.
|
|
Returns MP3 bytes directly (browsers play MP3 natively).
|
|
"""
|
|
import edge_tts
|
|
rate_str = f"+{int((speed - 1.0) * 50)}%" if speed != 1.0 else "+0%"
|
|
communicate = edge_tts.Communicate(text, voice_name, rate=rate_str)
|
|
buf = io.BytesIO()
|
|
async for chunk in communicate.stream():
|
|
if chunk["type"] == "audio":
|
|
buf.write(chunk["data"])
|
|
buf.seek(0)
|
|
data = buf.read()
|
|
if not data:
|
|
raise RuntimeError("edge-tts returned empty audio")
|
|
return data
|
|
|
|
|
|
async def _tts_piper(text: str) -> bytes | None:
|
|
"""Piper TTS — returns WAV bytes or None if unavailable."""
|
|
model_path = os.path.expanduser("~/.local/share/piper-voices/uk-UA-low/uk-UA-low.onnx")
|
|
if not Path(model_path).exists():
|
|
return None
|
|
try:
|
|
import piper as piper_mod
|
|
voice = piper_mod.PiperVoice.load(model_path)
|
|
buf = io.BytesIO()
|
|
voice.synthesize(text, buf)
|
|
buf.seek(0)
|
|
data = buf.read()
|
|
return data if data else None
|
|
except Exception as e:
|
|
logger.debug("Piper TTS failed: %s", e)
|
|
return None
|
|
|
|
|
|
async def _tts_macos_say(text: str, voice: str = "Milena") -> bytes | None:
|
|
"""macOS say — only works outside Docker. Returns WAV bytes or None."""
|
|
try:
|
|
tmp_id = uuid.uuid4().hex[:8]
|
|
aiff_path = f"/tmp/tts_{tmp_id}.aiff"
|
|
wav_path = f"/tmp/tts_{tmp_id}.wav"
|
|
proc = await asyncio.create_subprocess_exec(
|
|
"say", "-v", voice, "-o", aiff_path, text,
|
|
stdout=asyncio.subprocess.DEVNULL,
|
|
stderr=asyncio.subprocess.DEVNULL,
|
|
)
|
|
await asyncio.wait_for(proc.wait(), timeout=15)
|
|
if not Path(aiff_path).exists() or Path(aiff_path).stat().st_size == 0:
|
|
return None
|
|
# Convert to WAV only if ffmpeg available
|
|
if _ffmpeg_available():
|
|
subprocess.run(["ffmpeg", "-y", "-i", aiff_path, "-ar", "22050", "-ac", "1", wav_path],
|
|
capture_output=True, timeout=10)
|
|
Path(aiff_path).unlink(missing_ok=True)
|
|
if Path(wav_path).exists() and Path(wav_path).stat().st_size > 0:
|
|
data = Path(wav_path).read_bytes()
|
|
Path(wav_path).unlink(missing_ok=True)
|
|
return data
|
|
# Return AIFF if no ffmpeg — most browsers won't play it but at least we tried
|
|
data = Path(aiff_path).read_bytes()
|
|
Path(aiff_path).unlink(missing_ok=True)
|
|
return data if data else None
|
|
except Exception as e:
|
|
logger.debug("macOS say failed: %s", e)
|
|
return None
|
|
|
|
|
|
async def _tts_espeak(text: str, voice: str = "uk", speed: float = 1.0) -> bytes | None:
|
|
"""espeak-ng offline fallback for Linux. Returns WAV bytes or None."""
|
|
if not _espeak_available():
|
|
return None
|
|
try:
|
|
tmp_id = uuid.uuid4().hex[:8]
|
|
wav_path = f"/tmp/tts_espeak_{tmp_id}.wav"
|
|
rate = max(120, min(240, int((speed or 1.0) * 170)))
|
|
proc = await asyncio.create_subprocess_exec(
|
|
"espeak-ng", "-v", voice, "-s", str(rate), "-w", wav_path, text,
|
|
stdout=asyncio.subprocess.DEVNULL,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
_stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=10)
|
|
if proc.returncode != 0:
|
|
logger.debug("espeak-ng failed rc=%s stderr=%s", proc.returncode, (stderr or b"")[:200])
|
|
return None
|
|
p = Path(wav_path)
|
|
if not p.exists() or p.stat().st_size == 0:
|
|
return None
|
|
data = p.read_bytes()
|
|
p.unlink(missing_ok=True)
|
|
return data if data else None
|
|
except Exception as e:
|
|
logger.debug("espeak-ng TTS failed: %s", e)
|
|
return None
|
|
|
|
|
|
@router.post("/tts")
|
|
async def text_to_speech(request: TTSRequest):
|
|
"""
|
|
TTS pipeline:
|
|
1. edge-tts (primary — pure Python, returns MP3, works anywhere)
|
|
2. piper (if model file present)
|
|
3. espeak-ng (offline Linux fallback)
|
|
4. macOS say (macOS-only fallback)
|
|
"""
|
|
import time as _time
|
|
|
|
text = (request.text or "").strip()[:700]
|
|
if not text:
|
|
raise HTTPException(400, "Empty text")
|
|
|
|
edge_voice = _edge_voice(request.voice)
|
|
errors: list[str] = []
|
|
|
|
# ── 1. edge-tts (MP3, no ffmpeg needed) ──────────────────────────────
|
|
_t0 = _time.monotonic()
|
|
try:
|
|
data = await asyncio.wait_for(
|
|
_tts_edge(text, edge_voice, speed=request.speed or 1.0),
|
|
timeout=20.0,
|
|
)
|
|
_compute_ms = int((_time.monotonic() - _t0) * 1000)
|
|
logger.info("TTS edge-tts OK: voice=%s len=%d ms=%d", edge_voice, len(data), _compute_ms)
|
|
_prom_tts_observe("edge-tts", edge_voice, _compute_ms, len(data))
|
|
return StreamingResponse(
|
|
io.BytesIO(data),
|
|
media_type="audio/mpeg",
|
|
headers={"Content-Disposition": "inline; filename=speech.mp3",
|
|
"X-TTS-Engine": "edge-tts",
|
|
"X-TTS-Voice": edge_voice,
|
|
"X-TTS-Compute-MS": str(_compute_ms),
|
|
"Cache-Control": "no-store"},
|
|
)
|
|
except Exception as e:
|
|
_prom_tts_error("edge-tts", type(e).__name__)
|
|
errors.append(f"edge-tts: {e}")
|
|
logger.warning("edge-tts failed: %s", e)
|
|
|
|
# ── 2. piper ──────────────────────────────────────────────────────────
|
|
_t0 = _time.monotonic()
|
|
try:
|
|
data = await asyncio.wait_for(_tts_piper(text), timeout=15.0)
|
|
if data:
|
|
_compute_ms = int((_time.monotonic() - _t0) * 1000)
|
|
logger.info("TTS piper OK len=%d ms=%d", len(data), _compute_ms)
|
|
_prom_tts_observe("piper", "uk-UA", _compute_ms, len(data))
|
|
return StreamingResponse(
|
|
io.BytesIO(data),
|
|
media_type="audio/wav",
|
|
headers={"Content-Disposition": "inline; filename=speech.wav",
|
|
"X-TTS-Engine": "piper",
|
|
"X-TTS-Compute-MS": str(_compute_ms),
|
|
"Cache-Control": "no-store"},
|
|
)
|
|
except Exception as e:
|
|
_prom_tts_error("piper", type(e).__name__)
|
|
errors.append(f"piper: {e}")
|
|
logger.debug("piper failed: %s", e)
|
|
|
|
# ── 3. espeak-ng (offline Linux) ─────────────────────────────────────
|
|
espeak_voice = "en-us" if str(request.voice or "").startswith("en") else "uk"
|
|
_t0 = _time.monotonic()
|
|
try:
|
|
data = await asyncio.wait_for(_tts_espeak(text, espeak_voice, request.speed or 1.0), timeout=12.0)
|
|
if data:
|
|
_compute_ms = int((_time.monotonic() - _t0) * 1000)
|
|
logger.info("TTS espeak-ng OK voice=%s len=%d ms=%d", espeak_voice, len(data), _compute_ms)
|
|
_prom_tts_observe("espeak-ng", espeak_voice, _compute_ms, len(data))
|
|
return StreamingResponse(
|
|
io.BytesIO(data),
|
|
media_type="audio/wav",
|
|
headers={"Content-Disposition": "inline; filename=speech.wav",
|
|
"X-TTS-Engine": "espeak-ng",
|
|
"X-TTS-Voice": espeak_voice,
|
|
"X-TTS-Compute-MS": str(_compute_ms),
|
|
"Cache-Control": "no-store"},
|
|
)
|
|
except Exception as e:
|
|
_prom_tts_error("espeak-ng", type(e).__name__)
|
|
errors.append(f"espeak-ng: {e}")
|
|
logger.debug("espeak-ng failed: %s", e)
|
|
|
|
# ── 4. macOS say ──────────────────────────────────────────────────────
|
|
say_voice = "Milena" if request.voice in (None, "default", "Polina", "Milena") else "Yuri"
|
|
_t0 = _time.monotonic()
|
|
try:
|
|
data = await asyncio.wait_for(_tts_macos_say(text, say_voice), timeout=20.0)
|
|
if data:
|
|
_compute_ms = int((_time.monotonic() - _t0) * 1000)
|
|
mime = "audio/wav" if data[:4] == b"RIFF" else "audio/aiff"
|
|
logger.info("TTS macOS say OK voice=%s len=%d ms=%d", say_voice, len(data), _compute_ms)
|
|
_prom_tts_observe("macos-say", say_voice, _compute_ms, len(data))
|
|
return StreamingResponse(
|
|
io.BytesIO(data),
|
|
media_type=mime,
|
|
headers={"Content-Disposition": f"inline; filename=speech.{'wav' if mime=='audio/wav' else 'aiff'}",
|
|
"X-TTS-Engine": "macos-say",
|
|
"X-TTS-Compute-MS": str(_compute_ms),
|
|
"Cache-Control": "no-store"},
|
|
)
|
|
except Exception as e:
|
|
_prom_tts_error("macos-say", type(e).__name__)
|
|
errors.append(f"say: {e}")
|
|
logger.debug("macOS say failed: %s", e)
|
|
|
|
logger.error("All TTS engines failed: %s", errors)
|
|
raise HTTPException(503, f"All TTS engines failed: {'; '.join(errors)}")
|
|
|
|
|
|
# ── STT ───────────────────────────────────────────────────────────────────────
|
|
|
|
async def _convert_audio_to_wav(input_path: str, output_path: str) -> bool:
|
|
"""Convert audio to WAV using ffmpeg if available."""
|
|
if not _ffmpeg_available():
|
|
return False
|
|
try:
|
|
result = subprocess.run(
|
|
["ffmpeg", "-y", "-i", input_path, "-ar", "16000", "-ac", "1", output_path],
|
|
capture_output=True, timeout=30,
|
|
)
|
|
return result.returncode == 0 and Path(output_path).exists()
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def _stt_faster_whisper_sync(wav_path: str, language: str | None) -> str:
|
|
"""faster-whisper STT — sync, runs in executor — works in Docker/Linux."""
|
|
from faster_whisper import WhisperModel
|
|
# Use 'small' for better Ukrainian accuracy (still fast on CPU)
|
|
model_size = os.getenv("WHISPER_MODEL", "small")
|
|
cache_key = f"faster_whisper_{model_size}"
|
|
if cache_key not in MODELS_CACHE:
|
|
logger.info("Loading faster-whisper model=%s (first call)...", model_size)
|
|
MODELS_CACHE[cache_key] = WhisperModel(
|
|
model_size, device="cpu", compute_type="int8",
|
|
)
|
|
model = MODELS_CACHE[cache_key]
|
|
segments, info = model.transcribe(wav_path, language=language or "uk", beam_size=5)
|
|
text = " ".join(seg.text for seg in segments).strip()
|
|
logger.info("faster-whisper OK: lang=%s text_len=%d", info.language, len(text))
|
|
return text
|
|
|
|
|
|
def _stt_mlx_audio_sync(wav_path: str, language: str | None) -> str:
|
|
"""mlx-audio STT — sync, runs in executor — macOS Apple Silicon only."""
|
|
from mlx_audio.stt.utils import load_model
|
|
if "mlx_whisper" not in MODELS_CACHE:
|
|
logger.info("Loading mlx-audio whisper model (first call)...")
|
|
MODELS_CACHE["mlx_whisper"] = load_model(
|
|
"mlx-community/whisper-large-v3-turbo-asr-fp16"
|
|
)
|
|
model = MODELS_CACHE["mlx_whisper"]
|
|
result = model.generate(wav_path, language=language)
|
|
return result.text if hasattr(result, "text") else str(result)
|
|
|
|
|
|
async def _stt_whisper_cli(wav_path: str, language: str | None) -> str:
|
|
"""whisper CLI fallback."""
|
|
proc = await asyncio.create_subprocess_exec(
|
|
"whisper", wav_path,
|
|
"--language", language or "uk",
|
|
"--model", "base",
|
|
"--output_format", "txt",
|
|
"--output_dir", "/tmp",
|
|
stdout=asyncio.subprocess.DEVNULL,
|
|
stderr=asyncio.subprocess.DEVNULL,
|
|
)
|
|
await asyncio.wait_for(proc.wait(), timeout=90)
|
|
txt_path = Path(wav_path).with_suffix(".txt")
|
|
if txt_path.exists():
|
|
return txt_path.read_text().strip()
|
|
raise RuntimeError("whisper CLI produced no output")
|
|
|
|
|
|
@router.post("/stt")
|
|
async def speech_to_text(
|
|
audio: UploadFile = File(...),
|
|
model: str = Query("auto", description="STT model: auto|faster-whisper|mlx-audio|whisper-cli"),
|
|
language: Optional[str] = Query(None, description="Language code (auto-detect if None)"),
|
|
):
|
|
"""
|
|
STT pipeline:
|
|
1. Convert audio to WAV via ffmpeg (if available; skip if already WAV)
|
|
2. faster-whisper (primary — Docker/Linux)
|
|
3. mlx-audio (macOS Apple Silicon)
|
|
4. whisper CLI (last resort)
|
|
"""
|
|
tmp_path: str | None = None
|
|
wav_path: str | None = None
|
|
|
|
try:
|
|
content = await audio.read()
|
|
if not content:
|
|
raise HTTPException(400, "Empty audio file")
|
|
|
|
# Detect MIME type
|
|
fname = audio.filename or "audio.webm"
|
|
suffix = Path(fname).suffix or ".webm"
|
|
if audio.content_type and "wav" in audio.content_type:
|
|
suffix = ".wav"
|
|
elif audio.content_type and "ogg" in audio.content_type:
|
|
suffix = ".ogg"
|
|
|
|
tmp_id = uuid.uuid4().hex[:8]
|
|
tmp_path = f"/tmp/stt_in_{tmp_id}{suffix}"
|
|
wav_path = f"/tmp/stt_wav_{tmp_id}.wav"
|
|
|
|
with open(tmp_path, "wb") as f:
|
|
f.write(content)
|
|
|
|
# Convert to WAV (required by whisper models)
|
|
converted = False
|
|
if suffix == ".wav":
|
|
import shutil
|
|
shutil.copy(tmp_path, wav_path)
|
|
converted = True
|
|
else:
|
|
converted = await _convert_audio_to_wav(tmp_path, wav_path)
|
|
if not converted:
|
|
# No ffmpeg — try to use input directly (faster-whisper accepts many formats)
|
|
import shutil
|
|
shutil.copy(tmp_path, wav_path)
|
|
converted = True
|
|
|
|
if not Path(wav_path).exists():
|
|
raise HTTPException(500, "Audio conversion failed — ffmpeg missing and no WAV input")
|
|
|
|
errors: list[str] = []
|
|
|
|
loop = asyncio.get_event_loop()
|
|
|
|
# ── 1. faster-whisper ─────────────────────────────────────────────
|
|
if model in ("auto", "faster-whisper"):
|
|
_t0_stt = asyncio.get_event_loop().time()
|
|
try:
|
|
_wpath = wav_path # capture for lambda
|
|
_lang = language
|
|
text = await asyncio.wait_for(
|
|
loop.run_in_executor(None, _stt_faster_whisper_sync, _wpath, _lang),
|
|
timeout=60.0,
|
|
)
|
|
_stt_ms = int((asyncio.get_event_loop().time() - _t0_stt) * 1000)
|
|
_prom_stt_observe("faster-whisper", _stt_ms)
|
|
return {"text": text, "model": "faster-whisper", "language": language,
|
|
"compute_ms": _stt_ms}
|
|
except Exception as e:
|
|
errors.append(f"faster-whisper: {e}")
|
|
logger.warning("faster-whisper failed: %s", e)
|
|
|
|
# ── 2. mlx-audio (macOS) ─────────────────────────────────────────
|
|
if model in ("auto", "mlx-audio"):
|
|
try:
|
|
_wpath = wav_path
|
|
_lang = language
|
|
text = await asyncio.wait_for(
|
|
loop.run_in_executor(None, _stt_mlx_audio_sync, _wpath, _lang),
|
|
timeout=60.0,
|
|
)
|
|
return {"text": text, "model": "mlx-audio", "language": language}
|
|
except Exception as e:
|
|
errors.append(f"mlx-audio: {e}")
|
|
logger.warning("mlx-audio failed: %s", e)
|
|
|
|
# ── 3. whisper CLI ────────────────────────────────────────────────
|
|
if model in ("auto", "whisper-cli"):
|
|
try:
|
|
text = await asyncio.wait_for(
|
|
_stt_whisper_cli(wav_path, language), timeout=90.0
|
|
)
|
|
return {"text": text, "model": "whisper-cli", "language": language}
|
|
except Exception as e:
|
|
errors.append(f"whisper-cli: {e}")
|
|
logger.warning("whisper-cli failed: %s", e)
|
|
|
|
raise HTTPException(503, f"All STT engines failed: {'; '.join(str(e)[:80] for e in errors)}")
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error("STT error: %s", e)
|
|
raise HTTPException(500, str(e)[:200])
|
|
finally:
|
|
for p in [tmp_path, wav_path]:
|
|
if p:
|
|
Path(p).unlink(missing_ok=True)
|
|
|
|
|
|
# ── Voices list ───────────────────────────────────────────────────────────────
|
|
|
|
@router.get("/voices")
|
|
async def list_voices():
|
|
edge_voices = []
|
|
try:
|
|
import edge_tts # noqa: F401
|
|
edge_voices = [
|
|
{"id": "default", "name": "Polina Neural (uk-UA)", "lang": "uk-UA", "engine": "edge-tts"},
|
|
{"id": "Polina", "name": "Polina Neural (uk-UA)", "lang": "uk-UA", "engine": "edge-tts"},
|
|
{"id": "Ostap", "name": "Ostap Neural (uk-UA)", "lang": "uk-UA", "engine": "edge-tts"},
|
|
]
|
|
except ImportError:
|
|
pass
|
|
|
|
piper_voices = []
|
|
if Path(os.path.expanduser("~/.local/share/piper-voices/uk-UA-low/uk-UA-low.onnx")).exists():
|
|
piper_voices = [{"id": "uk-UA-low", "name": "Ukrainian Low (uk-UA)", "lang": "uk-UA", "engine": "piper"}]
|
|
|
|
macos_voices = []
|
|
if os.path.exists("/usr/bin/say") or os.path.exists("/usr/local/bin/say"):
|
|
macos_voices = [
|
|
{"id": "Milena", "name": "Milena (uk-UA, macOS)", "lang": "uk-UA", "engine": "say"},
|
|
{"id": "Yuri", "name": "Yuri (uk-UA, macOS)", "lang": "uk-UA", "engine": "say"},
|
|
]
|
|
|
|
espeak_voices = []
|
|
if _espeak_available():
|
|
espeak_voices = [
|
|
{"id": "uk", "name": "Ukrainian (espeak-ng)", "lang": "uk-UA", "engine": "espeak-ng"},
|
|
{"id": "en-us", "name": "English US (espeak-ng)", "lang": "en-US", "engine": "espeak-ng"},
|
|
]
|
|
|
|
return {
|
|
"edge": edge_voices,
|
|
"piper": piper_voices,
|
|
"macos": macos_voices,
|
|
"espeak": espeak_voices,
|
|
}
|