feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
This commit is contained in:
680
services/memory-service/app/voice_endpoints.py
Normal file
680
services/memory-service/app/voice_endpoints.py
Normal file
@@ -0,0 +1,680 @@
|
||||
"""
|
||||
DAARION Memory Service — Voice Endpoints
|
||||
STT: faster-whisper (Docker/Linux) → mlx-audio (macOS) → whisper-cli
|
||||
TTS: edge-tts Python API (primary, pure Python, no ffmpeg needed)
|
||||
→ piper (fallback, if model present)
|
||||
→ espeak-ng (offline Linux fallback)
|
||||
→ macOS say (fallback, macOS-only)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, File, HTTPException, Query, UploadFile
|
||||
from fastapi.responses import StreamingResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(prefix="/voice", tags=["voice"])
|
||||
|
||||
MODELS_CACHE: dict = {}
|
||||
|
||||
# ── Prometheus metrics (optional — skip if not installed) ─────────────────────
|
||||
try:
|
||||
from prometheus_client import Counter, Histogram
|
||||
|
||||
_tts_compute_hist = Histogram(
|
||||
"voice_tts_compute_ms",
|
||||
"TTS synthesis compute time in ms",
|
||||
["engine", "voice"],
|
||||
buckets=[50, 100, 250, 500, 1000, 2000, 5000],
|
||||
)
|
||||
_tts_bytes_hist = Histogram(
|
||||
"voice_tts_audio_bytes",
|
||||
"TTS audio output size in bytes",
|
||||
["engine"],
|
||||
buckets=[5000, 15000, 30000, 60000, 120000],
|
||||
)
|
||||
_tts_errors_total = Counter(
|
||||
"voice_tts_errors_total",
|
||||
"TTS engine errors",
|
||||
["engine", "error_type"],
|
||||
)
|
||||
_stt_compute_hist = Histogram(
|
||||
"voice_stt_compute_ms",
|
||||
"STT transcription time in ms",
|
||||
["engine"],
|
||||
buckets=[200, 500, 1000, 2000, 5000, 10000],
|
||||
)
|
||||
_PROM_OK = True
|
||||
except ImportError:
|
||||
_PROM_OK = False
|
||||
_tts_compute_hist = None
|
||||
_tts_bytes_hist = None
|
||||
_tts_errors_total = None
|
||||
_stt_compute_hist = None
|
||||
|
||||
|
||||
def _prom_tts_observe(engine: str, voice: str, ms: float, audio_bytes: int) -> None:
|
||||
if not _PROM_OK:
|
||||
return
|
||||
try:
|
||||
_tts_compute_hist.labels(engine=engine, voice=voice).observe(ms)
|
||||
_tts_bytes_hist.labels(engine=engine).observe(audio_bytes)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _prom_tts_error(engine: str, error_type: str) -> None:
|
||||
if not _PROM_OK:
|
||||
return
|
||||
try:
|
||||
_tts_errors_total.labels(engine=engine, error_type=error_type).inc()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _prom_stt_observe(engine: str, ms: float) -> None:
|
||||
if not _PROM_OK:
|
||||
return
|
||||
try:
|
||||
_stt_compute_hist.labels(engine=engine).observe(ms)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ── Voice mapping ─────────────────────────────────────────────────────────────
|
||||
# Maps UI voice id → edge-tts voice name
|
||||
_EDGE_VOICES: dict[str, str] = {
|
||||
"default": "uk-UA-PolinaNeural",
|
||||
"Polina": "uk-UA-PolinaNeural",
|
||||
"uk-UA-Polina": "uk-UA-PolinaNeural",
|
||||
"uk-UA-PolinaNeural": "uk-UA-PolinaNeural",
|
||||
"Ostap": "uk-UA-OstapNeural",
|
||||
"uk-UA-Ostap": "uk-UA-OstapNeural",
|
||||
"uk-UA-OstapNeural": "uk-UA-OstapNeural",
|
||||
# English voices — used for English-language segments
|
||||
"en-US-GuyNeural": "en-US-GuyNeural",
|
||||
"en-US-JennyNeural": "en-US-JennyNeural",
|
||||
"en": "en-US-GuyNeural",
|
||||
# macOS-only names: map to closest Ukrainian voice
|
||||
"Milena": "uk-UA-PolinaNeural",
|
||||
"Yuri": "uk-UA-OstapNeural",
|
||||
"af_heart": "uk-UA-PolinaNeural",
|
||||
}
|
||||
|
||||
def _edge_voice(name: str | None) -> str:
|
||||
"""Allow any valid edge-tts voice name to pass through directly."""
|
||||
n = name or "default"
|
||||
# If already a valid neural voice name (contains "Neural"), pass through
|
||||
if "Neural" in n or n == "en":
|
||||
return _EDGE_VOICES.get(n, n)
|
||||
return _EDGE_VOICES.get(n, "uk-UA-PolinaNeural")
|
||||
|
||||
|
||||
def _ffmpeg_available() -> bool:
|
||||
try:
|
||||
result = subprocess.run(["ffmpeg", "-version"], capture_output=True, timeout=3)
|
||||
return result.returncode == 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _espeak_available() -> bool:
|
||||
try:
|
||||
result = subprocess.run(["espeak-ng", "--version"], capture_output=True, timeout=3)
|
||||
return result.returncode == 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
class TTSRequest(BaseModel):
|
||||
text: str
|
||||
voice: Optional[str] = "default"
|
||||
speed: Optional[float] = 1.0
|
||||
model: Optional[str] = "auto"
|
||||
|
||||
|
||||
# ── Status & Live Health ───────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/health")
|
||||
async def voice_health():
|
||||
"""Live health check — actually synthesizes a short test phrase via edge-tts.
|
||||
Returns edge_tts=ok/error with details; used by preflight to detect 403/blocked.
|
||||
"""
|
||||
import importlib.metadata
|
||||
import time
|
||||
|
||||
result: dict = {}
|
||||
|
||||
# edge-tts version
|
||||
try:
|
||||
ver = importlib.metadata.version("edge-tts")
|
||||
except Exception:
|
||||
ver = "unknown"
|
||||
result["edge_tts_version"] = ver
|
||||
|
||||
# Live synthesis test for each required Neural voice
|
||||
live_voices: list[dict] = []
|
||||
test_text = "Test" # Minimal — just enough to trigger actual API call
|
||||
for voice_id in ("uk-UA-PolinaNeural", "uk-UA-OstapNeural"):
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
import edge_tts
|
||||
comm = edge_tts.Communicate(test_text, voice_id)
|
||||
byte_count = 0
|
||||
async for chunk in comm.stream():
|
||||
if chunk["type"] == "audio":
|
||||
byte_count += len(chunk["data"])
|
||||
elapsed_ms = int((time.monotonic() - t0) * 1000)
|
||||
live_voices.append({"voice": voice_id, "status": "ok",
|
||||
"bytes": byte_count, "ms": elapsed_ms})
|
||||
except Exception as e:
|
||||
elapsed_ms = int((time.monotonic() - t0) * 1000)
|
||||
live_voices.append({"voice": voice_id, "status": "error",
|
||||
"error": str(e)[:150], "ms": elapsed_ms})
|
||||
|
||||
all_ok = all(v["status"] == "ok" for v in live_voices)
|
||||
result["edge_tts"] = "ok" if all_ok else "error"
|
||||
result["voices"] = live_voices
|
||||
|
||||
# STT check (import only — no actual transcription in health)
|
||||
try:
|
||||
import faster_whisper # noqa: F401
|
||||
result["faster_whisper"] = "ok"
|
||||
except ImportError:
|
||||
result["faster_whisper"] = "unavailable"
|
||||
|
||||
result["ok"] = all_ok
|
||||
|
||||
# ── Repro pack (incident diagnosis) ──────────────────────────────────────
|
||||
import os as _os
|
||||
import socket as _socket
|
||||
result["repro"] = {
|
||||
"node_id": _os.getenv("NODE_ID", _socket.gethostname()),
|
||||
"service_name": _os.getenv("MEMORY_SERVICE_NAME", "memory-service"),
|
||||
"image_digest": _os.getenv("IMAGE_DIGEST", "unknown"), # set via docker label
|
||||
"memory_service_url": _os.getenv("MEMORY_SERVICE_URL", "http://localhost:8000"),
|
||||
"tts_max_chars": 700,
|
||||
"canary_test_text": test_text,
|
||||
"canary_audio_bytes": {
|
||||
v["voice"]: v.get("bytes", 0) for v in live_voices
|
||||
},
|
||||
}
|
||||
return result
|
||||
|
||||
|
||||
@router.get("/status")
|
||||
async def voice_status():
|
||||
edge_ok = False
|
||||
try:
|
||||
import edge_tts # noqa: F401
|
||||
edge_ok = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
espeak_ok = _espeak_available()
|
||||
|
||||
fw_ok = False
|
||||
try:
|
||||
import faster_whisper # noqa: F401
|
||||
fw_ok = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
mlx_ok = False
|
||||
try:
|
||||
import mlx_audio # noqa: F401
|
||||
mlx_ok = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
return {
|
||||
"available": True,
|
||||
"tts_engine": "edge-tts" if edge_ok else ("espeak-ng" if espeak_ok else "piper/say"),
|
||||
"stt_engine": ("faster-whisper" if fw_ok else "") + ("mlx-audio" if mlx_ok else ""),
|
||||
"edge_tts": edge_ok,
|
||||
"espeak_ng": espeak_ok,
|
||||
"faster_whisper": fw_ok,
|
||||
"mlx_audio": mlx_ok,
|
||||
"ffmpeg": _ffmpeg_available(),
|
||||
"voices": list(_EDGE_VOICES.keys()),
|
||||
}
|
||||
|
||||
|
||||
# ── TTS ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def _tts_edge(text: str, voice_name: str, speed: float = 1.0) -> bytes:
|
||||
"""
|
||||
edge-tts pure Python API — no subprocess, no ffmpeg.
|
||||
Returns MP3 bytes directly (browsers play MP3 natively).
|
||||
"""
|
||||
import edge_tts
|
||||
rate_str = f"+{int((speed - 1.0) * 50)}%" if speed != 1.0 else "+0%"
|
||||
communicate = edge_tts.Communicate(text, voice_name, rate=rate_str)
|
||||
buf = io.BytesIO()
|
||||
async for chunk in communicate.stream():
|
||||
if chunk["type"] == "audio":
|
||||
buf.write(chunk["data"])
|
||||
buf.seek(0)
|
||||
data = buf.read()
|
||||
if not data:
|
||||
raise RuntimeError("edge-tts returned empty audio")
|
||||
return data
|
||||
|
||||
|
||||
async def _tts_piper(text: str) -> bytes | None:
|
||||
"""Piper TTS — returns WAV bytes or None if unavailable."""
|
||||
model_path = os.path.expanduser("~/.local/share/piper-voices/uk-UA-low/uk-UA-low.onnx")
|
||||
if not Path(model_path).exists():
|
||||
return None
|
||||
try:
|
||||
import piper as piper_mod
|
||||
voice = piper_mod.PiperVoice.load(model_path)
|
||||
buf = io.BytesIO()
|
||||
voice.synthesize(text, buf)
|
||||
buf.seek(0)
|
||||
data = buf.read()
|
||||
return data if data else None
|
||||
except Exception as e:
|
||||
logger.debug("Piper TTS failed: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
async def _tts_macos_say(text: str, voice: str = "Milena") -> bytes | None:
|
||||
"""macOS say — only works outside Docker. Returns WAV bytes or None."""
|
||||
try:
|
||||
tmp_id = uuid.uuid4().hex[:8]
|
||||
aiff_path = f"/tmp/tts_{tmp_id}.aiff"
|
||||
wav_path = f"/tmp/tts_{tmp_id}.wav"
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"say", "-v", voice, "-o", aiff_path, text,
|
||||
stdout=asyncio.subprocess.DEVNULL,
|
||||
stderr=asyncio.subprocess.DEVNULL,
|
||||
)
|
||||
await asyncio.wait_for(proc.wait(), timeout=15)
|
||||
if not Path(aiff_path).exists() or Path(aiff_path).stat().st_size == 0:
|
||||
return None
|
||||
# Convert to WAV only if ffmpeg available
|
||||
if _ffmpeg_available():
|
||||
subprocess.run(["ffmpeg", "-y", "-i", aiff_path, "-ar", "22050", "-ac", "1", wav_path],
|
||||
capture_output=True, timeout=10)
|
||||
Path(aiff_path).unlink(missing_ok=True)
|
||||
if Path(wav_path).exists() and Path(wav_path).stat().st_size > 0:
|
||||
data = Path(wav_path).read_bytes()
|
||||
Path(wav_path).unlink(missing_ok=True)
|
||||
return data
|
||||
# Return AIFF if no ffmpeg — most browsers won't play it but at least we tried
|
||||
data = Path(aiff_path).read_bytes()
|
||||
Path(aiff_path).unlink(missing_ok=True)
|
||||
return data if data else None
|
||||
except Exception as e:
|
||||
logger.debug("macOS say failed: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
async def _tts_espeak(text: str, voice: str = "uk", speed: float = 1.0) -> bytes | None:
|
||||
"""espeak-ng offline fallback for Linux. Returns WAV bytes or None."""
|
||||
if not _espeak_available():
|
||||
return None
|
||||
try:
|
||||
tmp_id = uuid.uuid4().hex[:8]
|
||||
wav_path = f"/tmp/tts_espeak_{tmp_id}.wav"
|
||||
rate = max(120, min(240, int((speed or 1.0) * 170)))
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"espeak-ng", "-v", voice, "-s", str(rate), "-w", wav_path, text,
|
||||
stdout=asyncio.subprocess.DEVNULL,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
_stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=10)
|
||||
if proc.returncode != 0:
|
||||
logger.debug("espeak-ng failed rc=%s stderr=%s", proc.returncode, (stderr or b"")[:200])
|
||||
return None
|
||||
p = Path(wav_path)
|
||||
if not p.exists() or p.stat().st_size == 0:
|
||||
return None
|
||||
data = p.read_bytes()
|
||||
p.unlink(missing_ok=True)
|
||||
return data if data else None
|
||||
except Exception as e:
|
||||
logger.debug("espeak-ng TTS failed: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
@router.post("/tts")
|
||||
async def text_to_speech(request: TTSRequest):
|
||||
"""
|
||||
TTS pipeline:
|
||||
1. edge-tts (primary — pure Python, returns MP3, works anywhere)
|
||||
2. piper (if model file present)
|
||||
3. espeak-ng (offline Linux fallback)
|
||||
4. macOS say (macOS-only fallback)
|
||||
"""
|
||||
import time as _time
|
||||
|
||||
text = (request.text or "").strip()[:700]
|
||||
if not text:
|
||||
raise HTTPException(400, "Empty text")
|
||||
|
||||
edge_voice = _edge_voice(request.voice)
|
||||
errors: list[str] = []
|
||||
|
||||
# ── 1. edge-tts (MP3, no ffmpeg needed) ──────────────────────────────
|
||||
_t0 = _time.monotonic()
|
||||
try:
|
||||
data = await asyncio.wait_for(
|
||||
_tts_edge(text, edge_voice, speed=request.speed or 1.0),
|
||||
timeout=20.0,
|
||||
)
|
||||
_compute_ms = int((_time.monotonic() - _t0) * 1000)
|
||||
logger.info("TTS edge-tts OK: voice=%s len=%d ms=%d", edge_voice, len(data), _compute_ms)
|
||||
_prom_tts_observe("edge-tts", edge_voice, _compute_ms, len(data))
|
||||
return StreamingResponse(
|
||||
io.BytesIO(data),
|
||||
media_type="audio/mpeg",
|
||||
headers={"Content-Disposition": "inline; filename=speech.mp3",
|
||||
"X-TTS-Engine": "edge-tts",
|
||||
"X-TTS-Voice": edge_voice,
|
||||
"X-TTS-Compute-MS": str(_compute_ms),
|
||||
"Cache-Control": "no-store"},
|
||||
)
|
||||
except Exception as e:
|
||||
_prom_tts_error("edge-tts", type(e).__name__)
|
||||
errors.append(f"edge-tts: {e}")
|
||||
logger.warning("edge-tts failed: %s", e)
|
||||
|
||||
# ── 2. piper ──────────────────────────────────────────────────────────
|
||||
_t0 = _time.monotonic()
|
||||
try:
|
||||
data = await asyncio.wait_for(_tts_piper(text), timeout=15.0)
|
||||
if data:
|
||||
_compute_ms = int((_time.monotonic() - _t0) * 1000)
|
||||
logger.info("TTS piper OK len=%d ms=%d", len(data), _compute_ms)
|
||||
_prom_tts_observe("piper", "uk-UA", _compute_ms, len(data))
|
||||
return StreamingResponse(
|
||||
io.BytesIO(data),
|
||||
media_type="audio/wav",
|
||||
headers={"Content-Disposition": "inline; filename=speech.wav",
|
||||
"X-TTS-Engine": "piper",
|
||||
"X-TTS-Compute-MS": str(_compute_ms),
|
||||
"Cache-Control": "no-store"},
|
||||
)
|
||||
except Exception as e:
|
||||
_prom_tts_error("piper", type(e).__name__)
|
||||
errors.append(f"piper: {e}")
|
||||
logger.debug("piper failed: %s", e)
|
||||
|
||||
# ── 3. espeak-ng (offline Linux) ─────────────────────────────────────
|
||||
espeak_voice = "en-us" if str(request.voice or "").startswith("en") else "uk"
|
||||
_t0 = _time.monotonic()
|
||||
try:
|
||||
data = await asyncio.wait_for(_tts_espeak(text, espeak_voice, request.speed or 1.0), timeout=12.0)
|
||||
if data:
|
||||
_compute_ms = int((_time.monotonic() - _t0) * 1000)
|
||||
logger.info("TTS espeak-ng OK voice=%s len=%d ms=%d", espeak_voice, len(data), _compute_ms)
|
||||
_prom_tts_observe("espeak-ng", espeak_voice, _compute_ms, len(data))
|
||||
return StreamingResponse(
|
||||
io.BytesIO(data),
|
||||
media_type="audio/wav",
|
||||
headers={"Content-Disposition": "inline; filename=speech.wav",
|
||||
"X-TTS-Engine": "espeak-ng",
|
||||
"X-TTS-Voice": espeak_voice,
|
||||
"X-TTS-Compute-MS": str(_compute_ms),
|
||||
"Cache-Control": "no-store"},
|
||||
)
|
||||
except Exception as e:
|
||||
_prom_tts_error("espeak-ng", type(e).__name__)
|
||||
errors.append(f"espeak-ng: {e}")
|
||||
logger.debug("espeak-ng failed: %s", e)
|
||||
|
||||
# ── 4. macOS say ──────────────────────────────────────────────────────
|
||||
say_voice = "Milena" if request.voice in (None, "default", "Polina", "Milena") else "Yuri"
|
||||
_t0 = _time.monotonic()
|
||||
try:
|
||||
data = await asyncio.wait_for(_tts_macos_say(text, say_voice), timeout=20.0)
|
||||
if data:
|
||||
_compute_ms = int((_time.monotonic() - _t0) * 1000)
|
||||
mime = "audio/wav" if data[:4] == b"RIFF" else "audio/aiff"
|
||||
logger.info("TTS macOS say OK voice=%s len=%d ms=%d", say_voice, len(data), _compute_ms)
|
||||
_prom_tts_observe("macos-say", say_voice, _compute_ms, len(data))
|
||||
return StreamingResponse(
|
||||
io.BytesIO(data),
|
||||
media_type=mime,
|
||||
headers={"Content-Disposition": f"inline; filename=speech.{'wav' if mime=='audio/wav' else 'aiff'}",
|
||||
"X-TTS-Engine": "macos-say",
|
||||
"X-TTS-Compute-MS": str(_compute_ms),
|
||||
"Cache-Control": "no-store"},
|
||||
)
|
||||
except Exception as e:
|
||||
_prom_tts_error("macos-say", type(e).__name__)
|
||||
errors.append(f"say: {e}")
|
||||
logger.debug("macOS say failed: %s", e)
|
||||
|
||||
logger.error("All TTS engines failed: %s", errors)
|
||||
raise HTTPException(503, f"All TTS engines failed: {'; '.join(errors)}")
|
||||
|
||||
|
||||
# ── STT ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def _convert_audio_to_wav(input_path: str, output_path: str) -> bool:
|
||||
"""Convert audio to WAV using ffmpeg if available."""
|
||||
if not _ffmpeg_available():
|
||||
return False
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["ffmpeg", "-y", "-i", input_path, "-ar", "16000", "-ac", "1", output_path],
|
||||
capture_output=True, timeout=30,
|
||||
)
|
||||
return result.returncode == 0 and Path(output_path).exists()
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _stt_faster_whisper_sync(wav_path: str, language: str | None) -> str:
|
||||
"""faster-whisper STT — sync, runs in executor — works in Docker/Linux."""
|
||||
from faster_whisper import WhisperModel
|
||||
# Use 'small' for better Ukrainian accuracy (still fast on CPU)
|
||||
model_size = os.getenv("WHISPER_MODEL", "small")
|
||||
cache_key = f"faster_whisper_{model_size}"
|
||||
if cache_key not in MODELS_CACHE:
|
||||
logger.info("Loading faster-whisper model=%s (first call)...", model_size)
|
||||
MODELS_CACHE[cache_key] = WhisperModel(
|
||||
model_size, device="cpu", compute_type="int8",
|
||||
)
|
||||
model = MODELS_CACHE[cache_key]
|
||||
segments, info = model.transcribe(wav_path, language=language or "uk", beam_size=5)
|
||||
text = " ".join(seg.text for seg in segments).strip()
|
||||
logger.info("faster-whisper OK: lang=%s text_len=%d", info.language, len(text))
|
||||
return text
|
||||
|
||||
|
||||
def _stt_mlx_audio_sync(wav_path: str, language: str | None) -> str:
|
||||
"""mlx-audio STT — sync, runs in executor — macOS Apple Silicon only."""
|
||||
from mlx_audio.stt.utils import load_model
|
||||
if "mlx_whisper" not in MODELS_CACHE:
|
||||
logger.info("Loading mlx-audio whisper model (first call)...")
|
||||
MODELS_CACHE["mlx_whisper"] = load_model(
|
||||
"mlx-community/whisper-large-v3-turbo-asr-fp16"
|
||||
)
|
||||
model = MODELS_CACHE["mlx_whisper"]
|
||||
result = model.generate(wav_path, language=language)
|
||||
return result.text if hasattr(result, "text") else str(result)
|
||||
|
||||
|
||||
async def _stt_whisper_cli(wav_path: str, language: str | None) -> str:
|
||||
"""whisper CLI fallback."""
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"whisper", wav_path,
|
||||
"--language", language or "uk",
|
||||
"--model", "base",
|
||||
"--output_format", "txt",
|
||||
"--output_dir", "/tmp",
|
||||
stdout=asyncio.subprocess.DEVNULL,
|
||||
stderr=asyncio.subprocess.DEVNULL,
|
||||
)
|
||||
await asyncio.wait_for(proc.wait(), timeout=90)
|
||||
txt_path = Path(wav_path).with_suffix(".txt")
|
||||
if txt_path.exists():
|
||||
return txt_path.read_text().strip()
|
||||
raise RuntimeError("whisper CLI produced no output")
|
||||
|
||||
|
||||
@router.post("/stt")
|
||||
async def speech_to_text(
|
||||
audio: UploadFile = File(...),
|
||||
model: str = Query("auto", description="STT model: auto|faster-whisper|mlx-audio|whisper-cli"),
|
||||
language: Optional[str] = Query(None, description="Language code (auto-detect if None)"),
|
||||
):
|
||||
"""
|
||||
STT pipeline:
|
||||
1. Convert audio to WAV via ffmpeg (if available; skip if already WAV)
|
||||
2. faster-whisper (primary — Docker/Linux)
|
||||
3. mlx-audio (macOS Apple Silicon)
|
||||
4. whisper CLI (last resort)
|
||||
"""
|
||||
tmp_path: str | None = None
|
||||
wav_path: str | None = None
|
||||
|
||||
try:
|
||||
content = await audio.read()
|
||||
if not content:
|
||||
raise HTTPException(400, "Empty audio file")
|
||||
|
||||
# Detect MIME type
|
||||
fname = audio.filename or "audio.webm"
|
||||
suffix = Path(fname).suffix or ".webm"
|
||||
if audio.content_type and "wav" in audio.content_type:
|
||||
suffix = ".wav"
|
||||
elif audio.content_type and "ogg" in audio.content_type:
|
||||
suffix = ".ogg"
|
||||
|
||||
tmp_id = uuid.uuid4().hex[:8]
|
||||
tmp_path = f"/tmp/stt_in_{tmp_id}{suffix}"
|
||||
wav_path = f"/tmp/stt_wav_{tmp_id}.wav"
|
||||
|
||||
with open(tmp_path, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
# Convert to WAV (required by whisper models)
|
||||
converted = False
|
||||
if suffix == ".wav":
|
||||
import shutil
|
||||
shutil.copy(tmp_path, wav_path)
|
||||
converted = True
|
||||
else:
|
||||
converted = await _convert_audio_to_wav(tmp_path, wav_path)
|
||||
if not converted:
|
||||
# No ffmpeg — try to use input directly (faster-whisper accepts many formats)
|
||||
import shutil
|
||||
shutil.copy(tmp_path, wav_path)
|
||||
converted = True
|
||||
|
||||
if not Path(wav_path).exists():
|
||||
raise HTTPException(500, "Audio conversion failed — ffmpeg missing and no WAV input")
|
||||
|
||||
errors: list[str] = []
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
# ── 1. faster-whisper ─────────────────────────────────────────────
|
||||
if model in ("auto", "faster-whisper"):
|
||||
_t0_stt = asyncio.get_event_loop().time()
|
||||
try:
|
||||
_wpath = wav_path # capture for lambda
|
||||
_lang = language
|
||||
text = await asyncio.wait_for(
|
||||
loop.run_in_executor(None, _stt_faster_whisper_sync, _wpath, _lang),
|
||||
timeout=60.0,
|
||||
)
|
||||
_stt_ms = int((asyncio.get_event_loop().time() - _t0_stt) * 1000)
|
||||
_prom_stt_observe("faster-whisper", _stt_ms)
|
||||
return {"text": text, "model": "faster-whisper", "language": language,
|
||||
"compute_ms": _stt_ms}
|
||||
except Exception as e:
|
||||
errors.append(f"faster-whisper: {e}")
|
||||
logger.warning("faster-whisper failed: %s", e)
|
||||
|
||||
# ── 2. mlx-audio (macOS) ─────────────────────────────────────────
|
||||
if model in ("auto", "mlx-audio"):
|
||||
try:
|
||||
_wpath = wav_path
|
||||
_lang = language
|
||||
text = await asyncio.wait_for(
|
||||
loop.run_in_executor(None, _stt_mlx_audio_sync, _wpath, _lang),
|
||||
timeout=60.0,
|
||||
)
|
||||
return {"text": text, "model": "mlx-audio", "language": language}
|
||||
except Exception as e:
|
||||
errors.append(f"mlx-audio: {e}")
|
||||
logger.warning("mlx-audio failed: %s", e)
|
||||
|
||||
# ── 3. whisper CLI ────────────────────────────────────────────────
|
||||
if model in ("auto", "whisper-cli"):
|
||||
try:
|
||||
text = await asyncio.wait_for(
|
||||
_stt_whisper_cli(wav_path, language), timeout=90.0
|
||||
)
|
||||
return {"text": text, "model": "whisper-cli", "language": language}
|
||||
except Exception as e:
|
||||
errors.append(f"whisper-cli: {e}")
|
||||
logger.warning("whisper-cli failed: %s", e)
|
||||
|
||||
raise HTTPException(503, f"All STT engines failed: {'; '.join(str(e)[:80] for e in errors)}")
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("STT error: %s", e)
|
||||
raise HTTPException(500, str(e)[:200])
|
||||
finally:
|
||||
for p in [tmp_path, wav_path]:
|
||||
if p:
|
||||
Path(p).unlink(missing_ok=True)
|
||||
|
||||
|
||||
# ── Voices list ───────────────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/voices")
|
||||
async def list_voices():
|
||||
edge_voices = []
|
||||
try:
|
||||
import edge_tts # noqa: F401
|
||||
edge_voices = [
|
||||
{"id": "default", "name": "Polina Neural (uk-UA)", "lang": "uk-UA", "engine": "edge-tts"},
|
||||
{"id": "Polina", "name": "Polina Neural (uk-UA)", "lang": "uk-UA", "engine": "edge-tts"},
|
||||
{"id": "Ostap", "name": "Ostap Neural (uk-UA)", "lang": "uk-UA", "engine": "edge-tts"},
|
||||
]
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
piper_voices = []
|
||||
if Path(os.path.expanduser("~/.local/share/piper-voices/uk-UA-low/uk-UA-low.onnx")).exists():
|
||||
piper_voices = [{"id": "uk-UA-low", "name": "Ukrainian Low (uk-UA)", "lang": "uk-UA", "engine": "piper"}]
|
||||
|
||||
macos_voices = []
|
||||
if os.path.exists("/usr/bin/say") or os.path.exists("/usr/local/bin/say"):
|
||||
macos_voices = [
|
||||
{"id": "Milena", "name": "Milena (uk-UA, macOS)", "lang": "uk-UA", "engine": "say"},
|
||||
{"id": "Yuri", "name": "Yuri (uk-UA, macOS)", "lang": "uk-UA", "engine": "say"},
|
||||
]
|
||||
|
||||
espeak_voices = []
|
||||
if _espeak_available():
|
||||
espeak_voices = [
|
||||
{"id": "uk", "name": "Ukrainian (espeak-ng)", "lang": "uk-UA", "engine": "espeak-ng"},
|
||||
{"id": "en-us", "name": "English US (espeak-ng)", "lang": "en-US", "engine": "espeak-ng"},
|
||||
]
|
||||
|
||||
return {
|
||||
"edge": edge_voices,
|
||||
"piper": piper_voices,
|
||||
"macos": macos_voices,
|
||||
"espeak": espeak_voices,
|
||||
}
|
||||
Reference in New Issue
Block a user