microdao-daarion/services/stt-service/app/main.py

"""
STT Service - Speech-to-Text для DAARION
Конвертує аудіо файли в текст використовуючи Whisper AI
"""

from fastapi import FastAPI, HTTPException, UploadFile, File
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import logging
import os
import tempfile
import base64
from typing import Optional
import subprocess
import json

# Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI(
    title="STT Service",
    description="Speech-to-Text Service для DAARION (Whisper AI)",
    version="1.0.0"
)

# CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Конфігурація
WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base")  # tiny, base, small, medium, large
LANGUAGE = os.getenv("WHISPER_LANGUAGE", "uk")  # ukrainian

class STTRequest(BaseModel):
    audio: str  # base64 encoded audio
    language: Optional[str] = "uk"
    model: Optional[str] = "base"

class STTResponse(BaseModel):
    text: str
    language: str
    duration: float
    model: str
    confidence: Optional[float] = None

@app.get("/")
async def root():
    """Health check"""
    return {
        "service": "STT Service",
        "status": "running",
        "model": WHISPER_MODEL,
        "language": LANGUAGE,
        "version": "1.0.0"
    }

@app.get("/health")
async def health():
    """Health check endpoint"""
    try:
        # Перевірити чи Whisper доступний
        result = subprocess.run(
            ["whisper", "--help"],
            capture_output=True,
            text=True,
            timeout=5
        )
        whisper_available = result.returncode == 0

        return {
            "status": "healthy" if whisper_available else "degraded",
            "whisper": "available" if whisper_available else "unavailable",
            "model": WHISPER_MODEL
        }
    except Exception as e:
        logger.error(f"Health check failed: {e}")
        return {
            "status": "unhealthy",
            "error": str(e)
        }

@app.post("/api/stt", response_model=STTResponse)
async def speech_to_text(request: STTRequest):
    """
    Конвертує аудіо в текст

    Body:
    {
        "audio": "data:audio/webm;base64,...",
        "language": "uk",
        "model": "base"
    }
    """
    try:
        logger.info("📥 Received STT request")

        # Декодувати base64 audio
        audio_data = request.audio
        if ',' in audio_data:
            audio_data = audio_data.split(',')[1]

        audio_bytes = base64.b64decode(audio_data)
        logger.info(f"📊 Audio size: {len(audio_bytes)} bytes")

        # Зберегти у тимчасовий файл
        with tempfile.NamedTemporaryFile(suffix='.webm', delete=False) as temp_audio:
            temp_audio.write(audio_bytes)
            audio_path = temp_audio.name

        try:
            # Запустити Whisper
            model = request.model or WHISPER_MODEL
            language = request.language or LANGUAGE

            logger.info(f"🎤 Running Whisper (model={model}, language={language})")

            # Whisper CLI команда
            cmd = [
                "whisper",
                audio_path,
                "--model", model,
                "--language", language,
                "--output_format", "json",
                "--output_dir", tempfile.gettempdir()
            ]

            result = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=60
            )

            if result.returncode != 0:
                error_msg = result.stderr or "Whisper failed"
                logger.error(f"❌ Whisper error: {error_msg}")
                raise HTTPException(status_code=500, detail=f"Whisper error: {error_msg}")

            # Прочитати результат
            json_path = audio_path.replace('.webm', '.json')
            with open(json_path, 'r', encoding='utf-8') as f:
                whisper_result = json.load(f)

            text = whisper_result.get('text', '').strip()

            # Очистити тимчасові файли
            os.unlink(audio_path)
            if os.path.exists(json_path):
                os.unlink(json_path)

            logger.info(f"✅ Transcribed: '{text[:50]}...'")

            return STTResponse(
                text=text,
                language=language,
                duration=0.0,  # TODO: отримати з Whisper
                model=model,
                confidence=None
            )

        except subprocess.TimeoutExpired:
            os.unlink(audio_path)
            raise HTTPException(status_code=408, detail="Whisper timeout")
        except Exception as e:
            if os.path.exists(audio_path):
                os.unlink(audio_path)
            raise

    except Exception as e:
        logger.error(f"❌ STT error: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))

@app.post("/api/stt/upload")
async def stt_upload(file: UploadFile = File(...)):
    """
    Конвертує завантажений аудіо файл в текст

    Form-data:
    - file: audio file (webm, mp3, wav, m4a)
    """
    try:
        logger.info(f"📥 Received file upload: {file.filename}")

        # Зберегти у тимчасовий файл
        with tempfile.NamedTemporaryFile(suffix=os.path.splitext(file.filename)[1], delete=False) as temp_audio:
            content = await file.read()
            temp_audio.write(content)
            audio_path = temp_audio.name

        logger.info(f"📊 File size: {len(content)} bytes")

        try:
            # Запустити Whisper
            cmd = [
                "whisper",
                audio_path,
                "--model", WHISPER_MODEL,
                "--language", LANGUAGE,
                "--output_format", "json",
                "--output_dir", tempfile.gettempdir()
            ]

            result = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=60
            )

            if result.returncode != 0:
                error_msg = result.stderr or "Whisper failed"
                logger.error(f"❌ Whisper error: {error_msg}")
                raise HTTPException(status_code=500, detail=f"Whisper error: {error_msg}")

            # Прочитати результат
            json_path = audio_path.replace(os.path.splitext(audio_path)[1], '.json')
            with open(json_path, 'r', encoding='utf-8') as f:
                whisper_result = json.load(f)

            text = whisper_result.get('text', '').strip()

            # Очистити тимчасові файли
            os.unlink(audio_path)
            if os.path.exists(json_path):
                os.unlink(json_path)

            logger.info(f"✅ Transcribed: '{text[:50]}...'")

            return {
                "text": text,
                "filename": file.filename,
                "language": LANGUAGE,
                "model": WHISPER_MODEL
            }

        except Exception as e:
            if os.path.exists(audio_path):
                os.unlink(audio_path)
            raise

    except Exception as e:
        logger.error(f"❌ Upload STT error: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8895)