feat: add STT service for voice message recognition

- Add STT service with Whisper support (faster-whisper, whisper CLI, OpenAI API) - Update Gateway to handle Telegram voice/audio/video_note messages - Add STT service to docker-compose.yml - Gateway now converts voice → text → DAGI Router → text response
2025-11-15 12:43:41 -08:00
parent c78542c5ef
commit 65e33add81
6 changed files with 459 additions and 3 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -80,6 +80,8 @@ services:
      - "9300:9300"
    environment:
      - ROUTER_URL=http://router:9102
+      - MEMORY_SERVICE_URL=http://memory-service:8000
+      - STT_SERVICE_URL=http://stt-service:9000
      - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN:-}
      - DISCORD_BOT_TOKEN=${DISCORD_BOT_TOKEN:-}
      - DAARWIZZ_NAME=DAARWIZZ
@@ -88,6 +90,8 @@ services:
      - ./logs:/app/logs
    depends_on:
      - router
+      - memory-service
+      - stt-service
    networks:
      - dagi-network
    restart: unless-stopped
@@ -168,6 +172,28 @@ services:
      timeout: 10s
      retries: 3

+  # STT Service (Speech-to-Text using Whisper)
+  stt-service:
+    build:
+      context: ./services/stt-service
+      dockerfile: Dockerfile
+    container_name: dagi-stt-service
+    ports:
+      - "9000:9000"
+    environment:
+      - WHISPER_MODEL=${WHISPER_MODEL:-base}
+      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
+    volumes:
+      - ./logs:/app/logs
+    networks:
+      - dagi-network
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+
 networks:
  dagi-network:
    driver: bridge
--- a/gateway-bot/http_api.py
+++ b/gateway-bot/http_api.py
@@ -4,6 +4,7 @@ Handles incoming webhooks from Telegram, Discord, etc.
 """
 import logging
 import os
+import httpx
 from pathlib import Path
 from typing import Dict, Any, Optional
 from datetime import datetime
@@ -110,7 +111,6 @@ async def telegram_webhook(update: TelegramUpdate):
            raise HTTPException(status_code=400, detail="No message in update")
        
        # Extract message details
-        text = update.message.get("text", "")
        from_user = update.message.get("from", {})
        chat = update.message.get("chat", {})
        
@@ -121,6 +121,58 @@ async def telegram_webhook(update: TelegramUpdate):
        # Get DAO ID for this chat
        dao_id = get_dao_id(chat_id, "telegram")
        
+        # Check if it's a voice message
+        voice = update.message.get("voice")
+        audio = update.message.get("audio")
+        video_note = update.message.get("video_note")
+        
+        text = ""
+        
+        if voice or audio or video_note:
+            # Голосове повідомлення - розпізнаємо через STT
+            media_obj = voice or audio or video_note
+            file_id = media_obj.get("file_id") if media_obj else None
+            
+            if not file_id:
+                raise HTTPException(status_code=400, detail="No file_id in voice/audio/video_note")
+            
+            logger.info(f"Voice message from {username} (tg:{user_id}), file_id: {file_id}")
+            
+            try:
+                # Отримуємо файл з Telegram
+                file_path = await get_telegram_file_path(file_id)
+                if not file_path:
+                    raise HTTPException(status_code=400, detail="Failed to get file from Telegram")
+                
+                # Завантажуємо файл
+                file_url = f"https://api.telegram.org/file/bot{os.getenv('TELEGRAM_BOT_TOKEN')}/{file_path}"
+                async with httpx.AsyncClient(timeout=30.0) as client:
+                    file_resp = await client.get(file_url)
+                    file_resp.raise_for_status()
+                    audio_bytes = file_resp.content
+                
+                # Відправляємо в STT-сервіс
+                stt_service_url = os.getenv("STT_SERVICE_URL", "http://stt-service:9000")
+                files = {"file": ("voice.ogg", audio_bytes, "audio/ogg")}
+                
+                async with httpx.AsyncClient(timeout=60.0) as client:
+                    stt_resp = await client.post(f"{stt_service_url}/stt", files=files)
+                    stt_resp.raise_for_status()
+                    stt_data = stt_resp.json()
+                    text = stt_data.get("text", "")
+                
+                logger.info(f"STT result: {text[:100]}...")
+                
+            except Exception as e:
+                logger.error(f"STT processing failed: {e}", exc_info=True)
+                await send_telegram_message(chat_id, "Вибач, не вдалося розпізнати голосове повідомлення. Спробуй надіслати текстом.")
+                return {"ok": False, "error": "STT failed"}
+        else:
+            # Текстове повідомлення
+            text = update.message.get("text", "")
+            if not text:
+                raise HTTPException(status_code=400, detail="No text or voice in message")
+        
        logger.info(f"Telegram message from {username} (tg:{user_id}) in chat {chat_id}: {text[:50]}")
        
        # Fetch memory context
@@ -283,10 +335,30 @@ async def discord_webhook(message: DiscordMessage):
 # Helper Functions
 # ========================================

+async def get_telegram_file_path(file_id: str) -> Optional[str]:
+    """Отримати шлях до файлу з Telegram API"""
+    telegram_token = os.getenv("TELEGRAM_BOT_TOKEN")
+    if not telegram_token:
+        logger.error("TELEGRAM_BOT_TOKEN not set")
+        return None
+    
+    url = f"https://api.telegram.org/bot{telegram_token}/getFile"
+    params = {"file_id": file_id}
+    
+    try:
+        async with httpx.AsyncClient(timeout=10.0) as client:
+            response = await client.get(url, params=params)
+            response.raise_for_status()
+            data = response.json()
+            if data.get("ok"):
+                return data.get("result", {}).get("file_path")
+    except Exception as e:
+        logger.error(f"Error getting Telegram file: {e}")
+    return None
+
+
 async def send_telegram_message(chat_id: str, text: str):
    """Send message to Telegram chat"""
-    import httpx
-    
    telegram_token = os.getenv("TELEGRAM_BOT_TOKEN")
    if not telegram_token:
        logger.error("TELEGRAM_BOT_TOKEN not set")
--- a/services/stt-service/Dockerfile
+++ b/services/stt-service/Dockerfile
@@ -0,0 +1,25 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Встановлюємо системні залежності (ffmpeg для конвертації аудіо)
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+
+# Копіюємо requirements та встановлюємо залежності
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Копіюємо код
+COPY . .
+
+# Створюємо тимчасову директорію
+RUN mkdir -p /tmp/stt
+
+# Відкриваємо порт
+EXPOSE 9000
+
+# Запускаємо додаток
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "9000"]
+
--- a/services/stt-service/README.md
+++ b/services/stt-service/README.md
@@ -0,0 +1,131 @@
+# STT Service (Speech-to-Text)
+
+Сервіс для розпізнавання мови з аудіо файлів за допомогою Whisper.
+
+## Можливості
+
+- Розпізнавання мови з голосових повідомлень (Telegram voice, audio, video_note)
+- Підтримка форматів: ogg, mp3, wav, m4a, webm
+- Автоматична конвертація в WAV 16kHz mono через ffmpeg
+- Підтримка кількох Whisper-реалізацій:
+  - `faster-whisper` (рекомендовано, локально)
+  - `whisper` CLI (fallback)
+  - OpenAI Whisper API (якщо є API key)
+
+## Запуск
+
+### Локально (development)
+
+```bash
+cd services/stt-service
+pip install -r requirements.txt
+uvicorn main:app --reload --host 0.0.0.0 --port 9000
+```
+
+### Docker
+
+```bash
+docker-compose up stt-service
+```
+
+## API
+
+### POST /stt
+
+Розпізнати мову з аудіо файлу.
+
+**Request:**
+- `file`: аудіо файл (multipart/form-data)
+
+**Response:**
+```json
+{
+  "text": "розпізнаний текст",
+  "language": "uk",
+  "duration": 5.2
+}
+```
+
+**Приклад:**
+```bash
+curl -X POST http://localhost:9000/stt \
+  -F "file=@voice.ogg"
+```
+
+### GET /health
+
+Health check endpoint.
+
+## Конфігурація
+
+### Environment Variables
+
+- `WHISPER_MODEL`: модель Whisper (`base`, `small`, `medium`, `large`) - за замовчуванням `base`
+- `OPENAI_API_KEY`: API ключ OpenAI (опційно, для використання OpenAI Whisper API)
+
+### Моделі Whisper
+
+- `base`: найшвидша, менша точність (~74M параметрів)
+- `small`: баланс швидкості та якості (~244M)
+- `medium`: краща якість (~769M)
+- `large`: найкраща якість (~1550M)
+
+Для української мови рекомендую `small` або `medium`.
+
+## Інтеграція з Gateway
+
+Gateway автоматично використовує STT-сервіс для обробки голосових повідомлень з Telegram:
+
+1. Користувач надсилає voice/audio/video_note
+2. Gateway завантажує файл з Telegram
+3. Gateway відправляє файл в STT-сервіс
+4. STT повертає розпізнаний текст
+5. Текст відправляється в DAGI Router як звичайне текстове повідомлення
+
+## Встановлення залежностей
+
+### faster-whisper (рекомендовано)
+
+```bash
+pip install faster-whisper
+```
+
+Моделі завантажуються автоматично при першому використанні.
+
+### whisper CLI (fallback)
+
+```bash
+pip install openai-whisper
+```
+
+### ffmpeg (обов'язково)
+
+```bash
+# Ubuntu/Debian
+sudo apt-get install ffmpeg
+
+# macOS
+brew install ffmpeg
+
+# Docker
+Вже включено в Dockerfile
+```
+
+## Troubleshooting
+
+### Помилка: "No Whisper implementation available"
+
+Встановіть одну з реалізацій:
+- `pip install faster-whisper` (рекомендовано)
+- або `pip install openai-whisper`
+- або встановіть `OPENAI_API_KEY`
+
+### Помилка: "ffmpeg not found"
+
+Встановіть ffmpeg (див. вище).
+
+### Повільна обробка
+
+- Використовуйте меншу модель (`base` замість `medium`)
+- Або використовуйте GPU (додайте `device="cuda"` в коді)
+
--- a/services/stt-service/main.py
+++ b/services/stt-service/main.py
@@ -0,0 +1,196 @@
+"""
+STT Service (Speech-to-Text) для DAGI Router
+Використовує Whisper для розпізнавання голосу
+"""
+
+import os
+import uuid
+import subprocess
+import logging
+from pathlib import Path
+from typing import Optional
+
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+app = FastAPI(
+    title="STT Service",
+    description="Speech-to-Text service using Whisper",
+    version="1.0.0"
+)
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Configuration
+WHISPER_MODEL = os.getenv("WHISPER_MODEL", "base")  # base, small, medium
+TEMP_DIR = Path("/tmp/stt")
+TEMP_DIR.mkdir(exist_ok=True)
+
+
+class STTResponse(BaseModel):
+    text: str
+    language: Optional[str] = None
+    duration: Optional[float] = None
+
+
+def convert_audio_to_wav(input_path: str, output_path: str) -> bool:
+    """Конвертувати аудіо в WAV 16kHz mono"""
+    try:
+        cmd = [
+            "ffmpeg", "-y", "-i", input_path,
+            "-ar", "16000",  # Sample rate
+            "-ac", "1",      # Mono
+            "-f", "wav",
+            output_path
+        ]
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=30
+        )
+        if result.returncode != 0:
+            logger.error(f"ffmpeg error: {result.stderr}")
+            return False
+        return True
+    except Exception as e:
+        logger.error(f"Audio conversion failed: {e}")
+        return False
+
+
+def transcribe_with_whisper(audio_path: str) -> tuple[str, Optional[str], Optional[float]]:
+    """
+    Розпізнати мову з аудіо файлу
+    Повертає (text, language, duration)
+    """
+    try:
+        # Варіант 1: faster-whisper (рекомендовано)
+        try:
+            from faster_whisper import WhisperModel
+            model = WhisperModel(WHISPER_MODEL, device="cpu", compute_type="int8")
+            segments, info = model.transcribe(audio_path, language="uk", beam_size=5)
+            
+            text_parts = []
+            for segment in segments:
+                text_parts.append(segment.text)
+            
+            text = " ".join(text_parts).strip()
+            language = info.language
+            duration = sum(segment.end - segment.start for segment in segments)
+            
+            return text, language, duration
+        except ImportError:
+            logger.warning("faster-whisper not installed, trying whisper CLI")
+        
+        # Варіант 2: whisper CLI (fallback)
+        try:
+            cmd = ["whisper", audio_path, "--model", WHISPER_MODEL, "--language", "uk", "--output_format", "txt"]
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=60
+            )
+            if result.returncode == 0:
+                # Whisper CLI створює .txt файл з тим самим ім'ям
+                txt_path = audio_path.replace(".wav", ".txt")
+                if Path(txt_path).exists():
+                    text = Path(txt_path).read_text(encoding="utf-8").strip()
+                    return text, "uk", None
+        except FileNotFoundError:
+            logger.warning("whisper CLI not found")
+        
+        # Варіант 3: OpenAI Whisper API (якщо є API key)
+        openai_api_key = os.getenv("OPENAI_API_KEY")
+        if openai_api_key:
+            try:
+                import openai
+                client = openai.OpenAI(api_key=openai_api_key)
+                with open(audio_path, "rb") as audio_file:
+                    transcript = client.audio.transcriptions.create(
+                        model="whisper-1",
+                        file=audio_file,
+                        language="uk"
+                    )
+                return transcript.text, transcript.language, None
+            except Exception as e:
+                logger.warning(f"OpenAI Whisper API failed: {e}")
+        
+        raise Exception("No Whisper implementation available")
+        
+    except Exception as e:
+        logger.error(f"Transcription failed: {e}")
+        raise
+
+
+@app.post("/stt", response_model=STTResponse)
+async def stt(file: UploadFile = File(...)):
+    """
+    Розпізнати мову з аудіо файлу
+    
+    Підтримує формати: ogg, mp3, wav, m4a, webm
+    """
+    tmp_id = str(uuid.uuid4())
+    tmp_input = TEMP_DIR / f"{tmp_id}_input.{file.filename.split('.')[-1] if '.' in file.filename else 'ogg'}"
+    tmp_wav = TEMP_DIR / f"{tmp_id}.wav"
+    
+    try:
+        # Зберігаємо вхідний файл
+        content = await file.read()
+        tmp_input.write_bytes(content)
+        logger.info(f"Received audio file: {file.filename}, size: {len(content)} bytes")
+        
+        # Конвертуємо в WAV 16kHz
+        if not convert_audio_to_wav(str(tmp_input), str(tmp_wav)):
+            raise HTTPException(status_code=400, detail="Audio conversion failed")
+        
+        # Розпізнаємо мову
+        text, language, duration = transcribe_with_whisper(str(tmp_wav))
+        
+        logger.info(f"Transcribed: {text[:100]}... (lang: {language})")
+        
+        return STTResponse(
+            text=text,
+            language=language,
+            duration=duration
+        )
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"STT error: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"STT failed: {str(e)}")
+    finally:
+        # Очищаємо тимчасові файли
+        for path in [tmp_input, tmp_wav]:
+            if path.exists():
+                try:
+                    path.unlink()
+                except:
+                    pass
+
+
+@app.get("/health")
+async def health():
+    """Health check"""
+    return {
+        "status": "ok",
+        "service": "stt-service",
+        "model": WHISPER_MODEL
+    }
+
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=9000)
+
--- a/services/stt-service/requirements.txt
+++ b/services/stt-service/requirements.txt
@@ -0,0 +1,6 @@
+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+python-multipart==0.0.6
+faster-whisper==1.0.0
+openai>=1.0.0
+