- TTS: xtts-v2 integration with voice cloning support
- Document: docling integration for PDF/DOCX/PPTX processing
- Memory Service: added /facts/upsert, /facts/{key}, /facts endpoints
- Added required dependencies (TTS, docling)
221 lines
6.2 KiB
YAML
221 lines
6.2 KiB
YAML
# Swapper Configuration for Node #1 (Production Server)
|
||
# Optimized Multimodal Stack: LLM + Vision + OCR + Document + Audio
|
||
# Hetzner GEX44 - NVIDIA RTX 4000 SFF Ada (20GB VRAM)
|
||
#
|
||
# ВАЖЛИВО: Ембедінги через зовнішні API:
|
||
# - Text: Cohere API (embed-multilingual-v3.0, 1024 dim)
|
||
# - Image: Vision Encoder (OpenCLIP ViT-L/14, 768 dim)
|
||
# НЕ використовуємо локальні embedding моделі!
|
||
|
||
swapper:
|
||
mode: multi-active
|
||
max_concurrent_models: 4 # LLM + OCR + STT + TTS (до 15GB)
|
||
model_swap_timeout: 300
|
||
gpu_enabled: true
|
||
metal_acceleration: false
|
||
default_model: qwen3-8b
|
||
lazy_load_ocr: true
|
||
lazy_load_audio: true
|
||
# Автоматичне вивантаження при нестачі VRAM
|
||
auto_unload_on_oom: true
|
||
vram_threshold_gb: 18 # Починати вивантажувати при 18GB
|
||
|
||
models:
|
||
# ============================================
|
||
# LLM MODELS (Ollama) - тільки qwen3
|
||
# ============================================
|
||
|
||
# Primary LLM - Qwen3 8B (includes math, coding, reasoning)
|
||
qwen3-8b:
|
||
path: ollama:qwen3:8b
|
||
type: llm
|
||
size_gb: 5.2
|
||
priority: high
|
||
description: "Qwen3 8B - primary LLM with math, coding, reasoning capabilities"
|
||
capabilities:
|
||
- chat
|
||
- math
|
||
- coding
|
||
- reasoning
|
||
- multilingual
|
||
|
||
# ============================================
|
||
# VISION MODELS (Ollama)
|
||
# ============================================
|
||
|
||
# Vision Model - Qwen3-VL 8B
|
||
qwen3-vl-8b:
|
||
path: ollama:qwen3-vl:8b
|
||
type: vision
|
||
size_gb: 6.1
|
||
priority: high
|
||
description: "Qwen3-VL 8B for image understanding and visual reasoning"
|
||
capabilities:
|
||
- image_understanding
|
||
- visual_qa
|
||
- diagram_analysis
|
||
- ocr_basic
|
||
|
||
# ============================================
|
||
# OCR/DOCUMENT MODELS (HuggingFace)
|
||
# ============================================
|
||
|
||
# GOT-OCR2.0 - Best for documents, tables, formulas
|
||
got-ocr2:
|
||
path: huggingface:stepfun-ai/GOT-OCR2_0
|
||
type: ocr
|
||
size_gb: 7.0
|
||
priority: high
|
||
description: "Best OCR for documents, tables, formulas, handwriting"
|
||
capabilities:
|
||
- documents
|
||
- tables
|
||
- formulas
|
||
- handwriting
|
||
- multilingual
|
||
|
||
# Donut - Document Understanding (no external OCR, 91% CORD)
|
||
donut-base:
|
||
path: huggingface:naver-clova-ix/donut-base
|
||
type: ocr
|
||
size_gb: 3.0
|
||
priority: high
|
||
description: "Document parsing without OCR engine (91% CORD accuracy)"
|
||
capabilities:
|
||
- document_parsing
|
||
- receipts
|
||
- forms
|
||
- invoices
|
||
|
||
# Donut fine-tuned for receipts/invoices (CORD dataset)
|
||
donut-cord:
|
||
path: huggingface:naver-clova-ix/donut-base-finetuned-cord-v2
|
||
type: ocr
|
||
size_gb: 3.0
|
||
priority: medium
|
||
description: "Donut fine-tuned for receipts extraction"
|
||
capabilities:
|
||
- receipts
|
||
- invoices
|
||
- structured_extraction
|
||
|
||
# IBM Granite Docling - Document conversion with structure preservation
|
||
granite-docling:
|
||
path: huggingface:ds4sd/docling-ibm-granite-vision-1b
|
||
type: document
|
||
size_gb: 2.5
|
||
priority: high
|
||
description: "IBM Granite Docling for PDF/document structure extraction"
|
||
capabilities:
|
||
- pdf_conversion
|
||
- table_extraction
|
||
- formula_extraction
|
||
- layout_preservation
|
||
- doctags_format
|
||
|
||
# ============================================
|
||
# AUDIO MODELS - STT (Speech-to-Text)
|
||
# ============================================
|
||
|
||
# Faster Whisper Large-v3 - Best STT quality
|
||
faster-whisper-large:
|
||
path: huggingface:Systran/faster-whisper-large-v3
|
||
type: stt
|
||
size_gb: 3.0
|
||
priority: high
|
||
description: "Faster Whisper Large-v3 - best quality, 99 languages"
|
||
capabilities:
|
||
- speech_recognition
|
||
- transcription
|
||
- multilingual
|
||
- timestamps
|
||
- ukrainian
|
||
|
||
# Whisper Small - Fast/lightweight for quick transcription
|
||
whisper-small:
|
||
path: huggingface:openai/whisper-small
|
||
type: stt
|
||
size_gb: 0.5
|
||
priority: medium
|
||
description: "Whisper Small for fast transcription"
|
||
capabilities:
|
||
- speech_recognition
|
||
- transcription
|
||
|
||
# ============================================
|
||
# AUDIO MODELS - TTS (Text-to-Speech)
|
||
# ============================================
|
||
|
||
# Coqui XTTS-v2 - Best multilingual TTS with Ukrainian support
|
||
xtts-v2:
|
||
path: huggingface:coqui/XTTS-v2
|
||
type: tts
|
||
size_gb: 2.0
|
||
priority: high
|
||
description: "XTTS-v2 multilingual TTS with voice cloning, Ukrainian support"
|
||
capabilities:
|
||
- text_to_speech
|
||
- voice_cloning
|
||
- multilingual
|
||
- ukrainian
|
||
- 17_languages
|
||
|
||
# ============================================
|
||
# IMAGE GENERATION MODELS (HuggingFace/Diffusers)
|
||
# ============================================
|
||
|
||
# FLUX.2 Klein 4B - High quality image generation with lazy loading
|
||
flux-klein-4b:
|
||
path: huggingface:black-forest-labs/FLUX.2-klein-base-4B
|
||
type: image_generation
|
||
size_gb: 15.4
|
||
priority: medium
|
||
description: "FLUX.2 Klein 4B - high quality image generation, lazy loaded on demand"
|
||
capabilities:
|
||
- text_to_image
|
||
- high_quality
|
||
- 1024x1024
|
||
- artistic
|
||
default_params:
|
||
num_inference_steps: 50
|
||
guidance_scale: 4.0
|
||
width: 1024
|
||
height: 1024
|
||
|
||
storage:
|
||
models_dir: /app/models
|
||
cache_dir: /app/cache
|
||
swap_dir: /app/swap
|
||
huggingface_cache: /root/.cache/huggingface
|
||
|
||
ollama:
|
||
url: http://172.18.0.1:11434
|
||
timeout: 300
|
||
|
||
huggingface:
|
||
device: cuda
|
||
torch_dtype: float16
|
||
trust_remote_code: true
|
||
low_cpu_mem_usage: true
|
||
|
||
# ============================================
|
||
# EMBEDDING SERVICES (External APIs)
|
||
# НЕ через Swapper - окремі сервіси!
|
||
# ============================================
|
||
#
|
||
# Text Embeddings:
|
||
# Service: Memory Service → Cohere API
|
||
# Model: embed-multilingual-v3.0
|
||
# Dimension: 1024
|
||
# Endpoint: Memory Service handles internally
|
||
#
|
||
# Image/Multimodal Embeddings:
|
||
# Service: Vision Encoder (port 8001)
|
||
# Model: OpenCLIP ViT-L/14
|
||
# Dimension: 768
|
||
# Endpoint: http://vision-encoder:8001/embed
|
||
#
|
||
# Vector Storage:
|
||
# Qdrant (port 6333) - separate collections for text vs image embeddings
|
||
# ВАЖЛИВО: НЕ змішувати embedding spaces в одній колекції!
|