# Swapper Configuration for Node #1 (Production Server) # Optimized Multimodal Stack: LLM + Vision + OCR + Document + Audio # Hetzner GEX44 - NVIDIA RTX 4000 SFF Ada (20GB VRAM) # # ВАЖЛИВО: Ембедінги через зовнішні API: # - Text: Cohere API (embed-multilingual-v3.0, 1024 dim) # - Image: Vision Encoder (OpenCLIP ViT-L/14, 768 dim) # НЕ використовуємо локальні embedding моделі! swapper: mode: multi-active max_concurrent_models: 4 # LLM + OCR + STT + TTS (до 15GB) model_swap_timeout: 300 gpu_enabled: true metal_acceleration: false default_model: qwen3-8b lazy_load_ocr: true lazy_load_audio: true # Автоматичне вивантаження при нестачі VRAM auto_unload_on_oom: true vram_threshold_gb: 18 # Починати вивантажувати при 18GB models: # ============================================ # LLM MODELS (Ollama) - тільки qwen3 # ============================================ # Primary LLM - Qwen3 8B (includes math, coding, reasoning) qwen3-8b: path: ollama:qwen3:8b type: llm size_gb: 5.2 priority: high description: "Qwen3 8B - primary LLM with math, coding, reasoning capabilities" capabilities: - chat - math - coding - reasoning - multilingual # ============================================ # VISION MODELS (Ollama) # ============================================ # Vision Model - Qwen3-VL 8B qwen3-vl-8b: path: ollama:qwen3-vl:8b type: vision size_gb: 6.1 priority: high description: "Qwen3-VL 8B for image understanding and visual reasoning" capabilities: - image_understanding - visual_qa - diagram_analysis - ocr_basic # ============================================ # OCR/DOCUMENT MODELS (HuggingFace) # ============================================ # GOT-OCR2.0 - Best for documents, tables, formulas got-ocr2: path: huggingface:stepfun-ai/GOT-OCR2_0 type: ocr size_gb: 7.0 priority: high description: "Best OCR for documents, tables, formulas, handwriting" capabilities: - documents - tables - formulas - handwriting - multilingual # Donut - Document Understanding (no external OCR, 91% CORD) donut-base: path: huggingface:naver-clova-ix/donut-base type: ocr size_gb: 3.0 priority: high description: "Document parsing without OCR engine (91% CORD accuracy)" capabilities: - document_parsing - receipts - forms - invoices # Donut fine-tuned for receipts/invoices (CORD dataset) donut-cord: path: huggingface:naver-clova-ix/donut-base-finetuned-cord-v2 type: ocr size_gb: 3.0 priority: medium description: "Donut fine-tuned for receipts extraction" capabilities: - receipts - invoices - structured_extraction # IBM Granite Docling - Document conversion with structure preservation granite-docling: path: huggingface:ds4sd/docling-ibm-granite-vision-1b type: document size_gb: 2.5 priority: high description: "IBM Granite Docling for PDF/document structure extraction" capabilities: - pdf_conversion - table_extraction - formula_extraction - layout_preservation - doctags_format # ============================================ # AUDIO MODELS - STT (Speech-to-Text) # ============================================ # Faster Whisper Large-v3 - Best STT quality faster-whisper-large: path: huggingface:Systran/faster-whisper-large-v3 type: stt size_gb: 3.0 priority: high description: "Faster Whisper Large-v3 - best quality, 99 languages" capabilities: - speech_recognition - transcription - multilingual - timestamps - ukrainian # Whisper Small - Fast/lightweight for quick transcription whisper-small: path: huggingface:openai/whisper-small type: stt size_gb: 0.5 priority: medium description: "Whisper Small for fast transcription" capabilities: - speech_recognition - transcription # ============================================ # AUDIO MODELS - TTS (Text-to-Speech) # ============================================ # Coqui XTTS-v2 - Best multilingual TTS with Ukrainian support xtts-v2: path: huggingface:coqui/XTTS-v2 type: tts size_gb: 2.0 priority: high description: "XTTS-v2 multilingual TTS with voice cloning, Ukrainian support" capabilities: - text_to_speech - voice_cloning - multilingual - ukrainian - 17_languages # ============================================ # IMAGE GENERATION MODELS (HuggingFace/Diffusers) # ============================================ # FLUX.2 Klein 4B - High quality image generation with lazy loading flux-klein-4b: path: huggingface:black-forest-labs/FLUX.2-klein-base-4B type: image_generation size_gb: 15.4 priority: medium description: "FLUX.2 Klein 4B - high quality image generation, lazy loaded on demand" capabilities: - text_to_image - high_quality - 1024x1024 - artistic default_params: num_inference_steps: 50 guidance_scale: 4.0 width: 1024 height: 1024 storage: models_dir: /app/models cache_dir: /app/cache swap_dir: /app/swap huggingface_cache: /root/.cache/huggingface ollama: url: http://172.18.0.1:11434 timeout: 300 huggingface: device: cuda torch_dtype: float16 trust_remote_code: true low_cpu_mem_usage: true # ============================================ # EMBEDDING SERVICES (External APIs) # НЕ через Swapper - окремі сервіси! # ============================================ # # Text Embeddings: # Service: Memory Service → Cohere API # Model: embed-multilingual-v3.0 # Dimension: 1024 # Endpoint: Memory Service handles internally # # Image/Multimodal Embeddings: # Service: Vision Encoder (port 8001) # Model: OpenCLIP ViT-L/14 # Dimension: 768 # Endpoint: http://vision-encoder:8001/embed # # Vector Storage: # Qdrant (port 6333) - separate collections for text vs image embeddings # ВАЖЛИВО: НЕ змішувати embedding spaces в одній колекції!