microdao-daarion/services/swapper-service/config/swapper_config_node1.yaml

# Swapper Configuration for Node #1 (Production Server)
# Optimized Multimodal Stack: LLM + Vision + OCR + Document + Audio
# Hetzner GEX44 - NVIDIA RTX 4000 SFF Ada (20GB VRAM)
#
# ВАЖЛИВО: Ембедінги через зовнішні API:
#   - Text: Cohere API (embed-multilingual-v3.0, 1024 dim)
#   - Image: Vision Encoder (OpenCLIP ViT-L/14, 768 dim)
#   НЕ використовуємо локальні embedding моделі!

swapper:
  mode: multi-active
  max_concurrent_models: 4  # LLM + OCR + STT + TTS (до 15GB)
  model_swap_timeout: 300
  gpu_enabled: true
  metal_acceleration: false
  default_model: qwen3-8b
  lazy_load_ocr: true
  lazy_load_audio: true
  # Автоматичне вивантаження при нестачі VRAM
  auto_unload_on_oom: true
  vram_threshold_gb: 18  # Починати вивантажувати при 18GB

models:
  # ============================================
  # LLM MODELS (Ollama) - тільки qwen3
  # ============================================

  # Primary LLM - Qwen3 8B (includes math, coding, reasoning)
  qwen3-8b:
    path: ollama:qwen3:8b
    type: llm
    size_gb: 5.2
    priority: high
    description: "Qwen3 8B - primary LLM with math, coding, reasoning capabilities"
    capabilities:
      - chat
      - math
      - coding
      - reasoning
      - multilingual

  # ============================================
  # VISION MODELS (Ollama)
  # ============================================

  # Vision Model - Qwen3-VL 8B
  qwen3-vl-8b:
    path: ollama:qwen3-vl:8b
    type: vision
    size_gb: 6.1
    priority: high
    description: "Qwen3-VL 8B for image understanding and visual reasoning"
    capabilities:
      - image_understanding
      - visual_qa
      - diagram_analysis
      - ocr_basic

  # ============================================
  # OCR/DOCUMENT MODELS (HuggingFace)
  # ============================================

  # GOT-OCR2.0 - Best for documents, tables, formulas
  got-ocr2:
    path: huggingface:stepfun-ai/GOT-OCR2_0
    type: ocr
    size_gb: 7.0
    priority: high
    description: "Best OCR for documents, tables, formulas, handwriting"
    capabilities:
      - documents
      - tables
      - formulas
      - handwriting
      - multilingual

  # Donut - Document Understanding (no external OCR, 91% CORD)
  donut-base:
    path: huggingface:naver-clova-ix/donut-base
    type: ocr
    size_gb: 3.0
    priority: high
    description: "Document parsing without OCR engine (91% CORD accuracy)"
    capabilities:
      - document_parsing
      - receipts
      - forms
      - invoices

  # Donut fine-tuned for receipts/invoices (CORD dataset)
  donut-cord:
    path: huggingface:naver-clova-ix/donut-base-finetuned-cord-v2
    type: ocr
    size_gb: 3.0
    priority: medium
    description: "Donut fine-tuned for receipts extraction"
    capabilities:
      - receipts
      - invoices
      - structured_extraction

  # IBM Granite Docling - Document conversion with structure preservation
  granite-docling:
    path: huggingface:ds4sd/docling-ibm-granite-vision-1b
    type: document
    size_gb: 2.5
    priority: high
    description: "IBM Granite Docling for PDF/document structure extraction"
    capabilities:
      - pdf_conversion
      - table_extraction
      - formula_extraction
      - layout_preservation
      - doctags_format

  # ============================================
  # AUDIO MODELS - STT (Speech-to-Text)
  # ============================================

  # Faster Whisper Large-v3 - Best STT quality
  faster-whisper-large:
    path: huggingface:Systran/faster-whisper-large-v3
    type: stt
    size_gb: 3.0
    priority: high
    description: "Faster Whisper Large-v3 - best quality, 99 languages"
    capabilities:
      - speech_recognition
      - transcription
      - multilingual
      - timestamps
      - ukrainian

  # Whisper Small - Fast/lightweight for quick transcription
  whisper-small:
    path: huggingface:openai/whisper-small
    type: stt
    size_gb: 0.5
    priority: medium
    description: "Whisper Small for fast transcription"
    capabilities:
      - speech_recognition
      - transcription

  # ============================================
  # AUDIO MODELS - TTS (Text-to-Speech)
  # ============================================

  # Coqui XTTS-v2 - Best multilingual TTS with Ukrainian support
  xtts-v2:
    path: huggingface:coqui/XTTS-v2
    type: tts
    size_gb: 2.0
    priority: high
    description: "XTTS-v2 multilingual TTS with voice cloning, Ukrainian support"
    capabilities:
      - text_to_speech
      - voice_cloning
      - multilingual
      - ukrainian
      - 17_languages

  # ============================================
  # IMAGE GENERATION MODELS (HuggingFace/Diffusers)
  # ============================================

  # FLUX.2 Klein 4B - High quality image generation with lazy loading
  flux-klein-4b:
    path: huggingface:black-forest-labs/FLUX.2-klein-base-4B
    type: image_generation
    size_gb: 15.4
    priority: medium
    description: "FLUX.2 Klein 4B - high quality image generation, lazy loaded on demand"
    capabilities:
      - text_to_image
      - high_quality
      - 1024x1024
      - artistic
    default_params:
      num_inference_steps: 50
      guidance_scale: 4.0
      width: 1024
      height: 1024

storage:
  models_dir: /app/models
  cache_dir: /app/cache
  swap_dir: /app/swap
  huggingface_cache: /root/.cache/huggingface

ollama:
  url: http://172.18.0.1:11434
  timeout: 300

huggingface:
  device: cuda
  torch_dtype: float16
  trust_remote_code: true
  low_cpu_mem_usage: true

# ============================================
# EMBEDDING SERVICES (External APIs)
# НЕ через Swapper - окремі сервіси!
# ============================================
#
# Text Embeddings:
#   Service: Memory Service → Cohere API
#   Model: embed-multilingual-v3.0
#   Dimension: 1024
#   Endpoint: Memory Service handles internally
#
# Image/Multimodal Embeddings:
#   Service: Vision Encoder (port 8001)
#   Model: OpenCLIP ViT-L/14
#   Dimension: 768
#   Endpoint: http://vision-encoder:8001/embed
#
# Vector Storage:
#   Qdrant (port 6333) - separate collections for text vs image embeddings
#   ВАЖЛИВО: НЕ змішувати embedding spaces в одній колекції!