feat: implement TTS, Document processing, and Memory Service /facts API

- TTS: xtts-v2 integration with voice cloning support - Document: docling integration for PDF/DOCX/PPTX processing - Memory Service: added /facts/upsert, /facts/{key}, /facts endpoints - Added required dependencies (TTS, docling)
2026-01-17 08:16:37 -08:00
parent a9fcadc6e2
commit 5290287058
121 changed files with 17071 additions and 436 deletions
--- a/services/swapper-service/config/swapper_config_node1.yaml
+++ b/services/swapper-service/config/swapper_config_node1.yaml
@@ -1,64 +1,220 @@
 # Swapper Configuration for Node #1 (Production Server)
-# Single-active LLM scheduler
+# Optimized Multimodal Stack: LLM + Vision + OCR + Document + Audio
 # Hetzner GEX44 - NVIDIA RTX 4000 SFF Ada (20GB VRAM)
+#
+# ВАЖЛИВО: Ембедінги через зовнішні API:
+#   - Text: Cohere API (embed-multilingual-v3.0, 1024 dim)
+#   - Image: Vision Encoder (OpenCLIP ViT-L/14, 768 dim)
+#   НЕ використовуємо локальні embedding моделі!

 swapper:
-  mode: single-active
-  max_concurrent_models: 1
+  mode: multi-active
+  max_concurrent_models: 4  # LLM + OCR + STT + TTS (до 15GB)
  model_swap_timeout: 300
  gpu_enabled: true
-  metal_acceleration: false  # NVIDIA GPU, not Apple Silicon
-  # Модель для автоматичного завантаження при старті
-  # qwen3-8b - основна модель (4.87 GB), швидка відповідь на перший запит
+  metal_acceleration: false
  default_model: qwen3-8b
+  lazy_load_ocr: true
+  lazy_load_audio: true
+  # Автоматичне вивантаження при нестачі VRAM
+  auto_unload_on_oom: true
+  vram_threshold_gb: 18  # Починати вивантажувати при 18GB

 models:
-  # Primary LLM - Qwen3 8B (High Priority) - Main model from INFRASTRUCTURE.md
+  # ============================================
+  # LLM MODELS (Ollama) - тільки qwen3
+  # ============================================
+  
+  # Primary LLM - Qwen3 8B (includes math, coding, reasoning)
  qwen3-8b:
    path: ollama:qwen3:8b
    type: llm
-    size_gb: 4.87
+    size_gb: 5.2
    priority: high
-    description: "Primary LLM for general tasks and conversations"
-    
-  # Vision Model - Qwen3-VL 8B (High Priority) - For image processing
+    description: "Qwen3 8B - primary LLM with math, coding, reasoning capabilities"
+    capabilities:
+      - chat
+      - math
+      - coding
+      - reasoning
+      - multilingual
+
+  # ============================================
+  # VISION MODELS (Ollama)
+  # ============================================
+  
+  # Vision Model - Qwen3-VL 8B
  qwen3-vl-8b:
    path: ollama:qwen3-vl:8b
    type: vision
-    size_gb: 5.72
+    size_gb: 6.1
    priority: high
-    description: "Vision model for image understanding and processing"
-    
-  # Qwen2.5 7B Instruct (High Priority)
-  qwen2.5-7b-instruct:
-    path: ollama:qwen2.5:7b-instruct-q4_K_M
-    type: llm
-    size_gb: 4.36
+    description: "Qwen3-VL 8B for image understanding and visual reasoning"
+    capabilities:
+      - image_understanding
+      - visual_qa
+      - diagram_analysis
+      - ocr_basic
+
+  # ============================================
+  # OCR/DOCUMENT MODELS (HuggingFace)
+  # ============================================
+  
+  # GOT-OCR2.0 - Best for documents, tables, formulas
+  got-ocr2:
+    path: huggingface:stepfun-ai/GOT-OCR2_0
+    type: ocr
+    size_gb: 7.0
    priority: high
-    description: "Qwen2.5 7B Instruct model"
+    description: "Best OCR for documents, tables, formulas, handwriting"
+    capabilities:
+      - documents
+      - tables
+      - formulas
+      - handwriting
+      - multilingual
    
-  # Lightweight LLM - Qwen2.5 3B Instruct (Medium Priority)
-  qwen2.5-3b-instruct:
-    path: ollama:qwen2.5:3b-instruct-q4_K_M
-    type: llm
-    size_gb: 1.80
+  # Donut - Document Understanding (no external OCR, 91% CORD)
+  donut-base:
+    path: huggingface:naver-clova-ix/donut-base
+    type: ocr
+    size_gb: 3.0
+    priority: high
+    description: "Document parsing without OCR engine (91% CORD accuracy)"
+    capabilities:
+      - document_parsing
+      - receipts
+      - forms
+      - invoices
+    
+  # Donut fine-tuned for receipts/invoices (CORD dataset)
+  donut-cord:
+    path: huggingface:naver-clova-ix/donut-base-finetuned-cord-v2
+    type: ocr
+    size_gb: 3.0
    priority: medium
-    description: "Lightweight LLM for faster responses"
-    
-  # Math Specialist - Qwen2 Math 7B (High Priority)
-  qwen2-math-7b:
-    path: ollama:qwen2-math:7b
-    type: math
-    size_gb: 4.13
+    description: "Donut fine-tuned for receipts extraction"
+    capabilities:
+      - receipts
+      - invoices
+      - structured_extraction
+
+  # IBM Granite Docling - Document conversion with structure preservation
+  granite-docling:
+    path: huggingface:ds4sd/docling-ibm-granite-vision-1b
+    type: document
+    size_gb: 2.5
    priority: high
-    description: "Specialized model for mathematical tasks"
+    description: "IBM Granite Docling for PDF/document structure extraction"
+    capabilities:
+      - pdf_conversion
+      - table_extraction
+      - formula_extraction
+      - layout_preservation
+      - doctags_format
+
+  # ============================================
+  # AUDIO MODELS - STT (Speech-to-Text)
+  # ============================================
+  
+  # Faster Whisper Large-v3 - Best STT quality
+  faster-whisper-large:
+    path: huggingface:Systran/faster-whisper-large-v3
+    type: stt
+    size_gb: 3.0
+    priority: high
+    description: "Faster Whisper Large-v3 - best quality, 99 languages"
+    capabilities:
+      - speech_recognition
+      - transcription
+      - multilingual
+      - timestamps
+      - ukrainian
+    
+  # Whisper Small - Fast/lightweight for quick transcription
+  whisper-small:
+    path: huggingface:openai/whisper-small
+    type: stt
+    size_gb: 0.5
+    priority: medium
+    description: "Whisper Small for fast transcription"
+    capabilities:
+      - speech_recognition
+      - transcription
+
+  # ============================================
+  # AUDIO MODELS - TTS (Text-to-Speech)
+  # ============================================
+  
+  # Coqui XTTS-v2 - Best multilingual TTS with Ukrainian support
+  xtts-v2:
+    path: huggingface:coqui/XTTS-v2
+    type: tts
+    size_gb: 2.0
+    priority: high
+    description: "XTTS-v2 multilingual TTS with voice cloning, Ukrainian support"
+    capabilities:
+      - text_to_speech
+      - voice_cloning
+      - multilingual
+      - ukrainian
+      - 17_languages
+
+  # ============================================
+  # IMAGE GENERATION MODELS (HuggingFace/Diffusers)
+  # ============================================
+  
+  # FLUX.2 Klein 4B - High quality image generation with lazy loading
+  flux-klein-4b:
+    path: huggingface:black-forest-labs/FLUX.2-klein-base-4B
+    type: image_generation
+    size_gb: 15.4
+    priority: medium
+    description: "FLUX.2 Klein 4B - high quality image generation, lazy loaded on demand"
+    capabilities:
+      - text_to_image
+      - high_quality
+      - 1024x1024
+      - artistic
+    default_params:
+      num_inference_steps: 50
+      guidance_scale: 4.0
+      width: 1024
+      height: 1024

 storage:
  models_dir: /app/models
  cache_dir: /app/cache
  swap_dir: /app/swap
+  huggingface_cache: /root/.cache/huggingface

 ollama:
-  url: http://ollama:11434  # From Docker container to Ollama service
+  url: http://172.18.0.1:11434
  timeout: 300

+huggingface:
+  device: cuda
+  torch_dtype: float16
+  trust_remote_code: true
+  low_cpu_mem_usage: true
+
+# ============================================
+# EMBEDDING SERVICES (External APIs)
+# НЕ через Swapper - окремі сервіси!
+# ============================================
+# 
+# Text Embeddings:
+#   Service: Memory Service → Cohere API
+#   Model: embed-multilingual-v3.0
+#   Dimension: 1024
+#   Endpoint: Memory Service handles internally
+#
+# Image/Multimodal Embeddings:
+#   Service: Vision Encoder (port 8001)
+#   Model: OpenCLIP ViT-L/14
+#   Dimension: 768
+#   Endpoint: http://vision-encoder:8001/embed
+#
+# Vector Storage:
+#   Qdrant (port 6333) - separate collections for text vs image embeddings
+#   ВАЖЛИВО: НЕ змішувати embedding spaces в одній колекції!