feat(fabric): decommission Swapper from critical path, NCS = source of truth

- Node Worker: replace swapper_vision with ollama_vision (direct Ollama API) - Node Worker: add NATS subjects for stt/tts/image (stubs ready) - Node Worker: remove SWAPPER_URL dependency from config - Router: vision calls go directly to Ollama /api/generate with images - Router: local LLM calls go directly to Ollama /api/generate - Router: add OLLAMA_URL and PREFER_NODE_WORKER=true feature flag - Router: /v1/models now uses NCS global capabilities pool - NCS: SWAPPER_URL="" -> skip Swapper probing (status=disabled) - Swapper configs: remove all hardcoded model lists, keep only runtime URLs, timeouts, limits - docker-compose.node1.yml: add OLLAMA_URL, PREFER_NODE_WORKER for router; SWAPPER_URL= for NCS; remove swapper-service from node-worker depends_on - docker-compose.node2-sofiia.yml: same changes for NODA2 Swapper service still runs but is NOT in the critical inference path. Source of truth for models is now NCS -> Ollama /api/tags. Made-with: Cursor
2026-02-27 04:16:16 -08:00
parent 90080c632a
commit 194c87f53c
11 changed files with 347 additions and 614 deletions
--- a/services/swapper-service/config/swapper_config.yaml
+++ b/services/swapper-service/config/swapper_config.yaml
@@ -1,90 +1,35 @@
-# Swapper Configuration for Node #2 (Development Node)
-# Single-active LLM scheduler
-# MacBook Pro M4 Max - Apple Silicon (40-core GPU, 64GB RAM)
-# Auto-generated configuration with available Ollama models
+# Swapper Configuration — Default / Fallback
+#
+# NOTE: Swapper is now a runtime gateway / executor only.
+# Source of truth for models is NCS (Node Capabilities Service).
+# No hardcoded model lists — Swapper queries NCS or Ollama /api/tags at startup.
+#
+# Per-node overrides: swapper_config_node1.yaml, swapper_config_node2.yaml

-swapper:
-  mode: single-active
-  max_concurrent_models: 1
+node_id: default
+
+runtimes:
+  ollama:
+    url: http://localhost:11434
+    timeout: 300
+
+limits:
+  llm_concurrency: 2
+  vision_concurrency: 1
+  max_concurrent_models: 2
  model_swap_timeout: 300
-  gpu_enabled: true
-  metal_acceleration: true  # Apple Silicon GPU acceleration
-  # Модель для автоматичного завантаження при старті (опціонально)
-  # Якщо не вказано - моделі завантажуються тільки за запитом
-  # Рекомендовано: gpt-oss:latest (швидка модель) або phi3:latest (легка модель)
-  default_model: gpt-oss:latest  # Модель активується автоматично при старті

-models:
-  # Fast LLM - GPT-OSS 20B (High Priority) - Main model for general tasks
-  gpt-oss-latest:
-    path: ollama:gpt-oss:latest
-    type: llm
-    size_gb: 13.0
-    priority: high
-    description: "Fast LLM for general tasks and conversations (20.9B params)"
-    
-  # Lightweight LLM - Phi3 3.8B (High Priority) - Fast responses
-  phi3-latest:
-    path: ollama:phi3:latest
-    type: llm
-    size_gb: 2.2
-    priority: high
-    description: "Lightweight LLM for fast responses (3.8B params)"
-    
-  # Code Specialist - StarCoder2 3B (Medium Priority) - Code engineering
-  starcoder2-3b:
-    path: ollama:starcoder2:3b
-    type: code
-    size_gb: 1.7
-    priority: medium
-    description: "Code specialist model for code engineering (3B params)"
-    
-  # Reasoning Model - Mistral Nemo 12.2B (High Priority) - Advanced reasoning
-  mistral-nemo-12b:
-    path: ollama:mistral-nemo:12b
-    type: llm
-    size_gb: 7.1
-    priority: high
-    description: "Advanced reasoning model for complex tasks (12.2B params)"
-    
-  # Reasoning Model - Gemma2 27B (Medium Priority) - Strategic reasoning
-  gemma2-27b:
-    path: ollama:gemma2:27b
-    type: llm
-    size_gb: 15.0
-    priority: medium
-    description: "Reasoning model for strategic tasks (27.2B params)"
-    
-  # Code Specialist - DeepSeek Coder 33B (High Priority) - Advanced code tasks
-  deepseek-coder-33b:
-    path: ollama:deepseek-coder:33b
-    type: code
-    size_gb: 18.0
-    priority: high
-    description: "Advanced code specialist model (33B params)"
-    
-  # Code Specialist - Qwen2.5 Coder 32B (High Priority) - Advanced code tasks
-  qwen2.5-coder-32b:
-    path: ollama:qwen2.5-coder:32b
-    type: code
-    size_gb: 19.0
-    priority: high
-    description: "Advanced code specialist model (32.8B params)"
-    
-  # Reasoning Model - DeepSeek R1 70B (High Priority) - Strategic reasoning (large model)
-  deepseek-r1-70b:
-    path: ollama:deepseek-r1:70b
-    type: llm
-    size_gb: 42.0
-    priority: high
-    description: "Strategic reasoning model (70.6B params, quantized)"
+timeouts:
+  llm_ms: 120000
+  vision_ms: 180000
+  stt_ms: 60000
+  tts_ms: 60000
+
+gpu:
+  enabled: false
+  metal_acceleration: false

 storage:
  models_dir: /app/models
  cache_dir: /app/cache
  swap_dir: /app/swap
-
-ollama:
-  url: http://localhost:11434  # Native Ollama on MacBook (via Pieces OS or brew)
-  timeout: 300
-
--- a/services/swapper-service/config/swapper_config_node1.yaml
+++ b/services/swapper-service/config/swapper_config_node1.yaml
@@ -1,186 +1,37 @@
 # Swapper Configuration for Node #1 (Production Server)
-# Optimized Multimodal Stack: LLM + Vision + OCR + Document + Audio
 # Hetzner GEX44 - NVIDIA RTX 4000 SFF Ada (20GB VRAM)
 #
-# ВАЖЛИВО: Ембедінги через зовнішні API:
-#   - Text: Cohere API (embed-multilingual-v3.0, 1024 dim)
-#   - Image: Vision Encoder (OpenCLIP ViT-L/14, 768 dim)
-#   НЕ використовуємо локальні embedding моделі!
+# NOTE: Swapper is now a runtime gateway / executor only.
+# Source of truth for models is NCS (Node Capabilities Service).
+# No hardcoded model lists — Swapper queries NCS or Ollama /api/tags at startup.

-swapper:
-  mode: multi-active
-  max_concurrent_models: 4  # LLM + OCR + STT + TTS (до 15GB)
+node_id: noda1
+
+runtimes:
+  ollama:
+    url: http://172.18.0.1:11434
+    timeout: 300
+  # comfyui:
+  #   url: http://127.0.0.1:8188
+
+limits:
+  llm_concurrency: 2
+  vision_concurrency: 1
+  max_concurrent_models: 4
  model_swap_timeout: 300
-  gpu_enabled: true
+
+timeouts:
+  llm_ms: 120000
+  vision_ms: 180000
+  stt_ms: 60000
+  tts_ms: 60000
+  image_gen_ms: 300000
+
+gpu:
+  enabled: true
  metal_acceleration: false
-  default_model: qwen3-8b
-  lazy_load_ocr: true
-  lazy_load_audio: true
-  # Автоматичне вивантаження при нестачі VRAM
  auto_unload_on_oom: true
-  vram_threshold_gb: 18  # Починати вивантажувати при 18GB
-
-models:
-  # ============================================
-  # LLM MODELS (Ollama) - тільки qwen3
-  # ============================================
-  
-  # Primary LLM - Qwen3 8B (includes math, coding, reasoning)
-  qwen3-8b:
-    path: ollama:qwen3:8b
-    type: llm
-    size_gb: 5.2
-    priority: high
-    description: "Qwen3 8B - primary LLM with math, coding, reasoning capabilities"
-    capabilities:
-      - chat
-      - math
-      - coding
-      - reasoning
-      - multilingual
-
-  # ============================================
-  # VISION MODELS (Ollama)
-  # ============================================
-  
-  # Vision Model - Qwen3-VL 8B
-  qwen3-vl-8b:
-    path: ollama:qwen3-vl:8b
-    type: vision
-    size_gb: 6.1
-    priority: high
-    description: "Qwen3-VL 8B for image understanding and visual reasoning"
-    capabilities:
-      - image_understanding
-      - visual_qa
-      - diagram_analysis
-      - ocr_basic
-
-  # ============================================
-  # OCR/DOCUMENT MODELS (HuggingFace)
-  # ============================================
-  
-  # GOT-OCR2.0 - Best for documents, tables, formulas
-  got-ocr2:
-    path: huggingface:stepfun-ai/GOT-OCR2_0
-    type: ocr
-    size_gb: 7.0
-    priority: high
-    description: "Best OCR for documents, tables, formulas, handwriting"
-    capabilities:
-      - documents
-      - tables
-      - formulas
-      - handwriting
-      - multilingual
-    
-  # Donut - Document Understanding (no external OCR, 91% CORD)
-  donut-base:
-    path: huggingface:naver-clova-ix/donut-base
-    type: ocr
-    size_gb: 3.0
-    priority: high
-    description: "Document parsing without OCR engine (91% CORD accuracy)"
-    capabilities:
-      - document_parsing
-      - receipts
-      - forms
-      - invoices
-    
-  # Donut fine-tuned for receipts/invoices (CORD dataset)
-  donut-cord:
-    path: huggingface:naver-clova-ix/donut-base-finetuned-cord-v2
-    type: ocr
-    size_gb: 3.0
-    priority: medium
-    description: "Donut fine-tuned for receipts extraction"
-    capabilities:
-      - receipts
-      - invoices
-      - structured_extraction
-
-  # IBM Granite Docling - Document conversion with structure preservation
-  granite-docling:
-    path: huggingface:ds4sd/docling-ibm-granite-vision-1b
-    type: document
-    size_gb: 2.5
-    priority: high
-    description: "IBM Granite Docling for PDF/document structure extraction"
-    capabilities:
-      - pdf_conversion
-      - table_extraction
-      - formula_extraction
-      - layout_preservation
-      - doctags_format
-
-  # ============================================
-  # AUDIO MODELS - STT (Speech-to-Text)
-  # ============================================
-  
-  # Faster Whisper Large-v3 - Best STT quality
-  faster-whisper-large:
-    path: huggingface:Systran/faster-whisper-large-v3
-    type: stt
-    size_gb: 3.0
-    priority: high
-    description: "Faster Whisper Large-v3 - best quality, 99 languages"
-    capabilities:
-      - speech_recognition
-      - transcription
-      - multilingual
-      - timestamps
-      - ukrainian
-    
-  # Whisper Small - Fast/lightweight for quick transcription
-  whisper-small:
-    path: huggingface:openai/whisper-small
-    type: stt
-    size_gb: 0.5
-    priority: medium
-    description: "Whisper Small for fast transcription"
-    capabilities:
-      - speech_recognition
-      - transcription
-
-  # ============================================
-  # AUDIO MODELS - TTS (Text-to-Speech)
-  # ============================================
-  
-  # Coqui XTTS-v2 - Best multilingual TTS with Ukrainian support
-  xtts-v2:
-    path: huggingface:coqui/XTTS-v2
-    type: tts
-    size_gb: 2.0
-    priority: high
-    description: "XTTS-v2 multilingual TTS with voice cloning, Ukrainian support"
-    capabilities:
-      - text_to_speech
-      - voice_cloning
-      - multilingual
-      - ukrainian
-      - 17_languages
-
-  # ============================================
-  # IMAGE GENERATION MODELS (HuggingFace/Diffusers)
-  # ============================================
-  
-  # FLUX.2 Klein 4B - High quality image generation with lazy loading
-  flux-klein-4b:
-    path: huggingface:black-forest-labs/FLUX.2-klein-base-4B
-    type: image_generation
-    size_gb: 15.4
-    priority: medium
-    description: "FLUX.2 Klein 4B - high quality image generation, lazy loaded on demand"
-    capabilities:
-      - text_to_image
-      - high_quality
-      - 1024x1024
-      - artistic
-    default_params:
-      num_inference_steps: 50
-      guidance_scale: 4.0
-      width: 1024
-      height: 1024
+  vram_threshold_gb: 18

 storage:
  models_dir: /app/models
@@ -188,33 +39,8 @@ storage:
  swap_dir: /app/swap
  huggingface_cache: /root/.cache/huggingface

-ollama:
-  url: http://172.18.0.1:11434
-  timeout: 300
-
 huggingface:
  device: cuda
  torch_dtype: float16
  trust_remote_code: true
  low_cpu_mem_usage: true
-
-# ============================================
-# EMBEDDING SERVICES (External APIs)
-# НЕ через Swapper - окремі сервіси!
-# ============================================
-# 
-# Text Embeddings:
-#   Service: Memory Service → Cohere API
-#   Model: embed-multilingual-v3.0
-#   Dimension: 1024
-#   Endpoint: Memory Service handles internally
-#
-# Image/Multimodal Embeddings:
-#   Service: Vision Encoder (port 8001)
-#   Model: OpenCLIP ViT-L/14
-#   Dimension: 768
-#   Endpoint: http://vision-encoder:8001/embed
-#
-# Vector Storage:
-#   Qdrant (port 6333) - separate collections for text vs image embeddings
-#   ВАЖЛИВО: НЕ змішувати embedding spaces в одній колекції!
--- a/services/swapper-service/config/swapper_config_node2.yaml
+++ b/services/swapper-service/config/swapper_config_node2.yaml
@@ -1,126 +1,40 @@
 # Swapper Configuration for Node #2 (Development Node)
-# Single-active LLM scheduler
 # MacBook Pro M4 Max - Apple Silicon (40-core GPU, 64GB RAM)
-# Auto-generated configuration with available Ollama models
+#
+# NOTE: Swapper is now a runtime gateway / executor only.
+# Source of truth for models is NCS (Node Capabilities Service).
+# No hardcoded model lists — Swapper queries NCS or Ollama /api/tags at startup.

-swapper:
-  mode: single-active
+node_id: noda2
+
+runtimes:
+  ollama:
+    url: http://host.docker.internal:11434
+    timeout: 300
+  # mlx:
+  #   stt_model: whisper-large-v3-turbo
+  #   tts_model: kokoro-82m
+  # comfyui:
+  #   url: http://127.0.0.1:8188
+
+limits:
+  llm_concurrency: 1
+  vision_concurrency: 1
  max_concurrent_models: 1
  model_swap_timeout: 300
-  gpu_enabled: true
-  metal_acceleration: true  # Apple Silicon GPU acceleration
-  # Модель для автоматичного завантаження при старті (опціонально)
-  # Якщо не вказано - моделі завантажуються тільки за запитом
-  # Рекомендовано: gpt-oss:latest (швидка модель) або phi3:latest (легка модель)
-  # Стартова модель має бути реально встановлена в Ollama на NODA2
-  default_model: qwen3:14b  # Модель активується автоматично при старті

-models:
-  # Fast LLM - GPT-OSS 20B (High Priority) - Main model for general tasks
-  gpt-oss-latest:
-    path: ollama:gpt-oss:latest
-    type: llm
-    size_gb: 13.0
-    priority: high
-    description: "Fast LLM for general tasks and conversations (20.9B params)"
-    
-  # Lightweight LLM - Phi3 3.8B (High Priority) - Fast responses
-  phi3-latest:
-    path: ollama:phi3:latest
-    type: llm
-    size_gb: 2.2
-    priority: high
-    description: "Lightweight LLM for fast responses (3.8B params)"
-    
-  # General Reasoning - Qwen3 14B (High Priority)
-  qwen3-14b:
-    path: ollama:qwen3:14b
-    type: llm
-    size_gb: 9.3
-    priority: high
-    description: "Balanced local model for Sofiia and router fallback"
+timeouts:
+  llm_ms: 120000
+  vision_ms: 180000
+  stt_ms: 60000
+  tts_ms: 60000
+  image_gen_ms: 300000

-  # Reasoning Model - Qwen3.5 35B A3B (High Priority)
-  qwen3.5-35b-a3b:
-    path: ollama:qwen3.5:35b-a3b
-    type: llm
-    size_gb: 22.0
-    priority: high
-    description: "Large reasoning model for complex Sofiia requests"
-    
-  # Reasoning Model - GLM 4.7 Flash (High Priority) - Fast general model
-  glm-4.7-flash:
-    path: ollama:glm-4.7-flash:32k
-    type: llm
-    size_gb: 19.0
-    priority: high
-    description: "Multi-purpose reasoning model (fast context)"
-    
-  # Reasoning Model - Gemma2 27B (Medium Priority) - Strategic reasoning
-  gemma2-27b:
-    path: ollama:gemma2:27b
-    type: llm
-    size_gb: 15.0
-    priority: medium
-    description: "Reasoning model for strategic tasks (27.2B params)"
-    
-  # Code Specialist - DeepSeek Coder 33B (High Priority) - Advanced code tasks
-  deepseek-coder-33b:
-    path: ollama:deepseek-coder:33b
-    type: code
-    size_gb: 18.0
-    priority: high
-    description: "Advanced code specialist model (33B params)"
-    
-  # Code Specialist - Qwen2.5 Coder 32B (High Priority) - Advanced code tasks
-  qwen2.5-coder-32b:
-    path: ollama:qwen2.5-coder:32b
-    type: code
-    size_gb: 19.0
-    priority: high
-    description: "Advanced code specialist model (32.8B params)"
-    
-  # Reasoning Model - DeepSeek R1 70B (High Priority) - Strategic reasoning (large model)
-  deepseek-r1-70b:
-    path: ollama:deepseek-r1:70b
-    type: llm
-    size_gb: 42.0
-    priority: high
-    description: "Strategic reasoning model (70.6B params, quantized)"
-
-  # Vision Model - LLaVA 13B (P0 Fix: NODA2 fallback vision)
-  # Available in Ollama on NODA2 — used until qwen3-vl:8b is installed
-  llava-13b:
-    path: ollama:llava:13b
-    type: vision
-    size_gb: 8.0
-    priority: high
-    description: "LLaVA 13B vision model (multimodal CLIP+LLM). P0 fallback until qwen3-vl:8b."
-    vision: true
-    ollama_model: "llava:13b"
-
-  # Vision Model - Qwen3-VL 8B (RECOMMENDED: install with: ollama pull qwen3-vl:8b)
-  # Better quality than llava:13b. Enable once installed.
-  # qwen3-vl-8b:
-  #   path: ollama:qwen3-vl:8b
-  #   type: vision
-  #   size_gb: 5.5
-  #   priority: high
-  #   description: "Qwen3-VL 8B — modern vision-language model (recommended)"
-  #   vision: true
-  #   ollama_model: "qwen3-vl:8b"
+gpu:
+  enabled: true
+  metal_acceleration: true

 storage:
  models_dir: /app/models
  cache_dir: /app/cache
  swap_dir: /app/swap
-
-ollama:
-  url: http://host.docker.internal:11434  # host.docker.internal → native Ollama on MacBook (NODA2 P1 fix)
-  timeout: 300
-
-# Vision endpoint configuration
-# /vision/models returns all models where vision: true
-vision:
-  default_model: llava-13b
-  ollama_base_url: http://host.docker.internal:11434
--- a/services/swapper-service/config/swapper_config_node3.yaml
+++ b/services/swapper-service/config/swapper_config_node3.yaml
@@ -1,63 +1,37 @@
 # Swapper Configuration for Node #3 (AI/ML Workstation)
-# Single-active LLM scheduler
-# Threadripper PRO + RTX 3090 24GB - GPU-intensive workloads
+# Threadripper PRO + RTX 3090 24GB — GPU-intensive workloads
+#
+# NOTE: Swapper is now a runtime gateway / executor only.
+# Source of truth for models is NCS (Node Capabilities Service).
+# No hardcoded model lists.

-swapper:
-  mode: single-active
-  max_concurrent_models: 1
+node_id: noda3
+
+runtimes:
+  ollama:
+    url: http://localhost:11434
+    timeout: 300
+  comfyui:
+    url: http://127.0.0.1:8188
+
+limits:
+  llm_concurrency: 2
+  vision_concurrency: 1
+  max_concurrent_models: 2
  model_swap_timeout: 300
-  gpu_enabled: true
-  metal_acceleration: false  # NVIDIA GPU, not Apple Silicon
-  # Модель для автоматичного завантаження при старті
-  # qwen3-8b - основна модель (4.87 GB), швидка відповідь на перший запит
-  default_model: qwen3-8b

-models:
-  # Primary LLM - Qwen3 8B (High Priority) - Main model from INFRASTRUCTURE.md
-  qwen3-8b:
-    path: ollama:qwen3:8b
-    type: llm
-    size_gb: 4.87
-    priority: high
-    description: "Primary LLM for general tasks and conversations"
-    
-  # Vision Model - Qwen3-VL 8B (High Priority) - For image processing
-  qwen3-vl-8b:
-    path: ollama:qwen3-vl:8b
-    type: vision
-    size_gb: 5.72
-    priority: high
-    description: "Vision model for image understanding and processing"
-    
-  # Qwen2.5 7B Instruct (High Priority)
-  qwen2.5-7b-instruct:
-    path: ollama:qwen2.5:7b-instruct-q4_K_M
-    type: llm
-    size_gb: 4.36
-    priority: high
-    description: "Qwen2.5 7B Instruct model"
-    
-  # Lightweight LLM - Qwen2.5 3B Instruct (Medium Priority)
-  qwen2.5-3b-instruct:
-    path: ollama:qwen2.5:3b-instruct-q4_K_M
-    type: llm
-    size_gb: 1.80
-    priority: medium
-    description: "Lightweight LLM for faster responses"
-    
-  # Math Specialist - Qwen2 Math 7B (High Priority)
-  qwen2-math-7b:
-    path: ollama:qwen2-math:7b
-    type: math
-    size_gb: 4.13
-    priority: high
-    description: "Specialized model for mathematical tasks"
+timeouts:
+  llm_ms: 120000
+  vision_ms: 180000
+  image_gen_ms: 600000
+
+gpu:
+  enabled: true
+  metal_acceleration: false
+  auto_unload_on_oom: true
+  vram_threshold_gb: 22

 storage:
  models_dir: /app/models
  cache_dir: /app/cache
  swap_dir: /app/swap
-
-ollama:
-  url: http://ollama:11434  # From Docker container to Ollama service
-  timeout: 300