feat(fabric): decommission Swapper from critical path, NCS = source of truth

- Node Worker: replace swapper_vision with ollama_vision (direct Ollama API) - Node Worker: add NATS subjects for stt/tts/image (stubs ready) - Node Worker: remove SWAPPER_URL dependency from config - Router: vision calls go directly to Ollama /api/generate with images - Router: local LLM calls go directly to Ollama /api/generate - Router: add OLLAMA_URL and PREFER_NODE_WORKER=true feature flag - Router: /v1/models now uses NCS global capabilities pool - NCS: SWAPPER_URL="" -> skip Swapper probing (status=disabled) - Swapper configs: remove all hardcoded model lists, keep only runtime URLs, timeouts, limits - docker-compose.node1.yml: add OLLAMA_URL, PREFER_NODE_WORKER for router; SWAPPER_URL= for NCS; remove swapper-service from node-worker depends_on - docker-compose.node2-sofiia.yml: same changes for NODA2 Swapper service still runs but is NOT in the critical inference path. Source of truth for models is now NCS -> Ollama /api/tags. Made-with: Cursor
2026-02-27 04:16:16 -08:00
parent 90080c632a
commit 194c87f53c
11 changed files with 347 additions and 614 deletions
--- a/services/swapper-service/config/swapper_config_node3.yaml
+++ b/services/swapper-service/config/swapper_config_node3.yaml
@@ -1,63 +1,37 @@
 # Swapper Configuration for Node #3 (AI/ML Workstation)
-# Single-active LLM scheduler
-# Threadripper PRO + RTX 3090 24GB - GPU-intensive workloads
+# Threadripper PRO + RTX 3090 24GB — GPU-intensive workloads
+#
+# NOTE: Swapper is now a runtime gateway / executor only.
+# Source of truth for models is NCS (Node Capabilities Service).
+# No hardcoded model lists.

-swapper:
-  mode: single-active
-  max_concurrent_models: 1
+node_id: noda3
+
+runtimes:
+  ollama:
+    url: http://localhost:11434
+    timeout: 300
+  comfyui:
+    url: http://127.0.0.1:8188
+
+limits:
+  llm_concurrency: 2
+  vision_concurrency: 1
+  max_concurrent_models: 2
  model_swap_timeout: 300
-  gpu_enabled: true
-  metal_acceleration: false  # NVIDIA GPU, not Apple Silicon
-  # Модель для автоматичного завантаження при старті
-  # qwen3-8b - основна модель (4.87 GB), швидка відповідь на перший запит
-  default_model: qwen3-8b

-models:
-  # Primary LLM - Qwen3 8B (High Priority) - Main model from INFRASTRUCTURE.md
-  qwen3-8b:
-    path: ollama:qwen3:8b
-    type: llm
-    size_gb: 4.87
-    priority: high
-    description: "Primary LLM for general tasks and conversations"
-    
-  # Vision Model - Qwen3-VL 8B (High Priority) - For image processing
-  qwen3-vl-8b:
-    path: ollama:qwen3-vl:8b
-    type: vision
-    size_gb: 5.72
-    priority: high
-    description: "Vision model for image understanding and processing"
-    
-  # Qwen2.5 7B Instruct (High Priority)
-  qwen2.5-7b-instruct:
-    path: ollama:qwen2.5:7b-instruct-q4_K_M
-    type: llm
-    size_gb: 4.36
-    priority: high
-    description: "Qwen2.5 7B Instruct model"
-    
-  # Lightweight LLM - Qwen2.5 3B Instruct (Medium Priority)
-  qwen2.5-3b-instruct:
-    path: ollama:qwen2.5:3b-instruct-q4_K_M
-    type: llm
-    size_gb: 1.80
-    priority: medium
-    description: "Lightweight LLM for faster responses"
-    
-  # Math Specialist - Qwen2 Math 7B (High Priority)
-  qwen2-math-7b:
-    path: ollama:qwen2-math:7b
-    type: math
-    size_gb: 4.13
-    priority: high
-    description: "Specialized model for mathematical tasks"
+timeouts:
+  llm_ms: 120000
+  vision_ms: 180000
+  image_gen_ms: 600000
+
+gpu:
+  enabled: true
+  metal_acceleration: false
+  auto_unload_on_oom: true
+  vram_threshold_gb: 22

 storage:
  models_dir: /app/models
  cache_dir: /app/cache
  swap_dir: /app/swap
-
-ollama:
-  url: http://ollama:11434  # From Docker container to Ollama service
-  timeout: 300