feat(fabric): decommission Swapper from critical path, NCS = source of truth
- Node Worker: replace swapper_vision with ollama_vision (direct Ollama API) - Node Worker: add NATS subjects for stt/tts/image (stubs ready) - Node Worker: remove SWAPPER_URL dependency from config - Router: vision calls go directly to Ollama /api/generate with images - Router: local LLM calls go directly to Ollama /api/generate - Router: add OLLAMA_URL and PREFER_NODE_WORKER=true feature flag - Router: /v1/models now uses NCS global capabilities pool - NCS: SWAPPER_URL="" -> skip Swapper probing (status=disabled) - Swapper configs: remove all hardcoded model lists, keep only runtime URLs, timeouts, limits - docker-compose.node1.yml: add OLLAMA_URL, PREFER_NODE_WORKER for router; SWAPPER_URL= for NCS; remove swapper-service from node-worker depends_on - docker-compose.node2-sofiia.yml: same changes for NODA2 Swapper service still runs but is NOT in the critical inference path. Source of truth for models is now NCS -> Ollama /api/tags. Made-with: Cursor
This commit is contained in:
@@ -1,63 +1,37 @@
|
||||
# Swapper Configuration for Node #3 (AI/ML Workstation)
|
||||
# Single-active LLM scheduler
|
||||
# Threadripper PRO + RTX 3090 24GB - GPU-intensive workloads
|
||||
# Threadripper PRO + RTX 3090 24GB — GPU-intensive workloads
|
||||
#
|
||||
# NOTE: Swapper is now a runtime gateway / executor only.
|
||||
# Source of truth for models is NCS (Node Capabilities Service).
|
||||
# No hardcoded model lists.
|
||||
|
||||
swapper:
|
||||
mode: single-active
|
||||
max_concurrent_models: 1
|
||||
node_id: noda3
|
||||
|
||||
runtimes:
|
||||
ollama:
|
||||
url: http://localhost:11434
|
||||
timeout: 300
|
||||
comfyui:
|
||||
url: http://127.0.0.1:8188
|
||||
|
||||
limits:
|
||||
llm_concurrency: 2
|
||||
vision_concurrency: 1
|
||||
max_concurrent_models: 2
|
||||
model_swap_timeout: 300
|
||||
gpu_enabled: true
|
||||
metal_acceleration: false # NVIDIA GPU, not Apple Silicon
|
||||
# Модель для автоматичного завантаження при старті
|
||||
# qwen3-8b - основна модель (4.87 GB), швидка відповідь на перший запит
|
||||
default_model: qwen3-8b
|
||||
|
||||
models:
|
||||
# Primary LLM - Qwen3 8B (High Priority) - Main model from INFRASTRUCTURE.md
|
||||
qwen3-8b:
|
||||
path: ollama:qwen3:8b
|
||||
type: llm
|
||||
size_gb: 4.87
|
||||
priority: high
|
||||
description: "Primary LLM for general tasks and conversations"
|
||||
|
||||
# Vision Model - Qwen3-VL 8B (High Priority) - For image processing
|
||||
qwen3-vl-8b:
|
||||
path: ollama:qwen3-vl:8b
|
||||
type: vision
|
||||
size_gb: 5.72
|
||||
priority: high
|
||||
description: "Vision model for image understanding and processing"
|
||||
|
||||
# Qwen2.5 7B Instruct (High Priority)
|
||||
qwen2.5-7b-instruct:
|
||||
path: ollama:qwen2.5:7b-instruct-q4_K_M
|
||||
type: llm
|
||||
size_gb: 4.36
|
||||
priority: high
|
||||
description: "Qwen2.5 7B Instruct model"
|
||||
|
||||
# Lightweight LLM - Qwen2.5 3B Instruct (Medium Priority)
|
||||
qwen2.5-3b-instruct:
|
||||
path: ollama:qwen2.5:3b-instruct-q4_K_M
|
||||
type: llm
|
||||
size_gb: 1.80
|
||||
priority: medium
|
||||
description: "Lightweight LLM for faster responses"
|
||||
|
||||
# Math Specialist - Qwen2 Math 7B (High Priority)
|
||||
qwen2-math-7b:
|
||||
path: ollama:qwen2-math:7b
|
||||
type: math
|
||||
size_gb: 4.13
|
||||
priority: high
|
||||
description: "Specialized model for mathematical tasks"
|
||||
timeouts:
|
||||
llm_ms: 120000
|
||||
vision_ms: 180000
|
||||
image_gen_ms: 600000
|
||||
|
||||
gpu:
|
||||
enabled: true
|
||||
metal_acceleration: false
|
||||
auto_unload_on_oom: true
|
||||
vram_threshold_gb: 22
|
||||
|
||||
storage:
|
||||
models_dir: /app/models
|
||||
cache_dir: /app/cache
|
||||
swap_dir: /app/swap
|
||||
|
||||
ollama:
|
||||
url: http://ollama:11434 # From Docker container to Ollama service
|
||||
timeout: 300
|
||||
|
||||
Reference in New Issue
Block a user