- Node Worker: replace swapper_vision with ollama_vision (direct Ollama API) - Node Worker: add NATS subjects for stt/tts/image (stubs ready) - Node Worker: remove SWAPPER_URL dependency from config - Router: vision calls go directly to Ollama /api/generate with images - Router: local LLM calls go directly to Ollama /api/generate - Router: add OLLAMA_URL and PREFER_NODE_WORKER=true feature flag - Router: /v1/models now uses NCS global capabilities pool - NCS: SWAPPER_URL="" -> skip Swapper probing (status=disabled) - Swapper configs: remove all hardcoded model lists, keep only runtime URLs, timeouts, limits - docker-compose.node1.yml: add OLLAMA_URL, PREFER_NODE_WORKER for router; SWAPPER_URL= for NCS; remove swapper-service from node-worker depends_on - docker-compose.node2-sofiia.yml: same changes for NODA2 Swapper service still runs but is NOT in the critical inference path. Source of truth for models is now NCS -> Ollama /api/tags. Made-with: Cursor
47 lines
994 B
YAML
47 lines
994 B
YAML
# Swapper Configuration for Node #1 (Production Server)
|
|
# Hetzner GEX44 - NVIDIA RTX 4000 SFF Ada (20GB VRAM)
|
|
#
|
|
# NOTE: Swapper is now a runtime gateway / executor only.
|
|
# Source of truth for models is NCS (Node Capabilities Service).
|
|
# No hardcoded model lists — Swapper queries NCS or Ollama /api/tags at startup.
|
|
|
|
node_id: noda1
|
|
|
|
runtimes:
|
|
ollama:
|
|
url: http://172.18.0.1:11434
|
|
timeout: 300
|
|
# comfyui:
|
|
# url: http://127.0.0.1:8188
|
|
|
|
limits:
|
|
llm_concurrency: 2
|
|
vision_concurrency: 1
|
|
max_concurrent_models: 4
|
|
model_swap_timeout: 300
|
|
|
|
timeouts:
|
|
llm_ms: 120000
|
|
vision_ms: 180000
|
|
stt_ms: 60000
|
|
tts_ms: 60000
|
|
image_gen_ms: 300000
|
|
|
|
gpu:
|
|
enabled: true
|
|
metal_acceleration: false
|
|
auto_unload_on_oom: true
|
|
vram_threshold_gb: 18
|
|
|
|
storage:
|
|
models_dir: /app/models
|
|
cache_dir: /app/cache
|
|
swap_dir: /app/swap
|
|
huggingface_cache: /root/.cache/huggingface
|
|
|
|
huggingface:
|
|
device: cuda
|
|
torch_dtype: float16
|
|
trust_remote_code: true
|
|
low_cpu_mem_usage: true
|