feat(fabric): decommission Swapper from critical path, NCS = source of truth
- Node Worker: replace swapper_vision with ollama_vision (direct Ollama API) - Node Worker: add NATS subjects for stt/tts/image (stubs ready) - Node Worker: remove SWAPPER_URL dependency from config - Router: vision calls go directly to Ollama /api/generate with images - Router: local LLM calls go directly to Ollama /api/generate - Router: add OLLAMA_URL and PREFER_NODE_WORKER=true feature flag - Router: /v1/models now uses NCS global capabilities pool - NCS: SWAPPER_URL="" -> skip Swapper probing (status=disabled) - Swapper configs: remove all hardcoded model lists, keep only runtime URLs, timeouts, limits - docker-compose.node1.yml: add OLLAMA_URL, PREFER_NODE_WORKER for router; SWAPPER_URL= for NCS; remove swapper-service from node-worker depends_on - docker-compose.node2-sofiia.yml: same changes for NODA2 Swapper service still runs but is NOT in the critical inference path. Source of truth for models is now NCS -> Ollama /api/tags. Made-with: Cursor
This commit is contained in:
@@ -1,90 +1,35 @@
|
||||
# Swapper Configuration for Node #2 (Development Node)
|
||||
# Single-active LLM scheduler
|
||||
# MacBook Pro M4 Max - Apple Silicon (40-core GPU, 64GB RAM)
|
||||
# Auto-generated configuration with available Ollama models
|
||||
# Swapper Configuration — Default / Fallback
|
||||
#
|
||||
# NOTE: Swapper is now a runtime gateway / executor only.
|
||||
# Source of truth for models is NCS (Node Capabilities Service).
|
||||
# No hardcoded model lists — Swapper queries NCS or Ollama /api/tags at startup.
|
||||
#
|
||||
# Per-node overrides: swapper_config_node1.yaml, swapper_config_node2.yaml
|
||||
|
||||
swapper:
|
||||
mode: single-active
|
||||
max_concurrent_models: 1
|
||||
node_id: default
|
||||
|
||||
runtimes:
|
||||
ollama:
|
||||
url: http://localhost:11434
|
||||
timeout: 300
|
||||
|
||||
limits:
|
||||
llm_concurrency: 2
|
||||
vision_concurrency: 1
|
||||
max_concurrent_models: 2
|
||||
model_swap_timeout: 300
|
||||
gpu_enabled: true
|
||||
metal_acceleration: true # Apple Silicon GPU acceleration
|
||||
# Модель для автоматичного завантаження при старті (опціонально)
|
||||
# Якщо не вказано - моделі завантажуються тільки за запитом
|
||||
# Рекомендовано: gpt-oss:latest (швидка модель) або phi3:latest (легка модель)
|
||||
default_model: gpt-oss:latest # Модель активується автоматично при старті
|
||||
|
||||
models:
|
||||
# Fast LLM - GPT-OSS 20B (High Priority) - Main model for general tasks
|
||||
gpt-oss-latest:
|
||||
path: ollama:gpt-oss:latest
|
||||
type: llm
|
||||
size_gb: 13.0
|
||||
priority: high
|
||||
description: "Fast LLM for general tasks and conversations (20.9B params)"
|
||||
|
||||
# Lightweight LLM - Phi3 3.8B (High Priority) - Fast responses
|
||||
phi3-latest:
|
||||
path: ollama:phi3:latest
|
||||
type: llm
|
||||
size_gb: 2.2
|
||||
priority: high
|
||||
description: "Lightweight LLM for fast responses (3.8B params)"
|
||||
|
||||
# Code Specialist - StarCoder2 3B (Medium Priority) - Code engineering
|
||||
starcoder2-3b:
|
||||
path: ollama:starcoder2:3b
|
||||
type: code
|
||||
size_gb: 1.7
|
||||
priority: medium
|
||||
description: "Code specialist model for code engineering (3B params)"
|
||||
|
||||
# Reasoning Model - Mistral Nemo 12.2B (High Priority) - Advanced reasoning
|
||||
mistral-nemo-12b:
|
||||
path: ollama:mistral-nemo:12b
|
||||
type: llm
|
||||
size_gb: 7.1
|
||||
priority: high
|
||||
description: "Advanced reasoning model for complex tasks (12.2B params)"
|
||||
|
||||
# Reasoning Model - Gemma2 27B (Medium Priority) - Strategic reasoning
|
||||
gemma2-27b:
|
||||
path: ollama:gemma2:27b
|
||||
type: llm
|
||||
size_gb: 15.0
|
||||
priority: medium
|
||||
description: "Reasoning model for strategic tasks (27.2B params)"
|
||||
|
||||
# Code Specialist - DeepSeek Coder 33B (High Priority) - Advanced code tasks
|
||||
deepseek-coder-33b:
|
||||
path: ollama:deepseek-coder:33b
|
||||
type: code
|
||||
size_gb: 18.0
|
||||
priority: high
|
||||
description: "Advanced code specialist model (33B params)"
|
||||
|
||||
# Code Specialist - Qwen2.5 Coder 32B (High Priority) - Advanced code tasks
|
||||
qwen2.5-coder-32b:
|
||||
path: ollama:qwen2.5-coder:32b
|
||||
type: code
|
||||
size_gb: 19.0
|
||||
priority: high
|
||||
description: "Advanced code specialist model (32.8B params)"
|
||||
|
||||
# Reasoning Model - DeepSeek R1 70B (High Priority) - Strategic reasoning (large model)
|
||||
deepseek-r1-70b:
|
||||
path: ollama:deepseek-r1:70b
|
||||
type: llm
|
||||
size_gb: 42.0
|
||||
priority: high
|
||||
description: "Strategic reasoning model (70.6B params, quantized)"
|
||||
timeouts:
|
||||
llm_ms: 120000
|
||||
vision_ms: 180000
|
||||
stt_ms: 60000
|
||||
tts_ms: 60000
|
||||
|
||||
gpu:
|
||||
enabled: false
|
||||
metal_acceleration: false
|
||||
|
||||
storage:
|
||||
models_dir: /app/models
|
||||
cache_dir: /app/cache
|
||||
swap_dir: /app/swap
|
||||
|
||||
ollama:
|
||||
url: http://localhost:11434 # Native Ollama on MacBook (via Pieces OS or brew)
|
||||
timeout: 300
|
||||
|
||||
|
||||
@@ -1,186 +1,37 @@
|
||||
# Swapper Configuration for Node #1 (Production Server)
|
||||
# Optimized Multimodal Stack: LLM + Vision + OCR + Document + Audio
|
||||
# Hetzner GEX44 - NVIDIA RTX 4000 SFF Ada (20GB VRAM)
|
||||
#
|
||||
# ВАЖЛИВО: Ембедінги через зовнішні API:
|
||||
# - Text: Cohere API (embed-multilingual-v3.0, 1024 dim)
|
||||
# - Image: Vision Encoder (OpenCLIP ViT-L/14, 768 dim)
|
||||
# НЕ використовуємо локальні embedding моделі!
|
||||
# NOTE: Swapper is now a runtime gateway / executor only.
|
||||
# Source of truth for models is NCS (Node Capabilities Service).
|
||||
# No hardcoded model lists — Swapper queries NCS or Ollama /api/tags at startup.
|
||||
|
||||
swapper:
|
||||
mode: multi-active
|
||||
max_concurrent_models: 4 # LLM + OCR + STT + TTS (до 15GB)
|
||||
node_id: noda1
|
||||
|
||||
runtimes:
|
||||
ollama:
|
||||
url: http://172.18.0.1:11434
|
||||
timeout: 300
|
||||
# comfyui:
|
||||
# url: http://127.0.0.1:8188
|
||||
|
||||
limits:
|
||||
llm_concurrency: 2
|
||||
vision_concurrency: 1
|
||||
max_concurrent_models: 4
|
||||
model_swap_timeout: 300
|
||||
gpu_enabled: true
|
||||
|
||||
timeouts:
|
||||
llm_ms: 120000
|
||||
vision_ms: 180000
|
||||
stt_ms: 60000
|
||||
tts_ms: 60000
|
||||
image_gen_ms: 300000
|
||||
|
||||
gpu:
|
||||
enabled: true
|
||||
metal_acceleration: false
|
||||
default_model: qwen3-8b
|
||||
lazy_load_ocr: true
|
||||
lazy_load_audio: true
|
||||
# Автоматичне вивантаження при нестачі VRAM
|
||||
auto_unload_on_oom: true
|
||||
vram_threshold_gb: 18 # Починати вивантажувати при 18GB
|
||||
|
||||
models:
|
||||
# ============================================
|
||||
# LLM MODELS (Ollama) - тільки qwen3
|
||||
# ============================================
|
||||
|
||||
# Primary LLM - Qwen3 8B (includes math, coding, reasoning)
|
||||
qwen3-8b:
|
||||
path: ollama:qwen3:8b
|
||||
type: llm
|
||||
size_gb: 5.2
|
||||
priority: high
|
||||
description: "Qwen3 8B - primary LLM with math, coding, reasoning capabilities"
|
||||
capabilities:
|
||||
- chat
|
||||
- math
|
||||
- coding
|
||||
- reasoning
|
||||
- multilingual
|
||||
|
||||
# ============================================
|
||||
# VISION MODELS (Ollama)
|
||||
# ============================================
|
||||
|
||||
# Vision Model - Qwen3-VL 8B
|
||||
qwen3-vl-8b:
|
||||
path: ollama:qwen3-vl:8b
|
||||
type: vision
|
||||
size_gb: 6.1
|
||||
priority: high
|
||||
description: "Qwen3-VL 8B for image understanding and visual reasoning"
|
||||
capabilities:
|
||||
- image_understanding
|
||||
- visual_qa
|
||||
- diagram_analysis
|
||||
- ocr_basic
|
||||
|
||||
# ============================================
|
||||
# OCR/DOCUMENT MODELS (HuggingFace)
|
||||
# ============================================
|
||||
|
||||
# GOT-OCR2.0 - Best for documents, tables, formulas
|
||||
got-ocr2:
|
||||
path: huggingface:stepfun-ai/GOT-OCR2_0
|
||||
type: ocr
|
||||
size_gb: 7.0
|
||||
priority: high
|
||||
description: "Best OCR for documents, tables, formulas, handwriting"
|
||||
capabilities:
|
||||
- documents
|
||||
- tables
|
||||
- formulas
|
||||
- handwriting
|
||||
- multilingual
|
||||
|
||||
# Donut - Document Understanding (no external OCR, 91% CORD)
|
||||
donut-base:
|
||||
path: huggingface:naver-clova-ix/donut-base
|
||||
type: ocr
|
||||
size_gb: 3.0
|
||||
priority: high
|
||||
description: "Document parsing without OCR engine (91% CORD accuracy)"
|
||||
capabilities:
|
||||
- document_parsing
|
||||
- receipts
|
||||
- forms
|
||||
- invoices
|
||||
|
||||
# Donut fine-tuned for receipts/invoices (CORD dataset)
|
||||
donut-cord:
|
||||
path: huggingface:naver-clova-ix/donut-base-finetuned-cord-v2
|
||||
type: ocr
|
||||
size_gb: 3.0
|
||||
priority: medium
|
||||
description: "Donut fine-tuned for receipts extraction"
|
||||
capabilities:
|
||||
- receipts
|
||||
- invoices
|
||||
- structured_extraction
|
||||
|
||||
# IBM Granite Docling - Document conversion with structure preservation
|
||||
granite-docling:
|
||||
path: huggingface:ds4sd/docling-ibm-granite-vision-1b
|
||||
type: document
|
||||
size_gb: 2.5
|
||||
priority: high
|
||||
description: "IBM Granite Docling for PDF/document structure extraction"
|
||||
capabilities:
|
||||
- pdf_conversion
|
||||
- table_extraction
|
||||
- formula_extraction
|
||||
- layout_preservation
|
||||
- doctags_format
|
||||
|
||||
# ============================================
|
||||
# AUDIO MODELS - STT (Speech-to-Text)
|
||||
# ============================================
|
||||
|
||||
# Faster Whisper Large-v3 - Best STT quality
|
||||
faster-whisper-large:
|
||||
path: huggingface:Systran/faster-whisper-large-v3
|
||||
type: stt
|
||||
size_gb: 3.0
|
||||
priority: high
|
||||
description: "Faster Whisper Large-v3 - best quality, 99 languages"
|
||||
capabilities:
|
||||
- speech_recognition
|
||||
- transcription
|
||||
- multilingual
|
||||
- timestamps
|
||||
- ukrainian
|
||||
|
||||
# Whisper Small - Fast/lightweight for quick transcription
|
||||
whisper-small:
|
||||
path: huggingface:openai/whisper-small
|
||||
type: stt
|
||||
size_gb: 0.5
|
||||
priority: medium
|
||||
description: "Whisper Small for fast transcription"
|
||||
capabilities:
|
||||
- speech_recognition
|
||||
- transcription
|
||||
|
||||
# ============================================
|
||||
# AUDIO MODELS - TTS (Text-to-Speech)
|
||||
# ============================================
|
||||
|
||||
# Coqui XTTS-v2 - Best multilingual TTS with Ukrainian support
|
||||
xtts-v2:
|
||||
path: huggingface:coqui/XTTS-v2
|
||||
type: tts
|
||||
size_gb: 2.0
|
||||
priority: high
|
||||
description: "XTTS-v2 multilingual TTS with voice cloning, Ukrainian support"
|
||||
capabilities:
|
||||
- text_to_speech
|
||||
- voice_cloning
|
||||
- multilingual
|
||||
- ukrainian
|
||||
- 17_languages
|
||||
|
||||
# ============================================
|
||||
# IMAGE GENERATION MODELS (HuggingFace/Diffusers)
|
||||
# ============================================
|
||||
|
||||
# FLUX.2 Klein 4B - High quality image generation with lazy loading
|
||||
flux-klein-4b:
|
||||
path: huggingface:black-forest-labs/FLUX.2-klein-base-4B
|
||||
type: image_generation
|
||||
size_gb: 15.4
|
||||
priority: medium
|
||||
description: "FLUX.2 Klein 4B - high quality image generation, lazy loaded on demand"
|
||||
capabilities:
|
||||
- text_to_image
|
||||
- high_quality
|
||||
- 1024x1024
|
||||
- artistic
|
||||
default_params:
|
||||
num_inference_steps: 50
|
||||
guidance_scale: 4.0
|
||||
width: 1024
|
||||
height: 1024
|
||||
vram_threshold_gb: 18
|
||||
|
||||
storage:
|
||||
models_dir: /app/models
|
||||
@@ -188,33 +39,8 @@ storage:
|
||||
swap_dir: /app/swap
|
||||
huggingface_cache: /root/.cache/huggingface
|
||||
|
||||
ollama:
|
||||
url: http://172.18.0.1:11434
|
||||
timeout: 300
|
||||
|
||||
huggingface:
|
||||
device: cuda
|
||||
torch_dtype: float16
|
||||
trust_remote_code: true
|
||||
low_cpu_mem_usage: true
|
||||
|
||||
# ============================================
|
||||
# EMBEDDING SERVICES (External APIs)
|
||||
# НЕ через Swapper - окремі сервіси!
|
||||
# ============================================
|
||||
#
|
||||
# Text Embeddings:
|
||||
# Service: Memory Service → Cohere API
|
||||
# Model: embed-multilingual-v3.0
|
||||
# Dimension: 1024
|
||||
# Endpoint: Memory Service handles internally
|
||||
#
|
||||
# Image/Multimodal Embeddings:
|
||||
# Service: Vision Encoder (port 8001)
|
||||
# Model: OpenCLIP ViT-L/14
|
||||
# Dimension: 768
|
||||
# Endpoint: http://vision-encoder:8001/embed
|
||||
#
|
||||
# Vector Storage:
|
||||
# Qdrant (port 6333) - separate collections for text vs image embeddings
|
||||
# ВАЖЛИВО: НЕ змішувати embedding spaces в одній колекції!
|
||||
|
||||
@@ -1,126 +1,40 @@
|
||||
# Swapper Configuration for Node #2 (Development Node)
|
||||
# Single-active LLM scheduler
|
||||
# MacBook Pro M4 Max - Apple Silicon (40-core GPU, 64GB RAM)
|
||||
# Auto-generated configuration with available Ollama models
|
||||
#
|
||||
# NOTE: Swapper is now a runtime gateway / executor only.
|
||||
# Source of truth for models is NCS (Node Capabilities Service).
|
||||
# No hardcoded model lists — Swapper queries NCS or Ollama /api/tags at startup.
|
||||
|
||||
swapper:
|
||||
mode: single-active
|
||||
node_id: noda2
|
||||
|
||||
runtimes:
|
||||
ollama:
|
||||
url: http://host.docker.internal:11434
|
||||
timeout: 300
|
||||
# mlx:
|
||||
# stt_model: whisper-large-v3-turbo
|
||||
# tts_model: kokoro-82m
|
||||
# comfyui:
|
||||
# url: http://127.0.0.1:8188
|
||||
|
||||
limits:
|
||||
llm_concurrency: 1
|
||||
vision_concurrency: 1
|
||||
max_concurrent_models: 1
|
||||
model_swap_timeout: 300
|
||||
gpu_enabled: true
|
||||
metal_acceleration: true # Apple Silicon GPU acceleration
|
||||
# Модель для автоматичного завантаження при старті (опціонально)
|
||||
# Якщо не вказано - моделі завантажуються тільки за запитом
|
||||
# Рекомендовано: gpt-oss:latest (швидка модель) або phi3:latest (легка модель)
|
||||
# Стартова модель має бути реально встановлена в Ollama на NODA2
|
||||
default_model: qwen3:14b # Модель активується автоматично при старті
|
||||
|
||||
models:
|
||||
# Fast LLM - GPT-OSS 20B (High Priority) - Main model for general tasks
|
||||
gpt-oss-latest:
|
||||
path: ollama:gpt-oss:latest
|
||||
type: llm
|
||||
size_gb: 13.0
|
||||
priority: high
|
||||
description: "Fast LLM for general tasks and conversations (20.9B params)"
|
||||
|
||||
# Lightweight LLM - Phi3 3.8B (High Priority) - Fast responses
|
||||
phi3-latest:
|
||||
path: ollama:phi3:latest
|
||||
type: llm
|
||||
size_gb: 2.2
|
||||
priority: high
|
||||
description: "Lightweight LLM for fast responses (3.8B params)"
|
||||
|
||||
# General Reasoning - Qwen3 14B (High Priority)
|
||||
qwen3-14b:
|
||||
path: ollama:qwen3:14b
|
||||
type: llm
|
||||
size_gb: 9.3
|
||||
priority: high
|
||||
description: "Balanced local model for Sofiia and router fallback"
|
||||
timeouts:
|
||||
llm_ms: 120000
|
||||
vision_ms: 180000
|
||||
stt_ms: 60000
|
||||
tts_ms: 60000
|
||||
image_gen_ms: 300000
|
||||
|
||||
# Reasoning Model - Qwen3.5 35B A3B (High Priority)
|
||||
qwen3.5-35b-a3b:
|
||||
path: ollama:qwen3.5:35b-a3b
|
||||
type: llm
|
||||
size_gb: 22.0
|
||||
priority: high
|
||||
description: "Large reasoning model for complex Sofiia requests"
|
||||
|
||||
# Reasoning Model - GLM 4.7 Flash (High Priority) - Fast general model
|
||||
glm-4.7-flash:
|
||||
path: ollama:glm-4.7-flash:32k
|
||||
type: llm
|
||||
size_gb: 19.0
|
||||
priority: high
|
||||
description: "Multi-purpose reasoning model (fast context)"
|
||||
|
||||
# Reasoning Model - Gemma2 27B (Medium Priority) - Strategic reasoning
|
||||
gemma2-27b:
|
||||
path: ollama:gemma2:27b
|
||||
type: llm
|
||||
size_gb: 15.0
|
||||
priority: medium
|
||||
description: "Reasoning model for strategic tasks (27.2B params)"
|
||||
|
||||
# Code Specialist - DeepSeek Coder 33B (High Priority) - Advanced code tasks
|
||||
deepseek-coder-33b:
|
||||
path: ollama:deepseek-coder:33b
|
||||
type: code
|
||||
size_gb: 18.0
|
||||
priority: high
|
||||
description: "Advanced code specialist model (33B params)"
|
||||
|
||||
# Code Specialist - Qwen2.5 Coder 32B (High Priority) - Advanced code tasks
|
||||
qwen2.5-coder-32b:
|
||||
path: ollama:qwen2.5-coder:32b
|
||||
type: code
|
||||
size_gb: 19.0
|
||||
priority: high
|
||||
description: "Advanced code specialist model (32.8B params)"
|
||||
|
||||
# Reasoning Model - DeepSeek R1 70B (High Priority) - Strategic reasoning (large model)
|
||||
deepseek-r1-70b:
|
||||
path: ollama:deepseek-r1:70b
|
||||
type: llm
|
||||
size_gb: 42.0
|
||||
priority: high
|
||||
description: "Strategic reasoning model (70.6B params, quantized)"
|
||||
|
||||
# Vision Model - LLaVA 13B (P0 Fix: NODA2 fallback vision)
|
||||
# Available in Ollama on NODA2 — used until qwen3-vl:8b is installed
|
||||
llava-13b:
|
||||
path: ollama:llava:13b
|
||||
type: vision
|
||||
size_gb: 8.0
|
||||
priority: high
|
||||
description: "LLaVA 13B vision model (multimodal CLIP+LLM). P0 fallback until qwen3-vl:8b."
|
||||
vision: true
|
||||
ollama_model: "llava:13b"
|
||||
|
||||
# Vision Model - Qwen3-VL 8B (RECOMMENDED: install with: ollama pull qwen3-vl:8b)
|
||||
# Better quality than llava:13b. Enable once installed.
|
||||
# qwen3-vl-8b:
|
||||
# path: ollama:qwen3-vl:8b
|
||||
# type: vision
|
||||
# size_gb: 5.5
|
||||
# priority: high
|
||||
# description: "Qwen3-VL 8B — modern vision-language model (recommended)"
|
||||
# vision: true
|
||||
# ollama_model: "qwen3-vl:8b"
|
||||
gpu:
|
||||
enabled: true
|
||||
metal_acceleration: true
|
||||
|
||||
storage:
|
||||
models_dir: /app/models
|
||||
cache_dir: /app/cache
|
||||
swap_dir: /app/swap
|
||||
|
||||
ollama:
|
||||
url: http://host.docker.internal:11434 # host.docker.internal → native Ollama on MacBook (NODA2 P1 fix)
|
||||
timeout: 300
|
||||
|
||||
# Vision endpoint configuration
|
||||
# /vision/models returns all models where vision: true
|
||||
vision:
|
||||
default_model: llava-13b
|
||||
ollama_base_url: http://host.docker.internal:11434
|
||||
|
||||
@@ -1,63 +1,37 @@
|
||||
# Swapper Configuration for Node #3 (AI/ML Workstation)
|
||||
# Single-active LLM scheduler
|
||||
# Threadripper PRO + RTX 3090 24GB - GPU-intensive workloads
|
||||
# Threadripper PRO + RTX 3090 24GB — GPU-intensive workloads
|
||||
#
|
||||
# NOTE: Swapper is now a runtime gateway / executor only.
|
||||
# Source of truth for models is NCS (Node Capabilities Service).
|
||||
# No hardcoded model lists.
|
||||
|
||||
swapper:
|
||||
mode: single-active
|
||||
max_concurrent_models: 1
|
||||
node_id: noda3
|
||||
|
||||
runtimes:
|
||||
ollama:
|
||||
url: http://localhost:11434
|
||||
timeout: 300
|
||||
comfyui:
|
||||
url: http://127.0.0.1:8188
|
||||
|
||||
limits:
|
||||
llm_concurrency: 2
|
||||
vision_concurrency: 1
|
||||
max_concurrent_models: 2
|
||||
model_swap_timeout: 300
|
||||
gpu_enabled: true
|
||||
metal_acceleration: false # NVIDIA GPU, not Apple Silicon
|
||||
# Модель для автоматичного завантаження при старті
|
||||
# qwen3-8b - основна модель (4.87 GB), швидка відповідь на перший запит
|
||||
default_model: qwen3-8b
|
||||
|
||||
models:
|
||||
# Primary LLM - Qwen3 8B (High Priority) - Main model from INFRASTRUCTURE.md
|
||||
qwen3-8b:
|
||||
path: ollama:qwen3:8b
|
||||
type: llm
|
||||
size_gb: 4.87
|
||||
priority: high
|
||||
description: "Primary LLM for general tasks and conversations"
|
||||
|
||||
# Vision Model - Qwen3-VL 8B (High Priority) - For image processing
|
||||
qwen3-vl-8b:
|
||||
path: ollama:qwen3-vl:8b
|
||||
type: vision
|
||||
size_gb: 5.72
|
||||
priority: high
|
||||
description: "Vision model for image understanding and processing"
|
||||
|
||||
# Qwen2.5 7B Instruct (High Priority)
|
||||
qwen2.5-7b-instruct:
|
||||
path: ollama:qwen2.5:7b-instruct-q4_K_M
|
||||
type: llm
|
||||
size_gb: 4.36
|
||||
priority: high
|
||||
description: "Qwen2.5 7B Instruct model"
|
||||
|
||||
# Lightweight LLM - Qwen2.5 3B Instruct (Medium Priority)
|
||||
qwen2.5-3b-instruct:
|
||||
path: ollama:qwen2.5:3b-instruct-q4_K_M
|
||||
type: llm
|
||||
size_gb: 1.80
|
||||
priority: medium
|
||||
description: "Lightweight LLM for faster responses"
|
||||
|
||||
# Math Specialist - Qwen2 Math 7B (High Priority)
|
||||
qwen2-math-7b:
|
||||
path: ollama:qwen2-math:7b
|
||||
type: math
|
||||
size_gb: 4.13
|
||||
priority: high
|
||||
description: "Specialized model for mathematical tasks"
|
||||
timeouts:
|
||||
llm_ms: 120000
|
||||
vision_ms: 180000
|
||||
image_gen_ms: 600000
|
||||
|
||||
gpu:
|
||||
enabled: true
|
||||
metal_acceleration: false
|
||||
auto_unload_on_oom: true
|
||||
vram_threshold_gb: 22
|
||||
|
||||
storage:
|
||||
models_dir: /app/models
|
||||
cache_dir: /app/cache
|
||||
swap_dir: /app/swap
|
||||
|
||||
ollama:
|
||||
url: http://ollama:11434 # From Docker container to Ollama service
|
||||
timeout: 300
|
||||
|
||||
Reference in New Issue
Block a user