feat(fabric): decommission Swapper from critical path, NCS = source of truth

- Node Worker: replace swapper_vision with ollama_vision (direct Ollama API)
- Node Worker: add NATS subjects for stt/tts/image (stubs ready)
- Node Worker: remove SWAPPER_URL dependency from config
- Router: vision calls go directly to Ollama /api/generate with images
- Router: local LLM calls go directly to Ollama /api/generate
- Router: add OLLAMA_URL and PREFER_NODE_WORKER=true feature flag
- Router: /v1/models now uses NCS global capabilities pool
- NCS: SWAPPER_URL="" -> skip Swapper probing (status=disabled)
- Swapper configs: remove all hardcoded model lists, keep only runtime
  URLs, timeouts, limits
- docker-compose.node1.yml: add OLLAMA_URL, PREFER_NODE_WORKER for router;
  SWAPPER_URL= for NCS; remove swapper-service from node-worker depends_on
- docker-compose.node2-sofiia.yml: same changes for NODA2

Swapper service still runs but is NOT in the critical inference path.
Source of truth for models is now NCS -> Ollama /api/tags.

Made-with: Cursor
This commit is contained in:
Apple
2026-02-27 04:16:16 -08:00
parent 90080c632a
commit 194c87f53c
11 changed files with 347 additions and 614 deletions

View File

@@ -48,9 +48,11 @@ services:
- ROUTER_TOOL_MAX_ROUNDS=${ROUTER_TOOL_MAX_ROUNDS:-10} - ROUTER_TOOL_MAX_ROUNDS=${ROUTER_TOOL_MAX_ROUNDS:-10}
- AGROMATRIX_REVIEW_AUTH_MODE=${AGROMATRIX_REVIEW_AUTH_MODE:-bearer} - AGROMATRIX_REVIEW_AUTH_MODE=${AGROMATRIX_REVIEW_AUTH_MODE:-bearer}
- AGROMATRIX_REVIEW_BEARER_TOKENS=${AGROMATRIX_REVIEW_BEARER_TOKENS} - AGROMATRIX_REVIEW_BEARER_TOKENS=${AGROMATRIX_REVIEW_BEARER_TOKENS}
# ── Node Capabilities (multi-node model selection) ── # ── Fabric Layer (NCS + Node Worker, Swapper being decommissioned) ──
- NODE_CAPABILITIES_URL=http://node-capabilities:8099/capabilities - NODE_CAPABILITIES_URL=http://node-capabilities:8099/capabilities
- ENABLE_GLOBAL_CAPS_NATS=true - ENABLE_GLOBAL_CAPS_NATS=true
- OLLAMA_URL=http://172.18.0.1:11434
- PREFER_NODE_WORKER=true
volumes: volumes:
- ${DEPLOY_ROOT:-.}/services/router/router_config.yaml:/app/router_config.yaml:ro - ${DEPLOY_ROOT:-.}/services/router/router_config.yaml:/app/router_config.yaml:ro
- ${DEPLOY_ROOT:-.}/services/router/router-config.yml:/app/router-config.yml:ro - ${DEPLOY_ROOT:-.}/services/router/router-config.yml:/app/router-config.yml:ro
@@ -498,8 +500,8 @@ services:
container_name: node-capabilities-node1 container_name: node-capabilities-node1
environment: environment:
- NODE_ID=noda1 - NODE_ID=noda1
- OLLAMA_BASE_URL=http://host.docker.internal:11434 - OLLAMA_BASE_URL=http://172.18.0.1:11434
- SWAPPER_URL=http://swapper-service:8890 - SWAPPER_URL=
- CACHE_TTL_SEC=15 - CACHE_TTL_SEC=15
- ENABLE_NATS_CAPS=true - ENABLE_NATS_CAPS=true
- NATS_URL=nats://nats:4222 - NATS_URL=nats://nats:4222
@@ -527,15 +529,13 @@ services:
environment: environment:
- NODE_ID=noda1 - NODE_ID=noda1
- NATS_URL=nats://nats:4222 - NATS_URL=nats://nats:4222
- OLLAMA_BASE_URL=http://host.docker.internal:11434 - OLLAMA_BASE_URL=http://172.18.0.1:11434
- SWAPPER_URL=http://swapper-service:8890
- NODE_DEFAULT_LLM=qwen3.5:27b - NODE_DEFAULT_LLM=qwen3.5:27b
- NODE_DEFAULT_VISION=qwen3-vl-8b - NODE_DEFAULT_VISION=qwen3-vl:8b
- NODE_WORKER_MAX_CONCURRENCY=2 - NODE_WORKER_MAX_CONCURRENCY=2
- NCS_REPORT_URL=http://node-capabilities:8099 - NCS_REPORT_URL=http://node-capabilities:8099
depends_on: depends_on:
- nats - nats
- swapper-service
networks: networks:
- dagi-network - dagi-network
restart: unless-stopped restart: unless-stopped

View File

@@ -25,9 +25,11 @@ services:
- XAI_API_KEY=${XAI_API_KEY} - XAI_API_KEY=${XAI_API_KEY}
- GROK_API_KEY=${XAI_API_KEY} - GROK_API_KEY=${XAI_API_KEY}
- DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-} - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
# ── Node Capabilities (multi-node model selection) ──────────────────── # ── Fabric Layer (NCS + Node Worker, no Swapper dependency) ──────────
- NODE_CAPABILITIES_URL=http://node-capabilities:8099/capabilities - NODE_CAPABILITIES_URL=http://node-capabilities:8099/capabilities
- ENABLE_GLOBAL_CAPS_NATS=true - ENABLE_GLOBAL_CAPS_NATS=true
- OLLAMA_URL=http://host.docker.internal:11434
- PREFER_NODE_WORKER=true
# ── Persistence backends ────────────────────────────────────────────── # ── Persistence backends ──────────────────────────────────────────────
- ALERT_BACKEND=postgres - ALERT_BACKEND=postgres
- ALERT_DATABASE_URL=${ALERT_DATABASE_URL:-${DATABASE_URL}} - ALERT_DATABASE_URL=${ALERT_DATABASE_URL:-${DATABASE_URL}}
@@ -121,7 +123,7 @@ services:
environment: environment:
- NODE_ID=NODA2 - NODE_ID=NODA2
- OLLAMA_BASE_URL=http://host.docker.internal:11434 - OLLAMA_BASE_URL=http://host.docker.internal:11434
- SWAPPER_URL=http://swapper-service:8890 - SWAPPER_URL=
- LLAMA_SERVER_URL=http://host.docker.internal:11435 - LLAMA_SERVER_URL=http://host.docker.internal:11435
- CACHE_TTL_SEC=15 - CACHE_TTL_SEC=15
- ENABLE_NATS_CAPS=true - ENABLE_NATS_CAPS=true
@@ -147,14 +149,12 @@ services:
- NODE_ID=noda2 - NODE_ID=noda2
- NATS_URL=nats://dagi-nats:4222 - NATS_URL=nats://dagi-nats:4222
- OLLAMA_BASE_URL=http://host.docker.internal:11434 - OLLAMA_BASE_URL=http://host.docker.internal:11434
- SWAPPER_URL=http://swapper-service:8890
- NODE_DEFAULT_LLM=qwen3:14b - NODE_DEFAULT_LLM=qwen3:14b
- NODE_DEFAULT_VISION=llava:13b - NODE_DEFAULT_VISION=llava:13b
- NODE_WORKER_MAX_CONCURRENCY=2 - NODE_WORKER_MAX_CONCURRENCY=2
- NCS_REPORT_URL=http://node-capabilities:8099 - NCS_REPORT_URL=http://node-capabilities:8099
depends_on: depends_on:
- dagi-nats - dagi-nats
- swapper-service
networks: networks:
- dagi-network - dagi-network
restart: unless-stopped restart: unless-stopped

View File

@@ -20,7 +20,7 @@ app = FastAPI(title="Node Capabilities Service", version="1.0.0")
NODE_ID = os.getenv("NODE_ID", "noda2") NODE_ID = os.getenv("NODE_ID", "noda2")
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434") OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
SWAPPER_URL = os.getenv("SWAPPER_URL", "http://swapper-service:8890") SWAPPER_URL = os.getenv("SWAPPER_URL", "") # empty = skip Swapper probing
LLAMA_SERVER_URL = os.getenv("LLAMA_SERVER_URL", "") LLAMA_SERVER_URL = os.getenv("LLAMA_SERVER_URL", "")
_cache: Dict[str, Any] = {} _cache: Dict[str, Any] = {}
@@ -74,7 +74,10 @@ async def _collect_ollama() -> Dict[str, Any]:
async def _collect_swapper() -> Dict[str, Any]: async def _collect_swapper() -> Dict[str, Any]:
runtime: Dict[str, Any] = {"base_url": SWAPPER_URL, "status": "unknown", "models": [], "vision_models": [], "active_model": None} runtime: Dict[str, Any] = {"base_url": SWAPPER_URL or "n/a", "status": "unknown", "models": [], "vision_models": [], "active_model": None}
if not SWAPPER_URL:
runtime["status"] = "disabled"
return runtime
try: try:
async with httpx.AsyncClient(timeout=5) as c: async with httpx.AsyncClient(timeout=5) as c:
h = await c.get(f"{SWAPPER_URL}/health") h = await c.get(f"{SWAPPER_URL}/health")

View File

@@ -4,7 +4,6 @@ import os
NODE_ID = os.getenv("NODE_ID", "noda2") NODE_ID = os.getenv("NODE_ID", "noda2")
NATS_URL = os.getenv("NATS_URL", "nats://dagi-nats:4222") NATS_URL = os.getenv("NATS_URL", "nats://dagi-nats:4222")
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434") OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
SWAPPER_URL = os.getenv("SWAPPER_URL", "http://swapper-service:8890")
DEFAULT_LLM = os.getenv("NODE_DEFAULT_LLM", "qwen3:14b") DEFAULT_LLM = os.getenv("NODE_DEFAULT_LLM", "qwen3:14b")
DEFAULT_VISION = os.getenv("NODE_DEFAULT_VISION", "llava:13b") DEFAULT_VISION = os.getenv("NODE_DEFAULT_VISION", "llava:13b")
MAX_CONCURRENCY = int(os.getenv("NODE_WORKER_MAX_CONCURRENCY", "2")) MAX_CONCURRENCY = int(os.getenv("NODE_WORKER_MAX_CONCURRENCY", "2"))

View File

@@ -0,0 +1,49 @@
"""Ollama vision provider — direct Ollama API with images, no Swapper dependency."""
import logging
from typing import Any, Dict, List, Optional
import httpx
from config import OLLAMA_BASE_URL, DEFAULT_VISION
logger = logging.getLogger("provider.ollama_vision")
async def infer(
images: Optional[List[str]] = None,
prompt: str = "",
model: str = "",
system: str = "",
max_tokens: int = 1024,
temperature: float = 0.2,
timeout_s: float = 60.0,
) -> Dict[str, Any]:
model = model or DEFAULT_VISION
payload: Dict[str, Any] = {
"model": model,
"prompt": prompt or "Describe this image.",
"stream": False,
"options": {"num_predict": max_tokens, "temperature": temperature},
}
if images:
clean = []
for img in images:
if "," in img and img.startswith("data:"):
clean.append(img.split(",", 1)[1])
else:
clean.append(img)
payload["images"] = clean
if system:
payload["system"] = system
async with httpx.AsyncClient(timeout=timeout_s) as c:
resp = await c.post(f"{OLLAMA_BASE_URL}/api/generate", json=payload)
resp.raise_for_status()
data = resp.json()
return {
"text": data.get("response", ""),
"model": model,
"provider": "ollama_vision",
"eval_count": data.get("eval_count", 0),
}

View File

@@ -9,7 +9,7 @@ from typing import Any, Dict
import config import config
from models import JobRequest, JobResponse, JobError from models import JobRequest, JobResponse, JobError
from idempotency import IdempotencyStore from idempotency import IdempotencyStore
from providers import ollama, swapper_vision from providers import ollama, ollama_vision
import fabric_metrics as fm import fabric_metrics as fm
logger = logging.getLogger("node-worker") logger = logging.getLogger("node-worker")
@@ -27,9 +27,13 @@ async def start(nats_client):
global _nats_client global _nats_client
_nats_client = nats_client _nats_client = nats_client
nid = config.NODE_ID.lower()
subjects = [ subjects = [
f"node.{config.NODE_ID.lower()}.llm.request", f"node.{nid}.llm.request",
f"node.{config.NODE_ID.lower()}.vision.request", f"node.{nid}.vision.request",
f"node.{nid}.stt.request",
f"node.{nid}.tts.request",
f"node.{nid}.image.request",
] ]
for subj in subjects: for subj in subjects:
await nats_client.subscribe(subj, cb=_handle_request) await nats_client.subscribe(subj, cb=_handle_request)
@@ -160,7 +164,7 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
) )
elif job.required_type == "vision": elif job.required_type == "vision":
result = await asyncio.wait_for( result = await asyncio.wait_for(
swapper_vision.infer( ollama_vision.infer(
images=payload.get("images"), images=payload.get("images"),
prompt=payload.get("prompt", ""), prompt=payload.get("prompt", ""),
model=model, model=model,
@@ -171,11 +175,20 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
), ),
timeout=timeout_s, timeout=timeout_s,
) )
elif job.required_type in ("stt", "tts", "image"):
return JobResponse(
job_id=job.job_id, trace_id=job.trace_id, node_id=config.NODE_ID,
status="error",
error=JobError(
code="NOT_YET_IMPLEMENTED",
message=f"{job.required_type} adapter coming soon; use direct runtime API for now",
),
)
else: else:
return JobResponse( return JobResponse(
job_id=job.job_id, trace_id=job.trace_id, node_id=config.NODE_ID, job_id=job.job_id, trace_id=job.trace_id, node_id=config.NODE_ID,
status="error", status="error",
error=JobError(code="UNSUPPORTED_TYPE", message=f"{job.required_type} not implemented"), error=JobError(code="UNSUPPORTED_TYPE", message=f"{job.required_type} not supported"),
) )
logger.info( logger.info(

View File

@@ -877,12 +877,13 @@ app = FastAPI(title="DAARION Router", version="2.0.0")
# Configuration # Configuration
NATS_URL = os.getenv("NATS_URL", "nats://nats:4222") NATS_URL = os.getenv("NATS_URL", "nats://nats:4222")
SWAPPER_URL = os.getenv("SWAPPER_URL", "http://swapper-service:8890") SWAPPER_URL = os.getenv("SWAPPER_URL", "http://swapper-service:8890")
# All multimodal services now through Swapper OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
STT_URL = os.getenv("STT_URL", "http://swapper-service:8890") # Swapper /stt endpoint PREFER_NODE_WORKER = os.getenv("PREFER_NODE_WORKER", "true").lower() in ("true", "1")
TTS_URL = os.getenv("TTS_URL", "http://swapper-service:8890") # Swapper /tts endpoint STT_URL = os.getenv("STT_URL", "http://swapper-service:8890")
VISION_URL = os.getenv("VISION_URL", "http://172.18.0.1:11434") # Host Ollama TTS_URL = os.getenv("TTS_URL", "http://swapper-service:8890")
OCR_URL = os.getenv("OCR_URL", "http://swapper-service:8890") # Swapper /ocr endpoint VISION_URL = os.getenv("VISION_URL", "http://host.docker.internal:11434")
DOCUMENT_URL = os.getenv("DOCUMENT_URL", "http://swapper-service:8890") # Swapper /document endpoint OCR_URL = os.getenv("OCR_URL", "http://swapper-service:8890")
DOCUMENT_URL = os.getenv("DOCUMENT_URL", "http://swapper-service:8890")
CITY_SERVICE_URL = os.getenv("CITY_SERVICE_URL", "http://daarion-city-service:7001") CITY_SERVICE_URL = os.getenv("CITY_SERVICE_URL", "http://daarion-city-service:7001")
# CrewAI Routing Configuration # CrewAI Routing Configuration
@@ -1083,8 +1084,8 @@ async def startup_event():
runtime_guard_engine = None runtime_guard_engine = None
# Log backend URLs # Log backend URLs
logger.info(f"📡 Swapper URL: {SWAPPER_URL}") logger.info(f"📡 Ollama URL: {OLLAMA_URL} (prefer_node_worker={PREFER_NODE_WORKER})")
logger.info(f"📡 STT URL: {STT_URL}") logger.info(f"📡 Swapper URL: {SWAPPER_URL} (legacy, being decommissioned)")
logger.info(f"📡 Vision URL: {VISION_URL}") logger.info(f"📡 Vision URL: {VISION_URL}")
logger.info(f"📡 OCR URL: {OCR_URL}") logger.info(f"📡 OCR URL: {OCR_URL}")
logger.info(f"📡 Neo4j URL: {NEO4J_URI}") logger.info(f"📡 Neo4j URL: {NEO4J_URI}")
@@ -2388,33 +2389,39 @@ async def agent_infer(agent_id: str, request: InferRequest):
logger.warning(f"⚠️ Deterministic AgroMatrix plant flow failed, fallback to generic vision: {e}") logger.warning(f"⚠️ Deterministic AgroMatrix plant flow failed, fallback to generic vision: {e}")
try: try:
# Use Swapper's /vision endpoint (manages model loading) vision_model = "qwen3-vl-8b"
vision_payload = { vision_system = system_prompt or ""
"model": "qwen3-vl-8b", if vision_system and memory_brief_text:
"prompt": request.prompt, vision_system = f"{vision_system}\n\n[INTERNAL MEMORY - do NOT repeat to user]\n{memory_brief_text}"
"images": request.images, # Swapper handles data URL conversion
"max_tokens": request.max_tokens or 1024, clean_images = []
"temperature": request.temperature or 0.7 for img in (request.images or []):
} if "," in img and img.startswith("data:"):
clean_images.append(img.split(",", 1)[1])
# Add system prompt if available
if system_prompt:
if memory_brief_text:
vision_payload["system"] = f"{system_prompt}\n\n[INTERNAL MEMORY - do NOT repeat to user]\n{memory_brief_text}"
else: else:
vision_payload["system"] = system_prompt clean_images.append(img)
logger.info(f"🖼️ Sending to Swapper /vision: {SWAPPER_URL}/vision") logger.info(f"🖼️ Vision inference: model={vision_model} images={len(clean_images)} prefer_nw={PREFER_NODE_WORKER}")
vision_resp = await http_client.post( vision_resp = await http_client.post(
f"{SWAPPER_URL}/vision", f"{OLLAMA_URL}/api/generate",
json=vision_payload, json={
timeout=120.0 "model": vision_model.replace("-", "-vl:").replace("qwen3-vl:", "qwen3-vl:") if ":" not in vision_model else vision_model,
"prompt": request.prompt,
"images": clean_images,
"system": vision_system,
"stream": False,
"options": {
"num_predict": request.max_tokens or 1024,
"temperature": request.temperature or 0.7,
},
},
timeout=120.0,
) )
if vision_resp.status_code == 200: if vision_resp.status_code == 200:
vision_data = vision_resp.json() vision_data = vision_resp.json()
raw_response = vision_data.get("text", "") raw_response = vision_data.get("response", vision_data.get("text", ""))
full_response = _sanitize_vision_text_for_user(raw_response) full_response = _sanitize_vision_text_for_user(raw_response)
vision_web_query = "" vision_web_query = ""
vision_sources: List[Dict[str, str]] = [] vision_sources: List[Dict[str, str]] = []
@@ -2519,14 +2526,23 @@ async def agent_infer(agent_id: str, request: InferRequest):
"Відповідай українською 2-4 реченнями, без службових фраз. " "Відповідай українською 2-4 реченнями, без службових фраз. "
f"Запит користувача: {request.prompt}" f"Запит користувача: {request.prompt}"
) )
retry_ollama_payload = {
"model": vision_model.replace("-", "-vl:").replace("qwen3-vl:", "qwen3-vl:") if ":" not in vision_model else vision_model,
"prompt": retry_payload["prompt"],
"images": clean_images,
"stream": False,
"options": {"num_predict": request.max_tokens or 1024, "temperature": 0.7},
}
if retry_payload.get("system"):
retry_ollama_payload["system"] = retry_payload["system"]
retry_resp = await http_client.post( retry_resp = await http_client.post(
f"{SWAPPER_URL}/vision", f"{OLLAMA_URL}/api/generate",
json=retry_payload, json=retry_ollama_payload,
timeout=120.0 timeout=120.0,
) )
if retry_resp.status_code == 200: if retry_resp.status_code == 200:
retry_data = retry_resp.json() retry_data = retry_resp.json()
retry_raw = retry_data.get("text", "") retry_raw = retry_data.get("response", retry_data.get("text", ""))
retry_text = _sanitize_vision_text_for_user(retry_raw) retry_text = _sanitize_vision_text_for_user(retry_raw)
if retry_raw and not retry_text: if retry_raw and not retry_text:
retry_text = _extract_vision_search_facts(retry_raw, max_chars=280) retry_text = _extract_vision_search_facts(retry_raw, max_chars=280)
@@ -2541,7 +2557,7 @@ async def agent_infer(agent_id: str, request: InferRequest):
elif request_agent_id in DETERMINISTIC_PLANT_POLICY_AGENTS and _vision_response_is_blurry(full_response): elif request_agent_id in DETERMINISTIC_PLANT_POLICY_AGENTS and _vision_response_is_blurry(full_response):
full_response = _build_image_fallback_response(request_agent_id, request.prompt) full_response = _build_image_fallback_response(request_agent_id, request.prompt)
full_response = await _finalize_response_text(full_response, "swapper-vision") full_response = await _finalize_response_text(full_response, "ollama-vision")
# Store vision message in agent-specific memory # Store vision message in agent-specific memory
if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id and full_response: if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id and full_response:
@@ -2567,10 +2583,10 @@ async def agent_infer(agent_id: str, request: InferRequest):
response=full_response, response=full_response,
model="qwen3-vl-8b", model="qwen3-vl-8b",
tokens_used=None, tokens_used=None,
backend="swapper-vision" backend="ollama-vision"
) )
else: else:
logger.error(f"Swapper vision error: {vision_resp.status_code} - {vision_resp.text[:200]}") logger.error(f"Ollama vision error: {vision_resp.status_code} - {vision_resp.text[:200]}")
fallback_response = await _finalize_response_text( fallback_response = await _finalize_response_text(
_build_image_fallback_response(request_agent_id, request.prompt), _build_image_fallback_response(request_agent_id, request.prompt),
"swapper-vision-fallback", "swapper-vision-fallback",
@@ -2579,7 +2595,7 @@ async def agent_infer(agent_id: str, request: InferRequest):
response=fallback_response, response=fallback_response,
model="qwen3-vl-8b", model="qwen3-vl-8b",
tokens_used=None, tokens_used=None,
backend="swapper-vision-fallback" backend="vision-fallback"
) )
except Exception as e: except Exception as e:
@@ -2592,7 +2608,7 @@ async def agent_infer(agent_id: str, request: InferRequest):
response=fallback_response, response=fallback_response,
model="qwen3-vl-8b", model="qwen3-vl-8b",
tokens_used=None, tokens_used=None,
backend="swapper-vision-fallback" backend="vision-fallback"
) )
# ========================================================================= # =========================================================================
@@ -3142,90 +3158,88 @@ async def agent_infer(agent_id: str, request: InferRequest):
logger.warning(f"⚠️ No local model in config, using hardcoded fallback: {local_model}") logger.warning(f"⚠️ No local model in config, using hardcoded fallback: {local_model}")
try: try:
# Check if Swapper is available ollama_model = local_model.replace("-", ":") if ":" not in local_model else local_model
health_resp = await http_client.get(f"{SWAPPER_URL}/health", timeout=5.0) logger.info(f"📡 Calling Ollama direct: model={ollama_model}")
if health_resp.status_code == 200:
logger.info(f"📡 Calling Swapper with local model: {local_model}") generate_resp = await http_client.post(
# Generate response via Swapper (which handles model loading) f"{OLLAMA_URL}/api/generate",
generate_resp = await http_client.post( json={
f"{SWAPPER_URL}/generate", "model": ollama_model,
json={ "prompt": request.prompt,
"model": local_model, "system": system_prompt,
"prompt": request.prompt, "stream": False,
"system": system_prompt, "options": {
"max_tokens": request.max_tokens, "num_predict": request.max_tokens or 2048,
"temperature": request.temperature, "temperature": request.temperature or 0.7,
"stream": False
}, },
timeout=300.0 },
) timeout=300.0,
)
if generate_resp.status_code == 200: if generate_resp.status_code == 200:
data = generate_resp.json() data = generate_resp.json()
local_response = _normalize_text_response(data.get("response", "")) local_response = _normalize_text_response(data.get("response", ""))
# Empty-answer gate for selected local top-level agents. if request_agent_id in EMPTY_ANSWER_GUARD_AGENTS and _needs_empty_answer_recovery(local_response):
if request_agent_id in EMPTY_ANSWER_GUARD_AGENTS and _needs_empty_answer_recovery(local_response): logger.warning(f"⚠️ Empty-answer gate triggered for {request_agent_id}, retrying local generate once")
logger.warning(f"⚠️ Empty-answer gate triggered for {request_agent_id}, retrying local generate once") retry_prompt = (
retry_prompt = ( f"{request.prompt}\n\n"
f"{request.prompt}\n\n" "Відповідай коротко і конкретно (2-5 речень), без службових або мета-фраз."
"Відповідай коротко і конкретно (2-5 речень), без службових або мета-фраз."
)
retry_resp = await http_client.post(
f"{SWAPPER_URL}/generate",
json={
"model": local_model,
"prompt": retry_prompt,
"system": system_prompt,
"max_tokens": request.max_tokens,
"temperature": request.temperature,
"stream": False
},
timeout=300.0
)
if retry_resp.status_code == 200:
retry_data = retry_resp.json()
retry_text = _normalize_text_response(retry_data.get("response", ""))
if retry_text and not _needs_empty_answer_recovery(retry_text):
local_response = retry_text
if _needs_empty_answer_recovery(local_response):
local_response = (
"Я не отримав корисну відповідь з першої спроби. "
"Сформулюй запит коротко ще раз, і я відповім конкретно."
)
local_response = await _finalize_response_text(local_response, "swapper+ollama")
# Store in agent-specific memory
if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id and local_response:
asyncio.create_task(
memory_retrieval.store_message(
agent_id=request_agent_id,
user_id=user_id,
username=username,
message_text=request.prompt,
response_text=local_response,
chat_id=chat_id
)
)
return InferResponse(
response=local_response,
model=local_model,
tokens_used=data.get("eval_count", 0),
backend="swapper+ollama"
) )
else: retry_resp = await http_client.post(
logger.error(f"❌ Swapper error: {generate_resp.status_code} - {generate_resp.text}") f"{OLLAMA_URL}/api/generate",
json={
"model": ollama_model,
"prompt": retry_prompt,
"system": system_prompt,
"stream": False,
"options": {
"num_predict": request.max_tokens or 2048,
"temperature": request.temperature or 0.7,
},
},
timeout=300.0,
)
if retry_resp.status_code == 200:
retry_data = retry_resp.json()
retry_text = _normalize_text_response(retry_data.get("response", ""))
if retry_text and not _needs_empty_answer_recovery(retry_text):
local_response = retry_text
if _needs_empty_answer_recovery(local_response):
local_response = (
"Я не отримав корисну відповідь з першої спроби. "
"Сформулюй запит коротко ще раз, і я відповім конкретно."
)
local_response = await _finalize_response_text(local_response, "ollama-direct")
if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id and local_response:
asyncio.create_task(
memory_retrieval.store_message(
agent_id=request_agent_id,
user_id=user_id,
username=username,
message_text=request.prompt,
response_text=local_response,
chat_id=chat_id
)
)
return InferResponse(
response=local_response,
model=local_model,
tokens_used=data.get("eval_count", 0),
backend="ollama-direct"
)
else:
logger.error(f"❌ Ollama generate error: {generate_resp.status_code} - {generate_resp.text[:200]}")
except Exception as e: except Exception as e:
logger.error(f"Swapper/Ollama error: {e}") logger.error(f"Ollama direct error: {e}")
# Fallback to direct Ollama if Swapper fails
try: try:
logger.info(f"🔄 Falling back to direct Ollama connection") logger.info(f"🔄 Falling back to Ollama with hardcoded model")
generate_resp = await http_client.post( generate_resp = await http_client.post(
f"{VISION_URL}/api/generate", f"{OLLAMA_URL}/api/generate",
json={ json={
"model": "qwen3:8b", # Use actual Ollama model name "model": "qwen3:8b",
"prompt": request.prompt, "prompt": request.prompt,
"system": system_prompt, "system": system_prompt,
"stream": False, "stream": False,
@@ -3526,42 +3540,38 @@ async def documents_versions(doc_id: str, agent_id: str, limit: int = 20):
@app.get("/v1/models") @app.get("/v1/models")
async def list_available_models(): async def list_available_models():
"""List all available models across backends""" """List all available models from NCS (global capabilities pool)."""
models = [] models = []
# Get Swapper models
try: try:
resp = await http_client.get(f"{SWAPPER_URL}/models", timeout=5.0) from global_capabilities_client import get_global_capabilities
if resp.status_code == 200: pool = await get_global_capabilities()
data = resp.json() for m in pool.get("served_models", []):
for m in data.get("models", []): models.append({
models.append({ "id": m.get("name"),
"id": m.get("name"), "backend": m.get("runtime", "unknown"),
"backend": "swapper", "node": m.get("node", "?"),
"size_gb": m.get("size_gb"), "type": m.get("type", "llm"),
"status": m.get("status", "available") "size_gb": m.get("size_gb"),
}) "status": "served",
})
except Exception as e: except Exception as e:
logger.warning(f"Cannot get Swapper models: {e}") logger.warning(f"Cannot get NCS global models: {e}")
# Get Ollama models if not models:
try: try:
resp = await http_client.get(f"{VISION_URL}/api/tags", timeout=5.0) resp = await http_client.get(f"{OLLAMA_URL}/api/tags", timeout=5.0)
if resp.status_code == 200: if resp.status_code == 200:
data = resp.json() for m in resp.json().get("models", []):
for m in data.get("models", []):
# Avoid duplicates
model_name = m.get("name")
if not any(x.get("id") == model_name for x in models):
models.append({ models.append({
"id": model_name, "id": m.get("name"),
"backend": "ollama", "backend": "ollama",
"size_gb": round(m.get("size", 0) / 1e9, 1), "size_gb": round(m.get("size", 0) / 1e9, 1),
"status": "loaded" "status": "loaded",
}) })
except Exception as e: except Exception as e:
logger.warning(f"Cannot get Ollama models: {e}") logger.warning(f"Cannot get Ollama models: {e}")
return {"models": models, "total": len(models)} return {"models": models, "total": len(models)}

View File

@@ -1,90 +1,35 @@
# Swapper Configuration for Node #2 (Development Node) # Swapper Configuration — Default / Fallback
# Single-active LLM scheduler #
# MacBook Pro M4 Max - Apple Silicon (40-core GPU, 64GB RAM) # NOTE: Swapper is now a runtime gateway / executor only.
# Auto-generated configuration with available Ollama models # Source of truth for models is NCS (Node Capabilities Service).
# No hardcoded model lists — Swapper queries NCS or Ollama /api/tags at startup.
#
# Per-node overrides: swapper_config_node1.yaml, swapper_config_node2.yaml
swapper: node_id: default
mode: single-active
max_concurrent_models: 1 runtimes:
ollama:
url: http://localhost:11434
timeout: 300
limits:
llm_concurrency: 2
vision_concurrency: 1
max_concurrent_models: 2
model_swap_timeout: 300 model_swap_timeout: 300
gpu_enabled: true
metal_acceleration: true # Apple Silicon GPU acceleration
# Модель для автоматичного завантаження при старті (опціонально)
# Якщо не вказано - моделі завантажуються тільки за запитом
# Рекомендовано: gpt-oss:latest (швидка модель) або phi3:latest (легка модель)
default_model: gpt-oss:latest # Модель активується автоматично при старті
models: timeouts:
# Fast LLM - GPT-OSS 20B (High Priority) - Main model for general tasks llm_ms: 120000
gpt-oss-latest: vision_ms: 180000
path: ollama:gpt-oss:latest stt_ms: 60000
type: llm tts_ms: 60000
size_gb: 13.0
priority: high gpu:
description: "Fast LLM for general tasks and conversations (20.9B params)" enabled: false
metal_acceleration: false
# Lightweight LLM - Phi3 3.8B (High Priority) - Fast responses
phi3-latest:
path: ollama:phi3:latest
type: llm
size_gb: 2.2
priority: high
description: "Lightweight LLM for fast responses (3.8B params)"
# Code Specialist - StarCoder2 3B (Medium Priority) - Code engineering
starcoder2-3b:
path: ollama:starcoder2:3b
type: code
size_gb: 1.7
priority: medium
description: "Code specialist model for code engineering (3B params)"
# Reasoning Model - Mistral Nemo 12.2B (High Priority) - Advanced reasoning
mistral-nemo-12b:
path: ollama:mistral-nemo:12b
type: llm
size_gb: 7.1
priority: high
description: "Advanced reasoning model for complex tasks (12.2B params)"
# Reasoning Model - Gemma2 27B (Medium Priority) - Strategic reasoning
gemma2-27b:
path: ollama:gemma2:27b
type: llm
size_gb: 15.0
priority: medium
description: "Reasoning model for strategic tasks (27.2B params)"
# Code Specialist - DeepSeek Coder 33B (High Priority) - Advanced code tasks
deepseek-coder-33b:
path: ollama:deepseek-coder:33b
type: code
size_gb: 18.0
priority: high
description: "Advanced code specialist model (33B params)"
# Code Specialist - Qwen2.5 Coder 32B (High Priority) - Advanced code tasks
qwen2.5-coder-32b:
path: ollama:qwen2.5-coder:32b
type: code
size_gb: 19.0
priority: high
description: "Advanced code specialist model (32.8B params)"
# Reasoning Model - DeepSeek R1 70B (High Priority) - Strategic reasoning (large model)
deepseek-r1-70b:
path: ollama:deepseek-r1:70b
type: llm
size_gb: 42.0
priority: high
description: "Strategic reasoning model (70.6B params, quantized)"
storage: storage:
models_dir: /app/models models_dir: /app/models
cache_dir: /app/cache cache_dir: /app/cache
swap_dir: /app/swap swap_dir: /app/swap
ollama:
url: http://localhost:11434 # Native Ollama on MacBook (via Pieces OS or brew)
timeout: 300

View File

@@ -1,186 +1,37 @@
# Swapper Configuration for Node #1 (Production Server) # Swapper Configuration for Node #1 (Production Server)
# Optimized Multimodal Stack: LLM + Vision + OCR + Document + Audio
# Hetzner GEX44 - NVIDIA RTX 4000 SFF Ada (20GB VRAM) # Hetzner GEX44 - NVIDIA RTX 4000 SFF Ada (20GB VRAM)
# #
# ВАЖЛИВО: Ембедінги через зовнішні API: # NOTE: Swapper is now a runtime gateway / executor only.
# - Text: Cohere API (embed-multilingual-v3.0, 1024 dim) # Source of truth for models is NCS (Node Capabilities Service).
# - Image: Vision Encoder (OpenCLIP ViT-L/14, 768 dim) # No hardcoded model lists — Swapper queries NCS or Ollama /api/tags at startup.
# НЕ використовуємо локальні embedding моделі!
swapper: node_id: noda1
mode: multi-active
max_concurrent_models: 4 # LLM + OCR + STT + TTS (до 15GB) runtimes:
ollama:
url: http://172.18.0.1:11434
timeout: 300
# comfyui:
# url: http://127.0.0.1:8188
limits:
llm_concurrency: 2
vision_concurrency: 1
max_concurrent_models: 4
model_swap_timeout: 300 model_swap_timeout: 300
gpu_enabled: true
timeouts:
llm_ms: 120000
vision_ms: 180000
stt_ms: 60000
tts_ms: 60000
image_gen_ms: 300000
gpu:
enabled: true
metal_acceleration: false metal_acceleration: false
default_model: qwen3-8b
lazy_load_ocr: true
lazy_load_audio: true
# Автоматичне вивантаження при нестачі VRAM
auto_unload_on_oom: true auto_unload_on_oom: true
vram_threshold_gb: 18 # Починати вивантажувати при 18GB vram_threshold_gb: 18
models:
# ============================================
# LLM MODELS (Ollama) - тільки qwen3
# ============================================
# Primary LLM - Qwen3 8B (includes math, coding, reasoning)
qwen3-8b:
path: ollama:qwen3:8b
type: llm
size_gb: 5.2
priority: high
description: "Qwen3 8B - primary LLM with math, coding, reasoning capabilities"
capabilities:
- chat
- math
- coding
- reasoning
- multilingual
# ============================================
# VISION MODELS (Ollama)
# ============================================
# Vision Model - Qwen3-VL 8B
qwen3-vl-8b:
path: ollama:qwen3-vl:8b
type: vision
size_gb: 6.1
priority: high
description: "Qwen3-VL 8B for image understanding and visual reasoning"
capabilities:
- image_understanding
- visual_qa
- diagram_analysis
- ocr_basic
# ============================================
# OCR/DOCUMENT MODELS (HuggingFace)
# ============================================
# GOT-OCR2.0 - Best for documents, tables, formulas
got-ocr2:
path: huggingface:stepfun-ai/GOT-OCR2_0
type: ocr
size_gb: 7.0
priority: high
description: "Best OCR for documents, tables, formulas, handwriting"
capabilities:
- documents
- tables
- formulas
- handwriting
- multilingual
# Donut - Document Understanding (no external OCR, 91% CORD)
donut-base:
path: huggingface:naver-clova-ix/donut-base
type: ocr
size_gb: 3.0
priority: high
description: "Document parsing without OCR engine (91% CORD accuracy)"
capabilities:
- document_parsing
- receipts
- forms
- invoices
# Donut fine-tuned for receipts/invoices (CORD dataset)
donut-cord:
path: huggingface:naver-clova-ix/donut-base-finetuned-cord-v2
type: ocr
size_gb: 3.0
priority: medium
description: "Donut fine-tuned for receipts extraction"
capabilities:
- receipts
- invoices
- structured_extraction
# IBM Granite Docling - Document conversion with structure preservation
granite-docling:
path: huggingface:ds4sd/docling-ibm-granite-vision-1b
type: document
size_gb: 2.5
priority: high
description: "IBM Granite Docling for PDF/document structure extraction"
capabilities:
- pdf_conversion
- table_extraction
- formula_extraction
- layout_preservation
- doctags_format
# ============================================
# AUDIO MODELS - STT (Speech-to-Text)
# ============================================
# Faster Whisper Large-v3 - Best STT quality
faster-whisper-large:
path: huggingface:Systran/faster-whisper-large-v3
type: stt
size_gb: 3.0
priority: high
description: "Faster Whisper Large-v3 - best quality, 99 languages"
capabilities:
- speech_recognition
- transcription
- multilingual
- timestamps
- ukrainian
# Whisper Small - Fast/lightweight for quick transcription
whisper-small:
path: huggingface:openai/whisper-small
type: stt
size_gb: 0.5
priority: medium
description: "Whisper Small for fast transcription"
capabilities:
- speech_recognition
- transcription
# ============================================
# AUDIO MODELS - TTS (Text-to-Speech)
# ============================================
# Coqui XTTS-v2 - Best multilingual TTS with Ukrainian support
xtts-v2:
path: huggingface:coqui/XTTS-v2
type: tts
size_gb: 2.0
priority: high
description: "XTTS-v2 multilingual TTS with voice cloning, Ukrainian support"
capabilities:
- text_to_speech
- voice_cloning
- multilingual
- ukrainian
- 17_languages
# ============================================
# IMAGE GENERATION MODELS (HuggingFace/Diffusers)
# ============================================
# FLUX.2 Klein 4B - High quality image generation with lazy loading
flux-klein-4b:
path: huggingface:black-forest-labs/FLUX.2-klein-base-4B
type: image_generation
size_gb: 15.4
priority: medium
description: "FLUX.2 Klein 4B - high quality image generation, lazy loaded on demand"
capabilities:
- text_to_image
- high_quality
- 1024x1024
- artistic
default_params:
num_inference_steps: 50
guidance_scale: 4.0
width: 1024
height: 1024
storage: storage:
models_dir: /app/models models_dir: /app/models
@@ -188,33 +39,8 @@ storage:
swap_dir: /app/swap swap_dir: /app/swap
huggingface_cache: /root/.cache/huggingface huggingface_cache: /root/.cache/huggingface
ollama:
url: http://172.18.0.1:11434
timeout: 300
huggingface: huggingface:
device: cuda device: cuda
torch_dtype: float16 torch_dtype: float16
trust_remote_code: true trust_remote_code: true
low_cpu_mem_usage: true low_cpu_mem_usage: true
# ============================================
# EMBEDDING SERVICES (External APIs)
# НЕ через Swapper - окремі сервіси!
# ============================================
#
# Text Embeddings:
# Service: Memory Service → Cohere API
# Model: embed-multilingual-v3.0
# Dimension: 1024
# Endpoint: Memory Service handles internally
#
# Image/Multimodal Embeddings:
# Service: Vision Encoder (port 8001)
# Model: OpenCLIP ViT-L/14
# Dimension: 768
# Endpoint: http://vision-encoder:8001/embed
#
# Vector Storage:
# Qdrant (port 6333) - separate collections for text vs image embeddings
# ВАЖЛИВО: НЕ змішувати embedding spaces в одній колекції!

View File

@@ -1,126 +1,40 @@
# Swapper Configuration for Node #2 (Development Node) # Swapper Configuration for Node #2 (Development Node)
# Single-active LLM scheduler
# MacBook Pro M4 Max - Apple Silicon (40-core GPU, 64GB RAM) # MacBook Pro M4 Max - Apple Silicon (40-core GPU, 64GB RAM)
# Auto-generated configuration with available Ollama models #
# NOTE: Swapper is now a runtime gateway / executor only.
# Source of truth for models is NCS (Node Capabilities Service).
# No hardcoded model lists — Swapper queries NCS or Ollama /api/tags at startup.
swapper: node_id: noda2
mode: single-active
runtimes:
ollama:
url: http://host.docker.internal:11434
timeout: 300
# mlx:
# stt_model: whisper-large-v3-turbo
# tts_model: kokoro-82m
# comfyui:
# url: http://127.0.0.1:8188
limits:
llm_concurrency: 1
vision_concurrency: 1
max_concurrent_models: 1 max_concurrent_models: 1
model_swap_timeout: 300 model_swap_timeout: 300
gpu_enabled: true
metal_acceleration: true # Apple Silicon GPU acceleration
# Модель для автоматичного завантаження при старті (опціонально)
# Якщо не вказано - моделі завантажуються тільки за запитом
# Рекомендовано: gpt-oss:latest (швидка модель) або phi3:latest (легка модель)
# Стартова модель має бути реально встановлена в Ollama на NODA2
default_model: qwen3:14b # Модель активується автоматично при старті
models: timeouts:
# Fast LLM - GPT-OSS 20B (High Priority) - Main model for general tasks llm_ms: 120000
gpt-oss-latest: vision_ms: 180000
path: ollama:gpt-oss:latest stt_ms: 60000
type: llm tts_ms: 60000
size_gb: 13.0 image_gen_ms: 300000
priority: high
description: "Fast LLM for general tasks and conversations (20.9B params)"
# Lightweight LLM - Phi3 3.8B (High Priority) - Fast responses
phi3-latest:
path: ollama:phi3:latest
type: llm
size_gb: 2.2
priority: high
description: "Lightweight LLM for fast responses (3.8B params)"
# General Reasoning - Qwen3 14B (High Priority)
qwen3-14b:
path: ollama:qwen3:14b
type: llm
size_gb: 9.3
priority: high
description: "Balanced local model for Sofiia and router fallback"
# Reasoning Model - Qwen3.5 35B A3B (High Priority) gpu:
qwen3.5-35b-a3b: enabled: true
path: ollama:qwen3.5:35b-a3b metal_acceleration: true
type: llm
size_gb: 22.0
priority: high
description: "Large reasoning model for complex Sofiia requests"
# Reasoning Model - GLM 4.7 Flash (High Priority) - Fast general model
glm-4.7-flash:
path: ollama:glm-4.7-flash:32k
type: llm
size_gb: 19.0
priority: high
description: "Multi-purpose reasoning model (fast context)"
# Reasoning Model - Gemma2 27B (Medium Priority) - Strategic reasoning
gemma2-27b:
path: ollama:gemma2:27b
type: llm
size_gb: 15.0
priority: medium
description: "Reasoning model for strategic tasks (27.2B params)"
# Code Specialist - DeepSeek Coder 33B (High Priority) - Advanced code tasks
deepseek-coder-33b:
path: ollama:deepseek-coder:33b
type: code
size_gb: 18.0
priority: high
description: "Advanced code specialist model (33B params)"
# Code Specialist - Qwen2.5 Coder 32B (High Priority) - Advanced code tasks
qwen2.5-coder-32b:
path: ollama:qwen2.5-coder:32b
type: code
size_gb: 19.0
priority: high
description: "Advanced code specialist model (32.8B params)"
# Reasoning Model - DeepSeek R1 70B (High Priority) - Strategic reasoning (large model)
deepseek-r1-70b:
path: ollama:deepseek-r1:70b
type: llm
size_gb: 42.0
priority: high
description: "Strategic reasoning model (70.6B params, quantized)"
# Vision Model - LLaVA 13B (P0 Fix: NODA2 fallback vision)
# Available in Ollama on NODA2 — used until qwen3-vl:8b is installed
llava-13b:
path: ollama:llava:13b
type: vision
size_gb: 8.0
priority: high
description: "LLaVA 13B vision model (multimodal CLIP+LLM). P0 fallback until qwen3-vl:8b."
vision: true
ollama_model: "llava:13b"
# Vision Model - Qwen3-VL 8B (RECOMMENDED: install with: ollama pull qwen3-vl:8b)
# Better quality than llava:13b. Enable once installed.
# qwen3-vl-8b:
# path: ollama:qwen3-vl:8b
# type: vision
# size_gb: 5.5
# priority: high
# description: "Qwen3-VL 8B — modern vision-language model (recommended)"
# vision: true
# ollama_model: "qwen3-vl:8b"
storage: storage:
models_dir: /app/models models_dir: /app/models
cache_dir: /app/cache cache_dir: /app/cache
swap_dir: /app/swap swap_dir: /app/swap
ollama:
url: http://host.docker.internal:11434 # host.docker.internal → native Ollama on MacBook (NODA2 P1 fix)
timeout: 300
# Vision endpoint configuration
# /vision/models returns all models where vision: true
vision:
default_model: llava-13b
ollama_base_url: http://host.docker.internal:11434

View File

@@ -1,63 +1,37 @@
# Swapper Configuration for Node #3 (AI/ML Workstation) # Swapper Configuration for Node #3 (AI/ML Workstation)
# Single-active LLM scheduler # Threadripper PRO + RTX 3090 24GB — GPU-intensive workloads
# Threadripper PRO + RTX 3090 24GB - GPU-intensive workloads #
# NOTE: Swapper is now a runtime gateway / executor only.
# Source of truth for models is NCS (Node Capabilities Service).
# No hardcoded model lists.
swapper: node_id: noda3
mode: single-active
max_concurrent_models: 1 runtimes:
ollama:
url: http://localhost:11434
timeout: 300
comfyui:
url: http://127.0.0.1:8188
limits:
llm_concurrency: 2
vision_concurrency: 1
max_concurrent_models: 2
model_swap_timeout: 300 model_swap_timeout: 300
gpu_enabled: true
metal_acceleration: false # NVIDIA GPU, not Apple Silicon
# Модель для автоматичного завантаження при старті
# qwen3-8b - основна модель (4.87 GB), швидка відповідь на перший запит
default_model: qwen3-8b
models: timeouts:
# Primary LLM - Qwen3 8B (High Priority) - Main model from INFRASTRUCTURE.md llm_ms: 120000
qwen3-8b: vision_ms: 180000
path: ollama:qwen3:8b image_gen_ms: 600000
type: llm
size_gb: 4.87 gpu:
priority: high enabled: true
description: "Primary LLM for general tasks and conversations" metal_acceleration: false
auto_unload_on_oom: true
# Vision Model - Qwen3-VL 8B (High Priority) - For image processing vram_threshold_gb: 22
qwen3-vl-8b:
path: ollama:qwen3-vl:8b
type: vision
size_gb: 5.72
priority: high
description: "Vision model for image understanding and processing"
# Qwen2.5 7B Instruct (High Priority)
qwen2.5-7b-instruct:
path: ollama:qwen2.5:7b-instruct-q4_K_M
type: llm
size_gb: 4.36
priority: high
description: "Qwen2.5 7B Instruct model"
# Lightweight LLM - Qwen2.5 3B Instruct (Medium Priority)
qwen2.5-3b-instruct:
path: ollama:qwen2.5:3b-instruct-q4_K_M
type: llm
size_gb: 1.80
priority: medium
description: "Lightweight LLM for faster responses"
# Math Specialist - Qwen2 Math 7B (High Priority)
qwen2-math-7b:
path: ollama:qwen2-math:7b
type: math
size_gb: 4.13
priority: high
description: "Specialized model for mathematical tasks"
storage: storage:
models_dir: /app/models models_dir: /app/models
cache_dir: /app/cache cache_dir: /app/cache
swap_dir: /app/swap swap_dir: /app/swap
ollama:
url: http://ollama:11434 # From Docker container to Ollama service
timeout: 300