feat(fabric): decommission Swapper from critical path, NCS = source of truth
- Node Worker: replace swapper_vision with ollama_vision (direct Ollama API) - Node Worker: add NATS subjects for stt/tts/image (stubs ready) - Node Worker: remove SWAPPER_URL dependency from config - Router: vision calls go directly to Ollama /api/generate with images - Router: local LLM calls go directly to Ollama /api/generate - Router: add OLLAMA_URL and PREFER_NODE_WORKER=true feature flag - Router: /v1/models now uses NCS global capabilities pool - NCS: SWAPPER_URL="" -> skip Swapper probing (status=disabled) - Swapper configs: remove all hardcoded model lists, keep only runtime URLs, timeouts, limits - docker-compose.node1.yml: add OLLAMA_URL, PREFER_NODE_WORKER for router; SWAPPER_URL= for NCS; remove swapper-service from node-worker depends_on - docker-compose.node2-sofiia.yml: same changes for NODA2 Swapper service still runs but is NOT in the critical inference path. Source of truth for models is now NCS -> Ollama /api/tags. Made-with: Cursor
This commit is contained in:
@@ -48,9 +48,11 @@ services:
|
|||||||
- ROUTER_TOOL_MAX_ROUNDS=${ROUTER_TOOL_MAX_ROUNDS:-10}
|
- ROUTER_TOOL_MAX_ROUNDS=${ROUTER_TOOL_MAX_ROUNDS:-10}
|
||||||
- AGROMATRIX_REVIEW_AUTH_MODE=${AGROMATRIX_REVIEW_AUTH_MODE:-bearer}
|
- AGROMATRIX_REVIEW_AUTH_MODE=${AGROMATRIX_REVIEW_AUTH_MODE:-bearer}
|
||||||
- AGROMATRIX_REVIEW_BEARER_TOKENS=${AGROMATRIX_REVIEW_BEARER_TOKENS}
|
- AGROMATRIX_REVIEW_BEARER_TOKENS=${AGROMATRIX_REVIEW_BEARER_TOKENS}
|
||||||
# ── Node Capabilities (multi-node model selection) ──
|
# ── Fabric Layer (NCS + Node Worker, Swapper being decommissioned) ──
|
||||||
- NODE_CAPABILITIES_URL=http://node-capabilities:8099/capabilities
|
- NODE_CAPABILITIES_URL=http://node-capabilities:8099/capabilities
|
||||||
- ENABLE_GLOBAL_CAPS_NATS=true
|
- ENABLE_GLOBAL_CAPS_NATS=true
|
||||||
|
- OLLAMA_URL=http://172.18.0.1:11434
|
||||||
|
- PREFER_NODE_WORKER=true
|
||||||
volumes:
|
volumes:
|
||||||
- ${DEPLOY_ROOT:-.}/services/router/router_config.yaml:/app/router_config.yaml:ro
|
- ${DEPLOY_ROOT:-.}/services/router/router_config.yaml:/app/router_config.yaml:ro
|
||||||
- ${DEPLOY_ROOT:-.}/services/router/router-config.yml:/app/router-config.yml:ro
|
- ${DEPLOY_ROOT:-.}/services/router/router-config.yml:/app/router-config.yml:ro
|
||||||
@@ -498,8 +500,8 @@ services:
|
|||||||
container_name: node-capabilities-node1
|
container_name: node-capabilities-node1
|
||||||
environment:
|
environment:
|
||||||
- NODE_ID=noda1
|
- NODE_ID=noda1
|
||||||
- OLLAMA_BASE_URL=http://host.docker.internal:11434
|
- OLLAMA_BASE_URL=http://172.18.0.1:11434
|
||||||
- SWAPPER_URL=http://swapper-service:8890
|
- SWAPPER_URL=
|
||||||
- CACHE_TTL_SEC=15
|
- CACHE_TTL_SEC=15
|
||||||
- ENABLE_NATS_CAPS=true
|
- ENABLE_NATS_CAPS=true
|
||||||
- NATS_URL=nats://nats:4222
|
- NATS_URL=nats://nats:4222
|
||||||
@@ -527,15 +529,13 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
- NODE_ID=noda1
|
- NODE_ID=noda1
|
||||||
- NATS_URL=nats://nats:4222
|
- NATS_URL=nats://nats:4222
|
||||||
- OLLAMA_BASE_URL=http://host.docker.internal:11434
|
- OLLAMA_BASE_URL=http://172.18.0.1:11434
|
||||||
- SWAPPER_URL=http://swapper-service:8890
|
|
||||||
- NODE_DEFAULT_LLM=qwen3.5:27b
|
- NODE_DEFAULT_LLM=qwen3.5:27b
|
||||||
- NODE_DEFAULT_VISION=qwen3-vl-8b
|
- NODE_DEFAULT_VISION=qwen3-vl:8b
|
||||||
- NODE_WORKER_MAX_CONCURRENCY=2
|
- NODE_WORKER_MAX_CONCURRENCY=2
|
||||||
- NCS_REPORT_URL=http://node-capabilities:8099
|
- NCS_REPORT_URL=http://node-capabilities:8099
|
||||||
depends_on:
|
depends_on:
|
||||||
- nats
|
- nats
|
||||||
- swapper-service
|
|
||||||
networks:
|
networks:
|
||||||
- dagi-network
|
- dagi-network
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|||||||
@@ -25,9 +25,11 @@ services:
|
|||||||
- XAI_API_KEY=${XAI_API_KEY}
|
- XAI_API_KEY=${XAI_API_KEY}
|
||||||
- GROK_API_KEY=${XAI_API_KEY}
|
- GROK_API_KEY=${XAI_API_KEY}
|
||||||
- DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
|
- DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
|
||||||
# ── Node Capabilities (multi-node model selection) ────────────────────
|
# ── Fabric Layer (NCS + Node Worker, no Swapper dependency) ──────────
|
||||||
- NODE_CAPABILITIES_URL=http://node-capabilities:8099/capabilities
|
- NODE_CAPABILITIES_URL=http://node-capabilities:8099/capabilities
|
||||||
- ENABLE_GLOBAL_CAPS_NATS=true
|
- ENABLE_GLOBAL_CAPS_NATS=true
|
||||||
|
- OLLAMA_URL=http://host.docker.internal:11434
|
||||||
|
- PREFER_NODE_WORKER=true
|
||||||
# ── Persistence backends ──────────────────────────────────────────────
|
# ── Persistence backends ──────────────────────────────────────────────
|
||||||
- ALERT_BACKEND=postgres
|
- ALERT_BACKEND=postgres
|
||||||
- ALERT_DATABASE_URL=${ALERT_DATABASE_URL:-${DATABASE_URL}}
|
- ALERT_DATABASE_URL=${ALERT_DATABASE_URL:-${DATABASE_URL}}
|
||||||
@@ -121,7 +123,7 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
- NODE_ID=NODA2
|
- NODE_ID=NODA2
|
||||||
- OLLAMA_BASE_URL=http://host.docker.internal:11434
|
- OLLAMA_BASE_URL=http://host.docker.internal:11434
|
||||||
- SWAPPER_URL=http://swapper-service:8890
|
- SWAPPER_URL=
|
||||||
- LLAMA_SERVER_URL=http://host.docker.internal:11435
|
- LLAMA_SERVER_URL=http://host.docker.internal:11435
|
||||||
- CACHE_TTL_SEC=15
|
- CACHE_TTL_SEC=15
|
||||||
- ENABLE_NATS_CAPS=true
|
- ENABLE_NATS_CAPS=true
|
||||||
@@ -147,14 +149,12 @@ services:
|
|||||||
- NODE_ID=noda2
|
- NODE_ID=noda2
|
||||||
- NATS_URL=nats://dagi-nats:4222
|
- NATS_URL=nats://dagi-nats:4222
|
||||||
- OLLAMA_BASE_URL=http://host.docker.internal:11434
|
- OLLAMA_BASE_URL=http://host.docker.internal:11434
|
||||||
- SWAPPER_URL=http://swapper-service:8890
|
|
||||||
- NODE_DEFAULT_LLM=qwen3:14b
|
- NODE_DEFAULT_LLM=qwen3:14b
|
||||||
- NODE_DEFAULT_VISION=llava:13b
|
- NODE_DEFAULT_VISION=llava:13b
|
||||||
- NODE_WORKER_MAX_CONCURRENCY=2
|
- NODE_WORKER_MAX_CONCURRENCY=2
|
||||||
- NCS_REPORT_URL=http://node-capabilities:8099
|
- NCS_REPORT_URL=http://node-capabilities:8099
|
||||||
depends_on:
|
depends_on:
|
||||||
- dagi-nats
|
- dagi-nats
|
||||||
- swapper-service
|
|
||||||
networks:
|
networks:
|
||||||
- dagi-network
|
- dagi-network
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ app = FastAPI(title="Node Capabilities Service", version="1.0.0")
|
|||||||
|
|
||||||
NODE_ID = os.getenv("NODE_ID", "noda2")
|
NODE_ID = os.getenv("NODE_ID", "noda2")
|
||||||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
|
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
|
||||||
SWAPPER_URL = os.getenv("SWAPPER_URL", "http://swapper-service:8890")
|
SWAPPER_URL = os.getenv("SWAPPER_URL", "") # empty = skip Swapper probing
|
||||||
LLAMA_SERVER_URL = os.getenv("LLAMA_SERVER_URL", "")
|
LLAMA_SERVER_URL = os.getenv("LLAMA_SERVER_URL", "")
|
||||||
|
|
||||||
_cache: Dict[str, Any] = {}
|
_cache: Dict[str, Any] = {}
|
||||||
@@ -74,7 +74,10 @@ async def _collect_ollama() -> Dict[str, Any]:
|
|||||||
|
|
||||||
|
|
||||||
async def _collect_swapper() -> Dict[str, Any]:
|
async def _collect_swapper() -> Dict[str, Any]:
|
||||||
runtime: Dict[str, Any] = {"base_url": SWAPPER_URL, "status": "unknown", "models": [], "vision_models": [], "active_model": None}
|
runtime: Dict[str, Any] = {"base_url": SWAPPER_URL or "n/a", "status": "unknown", "models": [], "vision_models": [], "active_model": None}
|
||||||
|
if not SWAPPER_URL:
|
||||||
|
runtime["status"] = "disabled"
|
||||||
|
return runtime
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=5) as c:
|
async with httpx.AsyncClient(timeout=5) as c:
|
||||||
h = await c.get(f"{SWAPPER_URL}/health")
|
h = await c.get(f"{SWAPPER_URL}/health")
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ import os
|
|||||||
NODE_ID = os.getenv("NODE_ID", "noda2")
|
NODE_ID = os.getenv("NODE_ID", "noda2")
|
||||||
NATS_URL = os.getenv("NATS_URL", "nats://dagi-nats:4222")
|
NATS_URL = os.getenv("NATS_URL", "nats://dagi-nats:4222")
|
||||||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
|
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
|
||||||
SWAPPER_URL = os.getenv("SWAPPER_URL", "http://swapper-service:8890")
|
|
||||||
DEFAULT_LLM = os.getenv("NODE_DEFAULT_LLM", "qwen3:14b")
|
DEFAULT_LLM = os.getenv("NODE_DEFAULT_LLM", "qwen3:14b")
|
||||||
DEFAULT_VISION = os.getenv("NODE_DEFAULT_VISION", "llava:13b")
|
DEFAULT_VISION = os.getenv("NODE_DEFAULT_VISION", "llava:13b")
|
||||||
MAX_CONCURRENCY = int(os.getenv("NODE_WORKER_MAX_CONCURRENCY", "2"))
|
MAX_CONCURRENCY = int(os.getenv("NODE_WORKER_MAX_CONCURRENCY", "2"))
|
||||||
|
|||||||
49
services/node-worker/providers/ollama_vision.py
Normal file
49
services/node-worker/providers/ollama_vision.py
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
"""Ollama vision provider — direct Ollama API with images, no Swapper dependency."""
|
||||||
|
import logging
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from config import OLLAMA_BASE_URL, DEFAULT_VISION
|
||||||
|
|
||||||
|
logger = logging.getLogger("provider.ollama_vision")
|
||||||
|
|
||||||
|
|
||||||
|
async def infer(
|
||||||
|
images: Optional[List[str]] = None,
|
||||||
|
prompt: str = "",
|
||||||
|
model: str = "",
|
||||||
|
system: str = "",
|
||||||
|
max_tokens: int = 1024,
|
||||||
|
temperature: float = 0.2,
|
||||||
|
timeout_s: float = 60.0,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
model = model or DEFAULT_VISION
|
||||||
|
|
||||||
|
payload: Dict[str, Any] = {
|
||||||
|
"model": model,
|
||||||
|
"prompt": prompt or "Describe this image.",
|
||||||
|
"stream": False,
|
||||||
|
"options": {"num_predict": max_tokens, "temperature": temperature},
|
||||||
|
}
|
||||||
|
if images:
|
||||||
|
clean = []
|
||||||
|
for img in images:
|
||||||
|
if "," in img and img.startswith("data:"):
|
||||||
|
clean.append(img.split(",", 1)[1])
|
||||||
|
else:
|
||||||
|
clean.append(img)
|
||||||
|
payload["images"] = clean
|
||||||
|
if system:
|
||||||
|
payload["system"] = system
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=timeout_s) as c:
|
||||||
|
resp = await c.post(f"{OLLAMA_BASE_URL}/api/generate", json=payload)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
return {
|
||||||
|
"text": data.get("response", ""),
|
||||||
|
"model": model,
|
||||||
|
"provider": "ollama_vision",
|
||||||
|
"eval_count": data.get("eval_count", 0),
|
||||||
|
}
|
||||||
@@ -9,7 +9,7 @@ from typing import Any, Dict
|
|||||||
import config
|
import config
|
||||||
from models import JobRequest, JobResponse, JobError
|
from models import JobRequest, JobResponse, JobError
|
||||||
from idempotency import IdempotencyStore
|
from idempotency import IdempotencyStore
|
||||||
from providers import ollama, swapper_vision
|
from providers import ollama, ollama_vision
|
||||||
import fabric_metrics as fm
|
import fabric_metrics as fm
|
||||||
|
|
||||||
logger = logging.getLogger("node-worker")
|
logger = logging.getLogger("node-worker")
|
||||||
@@ -27,9 +27,13 @@ async def start(nats_client):
|
|||||||
global _nats_client
|
global _nats_client
|
||||||
_nats_client = nats_client
|
_nats_client = nats_client
|
||||||
|
|
||||||
|
nid = config.NODE_ID.lower()
|
||||||
subjects = [
|
subjects = [
|
||||||
f"node.{config.NODE_ID.lower()}.llm.request",
|
f"node.{nid}.llm.request",
|
||||||
f"node.{config.NODE_ID.lower()}.vision.request",
|
f"node.{nid}.vision.request",
|
||||||
|
f"node.{nid}.stt.request",
|
||||||
|
f"node.{nid}.tts.request",
|
||||||
|
f"node.{nid}.image.request",
|
||||||
]
|
]
|
||||||
for subj in subjects:
|
for subj in subjects:
|
||||||
await nats_client.subscribe(subj, cb=_handle_request)
|
await nats_client.subscribe(subj, cb=_handle_request)
|
||||||
@@ -160,7 +164,7 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
|
|||||||
)
|
)
|
||||||
elif job.required_type == "vision":
|
elif job.required_type == "vision":
|
||||||
result = await asyncio.wait_for(
|
result = await asyncio.wait_for(
|
||||||
swapper_vision.infer(
|
ollama_vision.infer(
|
||||||
images=payload.get("images"),
|
images=payload.get("images"),
|
||||||
prompt=payload.get("prompt", ""),
|
prompt=payload.get("prompt", ""),
|
||||||
model=model,
|
model=model,
|
||||||
@@ -171,11 +175,20 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
|
|||||||
),
|
),
|
||||||
timeout=timeout_s,
|
timeout=timeout_s,
|
||||||
)
|
)
|
||||||
|
elif job.required_type in ("stt", "tts", "image"):
|
||||||
|
return JobResponse(
|
||||||
|
job_id=job.job_id, trace_id=job.trace_id, node_id=config.NODE_ID,
|
||||||
|
status="error",
|
||||||
|
error=JobError(
|
||||||
|
code="NOT_YET_IMPLEMENTED",
|
||||||
|
message=f"{job.required_type} adapter coming soon; use direct runtime API for now",
|
||||||
|
),
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
return JobResponse(
|
return JobResponse(
|
||||||
job_id=job.job_id, trace_id=job.trace_id, node_id=config.NODE_ID,
|
job_id=job.job_id, trace_id=job.trace_id, node_id=config.NODE_ID,
|
||||||
status="error",
|
status="error",
|
||||||
error=JobError(code="UNSUPPORTED_TYPE", message=f"{job.required_type} not implemented"),
|
error=JobError(code="UNSUPPORTED_TYPE", message=f"{job.required_type} not supported"),
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|||||||
@@ -877,12 +877,13 @@ app = FastAPI(title="DAARION Router", version="2.0.0")
|
|||||||
# Configuration
|
# Configuration
|
||||||
NATS_URL = os.getenv("NATS_URL", "nats://nats:4222")
|
NATS_URL = os.getenv("NATS_URL", "nats://nats:4222")
|
||||||
SWAPPER_URL = os.getenv("SWAPPER_URL", "http://swapper-service:8890")
|
SWAPPER_URL = os.getenv("SWAPPER_URL", "http://swapper-service:8890")
|
||||||
# All multimodal services now through Swapper
|
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||||
STT_URL = os.getenv("STT_URL", "http://swapper-service:8890") # Swapper /stt endpoint
|
PREFER_NODE_WORKER = os.getenv("PREFER_NODE_WORKER", "true").lower() in ("true", "1")
|
||||||
TTS_URL = os.getenv("TTS_URL", "http://swapper-service:8890") # Swapper /tts endpoint
|
STT_URL = os.getenv("STT_URL", "http://swapper-service:8890")
|
||||||
VISION_URL = os.getenv("VISION_URL", "http://172.18.0.1:11434") # Host Ollama
|
TTS_URL = os.getenv("TTS_URL", "http://swapper-service:8890")
|
||||||
OCR_URL = os.getenv("OCR_URL", "http://swapper-service:8890") # Swapper /ocr endpoint
|
VISION_URL = os.getenv("VISION_URL", "http://host.docker.internal:11434")
|
||||||
DOCUMENT_URL = os.getenv("DOCUMENT_URL", "http://swapper-service:8890") # Swapper /document endpoint
|
OCR_URL = os.getenv("OCR_URL", "http://swapper-service:8890")
|
||||||
|
DOCUMENT_URL = os.getenv("DOCUMENT_URL", "http://swapper-service:8890")
|
||||||
CITY_SERVICE_URL = os.getenv("CITY_SERVICE_URL", "http://daarion-city-service:7001")
|
CITY_SERVICE_URL = os.getenv("CITY_SERVICE_URL", "http://daarion-city-service:7001")
|
||||||
|
|
||||||
# CrewAI Routing Configuration
|
# CrewAI Routing Configuration
|
||||||
@@ -1083,8 +1084,8 @@ async def startup_event():
|
|||||||
runtime_guard_engine = None
|
runtime_guard_engine = None
|
||||||
|
|
||||||
# Log backend URLs
|
# Log backend URLs
|
||||||
logger.info(f"📡 Swapper URL: {SWAPPER_URL}")
|
logger.info(f"📡 Ollama URL: {OLLAMA_URL} (prefer_node_worker={PREFER_NODE_WORKER})")
|
||||||
logger.info(f"📡 STT URL: {STT_URL}")
|
logger.info(f"📡 Swapper URL: {SWAPPER_URL} (legacy, being decommissioned)")
|
||||||
logger.info(f"📡 Vision URL: {VISION_URL}")
|
logger.info(f"📡 Vision URL: {VISION_URL}")
|
||||||
logger.info(f"📡 OCR URL: {OCR_URL}")
|
logger.info(f"📡 OCR URL: {OCR_URL}")
|
||||||
logger.info(f"📡 Neo4j URL: {NEO4J_URI}")
|
logger.info(f"📡 Neo4j URL: {NEO4J_URI}")
|
||||||
@@ -2388,33 +2389,39 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
|||||||
logger.warning(f"⚠️ Deterministic AgroMatrix plant flow failed, fallback to generic vision: {e}")
|
logger.warning(f"⚠️ Deterministic AgroMatrix plant flow failed, fallback to generic vision: {e}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Use Swapper's /vision endpoint (manages model loading)
|
vision_model = "qwen3-vl-8b"
|
||||||
vision_payload = {
|
vision_system = system_prompt or ""
|
||||||
"model": "qwen3-vl-8b",
|
if vision_system and memory_brief_text:
|
||||||
"prompt": request.prompt,
|
vision_system = f"{vision_system}\n\n[INTERNAL MEMORY - do NOT repeat to user]\n{memory_brief_text}"
|
||||||
"images": request.images, # Swapper handles data URL conversion
|
|
||||||
"max_tokens": request.max_tokens or 1024,
|
|
||||||
"temperature": request.temperature or 0.7
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add system prompt if available
|
clean_images = []
|
||||||
if system_prompt:
|
for img in (request.images or []):
|
||||||
if memory_brief_text:
|
if "," in img and img.startswith("data:"):
|
||||||
vision_payload["system"] = f"{system_prompt}\n\n[INTERNAL MEMORY - do NOT repeat to user]\n{memory_brief_text}"
|
clean_images.append(img.split(",", 1)[1])
|
||||||
else:
|
else:
|
||||||
vision_payload["system"] = system_prompt
|
clean_images.append(img)
|
||||||
|
|
||||||
logger.info(f"🖼️ Sending to Swapper /vision: {SWAPPER_URL}/vision")
|
logger.info(f"🖼️ Vision inference: model={vision_model} images={len(clean_images)} prefer_nw={PREFER_NODE_WORKER}")
|
||||||
|
|
||||||
vision_resp = await http_client.post(
|
vision_resp = await http_client.post(
|
||||||
f"{SWAPPER_URL}/vision",
|
f"{OLLAMA_URL}/api/generate",
|
||||||
json=vision_payload,
|
json={
|
||||||
timeout=120.0
|
"model": vision_model.replace("-", "-vl:").replace("qwen3-vl:", "qwen3-vl:") if ":" not in vision_model else vision_model,
|
||||||
|
"prompt": request.prompt,
|
||||||
|
"images": clean_images,
|
||||||
|
"system": vision_system,
|
||||||
|
"stream": False,
|
||||||
|
"options": {
|
||||||
|
"num_predict": request.max_tokens or 1024,
|
||||||
|
"temperature": request.temperature or 0.7,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
timeout=120.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
if vision_resp.status_code == 200:
|
if vision_resp.status_code == 200:
|
||||||
vision_data = vision_resp.json()
|
vision_data = vision_resp.json()
|
||||||
raw_response = vision_data.get("text", "")
|
raw_response = vision_data.get("response", vision_data.get("text", ""))
|
||||||
full_response = _sanitize_vision_text_for_user(raw_response)
|
full_response = _sanitize_vision_text_for_user(raw_response)
|
||||||
vision_web_query = ""
|
vision_web_query = ""
|
||||||
vision_sources: List[Dict[str, str]] = []
|
vision_sources: List[Dict[str, str]] = []
|
||||||
@@ -2519,14 +2526,23 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
|||||||
"Відповідай українською 2-4 реченнями, без службових фраз. "
|
"Відповідай українською 2-4 реченнями, без службових фраз. "
|
||||||
f"Запит користувача: {request.prompt}"
|
f"Запит користувача: {request.prompt}"
|
||||||
)
|
)
|
||||||
|
retry_ollama_payload = {
|
||||||
|
"model": vision_model.replace("-", "-vl:").replace("qwen3-vl:", "qwen3-vl:") if ":" not in vision_model else vision_model,
|
||||||
|
"prompt": retry_payload["prompt"],
|
||||||
|
"images": clean_images,
|
||||||
|
"stream": False,
|
||||||
|
"options": {"num_predict": request.max_tokens or 1024, "temperature": 0.7},
|
||||||
|
}
|
||||||
|
if retry_payload.get("system"):
|
||||||
|
retry_ollama_payload["system"] = retry_payload["system"]
|
||||||
retry_resp = await http_client.post(
|
retry_resp = await http_client.post(
|
||||||
f"{SWAPPER_URL}/vision",
|
f"{OLLAMA_URL}/api/generate",
|
||||||
json=retry_payload,
|
json=retry_ollama_payload,
|
||||||
timeout=120.0
|
timeout=120.0,
|
||||||
)
|
)
|
||||||
if retry_resp.status_code == 200:
|
if retry_resp.status_code == 200:
|
||||||
retry_data = retry_resp.json()
|
retry_data = retry_resp.json()
|
||||||
retry_raw = retry_data.get("text", "")
|
retry_raw = retry_data.get("response", retry_data.get("text", ""))
|
||||||
retry_text = _sanitize_vision_text_for_user(retry_raw)
|
retry_text = _sanitize_vision_text_for_user(retry_raw)
|
||||||
if retry_raw and not retry_text:
|
if retry_raw and not retry_text:
|
||||||
retry_text = _extract_vision_search_facts(retry_raw, max_chars=280)
|
retry_text = _extract_vision_search_facts(retry_raw, max_chars=280)
|
||||||
@@ -2541,7 +2557,7 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
|||||||
elif request_agent_id in DETERMINISTIC_PLANT_POLICY_AGENTS and _vision_response_is_blurry(full_response):
|
elif request_agent_id in DETERMINISTIC_PLANT_POLICY_AGENTS and _vision_response_is_blurry(full_response):
|
||||||
full_response = _build_image_fallback_response(request_agent_id, request.prompt)
|
full_response = _build_image_fallback_response(request_agent_id, request.prompt)
|
||||||
|
|
||||||
full_response = await _finalize_response_text(full_response, "swapper-vision")
|
full_response = await _finalize_response_text(full_response, "ollama-vision")
|
||||||
|
|
||||||
# Store vision message in agent-specific memory
|
# Store vision message in agent-specific memory
|
||||||
if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id and full_response:
|
if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id and full_response:
|
||||||
@@ -2567,10 +2583,10 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
|||||||
response=full_response,
|
response=full_response,
|
||||||
model="qwen3-vl-8b",
|
model="qwen3-vl-8b",
|
||||||
tokens_used=None,
|
tokens_used=None,
|
||||||
backend="swapper-vision"
|
backend="ollama-vision"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.error(f"❌ Swapper vision error: {vision_resp.status_code} - {vision_resp.text[:200]}")
|
logger.error(f"❌ Ollama vision error: {vision_resp.status_code} - {vision_resp.text[:200]}")
|
||||||
fallback_response = await _finalize_response_text(
|
fallback_response = await _finalize_response_text(
|
||||||
_build_image_fallback_response(request_agent_id, request.prompt),
|
_build_image_fallback_response(request_agent_id, request.prompt),
|
||||||
"swapper-vision-fallback",
|
"swapper-vision-fallback",
|
||||||
@@ -2579,7 +2595,7 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
|||||||
response=fallback_response,
|
response=fallback_response,
|
||||||
model="qwen3-vl-8b",
|
model="qwen3-vl-8b",
|
||||||
tokens_used=None,
|
tokens_used=None,
|
||||||
backend="swapper-vision-fallback"
|
backend="vision-fallback"
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -2592,7 +2608,7 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
|||||||
response=fallback_response,
|
response=fallback_response,
|
||||||
model="qwen3-vl-8b",
|
model="qwen3-vl-8b",
|
||||||
tokens_used=None,
|
tokens_used=None,
|
||||||
backend="swapper-vision-fallback"
|
backend="vision-fallback"
|
||||||
)
|
)
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
@@ -3142,29 +3158,27 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
|||||||
logger.warning(f"⚠️ No local model in config, using hardcoded fallback: {local_model}")
|
logger.warning(f"⚠️ No local model in config, using hardcoded fallback: {local_model}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Check if Swapper is available
|
ollama_model = local_model.replace("-", ":") if ":" not in local_model else local_model
|
||||||
health_resp = await http_client.get(f"{SWAPPER_URL}/health", timeout=5.0)
|
logger.info(f"📡 Calling Ollama direct: model={ollama_model}")
|
||||||
if health_resp.status_code == 200:
|
|
||||||
logger.info(f"📡 Calling Swapper with local model: {local_model}")
|
|
||||||
# Generate response via Swapper (which handles model loading)
|
|
||||||
generate_resp = await http_client.post(
|
generate_resp = await http_client.post(
|
||||||
f"{SWAPPER_URL}/generate",
|
f"{OLLAMA_URL}/api/generate",
|
||||||
json={
|
json={
|
||||||
"model": local_model,
|
"model": ollama_model,
|
||||||
"prompt": request.prompt,
|
"prompt": request.prompt,
|
||||||
"system": system_prompt,
|
"system": system_prompt,
|
||||||
"max_tokens": request.max_tokens,
|
"stream": False,
|
||||||
"temperature": request.temperature,
|
"options": {
|
||||||
"stream": False
|
"num_predict": request.max_tokens or 2048,
|
||||||
|
"temperature": request.temperature or 0.7,
|
||||||
},
|
},
|
||||||
timeout=300.0
|
},
|
||||||
|
timeout=300.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
if generate_resp.status_code == 200:
|
if generate_resp.status_code == 200:
|
||||||
data = generate_resp.json()
|
data = generate_resp.json()
|
||||||
local_response = _normalize_text_response(data.get("response", ""))
|
local_response = _normalize_text_response(data.get("response", ""))
|
||||||
|
|
||||||
# Empty-answer gate for selected local top-level agents.
|
|
||||||
if request_agent_id in EMPTY_ANSWER_GUARD_AGENTS and _needs_empty_answer_recovery(local_response):
|
if request_agent_id in EMPTY_ANSWER_GUARD_AGENTS and _needs_empty_answer_recovery(local_response):
|
||||||
logger.warning(f"⚠️ Empty-answer gate triggered for {request_agent_id}, retrying local generate once")
|
logger.warning(f"⚠️ Empty-answer gate triggered for {request_agent_id}, retrying local generate once")
|
||||||
retry_prompt = (
|
retry_prompt = (
|
||||||
@@ -3172,16 +3186,18 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
|||||||
"Відповідай коротко і конкретно (2-5 речень), без службових або мета-фраз."
|
"Відповідай коротко і конкретно (2-5 речень), без службових або мета-фраз."
|
||||||
)
|
)
|
||||||
retry_resp = await http_client.post(
|
retry_resp = await http_client.post(
|
||||||
f"{SWAPPER_URL}/generate",
|
f"{OLLAMA_URL}/api/generate",
|
||||||
json={
|
json={
|
||||||
"model": local_model,
|
"model": ollama_model,
|
||||||
"prompt": retry_prompt,
|
"prompt": retry_prompt,
|
||||||
"system": system_prompt,
|
"system": system_prompt,
|
||||||
"max_tokens": request.max_tokens,
|
"stream": False,
|
||||||
"temperature": request.temperature,
|
"options": {
|
||||||
"stream": False
|
"num_predict": request.max_tokens or 2048,
|
||||||
|
"temperature": request.temperature or 0.7,
|
||||||
},
|
},
|
||||||
timeout=300.0
|
},
|
||||||
|
timeout=300.0,
|
||||||
)
|
)
|
||||||
if retry_resp.status_code == 200:
|
if retry_resp.status_code == 200:
|
||||||
retry_data = retry_resp.json()
|
retry_data = retry_resp.json()
|
||||||
@@ -3194,9 +3210,8 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
|||||||
"Я не отримав корисну відповідь з першої спроби. "
|
"Я не отримав корисну відповідь з першої спроби. "
|
||||||
"Сформулюй запит коротко ще раз, і я відповім конкретно."
|
"Сформулюй запит коротко ще раз, і я відповім конкретно."
|
||||||
)
|
)
|
||||||
local_response = await _finalize_response_text(local_response, "swapper+ollama")
|
local_response = await _finalize_response_text(local_response, "ollama-direct")
|
||||||
|
|
||||||
# Store in agent-specific memory
|
|
||||||
if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id and local_response:
|
if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id and local_response:
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
memory_retrieval.store_message(
|
memory_retrieval.store_message(
|
||||||
@@ -3213,19 +3228,18 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
|||||||
response=local_response,
|
response=local_response,
|
||||||
model=local_model,
|
model=local_model,
|
||||||
tokens_used=data.get("eval_count", 0),
|
tokens_used=data.get("eval_count", 0),
|
||||||
backend="swapper+ollama"
|
backend="ollama-direct"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.error(f"❌ Swapper error: {generate_resp.status_code} - {generate_resp.text}")
|
logger.error(f"❌ Ollama generate error: {generate_resp.status_code} - {generate_resp.text[:200]}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"❌ Swapper/Ollama error: {e}")
|
logger.error(f"❌ Ollama direct error: {e}")
|
||||||
# Fallback to direct Ollama if Swapper fails
|
|
||||||
try:
|
try:
|
||||||
logger.info(f"🔄 Falling back to direct Ollama connection")
|
logger.info(f"🔄 Falling back to Ollama with hardcoded model")
|
||||||
generate_resp = await http_client.post(
|
generate_resp = await http_client.post(
|
||||||
f"{VISION_URL}/api/generate",
|
f"{OLLAMA_URL}/api/generate",
|
||||||
json={
|
json={
|
||||||
"model": "qwen3:8b", # Use actual Ollama model name
|
"model": "qwen3:8b",
|
||||||
"prompt": request.prompt,
|
"prompt": request.prompt,
|
||||||
"system": system_prompt,
|
"system": system_prompt,
|
||||||
"stream": False,
|
"stream": False,
|
||||||
@@ -3526,38 +3540,34 @@ async def documents_versions(doc_id: str, agent_id: str, limit: int = 20):
|
|||||||
|
|
||||||
@app.get("/v1/models")
|
@app.get("/v1/models")
|
||||||
async def list_available_models():
|
async def list_available_models():
|
||||||
"""List all available models across backends"""
|
"""List all available models from NCS (global capabilities pool)."""
|
||||||
models = []
|
models = []
|
||||||
|
|
||||||
# Get Swapper models
|
|
||||||
try:
|
try:
|
||||||
resp = await http_client.get(f"{SWAPPER_URL}/models", timeout=5.0)
|
from global_capabilities_client import get_global_capabilities
|
||||||
if resp.status_code == 200:
|
pool = await get_global_capabilities()
|
||||||
data = resp.json()
|
for m in pool.get("served_models", []):
|
||||||
for m in data.get("models", []):
|
|
||||||
models.append({
|
models.append({
|
||||||
"id": m.get("name"),
|
"id": m.get("name"),
|
||||||
"backend": "swapper",
|
"backend": m.get("runtime", "unknown"),
|
||||||
|
"node": m.get("node", "?"),
|
||||||
|
"type": m.get("type", "llm"),
|
||||||
"size_gb": m.get("size_gb"),
|
"size_gb": m.get("size_gb"),
|
||||||
"status": m.get("status", "available")
|
"status": "served",
|
||||||
})
|
})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Cannot get Swapper models: {e}")
|
logger.warning(f"Cannot get NCS global models: {e}")
|
||||||
|
|
||||||
# Get Ollama models
|
if not models:
|
||||||
try:
|
try:
|
||||||
resp = await http_client.get(f"{VISION_URL}/api/tags", timeout=5.0)
|
resp = await http_client.get(f"{OLLAMA_URL}/api/tags", timeout=5.0)
|
||||||
if resp.status_code == 200:
|
if resp.status_code == 200:
|
||||||
data = resp.json()
|
for m in resp.json().get("models", []):
|
||||||
for m in data.get("models", []):
|
|
||||||
# Avoid duplicates
|
|
||||||
model_name = m.get("name")
|
|
||||||
if not any(x.get("id") == model_name for x in models):
|
|
||||||
models.append({
|
models.append({
|
||||||
"id": model_name,
|
"id": m.get("name"),
|
||||||
"backend": "ollama",
|
"backend": "ollama",
|
||||||
"size_gb": round(m.get("size", 0) / 1e9, 1),
|
"size_gb": round(m.get("size", 0) / 1e9, 1),
|
||||||
"status": "loaded"
|
"status": "loaded",
|
||||||
})
|
})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Cannot get Ollama models: {e}")
|
logger.warning(f"Cannot get Ollama models: {e}")
|
||||||
|
|||||||
@@ -1,90 +1,35 @@
|
|||||||
# Swapper Configuration for Node #2 (Development Node)
|
# Swapper Configuration — Default / Fallback
|
||||||
# Single-active LLM scheduler
|
#
|
||||||
# MacBook Pro M4 Max - Apple Silicon (40-core GPU, 64GB RAM)
|
# NOTE: Swapper is now a runtime gateway / executor only.
|
||||||
# Auto-generated configuration with available Ollama models
|
# Source of truth for models is NCS (Node Capabilities Service).
|
||||||
|
# No hardcoded model lists — Swapper queries NCS or Ollama /api/tags at startup.
|
||||||
|
#
|
||||||
|
# Per-node overrides: swapper_config_node1.yaml, swapper_config_node2.yaml
|
||||||
|
|
||||||
swapper:
|
node_id: default
|
||||||
mode: single-active
|
|
||||||
max_concurrent_models: 1
|
runtimes:
|
||||||
|
ollama:
|
||||||
|
url: http://localhost:11434
|
||||||
|
timeout: 300
|
||||||
|
|
||||||
|
limits:
|
||||||
|
llm_concurrency: 2
|
||||||
|
vision_concurrency: 1
|
||||||
|
max_concurrent_models: 2
|
||||||
model_swap_timeout: 300
|
model_swap_timeout: 300
|
||||||
gpu_enabled: true
|
|
||||||
metal_acceleration: true # Apple Silicon GPU acceleration
|
|
||||||
# Модель для автоматичного завантаження при старті (опціонально)
|
|
||||||
# Якщо не вказано - моделі завантажуються тільки за запитом
|
|
||||||
# Рекомендовано: gpt-oss:latest (швидка модель) або phi3:latest (легка модель)
|
|
||||||
default_model: gpt-oss:latest # Модель активується автоматично при старті
|
|
||||||
|
|
||||||
models:
|
timeouts:
|
||||||
# Fast LLM - GPT-OSS 20B (High Priority) - Main model for general tasks
|
llm_ms: 120000
|
||||||
gpt-oss-latest:
|
vision_ms: 180000
|
||||||
path: ollama:gpt-oss:latest
|
stt_ms: 60000
|
||||||
type: llm
|
tts_ms: 60000
|
||||||
size_gb: 13.0
|
|
||||||
priority: high
|
|
||||||
description: "Fast LLM for general tasks and conversations (20.9B params)"
|
|
||||||
|
|
||||||
# Lightweight LLM - Phi3 3.8B (High Priority) - Fast responses
|
gpu:
|
||||||
phi3-latest:
|
enabled: false
|
||||||
path: ollama:phi3:latest
|
metal_acceleration: false
|
||||||
type: llm
|
|
||||||
size_gb: 2.2
|
|
||||||
priority: high
|
|
||||||
description: "Lightweight LLM for fast responses (3.8B params)"
|
|
||||||
|
|
||||||
# Code Specialist - StarCoder2 3B (Medium Priority) - Code engineering
|
|
||||||
starcoder2-3b:
|
|
||||||
path: ollama:starcoder2:3b
|
|
||||||
type: code
|
|
||||||
size_gb: 1.7
|
|
||||||
priority: medium
|
|
||||||
description: "Code specialist model for code engineering (3B params)"
|
|
||||||
|
|
||||||
# Reasoning Model - Mistral Nemo 12.2B (High Priority) - Advanced reasoning
|
|
||||||
mistral-nemo-12b:
|
|
||||||
path: ollama:mistral-nemo:12b
|
|
||||||
type: llm
|
|
||||||
size_gb: 7.1
|
|
||||||
priority: high
|
|
||||||
description: "Advanced reasoning model for complex tasks (12.2B params)"
|
|
||||||
|
|
||||||
# Reasoning Model - Gemma2 27B (Medium Priority) - Strategic reasoning
|
|
||||||
gemma2-27b:
|
|
||||||
path: ollama:gemma2:27b
|
|
||||||
type: llm
|
|
||||||
size_gb: 15.0
|
|
||||||
priority: medium
|
|
||||||
description: "Reasoning model for strategic tasks (27.2B params)"
|
|
||||||
|
|
||||||
# Code Specialist - DeepSeek Coder 33B (High Priority) - Advanced code tasks
|
|
||||||
deepseek-coder-33b:
|
|
||||||
path: ollama:deepseek-coder:33b
|
|
||||||
type: code
|
|
||||||
size_gb: 18.0
|
|
||||||
priority: high
|
|
||||||
description: "Advanced code specialist model (33B params)"
|
|
||||||
|
|
||||||
# Code Specialist - Qwen2.5 Coder 32B (High Priority) - Advanced code tasks
|
|
||||||
qwen2.5-coder-32b:
|
|
||||||
path: ollama:qwen2.5-coder:32b
|
|
||||||
type: code
|
|
||||||
size_gb: 19.0
|
|
||||||
priority: high
|
|
||||||
description: "Advanced code specialist model (32.8B params)"
|
|
||||||
|
|
||||||
# Reasoning Model - DeepSeek R1 70B (High Priority) - Strategic reasoning (large model)
|
|
||||||
deepseek-r1-70b:
|
|
||||||
path: ollama:deepseek-r1:70b
|
|
||||||
type: llm
|
|
||||||
size_gb: 42.0
|
|
||||||
priority: high
|
|
||||||
description: "Strategic reasoning model (70.6B params, quantized)"
|
|
||||||
|
|
||||||
storage:
|
storage:
|
||||||
models_dir: /app/models
|
models_dir: /app/models
|
||||||
cache_dir: /app/cache
|
cache_dir: /app/cache
|
||||||
swap_dir: /app/swap
|
swap_dir: /app/swap
|
||||||
|
|
||||||
ollama:
|
|
||||||
url: http://localhost:11434 # Native Ollama on MacBook (via Pieces OS or brew)
|
|
||||||
timeout: 300
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,186 +1,37 @@
|
|||||||
# Swapper Configuration for Node #1 (Production Server)
|
# Swapper Configuration for Node #1 (Production Server)
|
||||||
# Optimized Multimodal Stack: LLM + Vision + OCR + Document + Audio
|
|
||||||
# Hetzner GEX44 - NVIDIA RTX 4000 SFF Ada (20GB VRAM)
|
# Hetzner GEX44 - NVIDIA RTX 4000 SFF Ada (20GB VRAM)
|
||||||
#
|
#
|
||||||
# ВАЖЛИВО: Ембедінги через зовнішні API:
|
# NOTE: Swapper is now a runtime gateway / executor only.
|
||||||
# - Text: Cohere API (embed-multilingual-v3.0, 1024 dim)
|
# Source of truth for models is NCS (Node Capabilities Service).
|
||||||
# - Image: Vision Encoder (OpenCLIP ViT-L/14, 768 dim)
|
# No hardcoded model lists — Swapper queries NCS or Ollama /api/tags at startup.
|
||||||
# НЕ використовуємо локальні embedding моделі!
|
|
||||||
|
|
||||||
swapper:
|
node_id: noda1
|
||||||
mode: multi-active
|
|
||||||
max_concurrent_models: 4 # LLM + OCR + STT + TTS (до 15GB)
|
runtimes:
|
||||||
|
ollama:
|
||||||
|
url: http://172.18.0.1:11434
|
||||||
|
timeout: 300
|
||||||
|
# comfyui:
|
||||||
|
# url: http://127.0.0.1:8188
|
||||||
|
|
||||||
|
limits:
|
||||||
|
llm_concurrency: 2
|
||||||
|
vision_concurrency: 1
|
||||||
|
max_concurrent_models: 4
|
||||||
model_swap_timeout: 300
|
model_swap_timeout: 300
|
||||||
gpu_enabled: true
|
|
||||||
|
timeouts:
|
||||||
|
llm_ms: 120000
|
||||||
|
vision_ms: 180000
|
||||||
|
stt_ms: 60000
|
||||||
|
tts_ms: 60000
|
||||||
|
image_gen_ms: 300000
|
||||||
|
|
||||||
|
gpu:
|
||||||
|
enabled: true
|
||||||
metal_acceleration: false
|
metal_acceleration: false
|
||||||
default_model: qwen3-8b
|
|
||||||
lazy_load_ocr: true
|
|
||||||
lazy_load_audio: true
|
|
||||||
# Автоматичне вивантаження при нестачі VRAM
|
|
||||||
auto_unload_on_oom: true
|
auto_unload_on_oom: true
|
||||||
vram_threshold_gb: 18 # Починати вивантажувати при 18GB
|
vram_threshold_gb: 18
|
||||||
|
|
||||||
models:
|
|
||||||
# ============================================
|
|
||||||
# LLM MODELS (Ollama) - тільки qwen3
|
|
||||||
# ============================================
|
|
||||||
|
|
||||||
# Primary LLM - Qwen3 8B (includes math, coding, reasoning)
|
|
||||||
qwen3-8b:
|
|
||||||
path: ollama:qwen3:8b
|
|
||||||
type: llm
|
|
||||||
size_gb: 5.2
|
|
||||||
priority: high
|
|
||||||
description: "Qwen3 8B - primary LLM with math, coding, reasoning capabilities"
|
|
||||||
capabilities:
|
|
||||||
- chat
|
|
||||||
- math
|
|
||||||
- coding
|
|
||||||
- reasoning
|
|
||||||
- multilingual
|
|
||||||
|
|
||||||
# ============================================
|
|
||||||
# VISION MODELS (Ollama)
|
|
||||||
# ============================================
|
|
||||||
|
|
||||||
# Vision Model - Qwen3-VL 8B
|
|
||||||
qwen3-vl-8b:
|
|
||||||
path: ollama:qwen3-vl:8b
|
|
||||||
type: vision
|
|
||||||
size_gb: 6.1
|
|
||||||
priority: high
|
|
||||||
description: "Qwen3-VL 8B for image understanding and visual reasoning"
|
|
||||||
capabilities:
|
|
||||||
- image_understanding
|
|
||||||
- visual_qa
|
|
||||||
- diagram_analysis
|
|
||||||
- ocr_basic
|
|
||||||
|
|
||||||
# ============================================
|
|
||||||
# OCR/DOCUMENT MODELS (HuggingFace)
|
|
||||||
# ============================================
|
|
||||||
|
|
||||||
# GOT-OCR2.0 - Best for documents, tables, formulas
|
|
||||||
got-ocr2:
|
|
||||||
path: huggingface:stepfun-ai/GOT-OCR2_0
|
|
||||||
type: ocr
|
|
||||||
size_gb: 7.0
|
|
||||||
priority: high
|
|
||||||
description: "Best OCR for documents, tables, formulas, handwriting"
|
|
||||||
capabilities:
|
|
||||||
- documents
|
|
||||||
- tables
|
|
||||||
- formulas
|
|
||||||
- handwriting
|
|
||||||
- multilingual
|
|
||||||
|
|
||||||
# Donut - Document Understanding (no external OCR, 91% CORD)
|
|
||||||
donut-base:
|
|
||||||
path: huggingface:naver-clova-ix/donut-base
|
|
||||||
type: ocr
|
|
||||||
size_gb: 3.0
|
|
||||||
priority: high
|
|
||||||
description: "Document parsing without OCR engine (91% CORD accuracy)"
|
|
||||||
capabilities:
|
|
||||||
- document_parsing
|
|
||||||
- receipts
|
|
||||||
- forms
|
|
||||||
- invoices
|
|
||||||
|
|
||||||
# Donut fine-tuned for receipts/invoices (CORD dataset)
|
|
||||||
donut-cord:
|
|
||||||
path: huggingface:naver-clova-ix/donut-base-finetuned-cord-v2
|
|
||||||
type: ocr
|
|
||||||
size_gb: 3.0
|
|
||||||
priority: medium
|
|
||||||
description: "Donut fine-tuned for receipts extraction"
|
|
||||||
capabilities:
|
|
||||||
- receipts
|
|
||||||
- invoices
|
|
||||||
- structured_extraction
|
|
||||||
|
|
||||||
# IBM Granite Docling - Document conversion with structure preservation
|
|
||||||
granite-docling:
|
|
||||||
path: huggingface:ds4sd/docling-ibm-granite-vision-1b
|
|
||||||
type: document
|
|
||||||
size_gb: 2.5
|
|
||||||
priority: high
|
|
||||||
description: "IBM Granite Docling for PDF/document structure extraction"
|
|
||||||
capabilities:
|
|
||||||
- pdf_conversion
|
|
||||||
- table_extraction
|
|
||||||
- formula_extraction
|
|
||||||
- layout_preservation
|
|
||||||
- doctags_format
|
|
||||||
|
|
||||||
# ============================================
|
|
||||||
# AUDIO MODELS - STT (Speech-to-Text)
|
|
||||||
# ============================================
|
|
||||||
|
|
||||||
# Faster Whisper Large-v3 - Best STT quality
|
|
||||||
faster-whisper-large:
|
|
||||||
path: huggingface:Systran/faster-whisper-large-v3
|
|
||||||
type: stt
|
|
||||||
size_gb: 3.0
|
|
||||||
priority: high
|
|
||||||
description: "Faster Whisper Large-v3 - best quality, 99 languages"
|
|
||||||
capabilities:
|
|
||||||
- speech_recognition
|
|
||||||
- transcription
|
|
||||||
- multilingual
|
|
||||||
- timestamps
|
|
||||||
- ukrainian
|
|
||||||
|
|
||||||
# Whisper Small - Fast/lightweight for quick transcription
|
|
||||||
whisper-small:
|
|
||||||
path: huggingface:openai/whisper-small
|
|
||||||
type: stt
|
|
||||||
size_gb: 0.5
|
|
||||||
priority: medium
|
|
||||||
description: "Whisper Small for fast transcription"
|
|
||||||
capabilities:
|
|
||||||
- speech_recognition
|
|
||||||
- transcription
|
|
||||||
|
|
||||||
# ============================================
|
|
||||||
# AUDIO MODELS - TTS (Text-to-Speech)
|
|
||||||
# ============================================
|
|
||||||
|
|
||||||
# Coqui XTTS-v2 - Best multilingual TTS with Ukrainian support
|
|
||||||
xtts-v2:
|
|
||||||
path: huggingface:coqui/XTTS-v2
|
|
||||||
type: tts
|
|
||||||
size_gb: 2.0
|
|
||||||
priority: high
|
|
||||||
description: "XTTS-v2 multilingual TTS with voice cloning, Ukrainian support"
|
|
||||||
capabilities:
|
|
||||||
- text_to_speech
|
|
||||||
- voice_cloning
|
|
||||||
- multilingual
|
|
||||||
- ukrainian
|
|
||||||
- 17_languages
|
|
||||||
|
|
||||||
# ============================================
|
|
||||||
# IMAGE GENERATION MODELS (HuggingFace/Diffusers)
|
|
||||||
# ============================================
|
|
||||||
|
|
||||||
# FLUX.2 Klein 4B - High quality image generation with lazy loading
|
|
||||||
flux-klein-4b:
|
|
||||||
path: huggingface:black-forest-labs/FLUX.2-klein-base-4B
|
|
||||||
type: image_generation
|
|
||||||
size_gb: 15.4
|
|
||||||
priority: medium
|
|
||||||
description: "FLUX.2 Klein 4B - high quality image generation, lazy loaded on demand"
|
|
||||||
capabilities:
|
|
||||||
- text_to_image
|
|
||||||
- high_quality
|
|
||||||
- 1024x1024
|
|
||||||
- artistic
|
|
||||||
default_params:
|
|
||||||
num_inference_steps: 50
|
|
||||||
guidance_scale: 4.0
|
|
||||||
width: 1024
|
|
||||||
height: 1024
|
|
||||||
|
|
||||||
storage:
|
storage:
|
||||||
models_dir: /app/models
|
models_dir: /app/models
|
||||||
@@ -188,33 +39,8 @@ storage:
|
|||||||
swap_dir: /app/swap
|
swap_dir: /app/swap
|
||||||
huggingface_cache: /root/.cache/huggingface
|
huggingface_cache: /root/.cache/huggingface
|
||||||
|
|
||||||
ollama:
|
|
||||||
url: http://172.18.0.1:11434
|
|
||||||
timeout: 300
|
|
||||||
|
|
||||||
huggingface:
|
huggingface:
|
||||||
device: cuda
|
device: cuda
|
||||||
torch_dtype: float16
|
torch_dtype: float16
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
low_cpu_mem_usage: true
|
low_cpu_mem_usage: true
|
||||||
|
|
||||||
# ============================================
|
|
||||||
# EMBEDDING SERVICES (External APIs)
|
|
||||||
# НЕ через Swapper - окремі сервіси!
|
|
||||||
# ============================================
|
|
||||||
#
|
|
||||||
# Text Embeddings:
|
|
||||||
# Service: Memory Service → Cohere API
|
|
||||||
# Model: embed-multilingual-v3.0
|
|
||||||
# Dimension: 1024
|
|
||||||
# Endpoint: Memory Service handles internally
|
|
||||||
#
|
|
||||||
# Image/Multimodal Embeddings:
|
|
||||||
# Service: Vision Encoder (port 8001)
|
|
||||||
# Model: OpenCLIP ViT-L/14
|
|
||||||
# Dimension: 768
|
|
||||||
# Endpoint: http://vision-encoder:8001/embed
|
|
||||||
#
|
|
||||||
# Vector Storage:
|
|
||||||
# Qdrant (port 6333) - separate collections for text vs image embeddings
|
|
||||||
# ВАЖЛИВО: НЕ змішувати embedding spaces в одній колекції!
|
|
||||||
|
|||||||
@@ -1,126 +1,40 @@
|
|||||||
# Swapper Configuration for Node #2 (Development Node)
|
# Swapper Configuration for Node #2 (Development Node)
|
||||||
# Single-active LLM scheduler
|
|
||||||
# MacBook Pro M4 Max - Apple Silicon (40-core GPU, 64GB RAM)
|
# MacBook Pro M4 Max - Apple Silicon (40-core GPU, 64GB RAM)
|
||||||
# Auto-generated configuration with available Ollama models
|
#
|
||||||
|
# NOTE: Swapper is now a runtime gateway / executor only.
|
||||||
|
# Source of truth for models is NCS (Node Capabilities Service).
|
||||||
|
# No hardcoded model lists — Swapper queries NCS or Ollama /api/tags at startup.
|
||||||
|
|
||||||
swapper:
|
node_id: noda2
|
||||||
mode: single-active
|
|
||||||
|
runtimes:
|
||||||
|
ollama:
|
||||||
|
url: http://host.docker.internal:11434
|
||||||
|
timeout: 300
|
||||||
|
# mlx:
|
||||||
|
# stt_model: whisper-large-v3-turbo
|
||||||
|
# tts_model: kokoro-82m
|
||||||
|
# comfyui:
|
||||||
|
# url: http://127.0.0.1:8188
|
||||||
|
|
||||||
|
limits:
|
||||||
|
llm_concurrency: 1
|
||||||
|
vision_concurrency: 1
|
||||||
max_concurrent_models: 1
|
max_concurrent_models: 1
|
||||||
model_swap_timeout: 300
|
model_swap_timeout: 300
|
||||||
gpu_enabled: true
|
|
||||||
metal_acceleration: true # Apple Silicon GPU acceleration
|
|
||||||
# Модель для автоматичного завантаження при старті (опціонально)
|
|
||||||
# Якщо не вказано - моделі завантажуються тільки за запитом
|
|
||||||
# Рекомендовано: gpt-oss:latest (швидка модель) або phi3:latest (легка модель)
|
|
||||||
# Стартова модель має бути реально встановлена в Ollama на NODA2
|
|
||||||
default_model: qwen3:14b # Модель активується автоматично при старті
|
|
||||||
|
|
||||||
models:
|
timeouts:
|
||||||
# Fast LLM - GPT-OSS 20B (High Priority) - Main model for general tasks
|
llm_ms: 120000
|
||||||
gpt-oss-latest:
|
vision_ms: 180000
|
||||||
path: ollama:gpt-oss:latest
|
stt_ms: 60000
|
||||||
type: llm
|
tts_ms: 60000
|
||||||
size_gb: 13.0
|
image_gen_ms: 300000
|
||||||
priority: high
|
|
||||||
description: "Fast LLM for general tasks and conversations (20.9B params)"
|
|
||||||
|
|
||||||
# Lightweight LLM - Phi3 3.8B (High Priority) - Fast responses
|
gpu:
|
||||||
phi3-latest:
|
enabled: true
|
||||||
path: ollama:phi3:latest
|
metal_acceleration: true
|
||||||
type: llm
|
|
||||||
size_gb: 2.2
|
|
||||||
priority: high
|
|
||||||
description: "Lightweight LLM for fast responses (3.8B params)"
|
|
||||||
|
|
||||||
# General Reasoning - Qwen3 14B (High Priority)
|
|
||||||
qwen3-14b:
|
|
||||||
path: ollama:qwen3:14b
|
|
||||||
type: llm
|
|
||||||
size_gb: 9.3
|
|
||||||
priority: high
|
|
||||||
description: "Balanced local model for Sofiia and router fallback"
|
|
||||||
|
|
||||||
# Reasoning Model - Qwen3.5 35B A3B (High Priority)
|
|
||||||
qwen3.5-35b-a3b:
|
|
||||||
path: ollama:qwen3.5:35b-a3b
|
|
||||||
type: llm
|
|
||||||
size_gb: 22.0
|
|
||||||
priority: high
|
|
||||||
description: "Large reasoning model for complex Sofiia requests"
|
|
||||||
|
|
||||||
# Reasoning Model - GLM 4.7 Flash (High Priority) - Fast general model
|
|
||||||
glm-4.7-flash:
|
|
||||||
path: ollama:glm-4.7-flash:32k
|
|
||||||
type: llm
|
|
||||||
size_gb: 19.0
|
|
||||||
priority: high
|
|
||||||
description: "Multi-purpose reasoning model (fast context)"
|
|
||||||
|
|
||||||
# Reasoning Model - Gemma2 27B (Medium Priority) - Strategic reasoning
|
|
||||||
gemma2-27b:
|
|
||||||
path: ollama:gemma2:27b
|
|
||||||
type: llm
|
|
||||||
size_gb: 15.0
|
|
||||||
priority: medium
|
|
||||||
description: "Reasoning model for strategic tasks (27.2B params)"
|
|
||||||
|
|
||||||
# Code Specialist - DeepSeek Coder 33B (High Priority) - Advanced code tasks
|
|
||||||
deepseek-coder-33b:
|
|
||||||
path: ollama:deepseek-coder:33b
|
|
||||||
type: code
|
|
||||||
size_gb: 18.0
|
|
||||||
priority: high
|
|
||||||
description: "Advanced code specialist model (33B params)"
|
|
||||||
|
|
||||||
# Code Specialist - Qwen2.5 Coder 32B (High Priority) - Advanced code tasks
|
|
||||||
qwen2.5-coder-32b:
|
|
||||||
path: ollama:qwen2.5-coder:32b
|
|
||||||
type: code
|
|
||||||
size_gb: 19.0
|
|
||||||
priority: high
|
|
||||||
description: "Advanced code specialist model (32.8B params)"
|
|
||||||
|
|
||||||
# Reasoning Model - DeepSeek R1 70B (High Priority) - Strategic reasoning (large model)
|
|
||||||
deepseek-r1-70b:
|
|
||||||
path: ollama:deepseek-r1:70b
|
|
||||||
type: llm
|
|
||||||
size_gb: 42.0
|
|
||||||
priority: high
|
|
||||||
description: "Strategic reasoning model (70.6B params, quantized)"
|
|
||||||
|
|
||||||
# Vision Model - LLaVA 13B (P0 Fix: NODA2 fallback vision)
|
|
||||||
# Available in Ollama on NODA2 — used until qwen3-vl:8b is installed
|
|
||||||
llava-13b:
|
|
||||||
path: ollama:llava:13b
|
|
||||||
type: vision
|
|
||||||
size_gb: 8.0
|
|
||||||
priority: high
|
|
||||||
description: "LLaVA 13B vision model (multimodal CLIP+LLM). P0 fallback until qwen3-vl:8b."
|
|
||||||
vision: true
|
|
||||||
ollama_model: "llava:13b"
|
|
||||||
|
|
||||||
# Vision Model - Qwen3-VL 8B (RECOMMENDED: install with: ollama pull qwen3-vl:8b)
|
|
||||||
# Better quality than llava:13b. Enable once installed.
|
|
||||||
# qwen3-vl-8b:
|
|
||||||
# path: ollama:qwen3-vl:8b
|
|
||||||
# type: vision
|
|
||||||
# size_gb: 5.5
|
|
||||||
# priority: high
|
|
||||||
# description: "Qwen3-VL 8B — modern vision-language model (recommended)"
|
|
||||||
# vision: true
|
|
||||||
# ollama_model: "qwen3-vl:8b"
|
|
||||||
|
|
||||||
storage:
|
storage:
|
||||||
models_dir: /app/models
|
models_dir: /app/models
|
||||||
cache_dir: /app/cache
|
cache_dir: /app/cache
|
||||||
swap_dir: /app/swap
|
swap_dir: /app/swap
|
||||||
|
|
||||||
ollama:
|
|
||||||
url: http://host.docker.internal:11434 # host.docker.internal → native Ollama on MacBook (NODA2 P1 fix)
|
|
||||||
timeout: 300
|
|
||||||
|
|
||||||
# Vision endpoint configuration
|
|
||||||
# /vision/models returns all models where vision: true
|
|
||||||
vision:
|
|
||||||
default_model: llava-13b
|
|
||||||
ollama_base_url: http://host.docker.internal:11434
|
|
||||||
|
|||||||
@@ -1,63 +1,37 @@
|
|||||||
# Swapper Configuration for Node #3 (AI/ML Workstation)
|
# Swapper Configuration for Node #3 (AI/ML Workstation)
|
||||||
# Single-active LLM scheduler
|
# Threadripper PRO + RTX 3090 24GB — GPU-intensive workloads
|
||||||
# Threadripper PRO + RTX 3090 24GB - GPU-intensive workloads
|
#
|
||||||
|
# NOTE: Swapper is now a runtime gateway / executor only.
|
||||||
|
# Source of truth for models is NCS (Node Capabilities Service).
|
||||||
|
# No hardcoded model lists.
|
||||||
|
|
||||||
swapper:
|
node_id: noda3
|
||||||
mode: single-active
|
|
||||||
max_concurrent_models: 1
|
runtimes:
|
||||||
|
ollama:
|
||||||
|
url: http://localhost:11434
|
||||||
|
timeout: 300
|
||||||
|
comfyui:
|
||||||
|
url: http://127.0.0.1:8188
|
||||||
|
|
||||||
|
limits:
|
||||||
|
llm_concurrency: 2
|
||||||
|
vision_concurrency: 1
|
||||||
|
max_concurrent_models: 2
|
||||||
model_swap_timeout: 300
|
model_swap_timeout: 300
|
||||||
gpu_enabled: true
|
|
||||||
metal_acceleration: false # NVIDIA GPU, not Apple Silicon
|
|
||||||
# Модель для автоматичного завантаження при старті
|
|
||||||
# qwen3-8b - основна модель (4.87 GB), швидка відповідь на перший запит
|
|
||||||
default_model: qwen3-8b
|
|
||||||
|
|
||||||
models:
|
timeouts:
|
||||||
# Primary LLM - Qwen3 8B (High Priority) - Main model from INFRASTRUCTURE.md
|
llm_ms: 120000
|
||||||
qwen3-8b:
|
vision_ms: 180000
|
||||||
path: ollama:qwen3:8b
|
image_gen_ms: 600000
|
||||||
type: llm
|
|
||||||
size_gb: 4.87
|
|
||||||
priority: high
|
|
||||||
description: "Primary LLM for general tasks and conversations"
|
|
||||||
|
|
||||||
# Vision Model - Qwen3-VL 8B (High Priority) - For image processing
|
gpu:
|
||||||
qwen3-vl-8b:
|
enabled: true
|
||||||
path: ollama:qwen3-vl:8b
|
metal_acceleration: false
|
||||||
type: vision
|
auto_unload_on_oom: true
|
||||||
size_gb: 5.72
|
vram_threshold_gb: 22
|
||||||
priority: high
|
|
||||||
description: "Vision model for image understanding and processing"
|
|
||||||
|
|
||||||
# Qwen2.5 7B Instruct (High Priority)
|
|
||||||
qwen2.5-7b-instruct:
|
|
||||||
path: ollama:qwen2.5:7b-instruct-q4_K_M
|
|
||||||
type: llm
|
|
||||||
size_gb: 4.36
|
|
||||||
priority: high
|
|
||||||
description: "Qwen2.5 7B Instruct model"
|
|
||||||
|
|
||||||
# Lightweight LLM - Qwen2.5 3B Instruct (Medium Priority)
|
|
||||||
qwen2.5-3b-instruct:
|
|
||||||
path: ollama:qwen2.5:3b-instruct-q4_K_M
|
|
||||||
type: llm
|
|
||||||
size_gb: 1.80
|
|
||||||
priority: medium
|
|
||||||
description: "Lightweight LLM for faster responses"
|
|
||||||
|
|
||||||
# Math Specialist - Qwen2 Math 7B (High Priority)
|
|
||||||
qwen2-math-7b:
|
|
||||||
path: ollama:qwen2-math:7b
|
|
||||||
type: math
|
|
||||||
size_gb: 4.13
|
|
||||||
priority: high
|
|
||||||
description: "Specialized model for mathematical tasks"
|
|
||||||
|
|
||||||
storage:
|
storage:
|
||||||
models_dir: /app/models
|
models_dir: /app/models
|
||||||
cache_dir: /app/cache
|
cache_dir: /app/cache
|
||||||
swap_dir: /app/swap
|
swap_dir: /app/swap
|
||||||
|
|
||||||
ollama:
|
|
||||||
url: http://ollama:11434 # From Docker container to Ollama service
|
|
||||||
timeout: 300
|
|
||||||
|
|||||||
Reference in New Issue
Block a user