diff --git a/docker-compose.node1.yml b/docker-compose.node1.yml index 2a3a1b19..1ab1602b 100644 --- a/docker-compose.node1.yml +++ b/docker-compose.node1.yml @@ -48,9 +48,11 @@ services: - ROUTER_TOOL_MAX_ROUNDS=${ROUTER_TOOL_MAX_ROUNDS:-10} - AGROMATRIX_REVIEW_AUTH_MODE=${AGROMATRIX_REVIEW_AUTH_MODE:-bearer} - AGROMATRIX_REVIEW_BEARER_TOKENS=${AGROMATRIX_REVIEW_BEARER_TOKENS} - # ── Node Capabilities (multi-node model selection) ── + # ── Fabric Layer (NCS + Node Worker, Swapper being decommissioned) ── - NODE_CAPABILITIES_URL=http://node-capabilities:8099/capabilities - ENABLE_GLOBAL_CAPS_NATS=true + - OLLAMA_URL=http://172.18.0.1:11434 + - PREFER_NODE_WORKER=true volumes: - ${DEPLOY_ROOT:-.}/services/router/router_config.yaml:/app/router_config.yaml:ro - ${DEPLOY_ROOT:-.}/services/router/router-config.yml:/app/router-config.yml:ro @@ -498,8 +500,8 @@ services: container_name: node-capabilities-node1 environment: - NODE_ID=noda1 - - OLLAMA_BASE_URL=http://host.docker.internal:11434 - - SWAPPER_URL=http://swapper-service:8890 + - OLLAMA_BASE_URL=http://172.18.0.1:11434 + - SWAPPER_URL= - CACHE_TTL_SEC=15 - ENABLE_NATS_CAPS=true - NATS_URL=nats://nats:4222 @@ -527,15 +529,13 @@ services: environment: - NODE_ID=noda1 - NATS_URL=nats://nats:4222 - - OLLAMA_BASE_URL=http://host.docker.internal:11434 - - SWAPPER_URL=http://swapper-service:8890 + - OLLAMA_BASE_URL=http://172.18.0.1:11434 - NODE_DEFAULT_LLM=qwen3.5:27b - - NODE_DEFAULT_VISION=qwen3-vl-8b + - NODE_DEFAULT_VISION=qwen3-vl:8b - NODE_WORKER_MAX_CONCURRENCY=2 - NCS_REPORT_URL=http://node-capabilities:8099 depends_on: - nats - - swapper-service networks: - dagi-network restart: unless-stopped diff --git a/docker-compose.node2-sofiia.yml b/docker-compose.node2-sofiia.yml index 564cf77d..a0a8a84f 100644 --- a/docker-compose.node2-sofiia.yml +++ b/docker-compose.node2-sofiia.yml @@ -25,9 +25,11 @@ services: - XAI_API_KEY=${XAI_API_KEY} - GROK_API_KEY=${XAI_API_KEY} - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-} - # ── Node Capabilities (multi-node model selection) ──────────────────── + # ── Fabric Layer (NCS + Node Worker, no Swapper dependency) ────────── - NODE_CAPABILITIES_URL=http://node-capabilities:8099/capabilities - ENABLE_GLOBAL_CAPS_NATS=true + - OLLAMA_URL=http://host.docker.internal:11434 + - PREFER_NODE_WORKER=true # ── Persistence backends ────────────────────────────────────────────── - ALERT_BACKEND=postgres - ALERT_DATABASE_URL=${ALERT_DATABASE_URL:-${DATABASE_URL}} @@ -121,7 +123,7 @@ services: environment: - NODE_ID=NODA2 - OLLAMA_BASE_URL=http://host.docker.internal:11434 - - SWAPPER_URL=http://swapper-service:8890 + - SWAPPER_URL= - LLAMA_SERVER_URL=http://host.docker.internal:11435 - CACHE_TTL_SEC=15 - ENABLE_NATS_CAPS=true @@ -147,14 +149,12 @@ services: - NODE_ID=noda2 - NATS_URL=nats://dagi-nats:4222 - OLLAMA_BASE_URL=http://host.docker.internal:11434 - - SWAPPER_URL=http://swapper-service:8890 - NODE_DEFAULT_LLM=qwen3:14b - NODE_DEFAULT_VISION=llava:13b - NODE_WORKER_MAX_CONCURRENCY=2 - NCS_REPORT_URL=http://node-capabilities:8099 depends_on: - dagi-nats - - swapper-service networks: - dagi-network restart: unless-stopped diff --git a/services/node-capabilities/main.py b/services/node-capabilities/main.py index be0fb7bb..ee3094dd 100644 --- a/services/node-capabilities/main.py +++ b/services/node-capabilities/main.py @@ -20,7 +20,7 @@ app = FastAPI(title="Node Capabilities Service", version="1.0.0") NODE_ID = os.getenv("NODE_ID", "noda2") OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434") -SWAPPER_URL = os.getenv("SWAPPER_URL", "http://swapper-service:8890") +SWAPPER_URL = os.getenv("SWAPPER_URL", "") # empty = skip Swapper probing LLAMA_SERVER_URL = os.getenv("LLAMA_SERVER_URL", "") _cache: Dict[str, Any] = {} @@ -74,7 +74,10 @@ async def _collect_ollama() -> Dict[str, Any]: async def _collect_swapper() -> Dict[str, Any]: - runtime: Dict[str, Any] = {"base_url": SWAPPER_URL, "status": "unknown", "models": [], "vision_models": [], "active_model": None} + runtime: Dict[str, Any] = {"base_url": SWAPPER_URL or "n/a", "status": "unknown", "models": [], "vision_models": [], "active_model": None} + if not SWAPPER_URL: + runtime["status"] = "disabled" + return runtime try: async with httpx.AsyncClient(timeout=5) as c: h = await c.get(f"{SWAPPER_URL}/health") diff --git a/services/node-worker/config.py b/services/node-worker/config.py index 959a774b..cba9137f 100644 --- a/services/node-worker/config.py +++ b/services/node-worker/config.py @@ -4,7 +4,6 @@ import os NODE_ID = os.getenv("NODE_ID", "noda2") NATS_URL = os.getenv("NATS_URL", "nats://dagi-nats:4222") OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434") -SWAPPER_URL = os.getenv("SWAPPER_URL", "http://swapper-service:8890") DEFAULT_LLM = os.getenv("NODE_DEFAULT_LLM", "qwen3:14b") DEFAULT_VISION = os.getenv("NODE_DEFAULT_VISION", "llava:13b") MAX_CONCURRENCY = int(os.getenv("NODE_WORKER_MAX_CONCURRENCY", "2")) diff --git a/services/node-worker/providers/ollama_vision.py b/services/node-worker/providers/ollama_vision.py new file mode 100644 index 00000000..d09f170d --- /dev/null +++ b/services/node-worker/providers/ollama_vision.py @@ -0,0 +1,49 @@ +"""Ollama vision provider — direct Ollama API with images, no Swapper dependency.""" +import logging +from typing import Any, Dict, List, Optional + +import httpx + +from config import OLLAMA_BASE_URL, DEFAULT_VISION + +logger = logging.getLogger("provider.ollama_vision") + + +async def infer( + images: Optional[List[str]] = None, + prompt: str = "", + model: str = "", + system: str = "", + max_tokens: int = 1024, + temperature: float = 0.2, + timeout_s: float = 60.0, +) -> Dict[str, Any]: + model = model or DEFAULT_VISION + + payload: Dict[str, Any] = { + "model": model, + "prompt": prompt or "Describe this image.", + "stream": False, + "options": {"num_predict": max_tokens, "temperature": temperature}, + } + if images: + clean = [] + for img in images: + if "," in img and img.startswith("data:"): + clean.append(img.split(",", 1)[1]) + else: + clean.append(img) + payload["images"] = clean + if system: + payload["system"] = system + + async with httpx.AsyncClient(timeout=timeout_s) as c: + resp = await c.post(f"{OLLAMA_BASE_URL}/api/generate", json=payload) + resp.raise_for_status() + data = resp.json() + return { + "text": data.get("response", ""), + "model": model, + "provider": "ollama_vision", + "eval_count": data.get("eval_count", 0), + } diff --git a/services/node-worker/worker.py b/services/node-worker/worker.py index 82118cc6..13697fb3 100644 --- a/services/node-worker/worker.py +++ b/services/node-worker/worker.py @@ -9,7 +9,7 @@ from typing import Any, Dict import config from models import JobRequest, JobResponse, JobError from idempotency import IdempotencyStore -from providers import ollama, swapper_vision +from providers import ollama, ollama_vision import fabric_metrics as fm logger = logging.getLogger("node-worker") @@ -27,9 +27,13 @@ async def start(nats_client): global _nats_client _nats_client = nats_client + nid = config.NODE_ID.lower() subjects = [ - f"node.{config.NODE_ID.lower()}.llm.request", - f"node.{config.NODE_ID.lower()}.vision.request", + f"node.{nid}.llm.request", + f"node.{nid}.vision.request", + f"node.{nid}.stt.request", + f"node.{nid}.tts.request", + f"node.{nid}.image.request", ] for subj in subjects: await nats_client.subscribe(subj, cb=_handle_request) @@ -160,7 +164,7 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse: ) elif job.required_type == "vision": result = await asyncio.wait_for( - swapper_vision.infer( + ollama_vision.infer( images=payload.get("images"), prompt=payload.get("prompt", ""), model=model, @@ -171,11 +175,20 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse: ), timeout=timeout_s, ) + elif job.required_type in ("stt", "tts", "image"): + return JobResponse( + job_id=job.job_id, trace_id=job.trace_id, node_id=config.NODE_ID, + status="error", + error=JobError( + code="NOT_YET_IMPLEMENTED", + message=f"{job.required_type} adapter coming soon; use direct runtime API for now", + ), + ) else: return JobResponse( job_id=job.job_id, trace_id=job.trace_id, node_id=config.NODE_ID, status="error", - error=JobError(code="UNSUPPORTED_TYPE", message=f"{job.required_type} not implemented"), + error=JobError(code="UNSUPPORTED_TYPE", message=f"{job.required_type} not supported"), ) logger.info( diff --git a/services/router/main.py b/services/router/main.py index e8f3d2cb..c131c8bd 100644 --- a/services/router/main.py +++ b/services/router/main.py @@ -877,12 +877,13 @@ app = FastAPI(title="DAARION Router", version="2.0.0") # Configuration NATS_URL = os.getenv("NATS_URL", "nats://nats:4222") SWAPPER_URL = os.getenv("SWAPPER_URL", "http://swapper-service:8890") -# All multimodal services now through Swapper -STT_URL = os.getenv("STT_URL", "http://swapper-service:8890") # Swapper /stt endpoint -TTS_URL = os.getenv("TTS_URL", "http://swapper-service:8890") # Swapper /tts endpoint -VISION_URL = os.getenv("VISION_URL", "http://172.18.0.1:11434") # Host Ollama -OCR_URL = os.getenv("OCR_URL", "http://swapper-service:8890") # Swapper /ocr endpoint -DOCUMENT_URL = os.getenv("DOCUMENT_URL", "http://swapper-service:8890") # Swapper /document endpoint +OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") +PREFER_NODE_WORKER = os.getenv("PREFER_NODE_WORKER", "true").lower() in ("true", "1") +STT_URL = os.getenv("STT_URL", "http://swapper-service:8890") +TTS_URL = os.getenv("TTS_URL", "http://swapper-service:8890") +VISION_URL = os.getenv("VISION_URL", "http://host.docker.internal:11434") +OCR_URL = os.getenv("OCR_URL", "http://swapper-service:8890") +DOCUMENT_URL = os.getenv("DOCUMENT_URL", "http://swapper-service:8890") CITY_SERVICE_URL = os.getenv("CITY_SERVICE_URL", "http://daarion-city-service:7001") # CrewAI Routing Configuration @@ -1083,8 +1084,8 @@ async def startup_event(): runtime_guard_engine = None # Log backend URLs - logger.info(f"📡 Swapper URL: {SWAPPER_URL}") - logger.info(f"📡 STT URL: {STT_URL}") + logger.info(f"📡 Ollama URL: {OLLAMA_URL} (prefer_node_worker={PREFER_NODE_WORKER})") + logger.info(f"📡 Swapper URL: {SWAPPER_URL} (legacy, being decommissioned)") logger.info(f"📡 Vision URL: {VISION_URL}") logger.info(f"📡 OCR URL: {OCR_URL}") logger.info(f"📡 Neo4j URL: {NEO4J_URI}") @@ -2388,33 +2389,39 @@ async def agent_infer(agent_id: str, request: InferRequest): logger.warning(f"⚠️ Deterministic AgroMatrix plant flow failed, fallback to generic vision: {e}") try: - # Use Swapper's /vision endpoint (manages model loading) - vision_payload = { - "model": "qwen3-vl-8b", - "prompt": request.prompt, - "images": request.images, # Swapper handles data URL conversion - "max_tokens": request.max_tokens or 1024, - "temperature": request.temperature or 0.7 - } - - # Add system prompt if available - if system_prompt: - if memory_brief_text: - vision_payload["system"] = f"{system_prompt}\n\n[INTERNAL MEMORY - do NOT repeat to user]\n{memory_brief_text}" + vision_model = "qwen3-vl-8b" + vision_system = system_prompt or "" + if vision_system and memory_brief_text: + vision_system = f"{vision_system}\n\n[INTERNAL MEMORY - do NOT repeat to user]\n{memory_brief_text}" + + clean_images = [] + for img in (request.images or []): + if "," in img and img.startswith("data:"): + clean_images.append(img.split(",", 1)[1]) else: - vision_payload["system"] = system_prompt - - logger.info(f"🖼️ Sending to Swapper /vision: {SWAPPER_URL}/vision") - + clean_images.append(img) + + logger.info(f"🖼️ Vision inference: model={vision_model} images={len(clean_images)} prefer_nw={PREFER_NODE_WORKER}") + vision_resp = await http_client.post( - f"{SWAPPER_URL}/vision", - json=vision_payload, - timeout=120.0 + f"{OLLAMA_URL}/api/generate", + json={ + "model": vision_model.replace("-", "-vl:").replace("qwen3-vl:", "qwen3-vl:") if ":" not in vision_model else vision_model, + "prompt": request.prompt, + "images": clean_images, + "system": vision_system, + "stream": False, + "options": { + "num_predict": request.max_tokens or 1024, + "temperature": request.temperature or 0.7, + }, + }, + timeout=120.0, ) if vision_resp.status_code == 200: vision_data = vision_resp.json() - raw_response = vision_data.get("text", "") + raw_response = vision_data.get("response", vision_data.get("text", "")) full_response = _sanitize_vision_text_for_user(raw_response) vision_web_query = "" vision_sources: List[Dict[str, str]] = [] @@ -2519,14 +2526,23 @@ async def agent_infer(agent_id: str, request: InferRequest): "Відповідай українською 2-4 реченнями, без службових фраз. " f"Запит користувача: {request.prompt}" ) + retry_ollama_payload = { + "model": vision_model.replace("-", "-vl:").replace("qwen3-vl:", "qwen3-vl:") if ":" not in vision_model else vision_model, + "prompt": retry_payload["prompt"], + "images": clean_images, + "stream": False, + "options": {"num_predict": request.max_tokens or 1024, "temperature": 0.7}, + } + if retry_payload.get("system"): + retry_ollama_payload["system"] = retry_payload["system"] retry_resp = await http_client.post( - f"{SWAPPER_URL}/vision", - json=retry_payload, - timeout=120.0 + f"{OLLAMA_URL}/api/generate", + json=retry_ollama_payload, + timeout=120.0, ) if retry_resp.status_code == 200: retry_data = retry_resp.json() - retry_raw = retry_data.get("text", "") + retry_raw = retry_data.get("response", retry_data.get("text", "")) retry_text = _sanitize_vision_text_for_user(retry_raw) if retry_raw and not retry_text: retry_text = _extract_vision_search_facts(retry_raw, max_chars=280) @@ -2541,7 +2557,7 @@ async def agent_infer(agent_id: str, request: InferRequest): elif request_agent_id in DETERMINISTIC_PLANT_POLICY_AGENTS and _vision_response_is_blurry(full_response): full_response = _build_image_fallback_response(request_agent_id, request.prompt) - full_response = await _finalize_response_text(full_response, "swapper-vision") + full_response = await _finalize_response_text(full_response, "ollama-vision") # Store vision message in agent-specific memory if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id and full_response: @@ -2567,10 +2583,10 @@ async def agent_infer(agent_id: str, request: InferRequest): response=full_response, model="qwen3-vl-8b", tokens_used=None, - backend="swapper-vision" + backend="ollama-vision" ) else: - logger.error(f"❌ Swapper vision error: {vision_resp.status_code} - {vision_resp.text[:200]}") + logger.error(f"❌ Ollama vision error: {vision_resp.status_code} - {vision_resp.text[:200]}") fallback_response = await _finalize_response_text( _build_image_fallback_response(request_agent_id, request.prompt), "swapper-vision-fallback", @@ -2579,7 +2595,7 @@ async def agent_infer(agent_id: str, request: InferRequest): response=fallback_response, model="qwen3-vl-8b", tokens_used=None, - backend="swapper-vision-fallback" + backend="vision-fallback" ) except Exception as e: @@ -2592,7 +2608,7 @@ async def agent_infer(agent_id: str, request: InferRequest): response=fallback_response, model="qwen3-vl-8b", tokens_used=None, - backend="swapper-vision-fallback" + backend="vision-fallback" ) # ========================================================================= @@ -3142,90 +3158,88 @@ async def agent_infer(agent_id: str, request: InferRequest): logger.warning(f"⚠️ No local model in config, using hardcoded fallback: {local_model}") try: - # Check if Swapper is available - health_resp = await http_client.get(f"{SWAPPER_URL}/health", timeout=5.0) - if health_resp.status_code == 200: - logger.info(f"📡 Calling Swapper with local model: {local_model}") - # Generate response via Swapper (which handles model loading) - generate_resp = await http_client.post( - f"{SWAPPER_URL}/generate", - json={ - "model": local_model, - "prompt": request.prompt, - "system": system_prompt, - "max_tokens": request.max_tokens, - "temperature": request.temperature, - "stream": False + ollama_model = local_model.replace("-", ":") if ":" not in local_model else local_model + logger.info(f"📡 Calling Ollama direct: model={ollama_model}") + + generate_resp = await http_client.post( + f"{OLLAMA_URL}/api/generate", + json={ + "model": ollama_model, + "prompt": request.prompt, + "system": system_prompt, + "stream": False, + "options": { + "num_predict": request.max_tokens or 2048, + "temperature": request.temperature or 0.7, }, - timeout=300.0 - ) - - if generate_resp.status_code == 200: - data = generate_resp.json() - local_response = _normalize_text_response(data.get("response", "")) + }, + timeout=300.0, + ) + if generate_resp.status_code == 200: + data = generate_resp.json() + local_response = _normalize_text_response(data.get("response", "")) - # Empty-answer gate for selected local top-level agents. - if request_agent_id in EMPTY_ANSWER_GUARD_AGENTS and _needs_empty_answer_recovery(local_response): - logger.warning(f"⚠️ Empty-answer gate triggered for {request_agent_id}, retrying local generate once") - retry_prompt = ( - f"{request.prompt}\n\n" - "Відповідай коротко і конкретно (2-5 речень), без службових або мета-фраз." - ) - retry_resp = await http_client.post( - f"{SWAPPER_URL}/generate", - json={ - "model": local_model, - "prompt": retry_prompt, - "system": system_prompt, - "max_tokens": request.max_tokens, - "temperature": request.temperature, - "stream": False - }, - timeout=300.0 - ) - if retry_resp.status_code == 200: - retry_data = retry_resp.json() - retry_text = _normalize_text_response(retry_data.get("response", "")) - if retry_text and not _needs_empty_answer_recovery(retry_text): - local_response = retry_text - - if _needs_empty_answer_recovery(local_response): - local_response = ( - "Я не отримав корисну відповідь з першої спроби. " - "Сформулюй запит коротко ще раз, і я відповім конкретно." - ) - local_response = await _finalize_response_text(local_response, "swapper+ollama") - - # Store in agent-specific memory - if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id and local_response: - asyncio.create_task( - memory_retrieval.store_message( - agent_id=request_agent_id, - user_id=user_id, - username=username, - message_text=request.prompt, - response_text=local_response, - chat_id=chat_id - ) - ) - - return InferResponse( - response=local_response, - model=local_model, - tokens_used=data.get("eval_count", 0), - backend="swapper+ollama" + if request_agent_id in EMPTY_ANSWER_GUARD_AGENTS and _needs_empty_answer_recovery(local_response): + logger.warning(f"⚠️ Empty-answer gate triggered for {request_agent_id}, retrying local generate once") + retry_prompt = ( + f"{request.prompt}\n\n" + "Відповідай коротко і конкретно (2-5 речень), без службових або мета-фраз." ) - else: - logger.error(f"❌ Swapper error: {generate_resp.status_code} - {generate_resp.text}") + retry_resp = await http_client.post( + f"{OLLAMA_URL}/api/generate", + json={ + "model": ollama_model, + "prompt": retry_prompt, + "system": system_prompt, + "stream": False, + "options": { + "num_predict": request.max_tokens or 2048, + "temperature": request.temperature or 0.7, + }, + }, + timeout=300.0, + ) + if retry_resp.status_code == 200: + retry_data = retry_resp.json() + retry_text = _normalize_text_response(retry_data.get("response", "")) + if retry_text and not _needs_empty_answer_recovery(retry_text): + local_response = retry_text + + if _needs_empty_answer_recovery(local_response): + local_response = ( + "Я не отримав корисну відповідь з першої спроби. " + "Сформулюй запит коротко ще раз, і я відповім конкретно." + ) + local_response = await _finalize_response_text(local_response, "ollama-direct") + + if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id and local_response: + asyncio.create_task( + memory_retrieval.store_message( + agent_id=request_agent_id, + user_id=user_id, + username=username, + message_text=request.prompt, + response_text=local_response, + chat_id=chat_id + ) + ) + + return InferResponse( + response=local_response, + model=local_model, + tokens_used=data.get("eval_count", 0), + backend="ollama-direct" + ) + else: + logger.error(f"❌ Ollama generate error: {generate_resp.status_code} - {generate_resp.text[:200]}") except Exception as e: - logger.error(f"❌ Swapper/Ollama error: {e}") - # Fallback to direct Ollama if Swapper fails + logger.error(f"❌ Ollama direct error: {e}") try: - logger.info(f"🔄 Falling back to direct Ollama connection") + logger.info(f"🔄 Falling back to Ollama with hardcoded model") generate_resp = await http_client.post( - f"{VISION_URL}/api/generate", + f"{OLLAMA_URL}/api/generate", json={ - "model": "qwen3:8b", # Use actual Ollama model name + "model": "qwen3:8b", "prompt": request.prompt, "system": system_prompt, "stream": False, @@ -3526,42 +3540,38 @@ async def documents_versions(doc_id: str, agent_id: str, limit: int = 20): @app.get("/v1/models") async def list_available_models(): - """List all available models across backends""" + """List all available models from NCS (global capabilities pool).""" models = [] - - # Get Swapper models + try: - resp = await http_client.get(f"{SWAPPER_URL}/models", timeout=5.0) - if resp.status_code == 200: - data = resp.json() - for m in data.get("models", []): - models.append({ - "id": m.get("name"), - "backend": "swapper", - "size_gb": m.get("size_gb"), - "status": m.get("status", "available") - }) + from global_capabilities_client import get_global_capabilities + pool = await get_global_capabilities() + for m in pool.get("served_models", []): + models.append({ + "id": m.get("name"), + "backend": m.get("runtime", "unknown"), + "node": m.get("node", "?"), + "type": m.get("type", "llm"), + "size_gb": m.get("size_gb"), + "status": "served", + }) except Exception as e: - logger.warning(f"Cannot get Swapper models: {e}") - - # Get Ollama models - try: - resp = await http_client.get(f"{VISION_URL}/api/tags", timeout=5.0) - if resp.status_code == 200: - data = resp.json() - for m in data.get("models", []): - # Avoid duplicates - model_name = m.get("name") - if not any(x.get("id") == model_name for x in models): + logger.warning(f"Cannot get NCS global models: {e}") + + if not models: + try: + resp = await http_client.get(f"{OLLAMA_URL}/api/tags", timeout=5.0) + if resp.status_code == 200: + for m in resp.json().get("models", []): models.append({ - "id": model_name, + "id": m.get("name"), "backend": "ollama", "size_gb": round(m.get("size", 0) / 1e9, 1), - "status": "loaded" + "status": "loaded", }) - except Exception as e: - logger.warning(f"Cannot get Ollama models: {e}") - + except Exception as e: + logger.warning(f"Cannot get Ollama models: {e}") + return {"models": models, "total": len(models)} diff --git a/services/swapper-service/config/swapper_config.yaml b/services/swapper-service/config/swapper_config.yaml index 6721a408..db41f28e 100644 --- a/services/swapper-service/config/swapper_config.yaml +++ b/services/swapper-service/config/swapper_config.yaml @@ -1,90 +1,35 @@ -# Swapper Configuration for Node #2 (Development Node) -# Single-active LLM scheduler -# MacBook Pro M4 Max - Apple Silicon (40-core GPU, 64GB RAM) -# Auto-generated configuration with available Ollama models +# Swapper Configuration — Default / Fallback +# +# NOTE: Swapper is now a runtime gateway / executor only. +# Source of truth for models is NCS (Node Capabilities Service). +# No hardcoded model lists — Swapper queries NCS or Ollama /api/tags at startup. +# +# Per-node overrides: swapper_config_node1.yaml, swapper_config_node2.yaml -swapper: - mode: single-active - max_concurrent_models: 1 +node_id: default + +runtimes: + ollama: + url: http://localhost:11434 + timeout: 300 + +limits: + llm_concurrency: 2 + vision_concurrency: 1 + max_concurrent_models: 2 model_swap_timeout: 300 - gpu_enabled: true - metal_acceleration: true # Apple Silicon GPU acceleration - # Модель для автоматичного завантаження при старті (опціонально) - # Якщо не вказано - моделі завантажуються тільки за запитом - # Рекомендовано: gpt-oss:latest (швидка модель) або phi3:latest (легка модель) - default_model: gpt-oss:latest # Модель активується автоматично при старті -models: - # Fast LLM - GPT-OSS 20B (High Priority) - Main model for general tasks - gpt-oss-latest: - path: ollama:gpt-oss:latest - type: llm - size_gb: 13.0 - priority: high - description: "Fast LLM for general tasks and conversations (20.9B params)" - - # Lightweight LLM - Phi3 3.8B (High Priority) - Fast responses - phi3-latest: - path: ollama:phi3:latest - type: llm - size_gb: 2.2 - priority: high - description: "Lightweight LLM for fast responses (3.8B params)" - - # Code Specialist - StarCoder2 3B (Medium Priority) - Code engineering - starcoder2-3b: - path: ollama:starcoder2:3b - type: code - size_gb: 1.7 - priority: medium - description: "Code specialist model for code engineering (3B params)" - - # Reasoning Model - Mistral Nemo 12.2B (High Priority) - Advanced reasoning - mistral-nemo-12b: - path: ollama:mistral-nemo:12b - type: llm - size_gb: 7.1 - priority: high - description: "Advanced reasoning model for complex tasks (12.2B params)" - - # Reasoning Model - Gemma2 27B (Medium Priority) - Strategic reasoning - gemma2-27b: - path: ollama:gemma2:27b - type: llm - size_gb: 15.0 - priority: medium - description: "Reasoning model for strategic tasks (27.2B params)" - - # Code Specialist - DeepSeek Coder 33B (High Priority) - Advanced code tasks - deepseek-coder-33b: - path: ollama:deepseek-coder:33b - type: code - size_gb: 18.0 - priority: high - description: "Advanced code specialist model (33B params)" - - # Code Specialist - Qwen2.5 Coder 32B (High Priority) - Advanced code tasks - qwen2.5-coder-32b: - path: ollama:qwen2.5-coder:32b - type: code - size_gb: 19.0 - priority: high - description: "Advanced code specialist model (32.8B params)" - - # Reasoning Model - DeepSeek R1 70B (High Priority) - Strategic reasoning (large model) - deepseek-r1-70b: - path: ollama:deepseek-r1:70b - type: llm - size_gb: 42.0 - priority: high - description: "Strategic reasoning model (70.6B params, quantized)" +timeouts: + llm_ms: 120000 + vision_ms: 180000 + stt_ms: 60000 + tts_ms: 60000 + +gpu: + enabled: false + metal_acceleration: false storage: models_dir: /app/models cache_dir: /app/cache swap_dir: /app/swap - -ollama: - url: http://localhost:11434 # Native Ollama on MacBook (via Pieces OS or brew) - timeout: 300 - diff --git a/services/swapper-service/config/swapper_config_node1.yaml b/services/swapper-service/config/swapper_config_node1.yaml index aa22ac8f..4a69f22f 100644 --- a/services/swapper-service/config/swapper_config_node1.yaml +++ b/services/swapper-service/config/swapper_config_node1.yaml @@ -1,186 +1,37 @@ # Swapper Configuration for Node #1 (Production Server) -# Optimized Multimodal Stack: LLM + Vision + OCR + Document + Audio # Hetzner GEX44 - NVIDIA RTX 4000 SFF Ada (20GB VRAM) # -# ВАЖЛИВО: Ембедінги через зовнішні API: -# - Text: Cohere API (embed-multilingual-v3.0, 1024 dim) -# - Image: Vision Encoder (OpenCLIP ViT-L/14, 768 dim) -# НЕ використовуємо локальні embedding моделі! +# NOTE: Swapper is now a runtime gateway / executor only. +# Source of truth for models is NCS (Node Capabilities Service). +# No hardcoded model lists — Swapper queries NCS or Ollama /api/tags at startup. -swapper: - mode: multi-active - max_concurrent_models: 4 # LLM + OCR + STT + TTS (до 15GB) +node_id: noda1 + +runtimes: + ollama: + url: http://172.18.0.1:11434 + timeout: 300 + # comfyui: + # url: http://127.0.0.1:8188 + +limits: + llm_concurrency: 2 + vision_concurrency: 1 + max_concurrent_models: 4 model_swap_timeout: 300 - gpu_enabled: true + +timeouts: + llm_ms: 120000 + vision_ms: 180000 + stt_ms: 60000 + tts_ms: 60000 + image_gen_ms: 300000 + +gpu: + enabled: true metal_acceleration: false - default_model: qwen3-8b - lazy_load_ocr: true - lazy_load_audio: true - # Автоматичне вивантаження при нестачі VRAM auto_unload_on_oom: true - vram_threshold_gb: 18 # Починати вивантажувати при 18GB - -models: - # ============================================ - # LLM MODELS (Ollama) - тільки qwen3 - # ============================================ - - # Primary LLM - Qwen3 8B (includes math, coding, reasoning) - qwen3-8b: - path: ollama:qwen3:8b - type: llm - size_gb: 5.2 - priority: high - description: "Qwen3 8B - primary LLM with math, coding, reasoning capabilities" - capabilities: - - chat - - math - - coding - - reasoning - - multilingual - - # ============================================ - # VISION MODELS (Ollama) - # ============================================ - - # Vision Model - Qwen3-VL 8B - qwen3-vl-8b: - path: ollama:qwen3-vl:8b - type: vision - size_gb: 6.1 - priority: high - description: "Qwen3-VL 8B for image understanding and visual reasoning" - capabilities: - - image_understanding - - visual_qa - - diagram_analysis - - ocr_basic - - # ============================================ - # OCR/DOCUMENT MODELS (HuggingFace) - # ============================================ - - # GOT-OCR2.0 - Best for documents, tables, formulas - got-ocr2: - path: huggingface:stepfun-ai/GOT-OCR2_0 - type: ocr - size_gb: 7.0 - priority: high - description: "Best OCR for documents, tables, formulas, handwriting" - capabilities: - - documents - - tables - - formulas - - handwriting - - multilingual - - # Donut - Document Understanding (no external OCR, 91% CORD) - donut-base: - path: huggingface:naver-clova-ix/donut-base - type: ocr - size_gb: 3.0 - priority: high - description: "Document parsing without OCR engine (91% CORD accuracy)" - capabilities: - - document_parsing - - receipts - - forms - - invoices - - # Donut fine-tuned for receipts/invoices (CORD dataset) - donut-cord: - path: huggingface:naver-clova-ix/donut-base-finetuned-cord-v2 - type: ocr - size_gb: 3.0 - priority: medium - description: "Donut fine-tuned for receipts extraction" - capabilities: - - receipts - - invoices - - structured_extraction - - # IBM Granite Docling - Document conversion with structure preservation - granite-docling: - path: huggingface:ds4sd/docling-ibm-granite-vision-1b - type: document - size_gb: 2.5 - priority: high - description: "IBM Granite Docling for PDF/document structure extraction" - capabilities: - - pdf_conversion - - table_extraction - - formula_extraction - - layout_preservation - - doctags_format - - # ============================================ - # AUDIO MODELS - STT (Speech-to-Text) - # ============================================ - - # Faster Whisper Large-v3 - Best STT quality - faster-whisper-large: - path: huggingface:Systran/faster-whisper-large-v3 - type: stt - size_gb: 3.0 - priority: high - description: "Faster Whisper Large-v3 - best quality, 99 languages" - capabilities: - - speech_recognition - - transcription - - multilingual - - timestamps - - ukrainian - - # Whisper Small - Fast/lightweight for quick transcription - whisper-small: - path: huggingface:openai/whisper-small - type: stt - size_gb: 0.5 - priority: medium - description: "Whisper Small for fast transcription" - capabilities: - - speech_recognition - - transcription - - # ============================================ - # AUDIO MODELS - TTS (Text-to-Speech) - # ============================================ - - # Coqui XTTS-v2 - Best multilingual TTS with Ukrainian support - xtts-v2: - path: huggingface:coqui/XTTS-v2 - type: tts - size_gb: 2.0 - priority: high - description: "XTTS-v2 multilingual TTS with voice cloning, Ukrainian support" - capabilities: - - text_to_speech - - voice_cloning - - multilingual - - ukrainian - - 17_languages - - # ============================================ - # IMAGE GENERATION MODELS (HuggingFace/Diffusers) - # ============================================ - - # FLUX.2 Klein 4B - High quality image generation with lazy loading - flux-klein-4b: - path: huggingface:black-forest-labs/FLUX.2-klein-base-4B - type: image_generation - size_gb: 15.4 - priority: medium - description: "FLUX.2 Klein 4B - high quality image generation, lazy loaded on demand" - capabilities: - - text_to_image - - high_quality - - 1024x1024 - - artistic - default_params: - num_inference_steps: 50 - guidance_scale: 4.0 - width: 1024 - height: 1024 + vram_threshold_gb: 18 storage: models_dir: /app/models @@ -188,33 +39,8 @@ storage: swap_dir: /app/swap huggingface_cache: /root/.cache/huggingface -ollama: - url: http://172.18.0.1:11434 - timeout: 300 - huggingface: device: cuda torch_dtype: float16 trust_remote_code: true low_cpu_mem_usage: true - -# ============================================ -# EMBEDDING SERVICES (External APIs) -# НЕ через Swapper - окремі сервіси! -# ============================================ -# -# Text Embeddings: -# Service: Memory Service → Cohere API -# Model: embed-multilingual-v3.0 -# Dimension: 1024 -# Endpoint: Memory Service handles internally -# -# Image/Multimodal Embeddings: -# Service: Vision Encoder (port 8001) -# Model: OpenCLIP ViT-L/14 -# Dimension: 768 -# Endpoint: http://vision-encoder:8001/embed -# -# Vector Storage: -# Qdrant (port 6333) - separate collections for text vs image embeddings -# ВАЖЛИВО: НЕ змішувати embedding spaces в одній колекції! diff --git a/services/swapper-service/config/swapper_config_node2.yaml b/services/swapper-service/config/swapper_config_node2.yaml index 5cf6f055..bf596111 100644 --- a/services/swapper-service/config/swapper_config_node2.yaml +++ b/services/swapper-service/config/swapper_config_node2.yaml @@ -1,126 +1,40 @@ # Swapper Configuration for Node #2 (Development Node) -# Single-active LLM scheduler # MacBook Pro M4 Max - Apple Silicon (40-core GPU, 64GB RAM) -# Auto-generated configuration with available Ollama models +# +# NOTE: Swapper is now a runtime gateway / executor only. +# Source of truth for models is NCS (Node Capabilities Service). +# No hardcoded model lists — Swapper queries NCS or Ollama /api/tags at startup. -swapper: - mode: single-active +node_id: noda2 + +runtimes: + ollama: + url: http://host.docker.internal:11434 + timeout: 300 + # mlx: + # stt_model: whisper-large-v3-turbo + # tts_model: kokoro-82m + # comfyui: + # url: http://127.0.0.1:8188 + +limits: + llm_concurrency: 1 + vision_concurrency: 1 max_concurrent_models: 1 model_swap_timeout: 300 - gpu_enabled: true - metal_acceleration: true # Apple Silicon GPU acceleration - # Модель для автоматичного завантаження при старті (опціонально) - # Якщо не вказано - моделі завантажуються тільки за запитом - # Рекомендовано: gpt-oss:latest (швидка модель) або phi3:latest (легка модель) - # Стартова модель має бути реально встановлена в Ollama на NODA2 - default_model: qwen3:14b # Модель активується автоматично при старті -models: - # Fast LLM - GPT-OSS 20B (High Priority) - Main model for general tasks - gpt-oss-latest: - path: ollama:gpt-oss:latest - type: llm - size_gb: 13.0 - priority: high - description: "Fast LLM for general tasks and conversations (20.9B params)" - - # Lightweight LLM - Phi3 3.8B (High Priority) - Fast responses - phi3-latest: - path: ollama:phi3:latest - type: llm - size_gb: 2.2 - priority: high - description: "Lightweight LLM for fast responses (3.8B params)" - - # General Reasoning - Qwen3 14B (High Priority) - qwen3-14b: - path: ollama:qwen3:14b - type: llm - size_gb: 9.3 - priority: high - description: "Balanced local model for Sofiia and router fallback" +timeouts: + llm_ms: 120000 + vision_ms: 180000 + stt_ms: 60000 + tts_ms: 60000 + image_gen_ms: 300000 - # Reasoning Model - Qwen3.5 35B A3B (High Priority) - qwen3.5-35b-a3b: - path: ollama:qwen3.5:35b-a3b - type: llm - size_gb: 22.0 - priority: high - description: "Large reasoning model for complex Sofiia requests" - - # Reasoning Model - GLM 4.7 Flash (High Priority) - Fast general model - glm-4.7-flash: - path: ollama:glm-4.7-flash:32k - type: llm - size_gb: 19.0 - priority: high - description: "Multi-purpose reasoning model (fast context)" - - # Reasoning Model - Gemma2 27B (Medium Priority) - Strategic reasoning - gemma2-27b: - path: ollama:gemma2:27b - type: llm - size_gb: 15.0 - priority: medium - description: "Reasoning model for strategic tasks (27.2B params)" - - # Code Specialist - DeepSeek Coder 33B (High Priority) - Advanced code tasks - deepseek-coder-33b: - path: ollama:deepseek-coder:33b - type: code - size_gb: 18.0 - priority: high - description: "Advanced code specialist model (33B params)" - - # Code Specialist - Qwen2.5 Coder 32B (High Priority) - Advanced code tasks - qwen2.5-coder-32b: - path: ollama:qwen2.5-coder:32b - type: code - size_gb: 19.0 - priority: high - description: "Advanced code specialist model (32.8B params)" - - # Reasoning Model - DeepSeek R1 70B (High Priority) - Strategic reasoning (large model) - deepseek-r1-70b: - path: ollama:deepseek-r1:70b - type: llm - size_gb: 42.0 - priority: high - description: "Strategic reasoning model (70.6B params, quantized)" - - # Vision Model - LLaVA 13B (P0 Fix: NODA2 fallback vision) - # Available in Ollama on NODA2 — used until qwen3-vl:8b is installed - llava-13b: - path: ollama:llava:13b - type: vision - size_gb: 8.0 - priority: high - description: "LLaVA 13B vision model (multimodal CLIP+LLM). P0 fallback until qwen3-vl:8b." - vision: true - ollama_model: "llava:13b" - - # Vision Model - Qwen3-VL 8B (RECOMMENDED: install with: ollama pull qwen3-vl:8b) - # Better quality than llava:13b. Enable once installed. - # qwen3-vl-8b: - # path: ollama:qwen3-vl:8b - # type: vision - # size_gb: 5.5 - # priority: high - # description: "Qwen3-VL 8B — modern vision-language model (recommended)" - # vision: true - # ollama_model: "qwen3-vl:8b" +gpu: + enabled: true + metal_acceleration: true storage: models_dir: /app/models cache_dir: /app/cache swap_dir: /app/swap - -ollama: - url: http://host.docker.internal:11434 # host.docker.internal → native Ollama on MacBook (NODA2 P1 fix) - timeout: 300 - -# Vision endpoint configuration -# /vision/models returns all models where vision: true -vision: - default_model: llava-13b - ollama_base_url: http://host.docker.internal:11434 diff --git a/services/swapper-service/config/swapper_config_node3.yaml b/services/swapper-service/config/swapper_config_node3.yaml index 6169c62e..84608bd4 100644 --- a/services/swapper-service/config/swapper_config_node3.yaml +++ b/services/swapper-service/config/swapper_config_node3.yaml @@ -1,63 +1,37 @@ # Swapper Configuration for Node #3 (AI/ML Workstation) -# Single-active LLM scheduler -# Threadripper PRO + RTX 3090 24GB - GPU-intensive workloads +# Threadripper PRO + RTX 3090 24GB — GPU-intensive workloads +# +# NOTE: Swapper is now a runtime gateway / executor only. +# Source of truth for models is NCS (Node Capabilities Service). +# No hardcoded model lists. -swapper: - mode: single-active - max_concurrent_models: 1 +node_id: noda3 + +runtimes: + ollama: + url: http://localhost:11434 + timeout: 300 + comfyui: + url: http://127.0.0.1:8188 + +limits: + llm_concurrency: 2 + vision_concurrency: 1 + max_concurrent_models: 2 model_swap_timeout: 300 - gpu_enabled: true - metal_acceleration: false # NVIDIA GPU, not Apple Silicon - # Модель для автоматичного завантаження при старті - # qwen3-8b - основна модель (4.87 GB), швидка відповідь на перший запит - default_model: qwen3-8b -models: - # Primary LLM - Qwen3 8B (High Priority) - Main model from INFRASTRUCTURE.md - qwen3-8b: - path: ollama:qwen3:8b - type: llm - size_gb: 4.87 - priority: high - description: "Primary LLM for general tasks and conversations" - - # Vision Model - Qwen3-VL 8B (High Priority) - For image processing - qwen3-vl-8b: - path: ollama:qwen3-vl:8b - type: vision - size_gb: 5.72 - priority: high - description: "Vision model for image understanding and processing" - - # Qwen2.5 7B Instruct (High Priority) - qwen2.5-7b-instruct: - path: ollama:qwen2.5:7b-instruct-q4_K_M - type: llm - size_gb: 4.36 - priority: high - description: "Qwen2.5 7B Instruct model" - - # Lightweight LLM - Qwen2.5 3B Instruct (Medium Priority) - qwen2.5-3b-instruct: - path: ollama:qwen2.5:3b-instruct-q4_K_M - type: llm - size_gb: 1.80 - priority: medium - description: "Lightweight LLM for faster responses" - - # Math Specialist - Qwen2 Math 7B (High Priority) - qwen2-math-7b: - path: ollama:qwen2-math:7b - type: math - size_gb: 4.13 - priority: high - description: "Specialized model for mathematical tasks" +timeouts: + llm_ms: 120000 + vision_ms: 180000 + image_gen_ms: 600000 + +gpu: + enabled: true + metal_acceleration: false + auto_unload_on_oom: true + vram_threshold_gb: 22 storage: models_dir: /app/models cache_dir: /app/cache swap_dir: /app/swap - -ollama: - url: http://ollama:11434 # From Docker container to Ollama service - timeout: 300