diff --git a/ops/node2_model_capabilities.yml b/ops/node2_model_capabilities.yml new file mode 100644 index 00000000..ae88fee6 --- /dev/null +++ b/ops/node2_model_capabilities.yml @@ -0,0 +1,270 @@ +# NODA2 Model Capabilities — Machine-readable +# Node: MacBook Pro M4 Max, 64GB unified memory +# Date: 2026-02-27 +# Usage: router offload decisions, swapper config, capability queries + +node_id: noda2 +hostname: MacBook-Pro.local +hardware: + cpu: "Apple M4 Max" + ram_gb: 64 + unified_memory: true + gpu: "Apple Silicon (MPS/Metal)" + +# ─── LLM HEAVY ──────────────────────────────────────────────────────────────── +llm_heavy: + - name: qwen3.5:35b-a3b + backend: ollama + base_url: http://localhost:11434 + size_gb: 9.3 + params: "14.8B MoE" + quality: high + speed: fast_for_size + recommended: true + note: "Primary reasoning model. MoE architecture — fast despite 14.8B active params." + + - name: glm-4.7-flash:32k + backend: ollama + base_url: http://localhost:11434 + size_gb: 19.0 + params: "~32B" + quality: high + speed: medium + recommended: false + note: "Duplicate of q4_K_M variant. Keep only one." + + - name: deepseek-r1:70b + backend: ollama + base_url: http://localhost:11434 + size_gb: 42.0 + params: "70B" + quality: very_high + speed: slow + recommended: false + note: "Rarely needed. Only for max-quality reasoning tasks." + + - name: Qwen3.5-35B-A3B-Q4_K_M.gguf + backend: llama-server + base_url: http://localhost:11435 + size_gb: 20.0 + params: "34.6B" + quality: high + speed: fast + recommended: false + note: "DUPLICATE of qwen3.5:35b-a3b in Ollama. Same model, different backend. Remove or unify." + +# ─── LLM SMALL/MEDIUM ───────────────────────────────────────────────────────── +llm_small: + - name: qwen3:14b + backend: ollama + base_url: http://localhost:11434 + size_gb: 9.3 + params: "14B" + quality: medium_high + speed: fast + note: "Good all-rounder. Used by swapper as default." + + - name: gemma3:latest + backend: ollama + base_url: http://localhost:11434 + size_gb: 3.3 + params: "4B" + quality: medium + speed: very_fast + recommended: true + note: "Best small model for routing, classification, quick Q&A." + + - name: phi3:latest + backend: ollama + base_url: http://localhost:11434 + size_gb: 2.2 + params: "3.8B" + quality: medium + speed: very_fast + note: "Lightweight general model. Can replace with gemma3." + + - name: gpt-oss:latest + backend: ollama + base_url: http://localhost:11434 + size_gb: 13.0 + params: "20.9B" + quality: medium_high + speed: medium + note: "Old model. Not recommended for new workloads." + + - name: mistral-nemo:12b + backend: ollama + base_url: http://localhost:11434 + size_gb: 7.1 + params: "12B" + quality: medium + speed: medium + note: "Old. Not recommended for new workloads." + +# ─── CODE ───────────────────────────────────────────────────────────────────── +code: + - name: deepseek-coder:33b + backend: ollama + base_url: http://localhost:11434 + size_gb: 18.0 + params: "33B" + quality: high + speed: medium + note: "Good for code review. Heavy, cold-start slow." + + - name: starcoder2:3b + backend: ollama + base_url: http://localhost:11434 + size_gb: 1.7 + params: "3B" + quality: medium + speed: very_fast + note: "Fast code completion. Limited context." + +# ─── VISION ─────────────────────────────────────────────────────────────────── +vision_fast: + - name: llava:13b + backend: ollama + base_url: http://localhost:11434 + swapper_url: http://localhost:8890/vision + size_gb: 8.0 + params: "13B" + quality: medium + speed: medium + installed: true + active_in_swapper: true + note: "P0 fallback. CLIP-based. Outdated architecture but functional." + +vision_hq: + - name: MiniCPM-V-4_5 + backend: hf_transformers + size_gb: 16.0 + params: "8B" + quality: high + speed: medium + installed: true + active_in_swapper: false + note: "RECOMMENDED for P2. Better than llava:13b. Needs integration into Swapper." + + - name: Qwen3-VL-32B-Instruct + backend: hf + size_gb: 0 + installed: false + note: "Only refs/ placeholder — NOT downloaded. Would require 65+ GB." + +# ─── EMBEDDING ──────────────────────────────────────────────────────────────── +embedding: + - name: sentence-transformers/all-MiniLM-L6-v2 + backend: open-webui + size_gb: 0.1 + installed: true + active: true + note: "Used by open-webui for RAG. Not exposed as standalone endpoint." + + - name: TaylorAI/bge-micro-v2 + backend: open-webui + size_gb: 0.05 + installed: true + active: true + note: "Auxiliary embedding in open-webui." + + - name: Qwen/Qwen2.5-7B-Instruct + backend: hf_transformers + size_gb: 14.0 + installed: true + active: false + note: "HF weights only. Can be used for embedding but not configured." + +# ─── STT ────────────────────────────────────────────────────────────────────── +stt: + - name: mlx-community/whisper-large-v3-turbo-asr-fp16 + backend: mlx + size_gb: 1.5 + device: mps_metal + quality: high + speed: fast + installed: true + active: false + note: "READY — installed, MLX-accelerated. NOT integrated into Swapper or memory-service. P2 task." + + - name: whisper-base + backend: open-webui + size_gb: 0.15 + device: cpu + quality: low + installed: true + active: true + note: "Active in open-webui only. Low quality." + +# ─── TTS ────────────────────────────────────────────────────────────────────── +tts: + - name: mlx-community/Kokoro-82M-bf16 + backend: mlx + size_gb: 0.35 + device: mps_metal + quality: high + speed: fast + installed: true + active: false + note: "READY — installed, MLX-accelerated. NOT integrated into any service. P2 task." + +# ─── IMAGE GENERATION ───────────────────────────────────────────────────────── +image_gen: + - name: flux2-dev-Q8_0.gguf + backend: comfyui + path: ~/ComfyUI/models/checkpoints/ + size_gb: 33.0 + quality: very_high + installed: true + active: false + note: "ComfyUI + llama.cpp GGUF format. FLUX.2-dev — state-of-art image gen." + + - name: ltx-2-19b-distilled-fp8.safetensors + backend: comfyui + path: ~/Documents/ComfyUI/models/checkpoints/ + size_gb: 25.0 + type: video_gen + quality: very_high + installed: true + active: false + note: "LTX-2 19B — video generation. Very large." + + - name: z_image_turbo_bf16.safetensors + backend: comfyui + size_gb: 11.0 + quality: high + installed: true + active: false + note: "Fast image gen via ComfyUI." + + - name: SDXL-base-1.0 + refiner + backend: comfyui_symlink + size_gb: 72.0 + quality: medium + installed: true + active: false + note: "Legacy SDXL. Very large. Consider cleanup." + + - name: sdxl_sofia_lora_v1.safetensors + backend: comfyui_lora + size_gb: 0.08 + installed: true + active: false + note: "Custom Sofiia appearance LoRA for SDXL." + +# ─── OCR ────────────────────────────────────────────────────────────────────── +ocr: + - name: "(none configured)" + note: "Swapper /ocr returns 405. No dedicated OCR model. Could use llava:13b or MiniCPM-V as OCR." + +# ─── ROUTING POLICY (suggestions) ──────────────────────────────────────────── +routing_policy: + nats_subjects: + vision_request: "node.noda2.vision.request" + vision_reply: "node.noda2.vision.reply.*" + stt_request: "node.noda2.stt.request" + llm_request: "node.noda2.llm.request" + default_vision_model: "llava:13b" + recommended_vision_model: "MiniCPM-V-4_5 (needs integration)" + default_llm: "qwen3.5:35b-a3b" + default_small_llm: "gemma3:latest" diff --git a/ops/node2_model_gaps.yml b/ops/node2_model_gaps.yml new file mode 100644 index 00000000..39684bc5 --- /dev/null +++ b/ops/node2_model_gaps.yml @@ -0,0 +1,255 @@ +# NODA2 Model Gaps & Optimization Analysis +# Date: 2026-02-27 +# Node: MacBook Pro M4 Max (NODA2) +# Status: READ-ONLY analysis — no changes applied + +# ─── MODELS INSTALLED BUT UNUSED ───────────────────────────────────────────── +models_installed_but_unused: + - id: GAP-01 + model: "mlx-community/whisper-large-v3-turbo-asr-fp16" + size_gb: 1.5 + installed: true + integrated: false + priority: P2 + action: > + Integrate into memory-service or swapper as STT backend. + Uses MLX for Metal acceleration — fast on M4 Max. + Candidate: POST /transcribe endpoint in memory-service. + + - id: GAP-02 + model: "mlx-community/Kokoro-82M-bf16" + size_gb: 0.35 + installed: true + integrated: false + priority: P2 + action: > + Integrate as TTS backend. Kokoro via MLX is fast on Apple Silicon. + Candidate: POST /tts endpoint in memory-service or standalone service. + + - id: GAP-03 + model: "openbmb/MiniCPM-V-4_5" + size_gb: 16.0 + installed: true + integrated: false + priority: P2 + action: > + Better vision quality than llava:13b. + Option A: Serve via Ollama (needs GGUF conversion or mlx-lm). + Option B: Direct HF inference via memory-service endpoint. + Option C: Dedicated FastAPI microservice on port 8893. + + - id: GAP-04 + model: "Qwen/Qwen2.5-7B-Instruct" + size_gb: 14.0 + installed: true + integrated: false + priority: P3 + action: > + HF weights sitting idle. If needed — convert to Ollama (modelfile) + or use via transformers. Otherwise candidate for cleanup. + + - id: GAP-05 + model: "Qwen/Qwen2.5-1.5B-Instruct" + size_gb: 2.9 + installed: true + integrated: false + priority: P3 + action: > + Very small model. Could serve as ultra-fast routing/classification + if converted to Ollama. Otherwise cleanup. + + - id: GAP-06 + model: "flux2-dev-Q8_0.gguf + ltx-2-19b + SDXL + FLUX.1-*" + size_gb_total: 141 + installed: true + integrated: false + priority: P3 + action: > + ComfyUI stack running but not exposed as API endpoint. + No REST API service for Sofiia to call image/video generation. + Would need a ComfyUI API wrapper (comfyui-api-proxy) to integrate. + +# ─── MODELS CONFIGURED BUT MISSING ─────────────────────────────────────────── +models_configured_but_missing: + - id: MISS-01 + model: "gemma2:27b" + configured_in: swapper_config_node2.yaml + installed: false + priority: P2 + action: > + Swapper references gemma2:27b but it's NOT in ollama list. + Either: ollama pull gemma2:27b (15GB) or remove from swapper config. + Recommend: remove (gemma3:latest is 3.3GB and more modern). + + - id: MISS-02 + model: "qwen2.5-coder:32b" + configured_in: swapper_config_node2.yaml + installed: false + priority: P2 + action: > + Swapper references qwen2.5-coder:32b but NOT in ollama list. + Either: pull (19GB) or remove from config. + deepseek-coder:33b already covers this role. + + - id: MISS-03 + model: "gpt-oss:latest" + configured_in: swapper_config_node2.yaml + installed: true + priority: P3 + note: "Available in Ollama. Old model, low usage." + + - id: MISS-04 + model: "Qwen3-VL-32B-Instruct" + configured_in: "NOT configured (only refs/ placeholder in HF cache)" + installed: false + priority: P3 + note: "Only 4KB placeholder. Would need 65+ GB download to use." + +# ─── DUPLICATED MODELS ─────────────────────────────────────────────────────── +duplicated_models: + - id: DUP-01 + model: "Qwen3.5-35B-A3B" + instances: + - backend: ollama + name: "qwen3.5:35b-a3b" + size_gb: 9.3 + path: "~/.ollama/models/" + - backend: llama-server + name: "Qwen3.5-35B-A3B-Q4_K_M.gguf" + size_gb: 20.0 + path: "~/Library/Application Support/llama.cpp/models/" + total_duplicate_gb: 29.3 + priority: P1 + action: > + Two instances of same model (different quant format, same weights). + Ollama uses optimized .gguf internally (~9.3GB). + llama-server uses separate 20GB .gguf file. + RECOMMEND: Stop llama-server process and delete the 20GB file. + Use ollama as single Qwen3.5 backend. + Saves: 20 GB disk, eliminates port confusion. + + - id: DUP-02 + model: "glm-4.7-flash" + instances: + - name: "glm-4.7-flash:32k" + size_gb: 19.0 + - name: "glm-4.7-flash:q4_K_M" + size_gb: 19.0 + total_duplicate_gb: 38.0 + priority: P2 + action: > + Two identical GLM-4.7-flash quants. Choose one (32k recommended + for longer context), remove the other. Saves 19 GB. + +# ─── GPU / MEMORY CONFLICTS ────────────────────────────────────────────────── +gpu_conflicts: + - id: GPU-01 + description: > + llama-server (port 11435) and Ollama (port 11434) both use + Metal/MPS for the SAME model (Qwen3.5-35B-A3B). + On Apple Silicon unified memory, this means both could load + model weights simultaneously → 20GB + 9.3GB = 29.3GB consumed + from 64GB unified memory without any active inference. + severity: medium + priority: P1 + action: "Stop llama-server. Use only Ollama for this model." + + - id: GPU-02 + description: > + No single-model-in-VRAM policy on NODA2 (unlike NODA1). + Ollama can keep multiple models warm simultaneously. + On 64GB unified memory this is less critical than on NODA1's 20GB, + but for large models (deepseek-r1:70b = 42GB) concurrent loading + can cause swap pressure. + severity: low + priority: P3 + action: > + Set OLLAMA_MAX_LOADED_MODELS=1 in Ollama env if strict policy needed. + Current 64GB RAM is usually sufficient for 1-2 medium models. + +# ─── MISALIGNED SWAPPER CONFIG ─────────────────────────────────────────────── +misaligned_swapper_config: + - id: SW-01 + issue: "gemma2:27b referenced in swapper but not installed in Ollama" + severity: medium + priority: P2 + fix: "Remove gemma2-27b entry from swapper_config_node2.yaml or install" + + - id: SW-02 + issue: "qwen2.5-coder:32b referenced in swapper but not installed" + severity: medium + priority: P2 + fix: "Remove qwen2.5-coder-32b entry or install model" + + - id: SW-03 + issue: "ollama.url in swapper_config set to host.docker.internal:11434 (FIXED in P1)" + severity: resolved + note: "Fixed 2026-02-27" + + - id: SW-04 + issue: "STT/TTS sections are empty — whisper and kokoro installed but not configured" + severity: medium + priority: P2 + fix: > + Add stt section pointing to mlx-community/whisper-large-v3-turbo-asr-fp16 + Add tts section pointing to mlx-community/Kokoro-82M-bf16 + + - id: SW-05 + issue: "No OCR model configured — /ocr returns 405" + severity: low + priority: P3 + fix: "Either configure llava-13b as OCR fallback or integrate got-ocr2/granite-docling" + +# ─── REDUNDANT BACKENDS ────────────────────────────────────────────────────── +redundant_backends: + - id: RED-01 + description: "llama-server (port 11435) running alongside Ollama for identical model" + redundant: llama-server + preferred: ollama + priority: P1 + action: "Kill llama-server process, delete 20GB .gguf file, update router profile to use port 11434" + + - id: RED-02 + description: "open-webui whisper-base (low quality) vs mlx whisper-large-v3-turbo (high quality, idle)" + redundant: open-webui whisper-base + preferred: mlx whisper-large-v3-turbo + priority: P2 + action: "Integrate mlx whisper as service, configure open-webui to use external STT URL" + +# ─── RECOMMENDED CLEANUP ───────────────────────────────────────────────────── +recommended_cleanup: + immediate_P1: + - action: "Stop llama-server process + delete ~/Library/Application Support/llama.cpp/models/Qwen3.5-35B-A3B-Q4_K_M.gguf" + savings_gb: 20 + risk: low + + next_sprint_P2: + - action: "Remove glm-4.7-flash:q4_K_M from Ollama (keep :32k)" + savings_gb: 19 + risk: low + - action: "Remove gemma2-27b and qwen2.5-coder:32b from swapper_config_node2.yaml (not installed)" + savings_gb: 0 + risk: none + - action: "Integrate whisper-large-v3-turbo-asr-fp16 as STT endpoint" + savings_gb: 0 + risk: medium + - action: "Integrate Kokoro-82M-bf16 as TTS endpoint" + savings_gb: 0 + risk: medium + - action: "Integrate MiniCPM-V-4_5 as vision endpoint (replace llava:13b)" + savings_gb: 0 + risk: medium + + optional_P3: + - action: "Clean old HF models: Qwen/Qwen2.5-7B-Instruct + Qwen2.5-1.5B if not needed" + savings_gb: 17 + risk: low + - action: "Clean SDXL-base-1.0 from hf_models (72GB!) if ComfyUI FLUX covers use case" + savings_gb: 72 + risk: low + - action: "Remove gpt-oss:latest and mistral-nemo:12b from Ollama (old, replaced by qwen3)" + savings_gb: 20 + risk: low + + total_potential_savings_gb: 149 + immediate_savings_gb: 20 diff --git a/ops/node2_models_inventory_20260227.json b/ops/node2_models_inventory_20260227.json new file mode 100644 index 00000000..f0ea440b --- /dev/null +++ b/ops/node2_models_inventory_20260227.json @@ -0,0 +1,141 @@ +{ + "node_id": "noda2", + "hostname": "MacBook-Pro.local", + "timestamp": "2026-02-27T10:00:00Z", + "hardware": { + "cpu": "Apple M4 Max", + "ram_gb": 64, + "unified_memory": true, + "storage_total_tb": 1.8, + "storage_free_gb": 634, + "os": "macOS 26.3 (Darwin arm64 25.3.0)" + }, + + "backends": [ + { + "id": "ollama-main", + "type": "ollama", + "version": "0.17.1", + "port": 11434, + "base_url": "http://localhost:11434", + "gpu_mode": "Apple Silicon MPS/Metal (unified memory)", + "currently_loaded": null, + "models_count": 12, + "models": [ + {"name": "qwen3.5:35b-a3b", "type": "llm", "size_gb": 9.3, "params": "14.8B MoE", "running": false, "modified": "2026-02-26", "tags": ["reasoning", "primary"]}, + {"name": "qwen3:14b", "type": "llm", "size_gb": 9.3, "params": "14B", "running": false, "modified": "2026-02-26", "tags": ["general"]}, + {"name": "gemma3:latest", "type": "llm", "size_gb": 3.3, "params": "4B", "running": false, "modified": "2026-02-25", "tags": ["general", "fast"]}, + {"name": "glm-4.7-flash:32k", "type": "llm", "size_gb": 19.0, "params": "~32B", "running": false, "modified": "2026-02-09", "tags": ["long-context"]}, + {"name": "glm-4.7-flash:q4_K_M","type": "llm", "size_gb": 19.0, "params": "~32B", "running": false, "modified": "2026-02-09", "tags": ["duplicate-of-32k"]}, + {"name": "llava:13b", "type": "vision", "size_gb": 8.0, "params": "13B", "running": false, "modified": "2025-11-27", "tags": ["vision", "multimodal", "p0-fallback"], "vision_capable": true}, + {"name": "mistral-nemo:12b", "type": "llm", "size_gb": 7.1, "params": "12B", "running": false, "modified": "2025-11-21", "tags": ["general", "old"]}, + {"name": "deepseek-coder:33b", "type": "code", "size_gb": 18.0, "params": "33B", "running": false, "modified": "2025-11-21", "tags": ["code", "heavy"]}, + {"name": "deepseek-r1:70b", "type": "llm", "size_gb": 42.0, "params": "70B", "running": false, "modified": "2025-11-21", "tags": ["reasoning", "very-heavy"]}, + {"name": "starcoder2:3b", "type": "code", "size_gb": 1.7, "params": "3B", "running": false, "modified": "2025-11-21", "tags": ["code", "small"]}, + {"name": "phi3:latest", "type": "llm", "size_gb": 2.2, "params": "3.8B", "running": false, "modified": "2025-11-21", "tags": ["small", "fast"]}, + {"name": "gpt-oss:latest", "type": "llm", "size_gb": 13.0, "params": "20.9B", "running": false, "modified": "2025-11-21", "tags": ["general", "old"]} + ] + }, + { + "id": "llama-server-cpu", + "type": "llama.cpp", + "port": 11435, + "base_url": "http://localhost:11435", + "gpu_mode": "Apple Silicon Metal (via llama.cpp)", + "model_file": "/Users/apple/Library/Application Support/llama.cpp/models/Qwen3.5-35B-A3B-Q4_K_M.gguf", + "model_name": "Qwen3.5-35B-A3B-Q4_K_M", + "size_gb": 20.0, + "currently_loaded": true, + "note": "DUPLICATE — same model as qwen3.5:35b-a3b in Ollama. Two instances of identical model consume 2x disk." + }, + { + "id": "swapper-service", + "type": "swapper", + "port": 8890, + "base_url": "http://localhost:8890", + "status": "healthy", + "active_model": null, + "mode": "single-active", + "ollama_url": "http://host.docker.internal:11434", + "gpu_enabled": true, + "metal_acceleration": true, + "llm_models_configured": 9, + "vision_models_configured": 1, + "stt_models_configured": 0, + "tts_models_configured": 0, + "vision_models": [ + {"name": "llava-13b", "ollama_model": "llava:13b", "size_gb": 8.0, "status": "unloaded"} + ] + }, + { + "id": "open-webui", + "type": "open-webui", + "port": 8080, + "base_url": "http://localhost:8080", + "ollama_base_url": "http://host.docker.internal:11434", + "whisper_model": "base", + "whisper_device": "cpu", + "embedding_model": "sentence-transformers/all-MiniLM-L6-v2", + "note": "Internal WebUI for Ollama. Has Whisper-base STT (CPU) built-in." + } + ], + + "hf_cache_models": [ + {"name": "mlx-community/whisper-large-v3-turbo-asr-fp16", "type": "stt", "size_gb": 1.5, "backend": "mlx", "installed": true, "active": false, "note": "MLX Whisper — ready to use, NOT integrated into Swapper"}, + {"name": "mlx-community/Kokoro-82M-bf16", "type": "tts", "size_gb": 0.35, "backend": "mlx", "installed": true, "active": false, "note": "Kokoro TTS — ready to use, NOT integrated"}, + {"name": "openbmb/MiniCPM-V-4_5", "type": "vision", "size_gb": 16.0, "backend": "hf", "installed": true, "active": false, "note": "MiniCPM-V 4.5 — high-quality vision model, NOT in Swapper"}, + {"name": "Qwen/Qwen2.5-7B-Instruct", "type": "llm", "size_gb": 14.0, "backend": "hf", "installed": true, "active": false, "note": "HF weights only — not serving"}, + {"name": "Qwen/Qwen2.5-1.5B-Instruct", "type": "llm", "size_gb": 2.9, "backend": "hf", "installed": true, "active": false, "note": "HF weights only — not serving"}, + {"name": "Qwen/Qwen3-VL-32B-Instruct", "type": "vision", "size_gb": 0.004,"backend": "hf", "installed": false, "active": false, "note": "Only refs/ directory — model NOT downloaded (4KB, just placeholder)"}, + {"name": "Aquiles-ai/FLUX.2-dev", "type": "image_gen", "size_gb": 105, "backend": "hf/comfyui", "installed": true, "active": false, "note": "FLUX.2-dev (105GB) — VERY large, used by ComfyUI"}, + {"name": "google/gemma-3-12b-it-qat-q4_0-unquantized", "type": "llm", "size_gb": 0.004,"backend": "hf", "installed": false, "active": false, "note": "Only refs/ — NOT downloaded"} + ], + + "comfyui_models": { + "location": "~/Documents/ComfyUI/models + ~/ComfyUI/models", + "total_size_gb": 101, + "checkpoints": [ + {"name": "flux2-dev-Q8_0.gguf", "size_gb": 33.0, "type": "image_gen", "path": "~/ComfyUI/models/checkpoints/"}, + {"name": "ltx-2-19b-distilled-fp8.safetensors","size_gb": 25.0, "type": "video_gen", "path": "~/Documents/ComfyUI/models/checkpoints/"} + ], + "diffusion_models": [ + {"name": "z_image_turbo_bf16.safetensors", "size_gb": 11.0, "type": "image_gen"} + ], + "loras": [ + {"name": "ltx-2-19b-distilled-lora-384.safetensors", "size_gb": 7.1, "type": "video_gen"}, + {"name": "sdxl_sofia_lora_v1.safetensors", "size_gb": 0.08, "type": "image_gen", "note": "Sofiia custom LoRA"}, + {"name": "ip-adapter-faceid-plusv2_sdxl_lora.safetensors", "size_gb": 0.35, "type": "image_gen"}, + {"name": "sdxl_agent_lora_smoke.safetensors", "size_gb": 0.02, "type": "image_gen"} + ], + "hf_linked": [ + {"name": "FLUX.1-dev", "size_gb": 40, "location": "~/hf_models/FLUX.1-dev"}, + {"name": "FLUX.1-schnell", "size_gb": 12, "location": "~/hf_models/FLUX.1-schnell"}, + {"name": "SDXL-base-1.0", "size_gb": 72, "location": "~/hf_models/stabilityai_sdxl_base_1.0"} + ] + }, + + "docker_containers": [ + {"name": "dagi-router-node2", "status": "Up 13m (healthy)", "port": "127.0.0.1:9102→8000"}, + {"name": "dagi-gateway-node2", "status": "Up 19h (healthy)", "port": "0.0.0.0:9300"}, + {"name": "swapper-service-node2", "status": "Up 14m (healthy)", "port": "0.0.0.0:8890", "note": "P1: should be 127.0.0.1:8890 (not yet applied via --remove-orphans)"}, + {"name": "dagi-memory-service-node2", "status": "Up 19h (healthy)", "port": "0.0.0.0:8000"}, + {"name": "dagi-qdrant-node2", "status": "Up 19h (healthy)", "port": "0.0.0.0:6333-6334"}, + {"name": "dagi-postgres-node2", "status": "Up 19h (healthy)", "port": "0.0.0.0:5433"}, + {"name": "dagi-neo4j-node2", "status": "Up 19h (healthy)", "port": "0.0.0.0:7474,7687"}, + {"name": "dagi-nats-node2", "status": "Up 2h", "port": "0.0.0.0:4222,8222"}, + {"name": "open-webui", "status": "Up 19h (healthy)", "port": "0.0.0.0:8080"}, + {"name": "dagi-postgres", "status": "Up 19h (healthy)", "port": "0.0.0.0:5432"}, + {"name": "dagi-redis", "status": "Up 19h (healthy)", "port": "0.0.0.0:6379"}, + {"name": "sofiia-console", "status": "Up 13m", "port": "127.0.0.1:8002"} + ], + + "non_docker_processes": [ + {"name": "ollama", "port": 11434, "type": "native", "model_serving": "all ollama models"}, + {"name": "llama-server", "port": 11435, "type": "llama.cpp", "model": "Qwen3.5-35B-A3B-Q4_K_M.gguf", "note": "DUPLICATE — same model as in Ollama"}, + {"name": "spacebot", "port": 19898, "type": "telegram-bot","note": "Sofiia Telegram bot"}, + {"name": "gitea", "port": 3000, "type": "git-server", "note": "Self-hosted Git"}, + {"name": "opencode", "port": 3456, "type": "ai-coding", "note": "AI coding tool"}, + {"name": "Pieces OS", "port": 39300, "type": "ai-assistant","note": "Pieces OS — dev AI tools, not LLM serving"}, + {"name": "memory-service", "port": 8000, "type": "uvicorn", "note": "Running outside Docker (dev mode)"} + ] +} diff --git a/ops/node2_models_inventory_20260227.md b/ops/node2_models_inventory_20260227.md new file mode 100644 index 00000000..890459ad --- /dev/null +++ b/ops/node2_models_inventory_20260227.md @@ -0,0 +1,188 @@ +# NODA2 Full Model Inventory +**MacBook Pro M4 Max — 64GB unified memory** +**Date:** 2026-02-27 +**Status:** Read-only audit, no changes applied + +--- + +## Загальна статистика + +| Параметр | Значення | +|----------|---------| +| Всього моделей (унікальних) | **20+** | +| Реально використовуються зараз | **3** (qwen3.5 via Ollama/llama-server, llava:13b via Swapper) | +| Встановлені але невикористовувані | **12** (whisper, kokoro, MiniCPM-V, ComfyUI stack, FLUX, ...) | +| Дублюються | **2** (Qwen3.5-35B in Ollama + llama-server; GLM-4.7-flash:32k + :q4_K_M) | +| Налаштовані в swapper, але відсутні в Ollama | **2** (gemma2:27b, qwen2.5-coder:32b) | +| Є VL/vision-capable | **3** (llava:13b active, MiniCPM-V-4_5 idle, Qwen3-VL-32B not downloaded) | +| Можна перейти на single-model policy | **Частково** (64GB дозволяє тримати 1-2 великих моделі) | +| Потенційна економія диска (cleanup) | **~149 GB** | +| Негайна економія (P1 llama-server) | **~20 GB** | + +--- + +## A. Система + +| Параметр | Значення | +|----------|---------| +| CPU | Apple M4 Max | +| RAM | 64 GB unified | +| OS | macOS 26.3 (Darwin arm64) | +| Storage free | 634 GB | +| Ollama | v0.17.1 | +| Docker | 29.2.1 / Compose v5.0.2 | + +--- + +## B. Backends (активні) + +| Backend | Port | Статус | Модель | +|---------|------|--------|--------| +| Ollama (native) | 11434 | ✅ Running | 12 моделей, жодна не завантажена | +| llama-server (llama.cpp) | 11435 | ✅ Running | Qwen3.5-35B-A3B-Q4_K_M.gguf | +| Swapper (Docker) | 8890 | ✅ Healthy | active_model=null | +| open-webui (Docker) | 8080 | ✅ Healthy | Whisper-base (STT, CPU) | +| memory-service (uvicorn) | 8000 | ✅ Running | — | +| ComfyUI | — | ❌ Not running | — (offline) | + +--- + +## C. Ollama Models (12 шт.) + +| Модель | Розмір | Тип | GPU | Swapper | Рекомендація | +|--------|--------|-----|-----|---------|--------------| +| qwen3.5:35b-a3b | 9.3 GB | LLM (MoE) | Metal | ✅ так | 🟢 PRIMARY | +| qwen3:14b | 9.3 GB | LLM | Metal | ✅ так | 🟢 Keep | +| gemma3:latest | 3.3 GB | LLM (small) | Metal | ❌ не | 🟢 Add to swapper | +| glm-4.7-flash:32k | 19 GB | LLM | Metal | ✅ так | 🟡 Keep 1, remove other | +| glm-4.7-flash:q4_K_M | 19 GB | LLM | Metal | ❌ не | 🔴 DUPLICATE → видалити | +| **llava:13b** | 8.0 GB | **Vision** | Metal | ✅ так | 🟡 P0 fallback, замінити | +| mistral-nemo:12b | 7.1 GB | LLM | Metal | ❌ не | 🔴 Old → видалити | +| deepseek-coder:33b | 18 GB | Code | Metal | ✅ так | 🟡 Heavy, keep if needed | +| deepseek-r1:70b | 42 GB | LLM | Metal | ✅ так | 🟡 Very heavy, on-demand | +| starcoder2:3b | 1.7 GB | Code | Metal | ❌ не | 🟡 Keep for fast code | +| phi3:latest | 2.2 GB | LLM | Metal | ✅ так | 🟡 Replace with gemma3 | +| gpt-oss:latest | 13 GB | LLM | Metal | ✅ так | 🔴 Old → видалити | + +--- + +## D. llama-server (Дублікат!) + +| Параметр | Значення | +|----------|---------| +| Процес | `llama-server -m .../Qwen3.5-35B-A3B-Q4_K_M.gguf --port 11435` | +| Файл | `~/Library/Application Support/llama.cpp/models/Qwen3.5-35B-A3B-Q4_K_M.gguf` | +| Розмір | **20 GB** | +| Тип | Ідентична модель до `qwen3.5:35b-a3b` в Ollama | +| Проблема | DUP-01: 20 GB дублювання + плутає `router-config.node2.yml` | +| Рекомендація | 🔴 **P1: Зупинити + видалити файл. Savings: 20 GB** | + +--- + +## E. Swapper (swapper_config_node2.yaml) + +| Параметр | Значення | +|----------|---------| +| Status | ✅ healthy | +| active_model | null | +| ollama.url | http://host.docker.internal:11434 ✅ (виправлено P1) | +| GPU | Metal enabled | +| LLM моделей | 9 (2 не встановлені в Ollama!) | +| Vision моделей | 1 (llava-13b) ✅ після P0 | +| STT моделей | 0 ❌ (whisper є, але не підключений) | +| TTS моделей | 0 ❌ (kokoro є, але не підключений) | + +--- + +## F. HuggingFace Cache — встановлені, але невикористовувані + +| Модель | Тип | Розмір | Стан | Пріоритет | +|--------|-----|--------|------|-----------| +| whisper-large-v3-turbo-asr-fp16 (MLX) | STT | 1.5 GB | ✅ Installed | P2 integrate | +| Kokoro-82M-bf16 (MLX) | TTS | 0.35 GB | ✅ Installed | P2 integrate | +| MiniCPM-V-4_5 | Vision | 16 GB | ✅ Installed | P2 integrate | +| Qwen2.5-7B-Instruct | LLM | 14 GB | ✅ Installed | P3 cleanup | +| Qwen2.5-1.5B-Instruct | LLM | 2.9 GB | ✅ Installed | P3 cleanup | +| Qwen3-VL-32B-Instruct | Vision | 4 KB | ❌ Only refs | — | +| Aquiles-ai/FLUX.2-dev | Image gen | 105 GB | ✅ Installed | P3 ComfyUI | + +--- + +## G. ComfyUI Stack (offline, не інтегрований) + +| Модель | Тип | Розмір | Якість | +|--------|-----|--------|--------| +| flux2-dev-Q8_0.gguf | Image gen | 33 GB | Very high | +| ltx-2-19b-distilled-fp8 | **Video gen** | 25 GB | Very high | +| z_image_turbo_bf16 | Image gen | 11 GB | High | +| SDXL-base-1.0 (hf_models) | Image gen | 72 GB | Medium (old) | +| sdxl_sofia_lora_v1 | LoRA (image) | 0.08 GB | — | + +**Загальний розмір ComfyUI + linked:** ~200+ GB +**Інтеграція з платформою:** ❌ Немає REST API wrapper + +--- + +## H. Gaps Summary + +### 🔴 P1 — Негайно +| ID | Проблема | Дія | +|----|----------|-----| +| DUP-01 | llama-server дублює Ollama (20 GB) | Зупинити процес, видалити файл | + +### 🟡 P2 — Цього тижня +| ID | Проблема | Дія | +|----|----------|-----| +| GAP-01 | whisper-large-v3-turbo встановлений, не інтегрований | Додати STT endpoint | +| GAP-02 | Kokoro TTS встановлений, не інтегрований | Додати TTS endpoint | +| GAP-03 | MiniCPM-V-4_5 встановлений, не інтегрований | Замінити llava:13b в swapper | +| SW-01 | gemma2:27b в swapper config, не встановлений | Видалити з config | +| SW-02 | qwen2.5-coder:32b в swapper config, не встановлений | Видалити з config | +| DUP-02 | glm-4.7-flash два однакові варіанти (38 GB) | Залишити тільки :32k | + +### 🔵 P3 — Наступний спринт +| ID | Проблема | Дія | +|----|----------|-----| +| GAP-04/05 | Qwen2.5 HF weights (17 GB) idle | Cleanup або конвертація | +| RED-02 | open-webui whisper-base vs MLX whisper | Upgrade | +| GAP-06 | ComfyUI не має API wrapper | Integrate або leave as manual | +| — | SDXL-base-1.0 (72 GB) якщо не потрібно | Cleanup | + +--- + +## I. Executive Summary (15 bullets) + +1. **12 моделей в Ollama** — жодна не завантажена зараз; hot-start за 2-5с при запиті +2. **llama-server дублює Ollama** — 20 GB зайвого диску і плутання портів (11434 vs 11435) → P1 cleanup +3. **Vision ВІДНОВЛЕНО (P0)** — `llava:13b` активна в Swapper, inference ~3.5s через GPU +4. **MiniCPM-V-4_5 (16 GB) встановлений але не підключений** — значно краще llava:13b → P2 upgrade +5. **Whisper-large-v3-turbo (MLX, 1.5 GB) готовий** — STT відсутній в стеку → P2 integrate +6. **Kokoro TTS (MLX, 0.35 GB) готовий** — TTS відсутній в стеку → P2 integrate +7. **2 моделі в swapper config але відсутні в Ollama** — gemma2:27b, qwen2.5-coder:32b → видалити з config +8. **GLM-4.7-flash дублюється** — два варіанти одного 19GB моделі → 19 GB зайвого +9. **ComfyUI stack (200+ GB)** — FLUX.2-dev + LTX-2 video + SDXL — offline, немає API; величезні можливості без інтеграції +10. **open-webui** запущений з whisper-base (CPU, низька якість) + embeddings — декількі неузгодженості +11. **STT та TTS в Swapper порожні** — обидва /stt/models і /tts/models повертають `[]` +12. **Qwen3.5-35B-A3B є рекомендованою основною моделлю** — MoE, 9.3 GB, fast на M4 Max +13. **gemma3:latest (3.3 GB) не в Swapper** — найкращий "small fast" кандидат для швидких задач +14. **Single-model policy** — на 64GB RAM менш критична ніж на NODA1 (20GB), але все одно рекомендована для порядку +15. **Потенційна економія диску: 149 GB** (20 GB негайно); без втрати можливостей + +--- + +## Рекомендована мінімальна конфігурація (якщо йти на оптимізацію) + +``` +Primary LLM: qwen3.5:35b-a3b (Ollama) — 9.3 GB +Small/fast LLM: gemma3:latest (Ollama) — 3.3 GB +Vision: MiniCPM-V-4_5 (HF → integrate) — 16 GB +STT: whisper-large-v3-turbo (MLX) — 1.5 GB +TTS: Kokoro-82M-bf16 (MLX) — 0.35 GB +Code: deepseek-coder:33b (on-demand) — 18 GB +Image gen: flux2-dev via ComfyUI (manual) — 33 GB +Video gen: ltx-2 via ComfyUI (manual) — 25 GB +───────────────────────────────────────────────────────── +Total active: ~30 GB (primary use case, fits in 64 GB RAM) +``` + +Це покрило б: text, vision, STT, TTS, code — в одному ноуті, без NODA1.