node2: full model inventory audit 2026-02-27
Read-only audit of all installed models on NODA2 (MacBook M4 Max): - 12 Ollama models, 1 llama-server duplicate, 16 HF cache models - ComfyUI stack (200+ GB): FLUX.2-dev, LTX-2 video, SDXL - Whisper-large-v3-turbo (MLX, 1.5GB) + Kokoro TTS (MLX, 0.35GB) installed but unused - MiniCPM-V-4_5 (16GB) installed but not in Swapper (better than llava:13b) - Key finding: 149GB cleanup potential; llama-server duplicates Ollama (P1, 20GB) Artifacts: - ops/node2_models_inventory_20260227.json - ops/node2_models_inventory_20260227.md - ops/node2_model_capabilities.yml - ops/node2_model_gaps.yml Made-with: Cursor
This commit is contained in:
255
ops/node2_model_gaps.yml
Normal file
255
ops/node2_model_gaps.yml
Normal file
@@ -0,0 +1,255 @@
|
||||
# NODA2 Model Gaps & Optimization Analysis
|
||||
# Date: 2026-02-27
|
||||
# Node: MacBook Pro M4 Max (NODA2)
|
||||
# Status: READ-ONLY analysis — no changes applied
|
||||
|
||||
# ─── MODELS INSTALLED BUT UNUSED ─────────────────────────────────────────────
|
||||
models_installed_but_unused:
|
||||
- id: GAP-01
|
||||
model: "mlx-community/whisper-large-v3-turbo-asr-fp16"
|
||||
size_gb: 1.5
|
||||
installed: true
|
||||
integrated: false
|
||||
priority: P2
|
||||
action: >
|
||||
Integrate into memory-service or swapper as STT backend.
|
||||
Uses MLX for Metal acceleration — fast on M4 Max.
|
||||
Candidate: POST /transcribe endpoint in memory-service.
|
||||
|
||||
- id: GAP-02
|
||||
model: "mlx-community/Kokoro-82M-bf16"
|
||||
size_gb: 0.35
|
||||
installed: true
|
||||
integrated: false
|
||||
priority: P2
|
||||
action: >
|
||||
Integrate as TTS backend. Kokoro via MLX is fast on Apple Silicon.
|
||||
Candidate: POST /tts endpoint in memory-service or standalone service.
|
||||
|
||||
- id: GAP-03
|
||||
model: "openbmb/MiniCPM-V-4_5"
|
||||
size_gb: 16.0
|
||||
installed: true
|
||||
integrated: false
|
||||
priority: P2
|
||||
action: >
|
||||
Better vision quality than llava:13b.
|
||||
Option A: Serve via Ollama (needs GGUF conversion or mlx-lm).
|
||||
Option B: Direct HF inference via memory-service endpoint.
|
||||
Option C: Dedicated FastAPI microservice on port 8893.
|
||||
|
||||
- id: GAP-04
|
||||
model: "Qwen/Qwen2.5-7B-Instruct"
|
||||
size_gb: 14.0
|
||||
installed: true
|
||||
integrated: false
|
||||
priority: P3
|
||||
action: >
|
||||
HF weights sitting idle. If needed — convert to Ollama (modelfile)
|
||||
or use via transformers. Otherwise candidate for cleanup.
|
||||
|
||||
- id: GAP-05
|
||||
model: "Qwen/Qwen2.5-1.5B-Instruct"
|
||||
size_gb: 2.9
|
||||
installed: true
|
||||
integrated: false
|
||||
priority: P3
|
||||
action: >
|
||||
Very small model. Could serve as ultra-fast routing/classification
|
||||
if converted to Ollama. Otherwise cleanup.
|
||||
|
||||
- id: GAP-06
|
||||
model: "flux2-dev-Q8_0.gguf + ltx-2-19b + SDXL + FLUX.1-*"
|
||||
size_gb_total: 141
|
||||
installed: true
|
||||
integrated: false
|
||||
priority: P3
|
||||
action: >
|
||||
ComfyUI stack running but not exposed as API endpoint.
|
||||
No REST API service for Sofiia to call image/video generation.
|
||||
Would need a ComfyUI API wrapper (comfyui-api-proxy) to integrate.
|
||||
|
||||
# ─── MODELS CONFIGURED BUT MISSING ───────────────────────────────────────────
|
||||
models_configured_but_missing:
|
||||
- id: MISS-01
|
||||
model: "gemma2:27b"
|
||||
configured_in: swapper_config_node2.yaml
|
||||
installed: false
|
||||
priority: P2
|
||||
action: >
|
||||
Swapper references gemma2:27b but it's NOT in ollama list.
|
||||
Either: ollama pull gemma2:27b (15GB) or remove from swapper config.
|
||||
Recommend: remove (gemma3:latest is 3.3GB and more modern).
|
||||
|
||||
- id: MISS-02
|
||||
model: "qwen2.5-coder:32b"
|
||||
configured_in: swapper_config_node2.yaml
|
||||
installed: false
|
||||
priority: P2
|
||||
action: >
|
||||
Swapper references qwen2.5-coder:32b but NOT in ollama list.
|
||||
Either: pull (19GB) or remove from config.
|
||||
deepseek-coder:33b already covers this role.
|
||||
|
||||
- id: MISS-03
|
||||
model: "gpt-oss:latest"
|
||||
configured_in: swapper_config_node2.yaml
|
||||
installed: true
|
||||
priority: P3
|
||||
note: "Available in Ollama. Old model, low usage."
|
||||
|
||||
- id: MISS-04
|
||||
model: "Qwen3-VL-32B-Instruct"
|
||||
configured_in: "NOT configured (only refs/ placeholder in HF cache)"
|
||||
installed: false
|
||||
priority: P3
|
||||
note: "Only 4KB placeholder. Would need 65+ GB download to use."
|
||||
|
||||
# ─── DUPLICATED MODELS ───────────────────────────────────────────────────────
|
||||
duplicated_models:
|
||||
- id: DUP-01
|
||||
model: "Qwen3.5-35B-A3B"
|
||||
instances:
|
||||
- backend: ollama
|
||||
name: "qwen3.5:35b-a3b"
|
||||
size_gb: 9.3
|
||||
path: "~/.ollama/models/"
|
||||
- backend: llama-server
|
||||
name: "Qwen3.5-35B-A3B-Q4_K_M.gguf"
|
||||
size_gb: 20.0
|
||||
path: "~/Library/Application Support/llama.cpp/models/"
|
||||
total_duplicate_gb: 29.3
|
||||
priority: P1
|
||||
action: >
|
||||
Two instances of same model (different quant format, same weights).
|
||||
Ollama uses optimized .gguf internally (~9.3GB).
|
||||
llama-server uses separate 20GB .gguf file.
|
||||
RECOMMEND: Stop llama-server process and delete the 20GB file.
|
||||
Use ollama as single Qwen3.5 backend.
|
||||
Saves: 20 GB disk, eliminates port confusion.
|
||||
|
||||
- id: DUP-02
|
||||
model: "glm-4.7-flash"
|
||||
instances:
|
||||
- name: "glm-4.7-flash:32k"
|
||||
size_gb: 19.0
|
||||
- name: "glm-4.7-flash:q4_K_M"
|
||||
size_gb: 19.0
|
||||
total_duplicate_gb: 38.0
|
||||
priority: P2
|
||||
action: >
|
||||
Two identical GLM-4.7-flash quants. Choose one (32k recommended
|
||||
for longer context), remove the other. Saves 19 GB.
|
||||
|
||||
# ─── GPU / MEMORY CONFLICTS ──────────────────────────────────────────────────
|
||||
gpu_conflicts:
|
||||
- id: GPU-01
|
||||
description: >
|
||||
llama-server (port 11435) and Ollama (port 11434) both use
|
||||
Metal/MPS for the SAME model (Qwen3.5-35B-A3B).
|
||||
On Apple Silicon unified memory, this means both could load
|
||||
model weights simultaneously → 20GB + 9.3GB = 29.3GB consumed
|
||||
from 64GB unified memory without any active inference.
|
||||
severity: medium
|
||||
priority: P1
|
||||
action: "Stop llama-server. Use only Ollama for this model."
|
||||
|
||||
- id: GPU-02
|
||||
description: >
|
||||
No single-model-in-VRAM policy on NODA2 (unlike NODA1).
|
||||
Ollama can keep multiple models warm simultaneously.
|
||||
On 64GB unified memory this is less critical than on NODA1's 20GB,
|
||||
but for large models (deepseek-r1:70b = 42GB) concurrent loading
|
||||
can cause swap pressure.
|
||||
severity: low
|
||||
priority: P3
|
||||
action: >
|
||||
Set OLLAMA_MAX_LOADED_MODELS=1 in Ollama env if strict policy needed.
|
||||
Current 64GB RAM is usually sufficient for 1-2 medium models.
|
||||
|
||||
# ─── MISALIGNED SWAPPER CONFIG ───────────────────────────────────────────────
|
||||
misaligned_swapper_config:
|
||||
- id: SW-01
|
||||
issue: "gemma2:27b referenced in swapper but not installed in Ollama"
|
||||
severity: medium
|
||||
priority: P2
|
||||
fix: "Remove gemma2-27b entry from swapper_config_node2.yaml or install"
|
||||
|
||||
- id: SW-02
|
||||
issue: "qwen2.5-coder:32b referenced in swapper but not installed"
|
||||
severity: medium
|
||||
priority: P2
|
||||
fix: "Remove qwen2.5-coder-32b entry or install model"
|
||||
|
||||
- id: SW-03
|
||||
issue: "ollama.url in swapper_config set to host.docker.internal:11434 (FIXED in P1)"
|
||||
severity: resolved
|
||||
note: "Fixed 2026-02-27"
|
||||
|
||||
- id: SW-04
|
||||
issue: "STT/TTS sections are empty — whisper and kokoro installed but not configured"
|
||||
severity: medium
|
||||
priority: P2
|
||||
fix: >
|
||||
Add stt section pointing to mlx-community/whisper-large-v3-turbo-asr-fp16
|
||||
Add tts section pointing to mlx-community/Kokoro-82M-bf16
|
||||
|
||||
- id: SW-05
|
||||
issue: "No OCR model configured — /ocr returns 405"
|
||||
severity: low
|
||||
priority: P3
|
||||
fix: "Either configure llava-13b as OCR fallback or integrate got-ocr2/granite-docling"
|
||||
|
||||
# ─── REDUNDANT BACKENDS ──────────────────────────────────────────────────────
|
||||
redundant_backends:
|
||||
- id: RED-01
|
||||
description: "llama-server (port 11435) running alongside Ollama for identical model"
|
||||
redundant: llama-server
|
||||
preferred: ollama
|
||||
priority: P1
|
||||
action: "Kill llama-server process, delete 20GB .gguf file, update router profile to use port 11434"
|
||||
|
||||
- id: RED-02
|
||||
description: "open-webui whisper-base (low quality) vs mlx whisper-large-v3-turbo (high quality, idle)"
|
||||
redundant: open-webui whisper-base
|
||||
preferred: mlx whisper-large-v3-turbo
|
||||
priority: P2
|
||||
action: "Integrate mlx whisper as service, configure open-webui to use external STT URL"
|
||||
|
||||
# ─── RECOMMENDED CLEANUP ─────────────────────────────────────────────────────
|
||||
recommended_cleanup:
|
||||
immediate_P1:
|
||||
- action: "Stop llama-server process + delete ~/Library/Application Support/llama.cpp/models/Qwen3.5-35B-A3B-Q4_K_M.gguf"
|
||||
savings_gb: 20
|
||||
risk: low
|
||||
|
||||
next_sprint_P2:
|
||||
- action: "Remove glm-4.7-flash:q4_K_M from Ollama (keep :32k)"
|
||||
savings_gb: 19
|
||||
risk: low
|
||||
- action: "Remove gemma2-27b and qwen2.5-coder:32b from swapper_config_node2.yaml (not installed)"
|
||||
savings_gb: 0
|
||||
risk: none
|
||||
- action: "Integrate whisper-large-v3-turbo-asr-fp16 as STT endpoint"
|
||||
savings_gb: 0
|
||||
risk: medium
|
||||
- action: "Integrate Kokoro-82M-bf16 as TTS endpoint"
|
||||
savings_gb: 0
|
||||
risk: medium
|
||||
- action: "Integrate MiniCPM-V-4_5 as vision endpoint (replace llava:13b)"
|
||||
savings_gb: 0
|
||||
risk: medium
|
||||
|
||||
optional_P3:
|
||||
- action: "Clean old HF models: Qwen/Qwen2.5-7B-Instruct + Qwen2.5-1.5B if not needed"
|
||||
savings_gb: 17
|
||||
risk: low
|
||||
- action: "Clean SDXL-base-1.0 from hf_models (72GB!) if ComfyUI FLUX covers use case"
|
||||
savings_gb: 72
|
||||
risk: low
|
||||
- action: "Remove gpt-oss:latest and mistral-nemo:12b from Ollama (old, replaced by qwen3)"
|
||||
savings_gb: 20
|
||||
risk: low
|
||||
|
||||
total_potential_savings_gb: 149
|
||||
immediate_savings_gb: 20
|
||||
Reference in New Issue
Block a user