node2: full model inventory audit 2026-02-27

Read-only audit of all installed models on NODA2 (MacBook M4 Max): - 12 Ollama models, 1 llama-server duplicate, 16 HF cache models - ComfyUI stack (200+ GB): FLUX.2-dev, LTX-2 video, SDXL - Whisper-large-v3-turbo (MLX, 1.5GB) + Kokoro TTS (MLX, 0.35GB) installed but unused - MiniCPM-V-4_5 (16GB) installed but not in Swapper (better than llava:13b) - Key finding: 149GB cleanup potential; llama-server duplicates Ollama (P1, 20GB) Artifacts: - ops/node2_models_inventory_20260227.json - ops/node2_models_inventory_20260227.md - ops/node2_model_capabilities.yml - ops/node2_model_gaps.yml Made-with: Cursor
2026-02-27 01:44:26 -08:00
parent 7b8499dd8a
commit 3965f68fac
4 changed files with 854 additions and 0 deletions
--- a/ops/node2_model_gaps.yml
+++ b/ops/node2_model_gaps.yml
@@ -0,0 +1,255 @@
+# NODA2 Model Gaps & Optimization Analysis
+# Date: 2026-02-27
+# Node: MacBook Pro M4 Max (NODA2)
+# Status: READ-ONLY analysis — no changes applied
+
+# ─── MODELS INSTALLED BUT UNUSED ─────────────────────────────────────────────
+models_installed_but_unused:
+  - id: GAP-01
+    model: "mlx-community/whisper-large-v3-turbo-asr-fp16"
+    size_gb: 1.5
+    installed: true
+    integrated: false
+    priority: P2
+    action: >
+      Integrate into memory-service or swapper as STT backend.
+      Uses MLX for Metal acceleration — fast on M4 Max.
+      Candidate: POST /transcribe endpoint in memory-service.
+
+  - id: GAP-02
+    model: "mlx-community/Kokoro-82M-bf16"
+    size_gb: 0.35
+    installed: true
+    integrated: false
+    priority: P2
+    action: >
+      Integrate as TTS backend. Kokoro via MLX is fast on Apple Silicon.
+      Candidate: POST /tts endpoint in memory-service or standalone service.
+
+  - id: GAP-03
+    model: "openbmb/MiniCPM-V-4_5"
+    size_gb: 16.0
+    installed: true
+    integrated: false
+    priority: P2
+    action: >
+      Better vision quality than llava:13b.
+      Option A: Serve via Ollama (needs GGUF conversion or mlx-lm).
+      Option B: Direct HF inference via memory-service endpoint.
+      Option C: Dedicated FastAPI microservice on port 8893.
+
+  - id: GAP-04
+    model: "Qwen/Qwen2.5-7B-Instruct"
+    size_gb: 14.0
+    installed: true
+    integrated: false
+    priority: P3
+    action: >
+      HF weights sitting idle. If needed — convert to Ollama (modelfile)
+      or use via transformers. Otherwise candidate for cleanup.
+
+  - id: GAP-05
+    model: "Qwen/Qwen2.5-1.5B-Instruct"
+    size_gb: 2.9
+    installed: true
+    integrated: false
+    priority: P3
+    action: >
+      Very small model. Could serve as ultra-fast routing/classification
+      if converted to Ollama. Otherwise cleanup.
+
+  - id: GAP-06
+    model: "flux2-dev-Q8_0.gguf + ltx-2-19b + SDXL + FLUX.1-*"
+    size_gb_total: 141
+    installed: true
+    integrated: false
+    priority: P3
+    action: >
+      ComfyUI stack running but not exposed as API endpoint.
+      No REST API service for Sofiia to call image/video generation.
+      Would need a ComfyUI API wrapper (comfyui-api-proxy) to integrate.
+
+# ─── MODELS CONFIGURED BUT MISSING ───────────────────────────────────────────
+models_configured_but_missing:
+  - id: MISS-01
+    model: "gemma2:27b"
+    configured_in: swapper_config_node2.yaml
+    installed: false
+    priority: P2
+    action: >
+      Swapper references gemma2:27b but it's NOT in ollama list.
+      Either: ollama pull gemma2:27b (15GB) or remove from swapper config.
+      Recommend: remove (gemma3:latest is 3.3GB and more modern).
+
+  - id: MISS-02
+    model: "qwen2.5-coder:32b"
+    configured_in: swapper_config_node2.yaml
+    installed: false
+    priority: P2
+    action: >
+      Swapper references qwen2.5-coder:32b but NOT in ollama list.
+      Either: pull (19GB) or remove from config.
+      deepseek-coder:33b already covers this role.
+
+  - id: MISS-03
+    model: "gpt-oss:latest"
+    configured_in: swapper_config_node2.yaml
+    installed: true
+    priority: P3
+    note: "Available in Ollama. Old model, low usage."
+
+  - id: MISS-04
+    model: "Qwen3-VL-32B-Instruct"
+    configured_in: "NOT configured (only refs/ placeholder in HF cache)"
+    installed: false
+    priority: P3
+    note: "Only 4KB placeholder. Would need 65+ GB download to use."
+
+# ─── DUPLICATED MODELS ───────────────────────────────────────────────────────
+duplicated_models:
+  - id: DUP-01
+    model: "Qwen3.5-35B-A3B"
+    instances:
+      - backend: ollama
+        name: "qwen3.5:35b-a3b"
+        size_gb: 9.3
+        path: "~/.ollama/models/"
+      - backend: llama-server
+        name: "Qwen3.5-35B-A3B-Q4_K_M.gguf"
+        size_gb: 20.0
+        path: "~/Library/Application Support/llama.cpp/models/"
+    total_duplicate_gb: 29.3
+    priority: P1
+    action: >
+      Two instances of same model (different quant format, same weights).
+      Ollama uses optimized .gguf internally (~9.3GB).
+      llama-server uses separate 20GB .gguf file.
+      RECOMMEND: Stop llama-server process and delete the 20GB file.
+      Use ollama as single Qwen3.5 backend.
+      Saves: 20 GB disk, eliminates port confusion.
+
+  - id: DUP-02
+    model: "glm-4.7-flash"
+    instances:
+      - name: "glm-4.7-flash:32k"
+        size_gb: 19.0
+      - name: "glm-4.7-flash:q4_K_M"
+        size_gb: 19.0
+    total_duplicate_gb: 38.0
+    priority: P2
+    action: >
+      Two identical GLM-4.7-flash quants. Choose one (32k recommended
+      for longer context), remove the other. Saves 19 GB.
+
+# ─── GPU / MEMORY CONFLICTS ──────────────────────────────────────────────────
+gpu_conflicts:
+  - id: GPU-01
+    description: >
+      llama-server (port 11435) and Ollama (port 11434) both use
+      Metal/MPS for the SAME model (Qwen3.5-35B-A3B).
+      On Apple Silicon unified memory, this means both could load
+      model weights simultaneously → 20GB + 9.3GB = 29.3GB consumed
+      from 64GB unified memory without any active inference.
+    severity: medium
+    priority: P1
+    action: "Stop llama-server. Use only Ollama for this model."
+
+  - id: GPU-02
+    description: >
+      No single-model-in-VRAM policy on NODA2 (unlike NODA1).
+      Ollama can keep multiple models warm simultaneously.
+      On 64GB unified memory this is less critical than on NODA1's 20GB,
+      but for large models (deepseek-r1:70b = 42GB) concurrent loading
+      can cause swap pressure.
+    severity: low
+    priority: P3
+    action: >
+      Set OLLAMA_MAX_LOADED_MODELS=1 in Ollama env if strict policy needed.
+      Current 64GB RAM is usually sufficient for 1-2 medium models.
+
+# ─── MISALIGNED SWAPPER CONFIG ───────────────────────────────────────────────
+misaligned_swapper_config:
+  - id: SW-01
+    issue: "gemma2:27b referenced in swapper but not installed in Ollama"
+    severity: medium
+    priority: P2
+    fix: "Remove gemma2-27b entry from swapper_config_node2.yaml or install"
+
+  - id: SW-02
+    issue: "qwen2.5-coder:32b referenced in swapper but not installed"
+    severity: medium
+    priority: P2
+    fix: "Remove qwen2.5-coder-32b entry or install model"
+
+  - id: SW-03
+    issue: "ollama.url in swapper_config set to host.docker.internal:11434 (FIXED in P1)"
+    severity: resolved
+    note: "Fixed 2026-02-27"
+
+  - id: SW-04
+    issue: "STT/TTS sections are empty — whisper and kokoro installed but not configured"
+    severity: medium
+    priority: P2
+    fix: >
+      Add stt section pointing to mlx-community/whisper-large-v3-turbo-asr-fp16
+      Add tts section pointing to mlx-community/Kokoro-82M-bf16
+
+  - id: SW-05
+    issue: "No OCR model configured — /ocr returns 405"
+    severity: low
+    priority: P3
+    fix: "Either configure llava-13b as OCR fallback or integrate got-ocr2/granite-docling"
+
+# ─── REDUNDANT BACKENDS ──────────────────────────────────────────────────────
+redundant_backends:
+  - id: RED-01
+    description: "llama-server (port 11435) running alongside Ollama for identical model"
+    redundant: llama-server
+    preferred: ollama
+    priority: P1
+    action: "Kill llama-server process, delete 20GB .gguf file, update router profile to use port 11434"
+
+  - id: RED-02
+    description: "open-webui whisper-base (low quality) vs mlx whisper-large-v3-turbo (high quality, idle)"
+    redundant: open-webui whisper-base
+    preferred: mlx whisper-large-v3-turbo
+    priority: P2
+    action: "Integrate mlx whisper as service, configure open-webui to use external STT URL"
+
+# ─── RECOMMENDED CLEANUP ─────────────────────────────────────────────────────
+recommended_cleanup:
+  immediate_P1:
+    - action: "Stop llama-server process + delete ~/Library/Application Support/llama.cpp/models/Qwen3.5-35B-A3B-Q4_K_M.gguf"
+      savings_gb: 20
+      risk: low
+
+  next_sprint_P2:
+    - action: "Remove glm-4.7-flash:q4_K_M from Ollama (keep :32k)"
+      savings_gb: 19
+      risk: low
+    - action: "Remove gemma2-27b and qwen2.5-coder:32b from swapper_config_node2.yaml (not installed)"
+      savings_gb: 0
+      risk: none
+    - action: "Integrate whisper-large-v3-turbo-asr-fp16 as STT endpoint"
+      savings_gb: 0
+      risk: medium
+    - action: "Integrate Kokoro-82M-bf16 as TTS endpoint"
+      savings_gb: 0
+      risk: medium
+    - action: "Integrate MiniCPM-V-4_5 as vision endpoint (replace llava:13b)"
+      savings_gb: 0
+      risk: medium
+
+  optional_P3:
+    - action: "Clean old HF models: Qwen/Qwen2.5-7B-Instruct + Qwen2.5-1.5B if not needed"
+      savings_gb: 17
+      risk: low
+    - action: "Clean SDXL-base-1.0 from hf_models (72GB!) if ComfyUI FLUX covers use case"
+      savings_gb: 72
+      risk: low
+    - action: "Remove gpt-oss:latest and mistral-nemo:12b from Ollama (old, replaced by qwen3)"
+      savings_gb: 20
+      risk: low
+
+  total_potential_savings_gb: 149
+  immediate_savings_gb: 20