microdao-daarion/ops/node2_model_gaps.yml

# NODA2 Model Gaps & Optimization Analysis
# Date: 2026-02-27
# Node: MacBook Pro M4 Max (NODA2)
# Status: READ-ONLY analysis — no changes applied

# ─── MODELS INSTALLED BUT UNUSED ─────────────────────────────────────────────
models_installed_but_unused:
  - id: GAP-01
    model: "mlx-community/whisper-large-v3-turbo-asr-fp16"
    size_gb: 1.5
    installed: true
    integrated: false
    priority: P2
    action: >
      Integrate into memory-service or swapper as STT backend.
      Uses MLX for Metal acceleration — fast on M4 Max.
      Candidate: POST /transcribe endpoint in memory-service.

  - id: GAP-02
    model: "mlx-community/Kokoro-82M-bf16"
    size_gb: 0.35
    installed: true
    integrated: false
    priority: P2
    action: >
      Integrate as TTS backend. Kokoro via MLX is fast on Apple Silicon.
      Candidate: POST /tts endpoint in memory-service or standalone service.

  - id: GAP-03
    model: "openbmb/MiniCPM-V-4_5"
    size_gb: 16.0
    installed: true
    integrated: false
    priority: P2
    action: >
      Better vision quality than llava:13b.
      Option A: Serve via Ollama (needs GGUF conversion or mlx-lm).
      Option B: Direct HF inference via memory-service endpoint.
      Option C: Dedicated FastAPI microservice on port 8893.

  - id: GAP-04
    model: "Qwen/Qwen2.5-7B-Instruct"
    size_gb: 14.0
    installed: true
    integrated: false
    priority: P3
    action: >
      HF weights sitting idle. If needed — convert to Ollama (modelfile)
      or use via transformers. Otherwise candidate for cleanup.

  - id: GAP-05
    model: "Qwen/Qwen2.5-1.5B-Instruct"
    size_gb: 2.9
    installed: true
    integrated: false
    priority: P3
    action: >
      Very small model. Could serve as ultra-fast routing/classification
      if converted to Ollama. Otherwise cleanup.

  - id: GAP-06
    model: "flux2-dev-Q8_0.gguf + ltx-2-19b + SDXL + FLUX.1-*"
    size_gb_total: 141
    installed: true
    integrated: false
    priority: P3
    action: >
      ComfyUI stack running but not exposed as API endpoint.
      No REST API service for Sofiia to call image/video generation.
      Would need a ComfyUI API wrapper (comfyui-api-proxy) to integrate.

# ─── MODELS CONFIGURED BUT MISSING ───────────────────────────────────────────
models_configured_but_missing:
  - id: MISS-01
    model: "gemma2:27b"
    configured_in: swapper_config_node2.yaml
    installed: false
    priority: P2
    action: >
      Swapper references gemma2:27b but it's NOT in ollama list.
      Either: ollama pull gemma2:27b (15GB) or remove from swapper config.
      Recommend: remove (gemma3:latest is 3.3GB and more modern).

  - id: MISS-02
    model: "qwen2.5-coder:32b"
    configured_in: swapper_config_node2.yaml
    installed: false
    priority: P2
    action: >
      Swapper references qwen2.5-coder:32b but NOT in ollama list.
      Either: pull (19GB) or remove from config.
      deepseek-coder:33b already covers this role.

  - id: MISS-03
    model: "gpt-oss:latest"
    configured_in: swapper_config_node2.yaml
    installed: true
    priority: P3
    note: "Available in Ollama. Old model, low usage."

  - id: MISS-04
    model: "Qwen3-VL-32B-Instruct"
    configured_in: "NOT configured (only refs/ placeholder in HF cache)"
    installed: false
    priority: P3
    note: "Only 4KB placeholder. Would need 65+ GB download to use."

# ─── DUPLICATED MODELS ───────────────────────────────────────────────────────
duplicated_models:
  - id: DUP-01
    model: "Qwen3.5-35B-A3B"
    instances:
      - backend: ollama
        name: "qwen3.5:35b-a3b"
        size_gb: 9.3
        path: "~/.ollama/models/"
      - backend: llama-server
        name: "Qwen3.5-35B-A3B-Q4_K_M.gguf"
        size_gb: 20.0
        path: "~/Library/Application Support/llama.cpp/models/"
    total_duplicate_gb: 29.3
    priority: P1
    action: >
      Two instances of same model (different quant format, same weights).
      Ollama uses optimized .gguf internally (~9.3GB).
      llama-server uses separate 20GB .gguf file.
      RECOMMEND: Stop llama-server process and delete the 20GB file.
      Use ollama as single Qwen3.5 backend.
      Saves: 20 GB disk, eliminates port confusion.

  - id: DUP-02
    model: "glm-4.7-flash"
    instances:
      - name: "glm-4.7-flash:32k"
        size_gb: 19.0
      - name: "glm-4.7-flash:q4_K_M"
        size_gb: 19.0
    total_duplicate_gb: 38.0
    priority: P2
    action: >
      Two identical GLM-4.7-flash quants. Choose one (32k recommended
      for longer context), remove the other. Saves 19 GB.

# ─── GPU / MEMORY CONFLICTS ──────────────────────────────────────────────────
gpu_conflicts:
  - id: GPU-01
    description: >
      llama-server (port 11435) and Ollama (port 11434) both use
      Metal/MPS for the SAME model (Qwen3.5-35B-A3B).
      On Apple Silicon unified memory, this means both could load
      model weights simultaneously → 20GB + 9.3GB = 29.3GB consumed
      from 64GB unified memory without any active inference.
    severity: medium
    priority: P1
    action: "Stop llama-server. Use only Ollama for this model."

  - id: GPU-02
    description: >
      No single-model-in-VRAM policy on NODA2 (unlike NODA1).
      Ollama can keep multiple models warm simultaneously.
      On 64GB unified memory this is less critical than on NODA1's 20GB,
      but for large models (deepseek-r1:70b = 42GB) concurrent loading
      can cause swap pressure.
    severity: low
    priority: P3
    action: >
      Set OLLAMA_MAX_LOADED_MODELS=1 in Ollama env if strict policy needed.
      Current 64GB RAM is usually sufficient for 1-2 medium models.

# ─── MISALIGNED SWAPPER CONFIG ───────────────────────────────────────────────
misaligned_swapper_config:
  - id: SW-01
    issue: "gemma2:27b referenced in swapper but not installed in Ollama"
    severity: medium
    priority: P2
    fix: "Remove gemma2-27b entry from swapper_config_node2.yaml or install"

  - id: SW-02
    issue: "qwen2.5-coder:32b referenced in swapper but not installed"
    severity: medium
    priority: P2
    fix: "Remove qwen2.5-coder-32b entry or install model"

  - id: SW-03
    issue: "ollama.url in swapper_config set to host.docker.internal:11434 (FIXED in P1)"
    severity: resolved
    note: "Fixed 2026-02-27"

  - id: SW-04
    issue: "STT/TTS sections are empty — whisper and kokoro installed but not configured"
    severity: medium
    priority: P2
    fix: >
      Add stt section pointing to mlx-community/whisper-large-v3-turbo-asr-fp16
      Add tts section pointing to mlx-community/Kokoro-82M-bf16

  - id: SW-05
    issue: "No OCR model configured — /ocr returns 405"
    severity: low
    priority: P3
    fix: "Either configure llava-13b as OCR fallback or integrate got-ocr2/granite-docling"

# ─── REDUNDANT BACKENDS ──────────────────────────────────────────────────────
redundant_backends:
  - id: RED-01
    description: "llama-server (port 11435) running alongside Ollama for identical model"
    redundant: llama-server
    preferred: ollama
    priority: P1
    action: "Kill llama-server process, delete 20GB .gguf file, update router profile to use port 11434"

  - id: RED-02
    description: "open-webui whisper-base (low quality) vs mlx whisper-large-v3-turbo (high quality, idle)"
    redundant: open-webui whisper-base
    preferred: mlx whisper-large-v3-turbo
    priority: P2
    action: "Integrate mlx whisper as service, configure open-webui to use external STT URL"

# ─── RECOMMENDED CLEANUP ─────────────────────────────────────────────────────
recommended_cleanup:
  immediate_P1:
    - action: "Stop llama-server process + delete ~/Library/Application Support/llama.cpp/models/Qwen3.5-35B-A3B-Q4_K_M.gguf"
      savings_gb: 20
      risk: low

  next_sprint_P2:
    - action: "Remove glm-4.7-flash:q4_K_M from Ollama (keep :32k)"
      savings_gb: 19
      risk: low
    - action: "Remove gemma2-27b and qwen2.5-coder:32b from swapper_config_node2.yaml (not installed)"
      savings_gb: 0
      risk: none
    - action: "Integrate whisper-large-v3-turbo-asr-fp16 as STT endpoint"
      savings_gb: 0
      risk: medium
    - action: "Integrate Kokoro-82M-bf16 as TTS endpoint"
      savings_gb: 0
      risk: medium
    - action: "Integrate MiniCPM-V-4_5 as vision endpoint (replace llava:13b)"
      savings_gb: 0
      risk: medium

  optional_P3:
    - action: "Clean old HF models: Qwen/Qwen2.5-7B-Instruct + Qwen2.5-1.5B if not needed"
      savings_gb: 17
      risk: low
    - action: "Clean SDXL-base-1.0 from hf_models (72GB!) if ComfyUI FLUX covers use case"
      savings_gb: 72
      risk: low
    - action: "Remove gpt-oss:latest and mistral-nemo:12b from Ollama (old, replaced by qwen3)"
      savings_gb: 20
      risk: low

  total_potential_savings_gb: 149
  immediate_savings_gb: 20