node2: fix Sofiia routing determinism + Node Capabilities Service

Bug fixes: - Bug A: GROK_API_KEY env mismatch — router expected GROK_API_KEY but only XAI_API_KEY was present. Added GROK_API_KEY=${XAI_API_KEY} alias in compose. - Bug B: 'grok' profile missing in router-config.node2.yml — added cloud_grok profile (provider: grok, model: grok-2-1212). Sofiia now has default_llm=cloud_grok with fallback_llm=local_default_coder. - Bug C: Router silently defaulted to cloud DeepSeek when profile was unknown. Now falls back to agent.fallback_llm or local_default_coder with WARNING log. Hardcoded Ollama URL (172.18.0.1) replaced with config-driven base_url. New service: Node Capabilities Service (NCS) - services/node-capabilities/ — FastAPI microservice exposing live model inventory from Ollama, Swapper, and llama-server. - GET /capabilities — canonical JSON with served_models[] and inventory_only[] - GET /capabilities/models — flat list of served models - POST /capabilities/refresh — force cache refresh - Cache TTL 15s, bound to 127.0.0.1:8099 - services/router/capabilities_client.py — async client with TTL cache Artifacts: - ops/node2_models_audit.md — 3-layer model view (served/disk/cloud) - ops/node2_models_audit.yml — machine-readable audit - ops/node2_capabilities_example.json — sample NCS output (14 served models) Made-with: Cursor
2026-02-27 02:07:40 -08:00
parent 3965f68fac
commit e2a3ae342a
10 changed files with 867 additions and 33 deletions
--- a/services/node-capabilities/Dockerfile
+++ b/services/node-capabilities/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY main.py .
+EXPOSE 8099
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8099"]
--- a/services/node-capabilities/main.py
+++ b/services/node-capabilities/main.py
@@ -0,0 +1,245 @@
+"""Node Capabilities Service — exposes live model inventory for router decisions."""
+import os
+import time
+import logging
+from typing import Any, Dict, List, Optional
+
+from fastapi import FastAPI
+from fastapi.responses import JSONResponse
+import httpx
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("node-capabilities")
+
+app = FastAPI(title="Node Capabilities Service", version="1.0.0")
+
+NODE_ID = os.getenv("NODE_ID", "noda2")
+OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
+SWAPPER_URL = os.getenv("SWAPPER_URL", "http://swapper-service:8890")
+LLAMA_SERVER_URL = os.getenv("LLAMA_SERVER_URL", "")
+
+_cache: Dict[str, Any] = {}
+_cache_ts: float = 0
+CACHE_TTL = int(os.getenv("CACHE_TTL_SEC", "15"))
+
+
+def _classify_model(name: str) -> str:
+    nl = name.lower()
+    if any(k in nl for k in ("vl", "vision", "llava", "minicpm-v", "clip")):
+        return "vision"
+    if any(k in nl for k in ("coder", "starcoder", "codellama", "code")):
+        return "code"
+    if any(k in nl for k in ("embed", "bge", "minilm", "e5-")):
+        return "embedding"
+    if any(k in nl for k in ("whisper", "stt")):
+        return "stt"
+    if any(k in nl for k in ("kokoro", "tts", "bark", "coqui", "xtts")):
+        return "tts"
+    if any(k in nl for k in ("flux", "sdxl", "stable-diffusion", "ltx")):
+        return "image_gen"
+    return "llm"
+
+
+async def _collect_ollama() -> Dict[str, Any]:
+    runtime: Dict[str, Any] = {"base_url": OLLAMA_BASE_URL, "status": "unknown", "models": []}
+    try:
+        async with httpx.AsyncClient(timeout=5) as c:
+            r = await c.get(f"{OLLAMA_BASE_URL}/api/tags")
+            if r.status_code == 200:
+                data = r.json()
+                runtime["status"] = "ok"
+                for m in data.get("models", []):
+                    runtime["models"].append({
+                        "name": m.get("name", ""),
+                        "size_bytes": m.get("size", 0),
+                        "size_gb": round(m.get("size", 0) / 1e9, 1),
+                        "type": _classify_model(m.get("name", "")),
+                        "modified": m.get("modified_at", "")[:10],
+                    })
+            ps = await c.get(f"{OLLAMA_BASE_URL}/api/ps")
+            if ps.status_code == 200:
+                running = ps.json().get("models", [])
+                running_names = {m.get("name", "") for m in running}
+                for model in runtime["models"]:
+                    model["running"] = model["name"] in running_names
+    except Exception as e:
+        runtime["status"] = f"error: {e}"
+        logger.warning(f"Ollama collector failed: {e}")
+    return runtime
+
+
+async def _collect_swapper() -> Dict[str, Any]:
+    runtime: Dict[str, Any] = {"base_url": SWAPPER_URL, "status": "unknown", "models": [], "vision_models": [], "active_model": None}
+    try:
+        async with httpx.AsyncClient(timeout=5) as c:
+            h = await c.get(f"{SWAPPER_URL}/health")
+            if h.status_code == 200:
+                hd = h.json()
+                runtime["status"] = hd.get("status", "ok")
+                runtime["active_model"] = hd.get("active_model")
+
+            mr = await c.get(f"{SWAPPER_URL}/models")
+            if mr.status_code == 200:
+                for m in mr.json().get("models", []):
+                    runtime["models"].append({
+                        "name": m.get("name", ""),
+                        "type": m.get("type", "llm"),
+                        "size_gb": m.get("size_gb", 0),
+                        "status": m.get("status", "unknown"),
+                    })
+
+            vr = await c.get(f"{SWAPPER_URL}/vision/models")
+            if vr.status_code == 200:
+                for m in vr.json().get("models", []):
+                    runtime["vision_models"].append({
+                        "name": m.get("name", ""),
+                        "type": "vision",
+                        "size_gb": m.get("size_gb", 0),
+                        "status": m.get("status", "unknown"),
+                    })
+    except Exception as e:
+        runtime["status"] = f"error: {e}"
+        logger.warning(f"Swapper collector failed: {e}")
+    return runtime
+
+
+async def _collect_llama_server() -> Optional[Dict[str, Any]]:
+    if not LLAMA_SERVER_URL:
+        return None
+    runtime: Dict[str, Any] = {"base_url": LLAMA_SERVER_URL, "status": "unknown", "models": []}
+    try:
+        async with httpx.AsyncClient(timeout=5) as c:
+            r = await c.get(f"{LLAMA_SERVER_URL}/v1/models")
+            if r.status_code == 200:
+                data = r.json()
+                runtime["status"] = "ok"
+                for m in data.get("data", data.get("models", [])):
+                    name = m.get("id", m.get("name", "unknown"))
+                    runtime["models"].append({"name": name, "type": "llm"})
+    except Exception as e:
+        runtime["status"] = f"error: {e}"
+    return runtime
+
+
+def _collect_disk_inventory() -> List[Dict[str, Any]]:
+    """Scan known model directories — NOT for routing, only inventory."""
+    import pathlib
+    inventory: List[Dict[str, Any]] = []
+
+    scan_dirs = [
+        ("cursor_worktrees", pathlib.Path.home() / ".cursor" / "worktrees"),
+        ("jan_ai", pathlib.Path.home() / "Library" / "Application Support" / "Jan"),
+        ("hf_cache", pathlib.Path.home() / ".cache" / "huggingface" / "hub"),
+        ("comfyui_main", pathlib.Path.home() / "ComfyUI" / "models"),
+        ("comfyui_docs", pathlib.Path.home() / "Documents" / "ComfyUI" / "models"),
+        ("llama_cpp", pathlib.Path.home() / "Library" / "Application Support" / "llama.cpp" / "models"),
+        ("hf_models", pathlib.Path.home() / "hf_models"),
+    ]
+
+    for source, base in scan_dirs:
+        if not base.exists():
+            continue
+        try:
+            for f in base.rglob("*"):
+                if f.suffix in (".gguf", ".safetensors", ".bin", ".pt") and f.stat().st_size > 100_000_000:
+                    inventory.append({
+                        "name": f.stem,
+                        "path": str(f.relative_to(pathlib.Path.home())),
+                        "source": source,
+                        "size_gb": round(f.stat().st_size / 1e9, 1),
+                        "type": _classify_model(f.stem),
+                        "served": False,
+                    })
+        except Exception:
+            pass
+
+    return inventory
+
+
+def _build_served_models(ollama: Dict, swapper: Dict, llama: Optional[Dict]) -> List[Dict[str, Any]]:
+    """Merge all served models into a flat canonical list."""
+    served: List[Dict[str, Any]] = []
+    seen = set()
+
+    for m in ollama.get("models", []):
+        key = m["name"]
+        if key not in seen:
+            seen.add(key)
+            served.append({**m, "runtime": "ollama", "base_url": ollama["base_url"]})
+
+    for m in swapper.get("vision_models", []):
+        key = f"swapper:{m['name']}"
+        if key not in seen:
+            seen.add(key)
+            served.append({**m, "runtime": "swapper", "base_url": swapper["base_url"]})
+
+    if llama:
+        for m in llama.get("models", []):
+            key = f"llama:{m['name']}"
+            if key not in seen:
+                seen.add(key)
+                served.append({**m, "runtime": "llama_server", "base_url": llama["base_url"]})
+
+    return served
+
+
+async def _build_capabilities() -> Dict[str, Any]:
+    global _cache, _cache_ts
+
+    if _cache and (time.time() - _cache_ts) < CACHE_TTL:
+        return _cache
+
+    ollama = await _collect_ollama()
+    swapper = await _collect_swapper()
+    llama = await _collect_llama_server()
+    disk = _collect_disk_inventory()
+    served = _build_served_models(ollama, swapper, llama)
+
+    result = {
+        "node_id": NODE_ID,
+        "updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+        "runtimes": {
+            "ollama": ollama,
+            "swapper": swapper,
+        },
+        "served_models": served,
+        "served_count": len(served),
+        "inventory_only": disk,
+        "inventory_count": len(disk),
+    }
+    if llama:
+        result["runtimes"]["llama_server"] = llama
+
+    _cache = result
+    _cache_ts = time.time()
+    return result
+
+
+@app.get("/healthz")
+async def healthz():
+    return {"status": "ok", "node_id": NODE_ID}
+
+
+@app.get("/capabilities")
+async def capabilities():
+    data = await _build_capabilities()
+    return JSONResponse(content=data)
+
+
+@app.get("/capabilities/models")
+async def capabilities_models():
+    data = await _build_capabilities()
+    return JSONResponse(content={"node_id": data["node_id"], "served_models": data["served_models"]})
+
+
+@app.post("/capabilities/refresh")
+async def capabilities_refresh():
+    global _cache_ts
+    _cache_ts = 0
+    data = await _build_capabilities()
+    return JSONResponse(content={"refreshed": True, "served_count": data["served_count"]})
+
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "8099")))
--- a/services/node-capabilities/requirements.txt
+++ b/services/node-capabilities/requirements.txt
@@ -0,0 +1,3 @@
+fastapi>=0.110.0
+uvicorn>=0.29.0
+httpx>=0.27.0