node2: fix Sofiia routing determinism + Node Capabilities Service
Bug fixes:
- Bug A: GROK_API_KEY env mismatch — router expected GROK_API_KEY but only
XAI_API_KEY was present. Added GROK_API_KEY=${XAI_API_KEY} alias in compose.
- Bug B: 'grok' profile missing in router-config.node2.yml — added cloud_grok
profile (provider: grok, model: grok-2-1212). Sofiia now has
default_llm=cloud_grok with fallback_llm=local_default_coder.
- Bug C: Router silently defaulted to cloud DeepSeek when profile was unknown.
Now falls back to agent.fallback_llm or local_default_coder with WARNING log.
Hardcoded Ollama URL (172.18.0.1) replaced with config-driven base_url.
New service: Node Capabilities Service (NCS)
- services/node-capabilities/ — FastAPI microservice exposing live model
inventory from Ollama, Swapper, and llama-server.
- GET /capabilities — canonical JSON with served_models[] and inventory_only[]
- GET /capabilities/models — flat list of served models
- POST /capabilities/refresh — force cache refresh
- Cache TTL 15s, bound to 127.0.0.1:8099
- services/router/capabilities_client.py — async client with TTL cache
Artifacts:
- ops/node2_models_audit.md — 3-layer model view (served/disk/cloud)
- ops/node2_models_audit.yml — machine-readable audit
- ops/node2_capabilities_example.json — sample NCS output (14 served models)
Made-with: Cursor
This commit is contained in:
7
services/node-capabilities/Dockerfile
Normal file
7
services/node-capabilities/Dockerfile
Normal file
@@ -0,0 +1,7 @@
|
||||
FROM python:3.11-slim
|
||||
WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
COPY main.py .
|
||||
EXPOSE 8099
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8099"]
|
||||
245
services/node-capabilities/main.py
Normal file
245
services/node-capabilities/main.py
Normal file
@@ -0,0 +1,245 @@
|
||||
"""Node Capabilities Service — exposes live model inventory for router decisions."""
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import JSONResponse
|
||||
import httpx
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger("node-capabilities")
|
||||
|
||||
app = FastAPI(title="Node Capabilities Service", version="1.0.0")
|
||||
|
||||
NODE_ID = os.getenv("NODE_ID", "noda2")
|
||||
OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434")
|
||||
SWAPPER_URL = os.getenv("SWAPPER_URL", "http://swapper-service:8890")
|
||||
LLAMA_SERVER_URL = os.getenv("LLAMA_SERVER_URL", "")
|
||||
|
||||
_cache: Dict[str, Any] = {}
|
||||
_cache_ts: float = 0
|
||||
CACHE_TTL = int(os.getenv("CACHE_TTL_SEC", "15"))
|
||||
|
||||
|
||||
def _classify_model(name: str) -> str:
|
||||
nl = name.lower()
|
||||
if any(k in nl for k in ("vl", "vision", "llava", "minicpm-v", "clip")):
|
||||
return "vision"
|
||||
if any(k in nl for k in ("coder", "starcoder", "codellama", "code")):
|
||||
return "code"
|
||||
if any(k in nl for k in ("embed", "bge", "minilm", "e5-")):
|
||||
return "embedding"
|
||||
if any(k in nl for k in ("whisper", "stt")):
|
||||
return "stt"
|
||||
if any(k in nl for k in ("kokoro", "tts", "bark", "coqui", "xtts")):
|
||||
return "tts"
|
||||
if any(k in nl for k in ("flux", "sdxl", "stable-diffusion", "ltx")):
|
||||
return "image_gen"
|
||||
return "llm"
|
||||
|
||||
|
||||
async def _collect_ollama() -> Dict[str, Any]:
|
||||
runtime: Dict[str, Any] = {"base_url": OLLAMA_BASE_URL, "status": "unknown", "models": []}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5) as c:
|
||||
r = await c.get(f"{OLLAMA_BASE_URL}/api/tags")
|
||||
if r.status_code == 200:
|
||||
data = r.json()
|
||||
runtime["status"] = "ok"
|
||||
for m in data.get("models", []):
|
||||
runtime["models"].append({
|
||||
"name": m.get("name", ""),
|
||||
"size_bytes": m.get("size", 0),
|
||||
"size_gb": round(m.get("size", 0) / 1e9, 1),
|
||||
"type": _classify_model(m.get("name", "")),
|
||||
"modified": m.get("modified_at", "")[:10],
|
||||
})
|
||||
ps = await c.get(f"{OLLAMA_BASE_URL}/api/ps")
|
||||
if ps.status_code == 200:
|
||||
running = ps.json().get("models", [])
|
||||
running_names = {m.get("name", "") for m in running}
|
||||
for model in runtime["models"]:
|
||||
model["running"] = model["name"] in running_names
|
||||
except Exception as e:
|
||||
runtime["status"] = f"error: {e}"
|
||||
logger.warning(f"Ollama collector failed: {e}")
|
||||
return runtime
|
||||
|
||||
|
||||
async def _collect_swapper() -> Dict[str, Any]:
|
||||
runtime: Dict[str, Any] = {"base_url": SWAPPER_URL, "status": "unknown", "models": [], "vision_models": [], "active_model": None}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5) as c:
|
||||
h = await c.get(f"{SWAPPER_URL}/health")
|
||||
if h.status_code == 200:
|
||||
hd = h.json()
|
||||
runtime["status"] = hd.get("status", "ok")
|
||||
runtime["active_model"] = hd.get("active_model")
|
||||
|
||||
mr = await c.get(f"{SWAPPER_URL}/models")
|
||||
if mr.status_code == 200:
|
||||
for m in mr.json().get("models", []):
|
||||
runtime["models"].append({
|
||||
"name": m.get("name", ""),
|
||||
"type": m.get("type", "llm"),
|
||||
"size_gb": m.get("size_gb", 0),
|
||||
"status": m.get("status", "unknown"),
|
||||
})
|
||||
|
||||
vr = await c.get(f"{SWAPPER_URL}/vision/models")
|
||||
if vr.status_code == 200:
|
||||
for m in vr.json().get("models", []):
|
||||
runtime["vision_models"].append({
|
||||
"name": m.get("name", ""),
|
||||
"type": "vision",
|
||||
"size_gb": m.get("size_gb", 0),
|
||||
"status": m.get("status", "unknown"),
|
||||
})
|
||||
except Exception as e:
|
||||
runtime["status"] = f"error: {e}"
|
||||
logger.warning(f"Swapper collector failed: {e}")
|
||||
return runtime
|
||||
|
||||
|
||||
async def _collect_llama_server() -> Optional[Dict[str, Any]]:
|
||||
if not LLAMA_SERVER_URL:
|
||||
return None
|
||||
runtime: Dict[str, Any] = {"base_url": LLAMA_SERVER_URL, "status": "unknown", "models": []}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5) as c:
|
||||
r = await c.get(f"{LLAMA_SERVER_URL}/v1/models")
|
||||
if r.status_code == 200:
|
||||
data = r.json()
|
||||
runtime["status"] = "ok"
|
||||
for m in data.get("data", data.get("models", [])):
|
||||
name = m.get("id", m.get("name", "unknown"))
|
||||
runtime["models"].append({"name": name, "type": "llm"})
|
||||
except Exception as e:
|
||||
runtime["status"] = f"error: {e}"
|
||||
return runtime
|
||||
|
||||
|
||||
def _collect_disk_inventory() -> List[Dict[str, Any]]:
|
||||
"""Scan known model directories — NOT for routing, only inventory."""
|
||||
import pathlib
|
||||
inventory: List[Dict[str, Any]] = []
|
||||
|
||||
scan_dirs = [
|
||||
("cursor_worktrees", pathlib.Path.home() / ".cursor" / "worktrees"),
|
||||
("jan_ai", pathlib.Path.home() / "Library" / "Application Support" / "Jan"),
|
||||
("hf_cache", pathlib.Path.home() / ".cache" / "huggingface" / "hub"),
|
||||
("comfyui_main", pathlib.Path.home() / "ComfyUI" / "models"),
|
||||
("comfyui_docs", pathlib.Path.home() / "Documents" / "ComfyUI" / "models"),
|
||||
("llama_cpp", pathlib.Path.home() / "Library" / "Application Support" / "llama.cpp" / "models"),
|
||||
("hf_models", pathlib.Path.home() / "hf_models"),
|
||||
]
|
||||
|
||||
for source, base in scan_dirs:
|
||||
if not base.exists():
|
||||
continue
|
||||
try:
|
||||
for f in base.rglob("*"):
|
||||
if f.suffix in (".gguf", ".safetensors", ".bin", ".pt") and f.stat().st_size > 100_000_000:
|
||||
inventory.append({
|
||||
"name": f.stem,
|
||||
"path": str(f.relative_to(pathlib.Path.home())),
|
||||
"source": source,
|
||||
"size_gb": round(f.stat().st_size / 1e9, 1),
|
||||
"type": _classify_model(f.stem),
|
||||
"served": False,
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return inventory
|
||||
|
||||
|
||||
def _build_served_models(ollama: Dict, swapper: Dict, llama: Optional[Dict]) -> List[Dict[str, Any]]:
|
||||
"""Merge all served models into a flat canonical list."""
|
||||
served: List[Dict[str, Any]] = []
|
||||
seen = set()
|
||||
|
||||
for m in ollama.get("models", []):
|
||||
key = m["name"]
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
served.append({**m, "runtime": "ollama", "base_url": ollama["base_url"]})
|
||||
|
||||
for m in swapper.get("vision_models", []):
|
||||
key = f"swapper:{m['name']}"
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
served.append({**m, "runtime": "swapper", "base_url": swapper["base_url"]})
|
||||
|
||||
if llama:
|
||||
for m in llama.get("models", []):
|
||||
key = f"llama:{m['name']}"
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
served.append({**m, "runtime": "llama_server", "base_url": llama["base_url"]})
|
||||
|
||||
return served
|
||||
|
||||
|
||||
async def _build_capabilities() -> Dict[str, Any]:
|
||||
global _cache, _cache_ts
|
||||
|
||||
if _cache and (time.time() - _cache_ts) < CACHE_TTL:
|
||||
return _cache
|
||||
|
||||
ollama = await _collect_ollama()
|
||||
swapper = await _collect_swapper()
|
||||
llama = await _collect_llama_server()
|
||||
disk = _collect_disk_inventory()
|
||||
served = _build_served_models(ollama, swapper, llama)
|
||||
|
||||
result = {
|
||||
"node_id": NODE_ID,
|
||||
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
"runtimes": {
|
||||
"ollama": ollama,
|
||||
"swapper": swapper,
|
||||
},
|
||||
"served_models": served,
|
||||
"served_count": len(served),
|
||||
"inventory_only": disk,
|
||||
"inventory_count": len(disk),
|
||||
}
|
||||
if llama:
|
||||
result["runtimes"]["llama_server"] = llama
|
||||
|
||||
_cache = result
|
||||
_cache_ts = time.time()
|
||||
return result
|
||||
|
||||
|
||||
@app.get("/healthz")
|
||||
async def healthz():
|
||||
return {"status": "ok", "node_id": NODE_ID}
|
||||
|
||||
|
||||
@app.get("/capabilities")
|
||||
async def capabilities():
|
||||
data = await _build_capabilities()
|
||||
return JSONResponse(content=data)
|
||||
|
||||
|
||||
@app.get("/capabilities/models")
|
||||
async def capabilities_models():
|
||||
data = await _build_capabilities()
|
||||
return JSONResponse(content={"node_id": data["node_id"], "served_models": data["served_models"]})
|
||||
|
||||
|
||||
@app.post("/capabilities/refresh")
|
||||
async def capabilities_refresh():
|
||||
global _cache_ts
|
||||
_cache_ts = 0
|
||||
data = await _build_capabilities()
|
||||
return JSONResponse(content={"refreshed": True, "served_count": data["served_count"]})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "8099")))
|
||||
3
services/node-capabilities/requirements.txt
Normal file
3
services/node-capabilities/requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
fastapi>=0.110.0
|
||||
uvicorn>=0.29.0
|
||||
httpx>=0.27.0
|
||||
Reference in New Issue
Block a user