fix(nodes): Normalize Router/Swapper endpoints and fix NODE2 display

Major changes:
- Normalize get_node_endpoints to use ENV vars (ROUTER_BASE_URL, SWAPPER_BASE_URL)
- Remove node_id-based URL selection logic
- Add fallback direct API call in get_node_swapper_detail
- Fix Swapper API endpoint (/models instead of /api/v1/models)
- Add router_healthy and router_version to node_heartbeat fallback
- Add ENV vars to docker-compose for Router/Swapper URLs

Documentation:
- Add TASK_PHASE_NODE2_ROUTER_SWAPPER_FIX.md with full task description
- Add NODE2_GUARDIAN_SETUP.md with setup instructions

This fixes:
- Swapper models not showing for NODE1 and NODE2
- DAGI Router agents not showing for NODE2
- Router/Swapper showing as Down/Degraded when they're actually up
This commit is contained in:
Apple
2025-12-02 03:13:01 -08:00
parent 5f07a6b3ae
commit f95810e8a7
5 changed files with 509 additions and 45 deletions

View File

@@ -4388,43 +4388,86 @@ async def get_node_swapper_detail(node_id: str):
"""
Get detailed Swapper Service status for a node.
Used by Node Cabinet to show loaded models and health.
Returns fallback data if metrics not found (instead of 404).
First tries to get data from node_cache (populated by node-guardian).
If not found, attempts direct call to Swapper API as fallback.
Returns fallback data if both fail (instead of 404).
"""
import httpx
try:
# Fetch from node_cache
# First, try to fetch from node_cache (preferred - populated by node-guardian)
metrics = await repo_city.get_node_metrics(node_id)
if not metrics:
# Return fallback instead of 404 - allows UI to show pending state
logger.info(f"Swapper metrics not found for {node_id}, returning fallback")
if metrics:
# Parse swapper state (stored as JSONB)
state = metrics.get("swapper_state") or {}
models_data = state.get("models", [])
models = [
SwapperModel(
name=m.get("name", "unknown"),
# Swapper uses "status": "loaded" not "loaded": true
loaded=m.get("status") == "loaded" or m.get("loaded", False),
type=m.get("type"),
vram_gb=m.get("size_gb") or m.get("vram_gb")
)
for m in models_data
]
return NodeSwapperDetail(
node_id=node_id,
healthy=False,
models_loaded=0,
models_total=0,
models=[]
healthy=metrics.get("swapper_healthy", False),
models_loaded=metrics.get("swapper_models_loaded", 0),
models_total=metrics.get("swapper_models_total", 0),
models=models
)
# Parse swapper state (stored as JSONB)
state = metrics.get("swapper_state") or {}
models_data = state.get("models", [])
models = [
SwapperModel(
name=m.get("name", "unknown"),
# Swapper uses "status": "loaded" not "loaded": true
loaded=m.get("status") == "loaded" or m.get("loaded", False),
type=m.get("type"),
vram_gb=m.get("size_gb") or m.get("vram_gb")
)
for m in models_data
]
# Fallback: try direct call to Swapper API
logger.info(f"Swapper metrics not found in cache for {node_id}, trying direct API call")
endpoints = await repo_city.get_node_endpoints(node_id)
swapper_url = endpoints.get("swapper_url")
if swapper_url:
try:
async with httpx.AsyncClient(timeout=5.0) as client:
# Try to get models from Swapper (endpoint: /models, not /api/v1/models)
resp = await client.get(f"{swapper_url}/models")
if resp.status_code == 200:
data = resp.json()
models_list = data.get("models", []) if isinstance(data, dict) else data
models = [
SwapperModel(
name=m.get("name", "unknown"),
loaded=m.get("status") == "loaded" or m.get("loaded", False),
type=m.get("type"),
vram_gb=m.get("size_gb") or m.get("vram_gb")
)
for m in models_list
]
loaded_count = sum(1 for m in models if m.loaded)
logger.info(f"✅ Direct Swapper API call successful: {loaded_count}/{len(models)} models loaded")
return NodeSwapperDetail(
node_id=node_id,
healthy=True,
models_loaded=loaded_count,
models_total=len(models),
models=models
)
except Exception as api_error:
logger.warning(f"Direct Swapper API call failed for {node_id} at {swapper_url}: {api_error}")
# Final fallback: return empty state
logger.info(f"Swapper data unavailable for {node_id}, returning fallback")
return NodeSwapperDetail(
node_id=node_id,
healthy=metrics.get("swapper_healthy", False),
models_loaded=metrics.get("swapper_models_loaded", 0),
models_total=metrics.get("swapper_models_total", 0),
models=models
healthy=False,
models_loaded=0,
models_total=0,
models=[]
)
except HTTPException:
raise