feat: implement Swapper metrics collection and UI

This commit is contained in:
Apple
2025-11-30 15:12:49 -08:00
parent 5b5160ad8b
commit fd814b2059
11 changed files with 1224 additions and 4543 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,39 @@
# TASK 038: Dynamic discovery of Node Guardian / Steward if cache is empty
if not data.get("guardian_agent") or not data.get("steward_agent"):
dynamic_agents = await pool.fetch("""
SELECT id, display_name, kind, public_slug
FROM agents
WHERE node_id = $1
AND (kind IN ('node_guardian', 'node_steward') OR kind IN ('infra_monitor', 'infra_ops'))
AND COALESCE(is_archived, false) = false
""", node_id)
if not data.get("guardian_agent"):
# Prefer 'node_guardian', fallback to 'infra_monitor'
guardian_candidates = [a for a in dynamic_agents if a['kind'] == 'node_guardian']
monitor_candidates = [a for a in dynamic_agents if a['kind'] == 'infra_monitor']
guardian = guardian_candidates[0] if guardian_candidates else (monitor_candidates[0] if monitor_candidates else None)
if guardian:
data["guardian_agent"] = {
"id": guardian["id"],
"name": guardian["display_name"],
"kind": guardian["kind"],
"slug": guardian["public_slug"],
}
if not data.get("steward_agent"):
# Prefer 'node_steward', fallback to 'infra_ops'
steward_candidates = [a for a in dynamic_agents if a['kind'] == 'node_steward']
ops_candidates = [a for a in dynamic_agents if a['kind'] == 'infra_ops']
steward = steward_candidates[0] if steward_candidates else (ops_candidates[0] if ops_candidates else None)
if steward:
data["steward_agent"] = {
"id": steward["id"],
"name": steward["display_name"],
"kind": steward["kind"],
"slug": steward["public_slug"],
}

View File

@@ -0,0 +1,16 @@
# Fetch MicroDAOs where orchestrator is on this node
print(f"DEBUG: Fetching microdaos for node {node_id}")
try:
microdaos = await pool.fetch("""
SELECT m.id, m.slug, m.name, COUNT(cr.id) as rooms_count
FROM microdaos m
JOIN agents a ON m.orchestrator_agent_id = a.id
LEFT JOIN city_rooms cr ON cr.microdao_id::text = m.id
WHERE a.node_id = $1
GROUP BY m.id, m.slug, m.name
ORDER BY m.name
""", node_id)
print(f"DEBUG: Microdaos fetched: {len(microdaos)}")
except Exception as e:
print(f"DEBUG: Error fetching microdaos: {e}")
raise e

View File

@@ -4057,6 +4057,52 @@ async def get_node_self_healing_status(node_id: str):
)
@router.get("/internal/node/{node_id}/swapper", response_model=NodeSwapperDetail)
async def get_node_swapper_detail(node_id: str):
"""
Get detailed Swapper Service status for a node.
Used by Node Cabinet to show loaded models and health.
"""
try:
# Fetch from node_cache
metrics = await repo_city.get_node_metrics(node_id)
if not metrics:
raise HTTPException(status_code=404, detail="Node not found")
# Parse swapper state (stored as JSONB)
state = metrics.get("swapper_state") or {}
models_data = state.get("models", [])
models = [
SwapperModel(
name=m.get("name", "unknown"),
loaded=m.get("loaded", False),
type=m.get("type"),
vram_gb=m.get("vram_gb")
)
for m in models_data
]
return NodeSwapperDetail(
node_id=node_id,
healthy=metrics.get("swapper_healthy", False),
models_loaded=metrics.get("swapper_models_loaded", 0),
models_total=metrics.get("swapper_models_total", 0),
models=models
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Failed to get swapper detail for {node_id}: {e}")
return NodeSwapperDetail(
node_id=node_id,
healthy=False,
models_loaded=0,
models_total=0,
models=[]
)
@router.get("/internal/node/{node_id}/directory-check")
async def check_node_in_directory(node_id: str):
"""