feat: implement Swapper metrics collection and UI
This commit is contained in:
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
39
services/city-service/repo_city.py_fragment.py
Normal file
39
services/city-service/repo_city.py_fragment.py
Normal file
@@ -0,0 +1,39 @@
|
||||
# TASK 038: Dynamic discovery of Node Guardian / Steward if cache is empty
|
||||
if not data.get("guardian_agent") or not data.get("steward_agent"):
|
||||
dynamic_agents = await pool.fetch("""
|
||||
SELECT id, display_name, kind, public_slug
|
||||
FROM agents
|
||||
WHERE node_id = $1
|
||||
AND (kind IN ('node_guardian', 'node_steward') OR kind IN ('infra_monitor', 'infra_ops'))
|
||||
AND COALESCE(is_archived, false) = false
|
||||
""", node_id)
|
||||
|
||||
if not data.get("guardian_agent"):
|
||||
# Prefer 'node_guardian', fallback to 'infra_monitor'
|
||||
guardian_candidates = [a for a in dynamic_agents if a['kind'] == 'node_guardian']
|
||||
monitor_candidates = [a for a in dynamic_agents if a['kind'] == 'infra_monitor']
|
||||
|
||||
guardian = guardian_candidates[0] if guardian_candidates else (monitor_candidates[0] if monitor_candidates else None)
|
||||
|
||||
if guardian:
|
||||
data["guardian_agent"] = {
|
||||
"id": guardian["id"],
|
||||
"name": guardian["display_name"],
|
||||
"kind": guardian["kind"],
|
||||
"slug": guardian["public_slug"],
|
||||
}
|
||||
|
||||
if not data.get("steward_agent"):
|
||||
# Prefer 'node_steward', fallback to 'infra_ops'
|
||||
steward_candidates = [a for a in dynamic_agents if a['kind'] == 'node_steward']
|
||||
ops_candidates = [a for a in dynamic_agents if a['kind'] == 'infra_ops']
|
||||
|
||||
steward = steward_candidates[0] if steward_candidates else (ops_candidates[0] if ops_candidates else None)
|
||||
|
||||
if steward:
|
||||
data["steward_agent"] = {
|
||||
"id": steward["id"],
|
||||
"name": steward["display_name"],
|
||||
"kind": steward["kind"],
|
||||
"slug": steward["public_slug"],
|
||||
}
|
||||
16
services/city-service/repo_city.py_fragment_debug.py
Normal file
16
services/city-service/repo_city.py_fragment_debug.py
Normal file
@@ -0,0 +1,16 @@
|
||||
# Fetch MicroDAOs where orchestrator is on this node
|
||||
print(f"DEBUG: Fetching microdaos for node {node_id}")
|
||||
try:
|
||||
microdaos = await pool.fetch("""
|
||||
SELECT m.id, m.slug, m.name, COUNT(cr.id) as rooms_count
|
||||
FROM microdaos m
|
||||
JOIN agents a ON m.orchestrator_agent_id = a.id
|
||||
LEFT JOIN city_rooms cr ON cr.microdao_id::text = m.id
|
||||
WHERE a.node_id = $1
|
||||
GROUP BY m.id, m.slug, m.name
|
||||
ORDER BY m.name
|
||||
""", node_id)
|
||||
print(f"DEBUG: Microdaos fetched: {len(microdaos)}")
|
||||
except Exception as e:
|
||||
print(f"DEBUG: Error fetching microdaos: {e}")
|
||||
raise e
|
||||
@@ -4057,6 +4057,52 @@ async def get_node_self_healing_status(node_id: str):
|
||||
)
|
||||
|
||||
|
||||
@router.get("/internal/node/{node_id}/swapper", response_model=NodeSwapperDetail)
|
||||
async def get_node_swapper_detail(node_id: str):
|
||||
"""
|
||||
Get detailed Swapper Service status for a node.
|
||||
Used by Node Cabinet to show loaded models and health.
|
||||
"""
|
||||
try:
|
||||
# Fetch from node_cache
|
||||
metrics = await repo_city.get_node_metrics(node_id)
|
||||
if not metrics:
|
||||
raise HTTPException(status_code=404, detail="Node not found")
|
||||
|
||||
# Parse swapper state (stored as JSONB)
|
||||
state = metrics.get("swapper_state") or {}
|
||||
models_data = state.get("models", [])
|
||||
|
||||
models = [
|
||||
SwapperModel(
|
||||
name=m.get("name", "unknown"),
|
||||
loaded=m.get("loaded", False),
|
||||
type=m.get("type"),
|
||||
vram_gb=m.get("vram_gb")
|
||||
)
|
||||
for m in models_data
|
||||
]
|
||||
|
||||
return NodeSwapperDetail(
|
||||
node_id=node_id,
|
||||
healthy=metrics.get("swapper_healthy", False),
|
||||
models_loaded=metrics.get("swapper_models_loaded", 0),
|
||||
models_total=metrics.get("swapper_models_total", 0),
|
||||
models=models
|
||||
)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get swapper detail for {node_id}: {e}")
|
||||
return NodeSwapperDetail(
|
||||
node_id=node_id,
|
||||
healthy=False,
|
||||
models_loaded=0,
|
||||
models_total=0,
|
||||
models=[]
|
||||
)
|
||||
|
||||
|
||||
@router.get("/internal/node/{node_id}/directory-check")
|
||||
async def check_node_in_directory(node_id: str):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user