feat: DAGI Router v2 - new endpoints, hooks, and UI card

This commit is contained in:
Apple
2025-12-01 05:21:43 -08:00
parent 53f31adbf0
commit e3accd4df0
221 changed files with 999 additions and 261 deletions

View File

@@ -4103,6 +4103,198 @@ async def get_node_swapper_detail(node_id: str):
)
@router.get("/internal/node/{node_id}/dagi-router/health")
async def get_dagi_router_health(node_id: str):
"""
Get DAGI Router health status for a node.
Always returns 200 with status="down" if router is unavailable.
"""
import httpx
import time
# Node-specific router URLs
NODE_ROUTER_URLS = {
"node-1-hetzner-gex44": "http://dagi-router:9102",
"node-2-macbook-m4max": "http://localhost:9102", # Local router on NODE2
}
base_url = NODE_ROUTER_URLS.get(node_id)
if not base_url:
return {
"node_id": node_id,
"status": "down",
"version": None,
"agent_count": 0,
"latency_ms": None,
"error": "No router URL configured for this node"
}
try:
start = time.monotonic()
async with httpx.AsyncClient(timeout=3.0) as client:
resp = await client.get(f"{base_url}/health")
latency_ms = (time.monotonic() - start) * 1000.0
if resp.status_code != 200:
return {
"node_id": node_id,
"status": "down",
"version": None,
"agent_count": 0,
"latency_ms": latency_ms
}
data = resp.json()
return {
"node_id": node_id,
"status": "up" if data.get("status") == "healthy" else "degraded",
"version": data.get("version"),
"agent_count": data.get("agent_count", 0),
"latency_ms": round(latency_ms, 2)
}
except Exception as e:
logger.warning(f"DAGI Router health check failed for {node_id}: {e}")
return {
"node_id": node_id,
"status": "down",
"version": None,
"agent_count": 0,
"latency_ms": None
}
@router.get("/internal/node/{node_id}/dagi-router/agents")
async def get_dagi_router_agents(node_id: str):
"""
Get list of agents registered with DAGI Router for a node.
Compares with DB to identify phantom/stale agents.
"""
import httpx
NODE_ROUTER_URLS = {
"node-1-hetzner-gex44": "http://dagi-router:9102",
"node-2-macbook-m4max": "http://localhost:9102",
}
base_url = NODE_ROUTER_URLS.get(node_id)
router_agents = []
# Try to get agents from router
if base_url:
try:
async with httpx.AsyncClient(timeout=5.0) as client:
resp = await client.get(f"{base_url}/agents")
if resp.status_code == 200:
data = resp.json()
router_agents = data.get("agents", [])
except Exception as e:
logger.warning(f"Failed to get agents from router for {node_id}: {e}")
# Get agents from DB for this node
try:
db_agents = await repo_city.get_agents_for_node(node_id)
db_agent_ids = {a.get("id") or a.get("slug") for a in db_agents}
except Exception as e:
logger.warning(f"Failed to get DB agents for {node_id}: {e}")
db_agents = []
db_agent_ids = set()
# Build combined list
result_agents = []
router_agent_ids = set()
for ra in router_agents:
agent_id = ra.get("id") or ra.get("name") or ra.get("slug")
if not agent_id:
continue
router_agent_ids.add(agent_id)
# Check if in DB
has_db_record = agent_id in db_agent_ids
status = "active" if has_db_record else "phantom"
result_agents.append({
"id": agent_id,
"name": ra.get("name"),
"kind": ra.get("kind"),
"runtime": ra.get("runtime") or f"{node_id}-router",
"node_id": node_id,
"last_seen_at": ra.get("last_seen_at"),
"status": status,
"has_db_record": has_db_record
})
# Add stale agents (in DB but not in router)
for db_agent in db_agents:
agent_id = db_agent.get("id") or db_agent.get("slug")
if agent_id and agent_id not in router_agent_ids:
result_agents.append({
"id": agent_id,
"name": db_agent.get("display_name") or db_agent.get("name"),
"kind": db_agent.get("kind"),
"runtime": None,
"node_id": node_id,
"last_seen_at": None,
"status": "stale",
"has_db_record": True
})
# Count by status
active = sum(1 for a in result_agents if a["status"] == "active")
phantom = sum(1 for a in result_agents if a["status"] == "phantom")
stale = sum(1 for a in result_agents if a["status"] == "stale")
return {
"node_id": node_id,
"total": len(result_agents),
"active": active,
"phantom": phantom,
"stale": stale,
"agents": result_agents
}
@router.get("/internal/node/{node_id}/dagi-router/summary")
async def get_dagi_router_summary(node_id: str):
"""
Get combined DAGI Router status summary for a node.
Includes health, agent counts, and last audit timestamp.
"""
# Get health
health = await get_dagi_router_health(node_id)
# Get agents info
agents_info = await get_dagi_router_agents(node_id)
# Get last audit timestamp
last_audit_at = None
try:
pool = await repo_city.get_pool()
row = await pool.fetchrow("""
SELECT MAX(created_at) as last_audit
FROM dagi_audit_reports
WHERE node_id = $1
""", node_id)
if row and row["last_audit"]:
last_audit_at = row["last_audit"].isoformat()
except Exception as e:
logger.warning(f"Failed to get last audit for {node_id}: {e}")
return {
"node_id": node_id,
"status": health.get("status", "down"),
"version": health.get("version"),
"latency_ms": health.get("latency_ms"),
"router_agent_count": health.get("agent_count", 0),
"db_agent_count": agents_info.get("total", 0),
"active": agents_info.get("active", 0),
"phantom": agents_info.get("phantom", 0),
"stale": agents_info.get("stale", 0),
"last_audit_at": last_audit_at
}
@router.get("/internal/node/{node_id}/directory-check")
async def check_node_in_directory(node_id: str):
"""