From 88188ed693bf630e752fc74af3e22c05b0979694 Mon Sep 17 00:00:00 2001 From: Apple Date: Tue, 2 Dec 2025 07:02:08 -0800 Subject: [PATCH] fix(node2): Use node_cache router_healthy for DAGI Router agents status - Fix get_dagi_router_agents to use router_healthy from node_cache first - Fallback to direct API call only if cache is unavailable - This fixes NODE2 agents showing as 'stale' when router is actually healthy - Fix CITY_SERVICE_URL in scripts (remove /api/city, use /api) --- scripts/start-node2-guardian.sh | 13 ++++++++- services/city-service/routes_city.py | 41 ++++++++++++++++++---------- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/scripts/start-node2-guardian.sh b/scripts/start-node2-guardian.sh index 7e516590..f61c29c8 100755 --- a/scripts/start-node2-guardian.sh +++ b/scripts/start-node2-guardian.sh @@ -26,7 +26,7 @@ export NODE_NAME="НОДА2" export NODE_ENVIRONMENT="development" export NODE_ROLES="gpu,ai_runtime" export NODE_HOSTNAME="$(hostname)" -export CITY_SERVICE_URL="https://daarion.space/api/city" +export CITY_SERVICE_URL="https://daarion.space/api" export NODE_SWAPPER_URL="http://localhost:8890" export NODE_ROUTER_URL="http://localhost:9102" export GUARDIAN_INTERVAL="60" @@ -38,6 +38,17 @@ echo " Swapper URL: $NODE_SWAPPER_URL" echo " Router URL: $NODE_ROUTER_URL" echo "" +# Export environment variables for node-guardian-loop +export NODE_ID +export NODE_NAME +export NODE_ENVIRONMENT +export NODE_ROLES +export NODE_HOSTNAME +export CITY_SERVICE_URL +export NODE_SWAPPER_URL +export NODE_ROUTER_URL +export GUARDIAN_INTERVAL + # Run node-guardian-loop python3 "$PROJECT_ROOT/scripts/node-guardian-loop.py" \ --node-id "$NODE_ID" \ diff --git a/services/city-service/routes_city.py b/services/city-service/routes_city.py index 31260e3e..beab72d1 100644 --- a/services/city-service/routes_city.py +++ b/services/city-service/routes_city.py @@ -4571,23 +4571,34 @@ async def get_dagi_router_agents(node_id: str): """ import httpx - # Get router URL from database (node-specific) - endpoints = await repo_city.get_node_endpoints(node_id) - base_url = endpoints.get("router_url") + # First, try to get router health from node_cache (populated by node-guardian) + # This is preferred for remote nodes (like NODE2) where direct connection may not work router_healthy = False + try: + metrics = await repo_city.get_node_metrics(node_id) + if metrics and metrics.get("router_healthy") is not None: + router_healthy = metrics.get("router_healthy", False) + logger.debug(f"Using router_healthy from node_cache for {node_id}: {router_healthy}") + except Exception as e: + logger.debug(f"Failed to get cached router health for {node_id}: {e}") - # Check if router is healthy - if base_url: - try: - async with httpx.AsyncClient(timeout=3.0) as client: - resp = await client.get(f"{base_url}/health") - if resp.status_code == 200: - data = resp.json() - # Router can return "healthy" or "ok" - status = data.get("status", "").lower() - router_healthy = status in ("healthy", "ok") - except Exception as e: - logger.warning(f"Failed to check router health for {node_id} at {base_url}: {e}") + # Fallback: try direct health check (only works for local nodes like NODE1) + if router_healthy is False: + endpoints = await repo_city.get_node_endpoints(node_id) + base_url = endpoints.get("router_url") + + if base_url: + try: + async with httpx.AsyncClient(timeout=3.0) as client: + resp = await client.get(f"{base_url}/health") + if resp.status_code == 200: + data = resp.json() + # Router can return "healthy" or "ok" + status = data.get("status", "").lower() + router_healthy = status in ("healthy", "ok") + logger.debug(f"Direct router health check for {node_id}: {router_healthy}") + except Exception as e: + logger.debug(f"Failed to check router health for {node_id} at {base_url}: {e}") # Get agents from DB for this node try: