fix(node2): Use node_cache router_healthy for DAGI Router agents status
- Fix get_dagi_router_agents to use router_healthy from node_cache first - Fallback to direct API call only if cache is unavailable - This fixes NODE2 agents showing as 'stale' when router is actually healthy - Fix CITY_SERVICE_URL in scripts (remove /api/city, use /api)
This commit is contained in:
@@ -4571,23 +4571,34 @@ async def get_dagi_router_agents(node_id: str):
|
||||
"""
|
||||
import httpx
|
||||
|
||||
# Get router URL from database (node-specific)
|
||||
endpoints = await repo_city.get_node_endpoints(node_id)
|
||||
base_url = endpoints.get("router_url")
|
||||
# First, try to get router health from node_cache (populated by node-guardian)
|
||||
# This is preferred for remote nodes (like NODE2) where direct connection may not work
|
||||
router_healthy = False
|
||||
try:
|
||||
metrics = await repo_city.get_node_metrics(node_id)
|
||||
if metrics and metrics.get("router_healthy") is not None:
|
||||
router_healthy = metrics.get("router_healthy", False)
|
||||
logger.debug(f"Using router_healthy from node_cache for {node_id}: {router_healthy}")
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to get cached router health for {node_id}: {e}")
|
||||
|
||||
# Check if router is healthy
|
||||
if base_url:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=3.0) as client:
|
||||
resp = await client.get(f"{base_url}/health")
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
# Router can return "healthy" or "ok"
|
||||
status = data.get("status", "").lower()
|
||||
router_healthy = status in ("healthy", "ok")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to check router health for {node_id} at {base_url}: {e}")
|
||||
# Fallback: try direct health check (only works for local nodes like NODE1)
|
||||
if router_healthy is False:
|
||||
endpoints = await repo_city.get_node_endpoints(node_id)
|
||||
base_url = endpoints.get("router_url")
|
||||
|
||||
if base_url:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=3.0) as client:
|
||||
resp = await client.get(f"{base_url}/health")
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
# Router can return "healthy" or "ok"
|
||||
status = data.get("status", "").lower()
|
||||
router_healthy = status in ("healthy", "ok")
|
||||
logger.debug(f"Direct router health check for {node_id}: {router_healthy}")
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to check router health for {node_id} at {base_url}: {e}")
|
||||
|
||||
# Get agents from DB for this node
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user