fix(node2): Add detailed logging for router_healthy determination
This commit is contained in:
@@ -4576,14 +4576,21 @@ async def get_dagi_router_agents(node_id: str):
|
|||||||
router_healthy = None
|
router_healthy = None
|
||||||
try:
|
try:
|
||||||
metrics = await repo_city.get_node_metrics(node_id)
|
metrics = await repo_city.get_node_metrics(node_id)
|
||||||
if metrics and metrics.get("router_healthy") is not None:
|
if metrics:
|
||||||
router_healthy = bool(metrics.get("router_healthy", False))
|
router_healthy_raw = metrics.get("router_healthy")
|
||||||
logger.info(f"Using router_healthy from node_cache for {node_id}: {router_healthy}")
|
if router_healthy_raw is not None:
|
||||||
|
router_healthy = bool(router_healthy_raw)
|
||||||
|
logger.info(f"[{node_id}] Using router_healthy from node_cache: {router_healthy} (raw: {router_healthy_raw})")
|
||||||
|
else:
|
||||||
|
logger.debug(f"[{node_id}] router_healthy is None in node_cache")
|
||||||
|
else:
|
||||||
|
logger.debug(f"[{node_id}] No metrics found in node_cache")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Failed to get cached router health for {node_id}: {e}")
|
logger.warning(f"[{node_id}] Failed to get cached router health: {e}")
|
||||||
|
|
||||||
# Fallback: try direct health check (only works for local nodes like NODE1)
|
# Fallback: try direct health check (only works for local nodes like NODE1)
|
||||||
if router_healthy is None:
|
if router_healthy is None:
|
||||||
|
logger.info(f"[{node_id}] router_healthy is None, trying direct health check")
|
||||||
endpoints = await repo_city.get_node_endpoints(node_id)
|
endpoints = await repo_city.get_node_endpoints(node_id)
|
||||||
base_url = endpoints.get("router_url")
|
base_url = endpoints.get("router_url")
|
||||||
|
|
||||||
@@ -4596,9 +4603,16 @@ async def get_dagi_router_agents(node_id: str):
|
|||||||
# Router can return "healthy" or "ok"
|
# Router can return "healthy" or "ok"
|
||||||
status = data.get("status", "").lower()
|
status = data.get("status", "").lower()
|
||||||
router_healthy = status in ("healthy", "ok")
|
router_healthy = status in ("healthy", "ok")
|
||||||
logger.debug(f"Direct router health check for {node_id}: {router_healthy}")
|
logger.info(f"[{node_id}] Direct router health check: {router_healthy}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Failed to check router health for {node_id} at {base_url}: {e}")
|
logger.warning(f"[{node_id}] Failed to check router health at {base_url}: {e}")
|
||||||
|
|
||||||
|
# Final fallback
|
||||||
|
if router_healthy is None:
|
||||||
|
router_healthy = False
|
||||||
|
logger.warning(f"[{node_id}] router_healthy is None after all checks, defaulting to False")
|
||||||
|
|
||||||
|
logger.info(f"[{node_id}] Final router_healthy value: {router_healthy}")
|
||||||
|
|
||||||
# Get agents from DB for this node
|
# Get agents from DB for this node
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user