feat: add router health metrics to node_cache and node-guardian
- Add migration 042_node_cache_router_metrics.sql - Node guardian now collects router health and sends in heartbeat - City-service uses cached router_healthy from node_cache - This allows NODE2 router status to be displayed correctly
This commit is contained in:
@@ -3373,7 +3373,7 @@ async def get_node_endpoints(node_id: str) -> Dict[str, str]:
|
||||
|
||||
async def get_node_metrics(node_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Отримати розширені метрики ноди (включаючи Swapper).
|
||||
Отримати розширені метрики ноди (включаючи Swapper та Router).
|
||||
"""
|
||||
pool = await get_pool()
|
||||
|
||||
@@ -3385,7 +3385,9 @@ async def get_node_metrics(node_id: str) -> Optional[Dict[str, Any]]:
|
||||
swapper_models_total,
|
||||
swapper_state,
|
||||
router_url,
|
||||
swapper_url
|
||||
swapper_url,
|
||||
router_healthy,
|
||||
router_version
|
||||
FROM node_cache
|
||||
WHERE node_id = $1
|
||||
""", node_id)
|
||||
|
||||
@@ -4086,13 +4086,29 @@ async def get_node_swapper_detail(node_id: str):
|
||||
async def get_dagi_router_health(node_id: str):
|
||||
"""
|
||||
Get DAGI Router health status for a node.
|
||||
Always returns 200 with status="down" if router is unavailable.
|
||||
Uses node-specific router_url from node_cache.
|
||||
First checks node_cache for cached router_healthy status (from node-guardian).
|
||||
Falls back to direct health check if node is local (NODE1).
|
||||
"""
|
||||
import httpx
|
||||
import time
|
||||
|
||||
# Get router URL from database (node-specific)
|
||||
# First, try to get cached router health from node_cache
|
||||
# This is populated by node-guardian which has direct access to the router
|
||||
try:
|
||||
metrics = await repo_city.get_node_metrics(node_id)
|
||||
if metrics and metrics.get("router_healthy") is not None:
|
||||
return {
|
||||
"node_id": node_id,
|
||||
"status": "up" if metrics.get("router_healthy") else "down",
|
||||
"version": metrics.get("router_version"),
|
||||
"agent_count": 0, # TODO: get from node_cache
|
||||
"latency_ms": None,
|
||||
"source": "node_cache"
|
||||
}
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to get cached router health for {node_id}: {e}")
|
||||
|
||||
# Fallback: try direct health check (only works for NODE1 which is local to city-service)
|
||||
endpoints = await repo_city.get_node_endpoints(node_id)
|
||||
base_url = endpoints.get("router_url")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user