feat: add router health metrics to node_cache and node-guardian

- Add migration 042_node_cache_router_metrics.sql
- Node guardian now collects router health and sends in heartbeat
- City-service uses cached router_healthy from node_cache
- This allows NODE2 router status to be displayed correctly
This commit is contained in:
Apple
2025-12-01 08:03:46 -08:00
parent 9b9a72ffbd
commit a818f2ac2f
4 changed files with 108 additions and 5 deletions

View File

@@ -3373,7 +3373,7 @@ async def get_node_endpoints(node_id: str) -> Dict[str, str]:
async def get_node_metrics(node_id: str) -> Optional[Dict[str, Any]]:
"""
Отримати розширені метрики ноди (включаючи Swapper).
Отримати розширені метрики ноди (включаючи Swapper та Router).
"""
pool = await get_pool()
@@ -3385,7 +3385,9 @@ async def get_node_metrics(node_id: str) -> Optional[Dict[str, Any]]:
swapper_models_total,
swapper_state,
router_url,
swapper_url
swapper_url,
router_healthy,
router_version
FROM node_cache
WHERE node_id = $1
""", node_id)

View File

@@ -4086,13 +4086,29 @@ async def get_node_swapper_detail(node_id: str):
async def get_dagi_router_health(node_id: str):
"""
Get DAGI Router health status for a node.
Always returns 200 with status="down" if router is unavailable.
Uses node-specific router_url from node_cache.
First checks node_cache for cached router_healthy status (from node-guardian).
Falls back to direct health check if node is local (NODE1).
"""
import httpx
import time
# Get router URL from database (node-specific)
# First, try to get cached router health from node_cache
# This is populated by node-guardian which has direct access to the router
try:
metrics = await repo_city.get_node_metrics(node_id)
if metrics and metrics.get("router_healthy") is not None:
return {
"node_id": node_id,
"status": "up" if metrics.get("router_healthy") else "down",
"version": metrics.get("router_version"),
"agent_count": 0, # TODO: get from node_cache
"latency_ms": None,
"source": "node_cache"
}
except Exception as e:
logger.debug(f"Failed to get cached router health for {node_id}: {e}")
# Fallback: try direct health check (only works for NODE1 which is local to city-service)
endpoints = await repo_city.get_node_endpoints(node_id)
base_url = endpoints.get("router_url")