feat: add router health metrics to node_cache and node-guardian
- Add migration 042_node_cache_router_metrics.sql - Node guardian now collects router health and sends in heartbeat - City-service uses cached router_healthy from node_cache - This allows NODE2 router status to be displayed correctly
This commit is contained in:
71
migrations/042_node_cache_router_metrics.sql
Normal file
71
migrations/042_node_cache_router_metrics.sql
Normal file
@@ -0,0 +1,71 @@
|
||||
-- Migration: Add router health metrics to node_cache
|
||||
-- Purpose: Store router health status collected by node-guardian
|
||||
|
||||
ALTER TABLE node_cache
|
||||
ADD COLUMN IF NOT EXISTS router_healthy boolean DEFAULT false,
|
||||
ADD COLUMN IF NOT EXISTS router_version text;
|
||||
|
||||
COMMENT ON COLUMN node_cache.router_healthy IS 'Whether DAGI Router is healthy on this node';
|
||||
COMMENT ON COLUMN node_cache.router_version IS 'Version of DAGI Router on this node';
|
||||
|
||||
-- Update fn_node_heartbeat to handle router metrics
|
||||
CREATE OR REPLACE FUNCTION fn_node_heartbeat(
|
||||
p_node_id text,
|
||||
p_metrics jsonb DEFAULT '{}'::jsonb
|
||||
) RETURNS jsonb AS $$
|
||||
DECLARE
|
||||
v_result jsonb;
|
||||
v_node_exists boolean;
|
||||
v_swapper_state jsonb;
|
||||
BEGIN
|
||||
-- Check if node exists
|
||||
SELECT EXISTS(SELECT 1 FROM node_cache WHERE node_id = p_node_id) INTO v_node_exists;
|
||||
|
||||
IF NOT v_node_exists THEN
|
||||
RETURN jsonb_build_object(
|
||||
'success', false,
|
||||
'should_self_register', true,
|
||||
'message', 'Node not found in cache'
|
||||
);
|
||||
END IF;
|
||||
|
||||
-- Handle swapper_state - only update if provided and not empty
|
||||
v_swapper_state := CASE
|
||||
WHEN p_metrics ? 'swapper_state' AND p_metrics->'swapper_state' != '{}'::jsonb
|
||||
THEN p_metrics->'swapper_state'
|
||||
ELSE NULL
|
||||
END;
|
||||
|
||||
-- Update node_cache with metrics
|
||||
UPDATE node_cache SET
|
||||
last_heartbeat = NOW(),
|
||||
status = 'online',
|
||||
cpu_usage = COALESCE((p_metrics->>'cpu_usage')::float, cpu_usage),
|
||||
gpu_vram_used = COALESCE((p_metrics->>'gpu_vram_used')::int, gpu_vram_used),
|
||||
ram_used = COALESCE((p_metrics->>'ram_used')::bigint, ram_used),
|
||||
disk_used = COALESCE((p_metrics->>'disk_used')::bigint, disk_used),
|
||||
agent_count_router = COALESCE((p_metrics->>'agent_count_router')::int, agent_count_router),
|
||||
agent_count_system = COALESCE((p_metrics->>'agent_count_system')::int, agent_count_system),
|
||||
dagi_router_url = COALESCE(p_metrics->>'dagi_router_url', dagi_router_url),
|
||||
-- Swapper metrics
|
||||
swapper_healthy = COALESCE((p_metrics->>'swapper_healthy')::boolean, swapper_healthy),
|
||||
swapper_models_loaded = COALESCE((p_metrics->>'swapper_models_loaded')::int, swapper_models_loaded),
|
||||
swapper_models_total = COALESCE((p_metrics->>'swapper_models_total')::int, swapper_models_total),
|
||||
swapper_state = COALESCE(v_swapper_state, swapper_state),
|
||||
-- Router metrics
|
||||
router_healthy = COALESCE((p_metrics->>'router_healthy')::boolean, router_healthy),
|
||||
router_version = COALESCE(p_metrics->>'router_version', router_version),
|
||||
-- Node-specific URLs (update if provided)
|
||||
swapper_url = COALESCE(p_metrics->>'swapper_url', swapper_url),
|
||||
router_url = COALESCE(p_metrics->>'router_url', router_url),
|
||||
updated_at = NOW()
|
||||
WHERE node_id = p_node_id;
|
||||
|
||||
RETURN jsonb_build_object(
|
||||
'success', true,
|
||||
'node_id', p_node_id,
|
||||
'timestamp', NOW()
|
||||
);
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
Reference in New Issue
Block a user