### Backend (city-service) - Node Registry + Self-Healing API (migration 039) - Improved get_all_nodes() with robust fallback for node_registry/node_cache - Agent Prompts Runtime API for DAGI Router integration - DAGI Router Audit endpoints (phantom/stale detection) - Node Agents API (Guardian/Steward) - Node metrics extended (CPU/GPU/RAM/Disk) ### Frontend (apps/web) - Node Directory with improved error handling - Node Cabinet with metrics cards - DAGI Router Card component - Node Metrics Card component - useDAGIAudit hook ### Scripts - check-invariants.py - deploy verification - node-bootstrap.sh - node self-registration - node-guardian-loop.py - continuous self-healing - dagi_agent_audit.py - DAGI audit utility ### Migrations - 034: Agent prompts seed - 035: Agent DAGI audit - 036: Node metrics extended - 037: Node agents complete - 038: Agent prompts full coverage - 039: Node registry self-healing ### Tests - test_infra_smoke.py - test_agent_prompts_runtime.py - test_dagi_router_api.py ### Documentation - DEPLOY_CHECKLIST_2024_11_30.md - Multiple TASK_PHASE docs
312 lines
11 KiB
PL/PgSQL
312 lines
11 KiB
PL/PgSQL
-- Migration 039: Node Registry for Self-Healing
|
||
-- Створення node_registry як єдиного джерела істини для нод
|
||
-- Частина TASK_PHASE_NODE_SELF_HEALING_v1
|
||
|
||
-- ============================================================================
|
||
-- 1. Створити таблицю node_registry
|
||
-- ============================================================================
|
||
|
||
CREATE TABLE IF NOT EXISTS node_registry (
|
||
id text PRIMARY KEY, -- node_id (напр. node-2-macbook-m4max)
|
||
name text NOT NULL, -- Людська назва ноди
|
||
hostname text, -- Hostname ноди
|
||
environment text NOT NULL CHECK (environment IN ('production', 'development', 'staging')),
|
||
roles text[] NOT NULL DEFAULT '{}', -- ['gpu', 'ai_runtime', 'storage', ...]
|
||
description text, -- Опис ноди
|
||
owner_id text, -- ID власника (user/microdao)
|
||
config jsonb DEFAULT '{}', -- Додаткова конфігурація
|
||
is_active boolean NOT NULL DEFAULT true,
|
||
registered_at timestamptz NOT NULL DEFAULT now(),
|
||
updated_at timestamptz NOT NULL DEFAULT now(),
|
||
last_self_registration timestamptz, -- Остання самореєстрація
|
||
self_registration_count integer DEFAULT 0
|
||
);
|
||
|
||
-- Індекси
|
||
CREATE INDEX IF NOT EXISTS idx_node_registry_active ON node_registry(is_active) WHERE is_active = true;
|
||
CREATE INDEX IF NOT EXISTS idx_node_registry_environment ON node_registry(environment);
|
||
CREATE INDEX IF NOT EXISTS idx_node_registry_updated ON node_registry(updated_at DESC);
|
||
|
||
-- ============================================================================
|
||
-- 2. Оновити node_cache - додати зв'язок з registry
|
||
-- ============================================================================
|
||
|
||
-- Перевірити що node_cache.node_id є foreign key до node_registry
|
||
-- (опційно, можна не додавати FK для гнучкості)
|
||
|
||
-- Додати поле для статусу self-healing
|
||
ALTER TABLE node_cache ADD COLUMN IF NOT EXISTS self_healing_status text DEFAULT 'healthy';
|
||
ALTER TABLE node_cache ADD COLUMN IF NOT EXISTS self_healing_last_check timestamptz;
|
||
ALTER TABLE node_cache ADD COLUMN IF NOT EXISTS self_healing_errors jsonb DEFAULT '[]';
|
||
|
||
-- ============================================================================
|
||
-- 3. Зареєструвати існуючі ноди
|
||
-- ============================================================================
|
||
|
||
-- NODE1: Hetzner GEX44 Production
|
||
INSERT INTO node_registry (
|
||
id,
|
||
name,
|
||
hostname,
|
||
environment,
|
||
roles,
|
||
description,
|
||
is_active,
|
||
registered_at,
|
||
updated_at
|
||
) VALUES (
|
||
'node-1-hetzner-gex44',
|
||
'NODE1 — Hetzner GEX44',
|
||
'node1.daarion.space',
|
||
'production',
|
||
ARRAY['production', 'gpu', 'ai_runtime', 'storage', 'matrix'],
|
||
'Production server with RTX 4090, hosts Matrix Synapse, DAGI Router, main services',
|
||
true,
|
||
NOW(),
|
||
NOW()
|
||
)
|
||
ON CONFLICT (id) DO UPDATE SET
|
||
name = EXCLUDED.name,
|
||
hostname = EXCLUDED.hostname,
|
||
environment = EXCLUDED.environment,
|
||
roles = EXCLUDED.roles,
|
||
description = EXCLUDED.description,
|
||
is_active = true,
|
||
updated_at = NOW();
|
||
|
||
-- NODE2: MacBook Pro M4 Max Development
|
||
INSERT INTO node_registry (
|
||
id,
|
||
name,
|
||
hostname,
|
||
environment,
|
||
roles,
|
||
description,
|
||
is_active,
|
||
registered_at,
|
||
updated_at
|
||
) VALUES (
|
||
'node-2-macbook-m4max',
|
||
'NODE2 — MacBook Pro M4 Max',
|
||
'node2.local',
|
||
'development',
|
||
ARRAY['development', 'gpu', 'ai_runtime', 'testing'],
|
||
'Development node with M4 Max GPU (40GB unified memory), local AI models',
|
||
true,
|
||
NOW(),
|
||
NOW()
|
||
)
|
||
ON CONFLICT (id) DO UPDATE SET
|
||
name = EXCLUDED.name,
|
||
hostname = EXCLUDED.hostname,
|
||
environment = EXCLUDED.environment,
|
||
roles = EXCLUDED.roles,
|
||
description = EXCLUDED.description,
|
||
is_active = true,
|
||
updated_at = NOW();
|
||
|
||
-- ============================================================================
|
||
-- 4. Переконатися що node_cache має записи для обох нод
|
||
-- ============================================================================
|
||
|
||
-- NODE1
|
||
INSERT INTO node_cache (node_id, last_heartbeat, self_healing_status)
|
||
VALUES ('node-1-hetzner-gex44', NOW(), 'healthy')
|
||
ON CONFLICT (node_id) DO UPDATE SET
|
||
self_healing_status = 'healthy',
|
||
self_healing_last_check = NOW();
|
||
|
||
-- NODE2
|
||
INSERT INTO node_cache (node_id, last_heartbeat, self_healing_status)
|
||
VALUES ('node-2-macbook-m4max', NOW(), 'healthy')
|
||
ON CONFLICT (node_id) DO UPDATE SET
|
||
self_healing_status = 'healthy',
|
||
self_healing_last_check = NOW();
|
||
|
||
-- ============================================================================
|
||
-- 5. View для Node Directory (з'єднання registry + cache)
|
||
-- ============================================================================
|
||
|
||
CREATE OR REPLACE VIEW v_nodes_directory AS
|
||
SELECT
|
||
r.id,
|
||
r.name,
|
||
r.hostname,
|
||
r.environment,
|
||
r.roles,
|
||
r.description,
|
||
r.is_active,
|
||
r.registered_at,
|
||
r.updated_at,
|
||
r.last_self_registration,
|
||
-- Cache data (metrics)
|
||
c.cpu_model,
|
||
c.cpu_cores,
|
||
c.cpu_usage,
|
||
c.gpu_model,
|
||
c.gpu_vram_total,
|
||
c.gpu_vram_used,
|
||
c.ram_total,
|
||
c.ram_used,
|
||
c.disk_total,
|
||
c.disk_used,
|
||
c.agent_count_router,
|
||
c.agent_count_system,
|
||
c.last_heartbeat,
|
||
c.dagi_router_url,
|
||
c.guardian_agent_id,
|
||
c.steward_agent_id,
|
||
c.self_healing_status,
|
||
c.self_healing_last_check,
|
||
-- Derived fields
|
||
CASE
|
||
WHEN c.last_heartbeat IS NULL THEN 'offline'
|
||
WHEN c.last_heartbeat < NOW() - INTERVAL '10 minutes' THEN 'stale'
|
||
ELSE 'online'
|
||
END AS connection_status,
|
||
EXTRACT(EPOCH FROM (NOW() - c.last_heartbeat)) / 60 AS heartbeat_age_minutes
|
||
FROM node_registry r
|
||
LEFT JOIN node_cache c ON c.node_id = r.id
|
||
WHERE r.is_active = true;
|
||
|
||
-- ============================================================================
|
||
-- 6. Функція для self-registration
|
||
-- ============================================================================
|
||
|
||
CREATE OR REPLACE FUNCTION fn_node_self_register(
|
||
p_node_id text,
|
||
p_name text,
|
||
p_hostname text DEFAULT NULL,
|
||
p_environment text DEFAULT 'development',
|
||
p_roles text[] DEFAULT '{}'
|
||
) RETURNS jsonb AS $$
|
||
DECLARE
|
||
v_result jsonb;
|
||
v_is_new boolean := false;
|
||
BEGIN
|
||
-- Перевірити чи нода вже існує
|
||
IF NOT EXISTS (SELECT 1 FROM node_registry WHERE id = p_node_id) THEN
|
||
v_is_new := true;
|
||
END IF;
|
||
|
||
-- Insert or update node_registry
|
||
INSERT INTO node_registry (
|
||
id, name, hostname, environment, roles,
|
||
is_active, registered_at, updated_at,
|
||
last_self_registration, self_registration_count
|
||
) VALUES (
|
||
p_node_id, p_name, p_hostname, p_environment, p_roles,
|
||
true, NOW(), NOW(), NOW(), 1
|
||
)
|
||
ON CONFLICT (id) DO UPDATE SET
|
||
name = COALESCE(NULLIF(p_name, ''), node_registry.name),
|
||
hostname = COALESCE(p_hostname, node_registry.hostname),
|
||
environment = COALESCE(NULLIF(p_environment, ''), node_registry.environment),
|
||
roles = CASE
|
||
WHEN array_length(p_roles, 1) > 0 THEN p_roles
|
||
ELSE node_registry.roles
|
||
END,
|
||
is_active = true,
|
||
updated_at = NOW(),
|
||
last_self_registration = NOW(),
|
||
self_registration_count = COALESCE(node_registry.self_registration_count, 0) + 1;
|
||
|
||
-- Ensure node_cache entry exists
|
||
INSERT INTO node_cache (node_id, last_heartbeat, self_healing_status)
|
||
VALUES (p_node_id, NOW(), 'healthy')
|
||
ON CONFLICT (node_id) DO UPDATE SET
|
||
last_heartbeat = NOW(),
|
||
self_healing_status = 'healthy',
|
||
self_healing_last_check = NOW();
|
||
|
||
-- Return result
|
||
v_result := jsonb_build_object(
|
||
'success', true,
|
||
'node_id', p_node_id,
|
||
'is_new', v_is_new,
|
||
'message', CASE WHEN v_is_new THEN 'Node registered' ELSE 'Node updated' END
|
||
);
|
||
|
||
RETURN v_result;
|
||
END;
|
||
$$ LANGUAGE plpgsql;
|
||
|
||
-- ============================================================================
|
||
-- 7. Функція для оновлення heartbeat
|
||
-- ============================================================================
|
||
|
||
CREATE OR REPLACE FUNCTION fn_node_heartbeat(
|
||
p_node_id text,
|
||
p_metrics jsonb DEFAULT NULL
|
||
) RETURNS jsonb AS $$
|
||
DECLARE
|
||
v_node_exists boolean;
|
||
BEGIN
|
||
-- Перевірити чи нода зареєстрована
|
||
SELECT EXISTS(SELECT 1 FROM node_registry WHERE id = p_node_id AND is_active = true)
|
||
INTO v_node_exists;
|
||
|
||
IF NOT v_node_exists THEN
|
||
RETURN jsonb_build_object(
|
||
'success', false,
|
||
'error', 'Node not registered',
|
||
'should_self_register', true
|
||
);
|
||
END IF;
|
||
|
||
-- Оновити node_cache
|
||
UPDATE node_cache SET
|
||
last_heartbeat = NOW(),
|
||
self_healing_status = 'healthy',
|
||
cpu_usage = COALESCE((p_metrics->>'cpu_usage')::numeric, cpu_usage),
|
||
gpu_vram_used = COALESCE((p_metrics->>'gpu_vram_used')::integer, gpu_vram_used),
|
||
ram_used = COALESCE((p_metrics->>'ram_used')::integer, ram_used),
|
||
disk_used = COALESCE((p_metrics->>'disk_used')::integer, disk_used),
|
||
agent_count_router = COALESCE((p_metrics->>'agent_count_router')::integer, agent_count_router),
|
||
agent_count_system = COALESCE((p_metrics->>'agent_count_system')::integer, agent_count_system)
|
||
WHERE node_id = p_node_id;
|
||
|
||
-- Також оновити updated_at в registry
|
||
UPDATE node_registry SET updated_at = NOW()
|
||
WHERE id = p_node_id;
|
||
|
||
RETURN jsonb_build_object(
|
||
'success', true,
|
||
'node_id', p_node_id,
|
||
'heartbeat_at', NOW()
|
||
);
|
||
END;
|
||
$$ LANGUAGE plpgsql;
|
||
|
||
-- ============================================================================
|
||
-- 8. Коментарі
|
||
-- ============================================================================
|
||
|
||
COMMENT ON TABLE node_registry IS 'Реєстр нод DAARION — єдине джерело істини для Node Directory';
|
||
COMMENT ON COLUMN node_registry.id IS 'Унікальний ідентифікатор ноди';
|
||
COMMENT ON COLUMN node_registry.roles IS 'Ролі ноди: gpu, ai_runtime, storage, matrix, development, production';
|
||
COMMENT ON COLUMN node_registry.last_self_registration IS 'Остання успішна самореєстрація ноди';
|
||
COMMENT ON COLUMN node_registry.self_registration_count IS 'Кількість разів, коли нода реєструвала себе';
|
||
|
||
COMMENT ON FUNCTION fn_node_self_register IS 'Самореєстрація ноди — викликається Node Bootstrap або Guardian';
|
||
COMMENT ON FUNCTION fn_node_heartbeat IS 'Heartbeat ноди з оновленням метрик';
|
||
|
||
COMMENT ON VIEW v_nodes_directory IS 'View для Node Directory — з''єднує registry + cache + derived статуси';
|
||
|
||
-- ============================================================================
|
||
-- 9. Результат
|
||
-- ============================================================================
|
||
|
||
SELECT 'Migration 039 completed: Node Registry for Self-Healing' AS result;
|
||
|
||
-- Показати зареєстровані ноди
|
||
SELECT
|
||
id,
|
||
name,
|
||
environment,
|
||
roles,
|
||
is_active
|
||
FROM node_registry
|
||
ORDER BY registered_at;
|
||
|