Files
microdao-daarion/migrations/039_node_registry_self_healing.sql
Apple bca81dc719 feat: Node Self-Healing, DAGI Audit, Agent Prompts, Infra Invariants
### Backend (city-service)
- Node Registry + Self-Healing API (migration 039)
- Improved get_all_nodes() with robust fallback for node_registry/node_cache
- Agent Prompts Runtime API for DAGI Router integration
- DAGI Router Audit endpoints (phantom/stale detection)
- Node Agents API (Guardian/Steward)
- Node metrics extended (CPU/GPU/RAM/Disk)

### Frontend (apps/web)
- Node Directory with improved error handling
- Node Cabinet with metrics cards
- DAGI Router Card component
- Node Metrics Card component
- useDAGIAudit hook

### Scripts
- check-invariants.py - deploy verification
- node-bootstrap.sh - node self-registration
- node-guardian-loop.py - continuous self-healing
- dagi_agent_audit.py - DAGI audit utility

### Migrations
- 034: Agent prompts seed
- 035: Agent DAGI audit
- 036: Node metrics extended
- 037: Node agents complete
- 038: Agent prompts full coverage
- 039: Node registry self-healing

### Tests
- test_infra_smoke.py
- test_agent_prompts_runtime.py
- test_dagi_router_api.py

### Documentation
- DEPLOY_CHECKLIST_2024_11_30.md
- Multiple TASK_PHASE docs
2025-11-30 13:52:01 -08:00

312 lines
11 KiB
PL/PgSQL
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- Migration 039: Node Registry for Self-Healing
-- Створення node_registry як єдиного джерела істини для нод
-- Частина TASK_PHASE_NODE_SELF_HEALING_v1
-- ============================================================================
-- 1. Створити таблицю node_registry
-- ============================================================================
CREATE TABLE IF NOT EXISTS node_registry (
id text PRIMARY KEY, -- node_id (напр. node-2-macbook-m4max)
name text NOT NULL, -- Людська назва ноди
hostname text, -- Hostname ноди
environment text NOT NULL CHECK (environment IN ('production', 'development', 'staging')),
roles text[] NOT NULL DEFAULT '{}', -- ['gpu', 'ai_runtime', 'storage', ...]
description text, -- Опис ноди
owner_id text, -- ID власника (user/microdao)
config jsonb DEFAULT '{}', -- Додаткова конфігурація
is_active boolean NOT NULL DEFAULT true,
registered_at timestamptz NOT NULL DEFAULT now(),
updated_at timestamptz NOT NULL DEFAULT now(),
last_self_registration timestamptz, -- Остання самореєстрація
self_registration_count integer DEFAULT 0
);
-- Індекси
CREATE INDEX IF NOT EXISTS idx_node_registry_active ON node_registry(is_active) WHERE is_active = true;
CREATE INDEX IF NOT EXISTS idx_node_registry_environment ON node_registry(environment);
CREATE INDEX IF NOT EXISTS idx_node_registry_updated ON node_registry(updated_at DESC);
-- ============================================================================
-- 2. Оновити node_cache - додати зв'язок з registry
-- ============================================================================
-- Перевірити що node_cache.node_id є foreign key до node_registry
-- (опційно, можна не додавати FK для гнучкості)
-- Додати поле для статусу self-healing
ALTER TABLE node_cache ADD COLUMN IF NOT EXISTS self_healing_status text DEFAULT 'healthy';
ALTER TABLE node_cache ADD COLUMN IF NOT EXISTS self_healing_last_check timestamptz;
ALTER TABLE node_cache ADD COLUMN IF NOT EXISTS self_healing_errors jsonb DEFAULT '[]';
-- ============================================================================
-- 3. Зареєструвати існуючі ноди
-- ============================================================================
-- NODE1: Hetzner GEX44 Production
INSERT INTO node_registry (
id,
name,
hostname,
environment,
roles,
description,
is_active,
registered_at,
updated_at
) VALUES (
'node-1-hetzner-gex44',
'NODE1 — Hetzner GEX44',
'node1.daarion.space',
'production',
ARRAY['production', 'gpu', 'ai_runtime', 'storage', 'matrix'],
'Production server with RTX 4090, hosts Matrix Synapse, DAGI Router, main services',
true,
NOW(),
NOW()
)
ON CONFLICT (id) DO UPDATE SET
name = EXCLUDED.name,
hostname = EXCLUDED.hostname,
environment = EXCLUDED.environment,
roles = EXCLUDED.roles,
description = EXCLUDED.description,
is_active = true,
updated_at = NOW();
-- NODE2: MacBook Pro M4 Max Development
INSERT INTO node_registry (
id,
name,
hostname,
environment,
roles,
description,
is_active,
registered_at,
updated_at
) VALUES (
'node-2-macbook-m4max',
'NODE2 — MacBook Pro M4 Max',
'node2.local',
'development',
ARRAY['development', 'gpu', 'ai_runtime', 'testing'],
'Development node with M4 Max GPU (40GB unified memory), local AI models',
true,
NOW(),
NOW()
)
ON CONFLICT (id) DO UPDATE SET
name = EXCLUDED.name,
hostname = EXCLUDED.hostname,
environment = EXCLUDED.environment,
roles = EXCLUDED.roles,
description = EXCLUDED.description,
is_active = true,
updated_at = NOW();
-- ============================================================================
-- 4. Переконатися що node_cache має записи для обох нод
-- ============================================================================
-- NODE1
INSERT INTO node_cache (node_id, last_heartbeat, self_healing_status)
VALUES ('node-1-hetzner-gex44', NOW(), 'healthy')
ON CONFLICT (node_id) DO UPDATE SET
self_healing_status = 'healthy',
self_healing_last_check = NOW();
-- NODE2
INSERT INTO node_cache (node_id, last_heartbeat, self_healing_status)
VALUES ('node-2-macbook-m4max', NOW(), 'healthy')
ON CONFLICT (node_id) DO UPDATE SET
self_healing_status = 'healthy',
self_healing_last_check = NOW();
-- ============================================================================
-- 5. View для Node Directory (з'єднання registry + cache)
-- ============================================================================
CREATE OR REPLACE VIEW v_nodes_directory AS
SELECT
r.id,
r.name,
r.hostname,
r.environment,
r.roles,
r.description,
r.is_active,
r.registered_at,
r.updated_at,
r.last_self_registration,
-- Cache data (metrics)
c.cpu_model,
c.cpu_cores,
c.cpu_usage,
c.gpu_model,
c.gpu_vram_total,
c.gpu_vram_used,
c.ram_total,
c.ram_used,
c.disk_total,
c.disk_used,
c.agent_count_router,
c.agent_count_system,
c.last_heartbeat,
c.dagi_router_url,
c.guardian_agent_id,
c.steward_agent_id,
c.self_healing_status,
c.self_healing_last_check,
-- Derived fields
CASE
WHEN c.last_heartbeat IS NULL THEN 'offline'
WHEN c.last_heartbeat < NOW() - INTERVAL '10 minutes' THEN 'stale'
ELSE 'online'
END AS connection_status,
EXTRACT(EPOCH FROM (NOW() - c.last_heartbeat)) / 60 AS heartbeat_age_minutes
FROM node_registry r
LEFT JOIN node_cache c ON c.node_id = r.id
WHERE r.is_active = true;
-- ============================================================================
-- 6. Функція для self-registration
-- ============================================================================
CREATE OR REPLACE FUNCTION fn_node_self_register(
p_node_id text,
p_name text,
p_hostname text DEFAULT NULL,
p_environment text DEFAULT 'development',
p_roles text[] DEFAULT '{}'
) RETURNS jsonb AS $$
DECLARE
v_result jsonb;
v_is_new boolean := false;
BEGIN
-- Перевірити чи нода вже існує
IF NOT EXISTS (SELECT 1 FROM node_registry WHERE id = p_node_id) THEN
v_is_new := true;
END IF;
-- Insert or update node_registry
INSERT INTO node_registry (
id, name, hostname, environment, roles,
is_active, registered_at, updated_at,
last_self_registration, self_registration_count
) VALUES (
p_node_id, p_name, p_hostname, p_environment, p_roles,
true, NOW(), NOW(), NOW(), 1
)
ON CONFLICT (id) DO UPDATE SET
name = COALESCE(NULLIF(p_name, ''), node_registry.name),
hostname = COALESCE(p_hostname, node_registry.hostname),
environment = COALESCE(NULLIF(p_environment, ''), node_registry.environment),
roles = CASE
WHEN array_length(p_roles, 1) > 0 THEN p_roles
ELSE node_registry.roles
END,
is_active = true,
updated_at = NOW(),
last_self_registration = NOW(),
self_registration_count = COALESCE(node_registry.self_registration_count, 0) + 1;
-- Ensure node_cache entry exists
INSERT INTO node_cache (node_id, last_heartbeat, self_healing_status)
VALUES (p_node_id, NOW(), 'healthy')
ON CONFLICT (node_id) DO UPDATE SET
last_heartbeat = NOW(),
self_healing_status = 'healthy',
self_healing_last_check = NOW();
-- Return result
v_result := jsonb_build_object(
'success', true,
'node_id', p_node_id,
'is_new', v_is_new,
'message', CASE WHEN v_is_new THEN 'Node registered' ELSE 'Node updated' END
);
RETURN v_result;
END;
$$ LANGUAGE plpgsql;
-- ============================================================================
-- 7. Функція для оновлення heartbeat
-- ============================================================================
CREATE OR REPLACE FUNCTION fn_node_heartbeat(
p_node_id text,
p_metrics jsonb DEFAULT NULL
) RETURNS jsonb AS $$
DECLARE
v_node_exists boolean;
BEGIN
-- Перевірити чи нода зареєстрована
SELECT EXISTS(SELECT 1 FROM node_registry WHERE id = p_node_id AND is_active = true)
INTO v_node_exists;
IF NOT v_node_exists THEN
RETURN jsonb_build_object(
'success', false,
'error', 'Node not registered',
'should_self_register', true
);
END IF;
-- Оновити node_cache
UPDATE node_cache SET
last_heartbeat = NOW(),
self_healing_status = 'healthy',
cpu_usage = COALESCE((p_metrics->>'cpu_usage')::numeric, cpu_usage),
gpu_vram_used = COALESCE((p_metrics->>'gpu_vram_used')::integer, gpu_vram_used),
ram_used = COALESCE((p_metrics->>'ram_used')::integer, ram_used),
disk_used = COALESCE((p_metrics->>'disk_used')::integer, disk_used),
agent_count_router = COALESCE((p_metrics->>'agent_count_router')::integer, agent_count_router),
agent_count_system = COALESCE((p_metrics->>'agent_count_system')::integer, agent_count_system)
WHERE node_id = p_node_id;
-- Також оновити updated_at в registry
UPDATE node_registry SET updated_at = NOW()
WHERE id = p_node_id;
RETURN jsonb_build_object(
'success', true,
'node_id', p_node_id,
'heartbeat_at', NOW()
);
END;
$$ LANGUAGE plpgsql;
-- ============================================================================
-- 8. Коментарі
-- ============================================================================
COMMENT ON TABLE node_registry IS 'Реєстр нод DAARION — єдине джерело істини для Node Directory';
COMMENT ON COLUMN node_registry.id IS 'Унікальний ідентифікатор ноди';
COMMENT ON COLUMN node_registry.roles IS 'Ролі ноди: gpu, ai_runtime, storage, matrix, development, production';
COMMENT ON COLUMN node_registry.last_self_registration IS 'Остання успішна самореєстрація ноди';
COMMENT ON COLUMN node_registry.self_registration_count IS 'Кількість разів, коли нода реєструвала себе';
COMMENT ON FUNCTION fn_node_self_register IS 'Самореєстрація ноди — викликається Node Bootstrap або Guardian';
COMMENT ON FUNCTION fn_node_heartbeat IS 'Heartbeat ноди з оновленням метрик';
COMMENT ON VIEW v_nodes_directory IS 'View для Node Directory — з''єднує registry + cache + derived статуси';
-- ============================================================================
-- 9. Результат
-- ============================================================================
SELECT 'Migration 039 completed: Node Registry for Self-Healing' AS result;
-- Показати зареєстровані ноди
SELECT
id,
name,
environment,
roles,
is_active
FROM node_registry
ORDER BY registered_at;