Some checks failed
Update Documentation / update-repos-info (push) Has been cancelled
- Capability Registry (Postgres heartbeat) - NATS Client (підписка на streams) - Job Executor (виконання jobs) - Metrics Exporter (Prometheus) - Dockerfile для deployment - Виправлено server_name в NATS (emptyDir) TODO: Реальна реалізація embed/retrieve/summarize, Matrix Gateway, Auth
176 lines
7.2 KiB
Python
176 lines
7.2 KiB
Python
"""
|
||
Capability Registry — реєстрація воркерів в Postgres
|
||
"""
|
||
|
||
import asyncio
|
||
import asyncpg
|
||
import json
|
||
import os
|
||
import platform
|
||
import subprocess
|
||
from datetime import datetime
|
||
from typing import Dict, Any, Optional
|
||
|
||
|
||
class CapabilityRegistry:
|
||
def __init__(self, postgres_url: str, node_id: str, tier: str, region: str):
|
||
self.postgres_url = postgres_url
|
||
self.node_id = node_id
|
||
self.tier = tier
|
||
self.region = region
|
||
self.pool: Optional[asyncpg.Pool] = None
|
||
|
||
async def connect(self):
|
||
"""Підключення до Postgres"""
|
||
if not self.postgres_url:
|
||
print("⚠️ CAPABILITY_REGISTRY не встановлено, пропускаємо реєстрацію")
|
||
return
|
||
|
||
try:
|
||
self.pool = await asyncpg.create_pool(self.postgres_url)
|
||
await self._ensure_table()
|
||
except Exception as e:
|
||
print(f"❌ Помилка підключення до Postgres: {e}")
|
||
self.pool = None
|
||
|
||
async def _ensure_table(self):
|
||
"""Створення таблиці worker_capabilities якщо не існує"""
|
||
async with self.pool.acquire() as conn:
|
||
await conn.execute("""
|
||
CREATE TABLE IF NOT EXISTS worker_capabilities (
|
||
node_id VARCHAR(255) PRIMARY KEY,
|
||
tier VARCHAR(10) NOT NULL,
|
||
region VARCHAR(50),
|
||
trust_zone VARCHAR(50),
|
||
hardware JSONB NOT NULL,
|
||
capabilities JSONB NOT NULL,
|
||
status VARCHAR(20) NOT NULL,
|
||
last_heartbeat TIMESTAMP WITH TIME ZONE NOT NULL,
|
||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||
);
|
||
|
||
CREATE INDEX IF NOT EXISTS idx_worker_capabilities_tier
|
||
ON worker_capabilities(tier);
|
||
CREATE INDEX IF NOT EXISTS idx_worker_capabilities_status
|
||
ON worker_capabilities(status);
|
||
""")
|
||
|
||
async def _detect_hardware(self) -> Dict[str, Any]:
|
||
"""Автоматичне визначення hardware capabilities"""
|
||
hardware = {
|
||
"cpu_cores": os.cpu_count() or 1,
|
||
"ram_gb": self._get_ram_gb(),
|
||
"gpu": False,
|
||
"gpu_model": None,
|
||
"vram_gb": 0,
|
||
"cuda_version": None
|
||
}
|
||
|
||
# Перевірка GPU (NVIDIA)
|
||
try:
|
||
result = subprocess.run(["nvidia-smi", "--query-gpu=name,memory.total", "--format=csv,noheader"],
|
||
capture_output=True, text=True, timeout=5)
|
||
if result.returncode == 0:
|
||
lines = result.stdout.strip().split("\n")
|
||
if lines:
|
||
gpu_info = lines[0].split(",")
|
||
hardware["gpu"] = True
|
||
hardware["gpu_model"] = gpu_info[0].strip()
|
||
# Парсинг VRAM (формат: "XXXXX MiB")
|
||
vram_str = gpu_info[1].strip().replace("MiB", "").strip()
|
||
hardware["vram_gb"] = int(vram_str) // 1024 if vram_str.isdigit() else 0
|
||
|
||
# CUDA version
|
||
cuda_result = subprocess.run(["nvcc", "--version"],
|
||
capture_output=True, text=True, timeout=5)
|
||
if cuda_result.returncode == 0:
|
||
for line in cuda_result.stdout.split("\n"):
|
||
if "release" in line.lower():
|
||
# Парсинг версії CUDA
|
||
parts = line.split()
|
||
for i, part in enumerate(parts):
|
||
if part.lower() == "release":
|
||
if i + 1 < len(parts):
|
||
hardware["cuda_version"] = parts[i + 1].rstrip(",")
|
||
except Exception:
|
||
pass # GPU не знайдено або помилка
|
||
|
||
return hardware
|
||
|
||
def _get_ram_gb(self) -> int:
|
||
"""Отримання RAM в GB"""
|
||
try:
|
||
if platform.system() == "Linux":
|
||
with open("/proc/meminfo") as f:
|
||
for line in f:
|
||
if line.startswith("MemTotal:"):
|
||
return int(line.split()[1]) // (1024 * 1024)
|
||
elif platform.system() == "Darwin": # macOS
|
||
result = subprocess.run(["sysctl", "-n", "hw.memsize"],
|
||
capture_output=True, text=True)
|
||
if result.returncode == 0:
|
||
return int(result.stdout.strip()) // (1024 ** 3)
|
||
except Exception:
|
||
pass
|
||
return 8 # Default
|
||
|
||
async def register(self):
|
||
"""Реєстрація воркера"""
|
||
await self.connect()
|
||
if not self.pool:
|
||
return
|
||
|
||
hardware = await self._detect_hardware()
|
||
capabilities = {
|
||
"max_batch": 1000,
|
||
"max_tokens": 8192,
|
||
"models": ["cohere/embed-multilingual-v3.0"],
|
||
"embedding_dim": 1024,
|
||
"supported_jobs": ["embed", "summarize", "index"]
|
||
}
|
||
|
||
async with self.pool.acquire() as conn:
|
||
await conn.execute("""
|
||
INSERT INTO worker_capabilities
|
||
(node_id, tier, region, trust_zone, hardware, capabilities, status, last_heartbeat)
|
||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||
ON CONFLICT (node_id) DO UPDATE SET
|
||
tier = EXCLUDED.tier,
|
||
region = EXCLUDED.region,
|
||
hardware = EXCLUDED.hardware,
|
||
capabilities = EXCLUDED.capabilities,
|
||
status = EXCLUDED.status,
|
||
last_heartbeat = EXCLUDED.last_heartbeat,
|
||
updated_at = CURRENT_TIMESTAMP
|
||
""", self.node_id, self.tier, self.region, "internal",
|
||
json.dumps(hardware), json.dumps(capabilities), "ready", datetime.utcnow())
|
||
|
||
print(f"✅ Worker зареєстровано: {self.node_id} (Tier {self.tier})")
|
||
|
||
async def update_heartbeat(self):
|
||
"""Оновлення heartbeat"""
|
||
if not self.pool:
|
||
return
|
||
|
||
async with self.pool.acquire() as conn:
|
||
await conn.execute("""
|
||
UPDATE worker_capabilities
|
||
SET last_heartbeat = CURRENT_TIMESTAMP
|
||
WHERE node_id = $1
|
||
""", self.node_id)
|
||
|
||
async def unregister(self):
|
||
"""Видалення реєстрації"""
|
||
if not self.pool:
|
||
return
|
||
|
||
async with self.pool.acquire() as conn:
|
||
await conn.execute("""
|
||
UPDATE worker_capabilities
|
||
SET status = 'offline'
|
||
WHERE node_id = $1
|
||
""", self.node_id)
|
||
|
||
print(f"✅ Worker видалено з реєстру: {self.node_id}")
|