🔧 Worker Daemon: базова реалізація v1
Some checks failed
Update Documentation / update-repos-info (push) Has been cancelled
Some checks failed
Update Documentation / update-repos-info (push) Has been cancelled
- Capability Registry (Postgres heartbeat) - NATS Client (підписка на streams) - Job Executor (виконання jobs) - Metrics Exporter (Prometheus) - Dockerfile для deployment - Виправлено server_name в NATS (emptyDir) TODO: Реальна реалізація embed/retrieve/summarize, Matrix Gateway, Auth
This commit is contained in:
175
infrastructure/worker-daemon/worker/registry.py
Normal file
175
infrastructure/worker-daemon/worker/registry.py
Normal file
@@ -0,0 +1,175 @@
|
||||
"""
|
||||
Capability Registry — реєстрація воркерів в Postgres
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import asyncpg
|
||||
import json
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
|
||||
class CapabilityRegistry:
|
||||
def __init__(self, postgres_url: str, node_id: str, tier: str, region: str):
|
||||
self.postgres_url = postgres_url
|
||||
self.node_id = node_id
|
||||
self.tier = tier
|
||||
self.region = region
|
||||
self.pool: Optional[asyncpg.Pool] = None
|
||||
|
||||
async def connect(self):
|
||||
"""Підключення до Postgres"""
|
||||
if not self.postgres_url:
|
||||
print("⚠️ CAPABILITY_REGISTRY не встановлено, пропускаємо реєстрацію")
|
||||
return
|
||||
|
||||
try:
|
||||
self.pool = await asyncpg.create_pool(self.postgres_url)
|
||||
await self._ensure_table()
|
||||
except Exception as e:
|
||||
print(f"❌ Помилка підключення до Postgres: {e}")
|
||||
self.pool = None
|
||||
|
||||
async def _ensure_table(self):
|
||||
"""Створення таблиці worker_capabilities якщо не існує"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS worker_capabilities (
|
||||
node_id VARCHAR(255) PRIMARY KEY,
|
||||
tier VARCHAR(10) NOT NULL,
|
||||
region VARCHAR(50),
|
||||
trust_zone VARCHAR(50),
|
||||
hardware JSONB NOT NULL,
|
||||
capabilities JSONB NOT NULL,
|
||||
status VARCHAR(20) NOT NULL,
|
||||
last_heartbeat TIMESTAMP WITH TIME ZONE NOT NULL,
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_capabilities_tier
|
||||
ON worker_capabilities(tier);
|
||||
CREATE INDEX IF NOT EXISTS idx_worker_capabilities_status
|
||||
ON worker_capabilities(status);
|
||||
""")
|
||||
|
||||
async def _detect_hardware(self) -> Dict[str, Any]:
|
||||
"""Автоматичне визначення hardware capabilities"""
|
||||
hardware = {
|
||||
"cpu_cores": os.cpu_count() or 1,
|
||||
"ram_gb": self._get_ram_gb(),
|
||||
"gpu": False,
|
||||
"gpu_model": None,
|
||||
"vram_gb": 0,
|
||||
"cuda_version": None
|
||||
}
|
||||
|
||||
# Перевірка GPU (NVIDIA)
|
||||
try:
|
||||
result = subprocess.run(["nvidia-smi", "--query-gpu=name,memory.total", "--format=csv,noheader"],
|
||||
capture_output=True, text=True, timeout=5)
|
||||
if result.returncode == 0:
|
||||
lines = result.stdout.strip().split("\n")
|
||||
if lines:
|
||||
gpu_info = lines[0].split(",")
|
||||
hardware["gpu"] = True
|
||||
hardware["gpu_model"] = gpu_info[0].strip()
|
||||
# Парсинг VRAM (формат: "XXXXX MiB")
|
||||
vram_str = gpu_info[1].strip().replace("MiB", "").strip()
|
||||
hardware["vram_gb"] = int(vram_str) // 1024 if vram_str.isdigit() else 0
|
||||
|
||||
# CUDA version
|
||||
cuda_result = subprocess.run(["nvcc", "--version"],
|
||||
capture_output=True, text=True, timeout=5)
|
||||
if cuda_result.returncode == 0:
|
||||
for line in cuda_result.stdout.split("\n"):
|
||||
if "release" in line.lower():
|
||||
# Парсинг версії CUDA
|
||||
parts = line.split()
|
||||
for i, part in enumerate(parts):
|
||||
if part.lower() == "release":
|
||||
if i + 1 < len(parts):
|
||||
hardware["cuda_version"] = parts[i + 1].rstrip(",")
|
||||
except Exception:
|
||||
pass # GPU не знайдено або помилка
|
||||
|
||||
return hardware
|
||||
|
||||
def _get_ram_gb(self) -> int:
|
||||
"""Отримання RAM в GB"""
|
||||
try:
|
||||
if platform.system() == "Linux":
|
||||
with open("/proc/meminfo") as f:
|
||||
for line in f:
|
||||
if line.startswith("MemTotal:"):
|
||||
return int(line.split()[1]) // (1024 * 1024)
|
||||
elif platform.system() == "Darwin": # macOS
|
||||
result = subprocess.run(["sysctl", "-n", "hw.memsize"],
|
||||
capture_output=True, text=True)
|
||||
if result.returncode == 0:
|
||||
return int(result.stdout.strip()) // (1024 ** 3)
|
||||
except Exception:
|
||||
pass
|
||||
return 8 # Default
|
||||
|
||||
async def register(self):
|
||||
"""Реєстрація воркера"""
|
||||
await self.connect()
|
||||
if not self.pool:
|
||||
return
|
||||
|
||||
hardware = await self._detect_hardware()
|
||||
capabilities = {
|
||||
"max_batch": 1000,
|
||||
"max_tokens": 8192,
|
||||
"models": ["cohere/embed-multilingual-v3.0"],
|
||||
"embedding_dim": 1024,
|
||||
"supported_jobs": ["embed", "summarize", "index"]
|
||||
}
|
||||
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
INSERT INTO worker_capabilities
|
||||
(node_id, tier, region, trust_zone, hardware, capabilities, status, last_heartbeat)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
ON CONFLICT (node_id) DO UPDATE SET
|
||||
tier = EXCLUDED.tier,
|
||||
region = EXCLUDED.region,
|
||||
hardware = EXCLUDED.hardware,
|
||||
capabilities = EXCLUDED.capabilities,
|
||||
status = EXCLUDED.status,
|
||||
last_heartbeat = EXCLUDED.last_heartbeat,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
""", self.node_id, self.tier, self.region, "internal",
|
||||
json.dumps(hardware), json.dumps(capabilities), "ready", datetime.utcnow())
|
||||
|
||||
print(f"✅ Worker зареєстровано: {self.node_id} (Tier {self.tier})")
|
||||
|
||||
async def update_heartbeat(self):
|
||||
"""Оновлення heartbeat"""
|
||||
if not self.pool:
|
||||
return
|
||||
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
UPDATE worker_capabilities
|
||||
SET last_heartbeat = CURRENT_TIMESTAMP
|
||||
WHERE node_id = $1
|
||||
""", self.node_id)
|
||||
|
||||
async def unregister(self):
|
||||
"""Видалення реєстрації"""
|
||||
if not self.pool:
|
||||
return
|
||||
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
UPDATE worker_capabilities
|
||||
SET status = 'offline'
|
||||
WHERE node_id = $1
|
||||
""", self.node_id)
|
||||
|
||||
print(f"✅ Worker видалено з реєстру: {self.node_id}")
|
||||
Reference in New Issue
Block a user