480 lines
16 KiB
Python
Executable File
480 lines
16 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
DAARION Node Guardian Self-Healing Loop
|
|
|
|
Періодично перевіряє стан ноди та виконує self-healing якщо потрібно.
|
|
Запускається як фоновий процес на кожній ноді.
|
|
|
|
Використання:
|
|
python scripts/node-guardian-loop.py
|
|
python scripts/node-guardian-loop.py --node-id node-2-macbook-m4max
|
|
python scripts/node-guardian-loop.py --interval 300 # 5 хвилин
|
|
|
|
Environment variables:
|
|
CITY_SERVICE_URL - URL city-service
|
|
NODE_ID - ID ноди
|
|
NODE_NAME - Назва ноди (для self-registration)
|
|
NODE_ENVIRONMENT - production/development
|
|
NODE_ROLES - Ролі через кому
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, Any, Optional
|
|
|
|
try:
|
|
import httpx
|
|
except ImportError:
|
|
print("❌ httpx not installed. Run: pip install httpx")
|
|
sys.exit(1)
|
|
|
|
|
|
# ==============================================================================
|
|
# Configuration
|
|
# ==============================================================================
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s [NODE-GUARDIAN] %(levelname)s: %(message)s',
|
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
DEFAULT_INTERVAL = 60 # seconds
|
|
DEFAULT_CITY_URL = "http://localhost:7001"
|
|
|
|
|
|
# ==============================================================================
|
|
# Self-Healing Logic
|
|
# ==============================================================================
|
|
|
|
class NodeGuardian:
|
|
"""Node Guardian — self-healing agent for DAARION nodes"""
|
|
|
|
def __init__(
|
|
self,
|
|
node_id: str,
|
|
node_name: str,
|
|
city_url: str,
|
|
environment: str = "development",
|
|
roles: list = None,
|
|
hostname: str = None
|
|
):
|
|
self.node_id = node_id
|
|
self.node_name = node_name
|
|
self.city_url = city_url.rstrip("/")
|
|
self.environment = environment
|
|
self.roles = roles or []
|
|
self.hostname = hostname
|
|
|
|
self.client = httpx.AsyncClient(timeout=10.0)
|
|
self.healing_attempts = 0
|
|
self.last_successful_check = None
|
|
|
|
async def close(self):
|
|
await self.client.aclose()
|
|
|
|
async def check_visibility(self) -> bool:
|
|
"""Перевірити чи нода видима в Node Directory"""
|
|
try:
|
|
response = await self.client.get(
|
|
f"{self.city_url}/city/internal/node/{self.node_id}/directory-check"
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
return data.get("visible_in_directory", False)
|
|
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f"Visibility check failed: {e}")
|
|
return False
|
|
|
|
async def get_self_healing_status(self) -> Dict[str, Any]:
|
|
"""Отримати статус self-healing"""
|
|
try:
|
|
response = await self.client.get(
|
|
f"{self.city_url}/city/internal/node/{self.node_id}/self-healing/status"
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
|
|
return {"registered": False, "status": "unknown"}
|
|
except Exception as e:
|
|
logger.error(f"Status check failed: {e}")
|
|
return {"registered": False, "status": "error", "error": str(e)}
|
|
|
|
async def self_register(self) -> bool:
|
|
"""Виконати самореєстрацію"""
|
|
try:
|
|
payload = {
|
|
"id": self.node_id,
|
|
"name": self.node_name,
|
|
"hostname": self.hostname,
|
|
"environment": self.environment,
|
|
"roles": self.roles
|
|
}
|
|
|
|
response = await self.client.post(
|
|
f"{self.city_url}/city/internal/nodes/register-or-update",
|
|
json=payload
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
if data.get("success"):
|
|
logger.info(f"✅ Self-registration successful: {data.get('message')}")
|
|
return True
|
|
else:
|
|
logger.warning(f"Self-registration returned false: {data}")
|
|
else:
|
|
logger.error(f"Self-registration failed: HTTP {response.status_code}")
|
|
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f"Self-registration error: {e}")
|
|
return False
|
|
|
|
async def send_heartbeat(self, metrics: Dict = None) -> bool:
|
|
"""Відправити heartbeat"""
|
|
try:
|
|
payload = {"metrics": metrics or {}}
|
|
|
|
response = await self.client.post(
|
|
f"{self.city_url}/city/internal/node/{self.node_id}/heartbeat",
|
|
json=payload
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
|
|
if data.get("should_self_register"):
|
|
logger.warning("⚠️ Server requests self-registration")
|
|
return await self.self_register()
|
|
|
|
if data.get("success"):
|
|
return True
|
|
|
|
return False
|
|
except Exception as e:
|
|
logger.error(f"Heartbeat failed: {e}")
|
|
return False
|
|
|
|
async def trigger_healing(self) -> Dict[str, Any]:
|
|
"""Тригернути self-healing через API"""
|
|
try:
|
|
response = await self.client.post(
|
|
f"{self.city_url}/city/internal/node/{self.node_id}/self-healing/trigger"
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
return response.json()
|
|
else:
|
|
return {"error": f"HTTP {response.status_code}"}
|
|
except Exception as e:
|
|
return {"error": str(e)}
|
|
|
|
async def collect_metrics(self) -> Dict[str, Any]:
|
|
"""Зібрати метрики ноди та Swapper"""
|
|
metrics = {
|
|
"cpu_usage": 0.0, # TODO: Implement real metrics
|
|
"gpu_vram_used": 0,
|
|
"ram_used": 0,
|
|
"disk_used": 0,
|
|
"agent_count_router": 0,
|
|
"agent_count_system": 0,
|
|
"dagi_router_url": "http://dagi-router:9102",
|
|
# Swapper defaults
|
|
"swapper_healthy": False,
|
|
"swapper_models_loaded": 0,
|
|
"swapper_models_total": 0,
|
|
"swapper_state": {}
|
|
}
|
|
|
|
# Collect Swapper Metrics
|
|
swapper_url = os.getenv("SWAPPER_URL", "http://swapper-service:8890")
|
|
try:
|
|
# Check health (Swapper uses /health, not /healthz)
|
|
try:
|
|
r = await self.client.get(f"{swapper_url}/health", timeout=3.0)
|
|
if r.status_code == 200:
|
|
health_data = r.json()
|
|
# Swapper can return "status": "healthy" or "ok"
|
|
status = health_data.get("status", "").lower()
|
|
metrics["swapper_healthy"] = status in ("healthy", "ok")
|
|
else:
|
|
metrics["swapper_healthy"] = False
|
|
except Exception:
|
|
metrics["swapper_healthy"] = False
|
|
|
|
# Check models (Swapper uses /models, not /v1/models)
|
|
try:
|
|
r = await self.client.get(f"{swapper_url}/models", timeout=5.0)
|
|
if r.status_code == 200:
|
|
data = r.json()
|
|
models = data.get("models", [])
|
|
metrics["swapper_models_total"] = len(models)
|
|
# Swapper uses "status": "loaded" not "loaded": true
|
|
metrics["swapper_models_loaded"] = sum(1 for m in models if m.get("status") == "loaded")
|
|
metrics["swapper_state"] = data
|
|
except Exception as e:
|
|
# logger.warning(f"Failed to fetch Swapper models: {e}")
|
|
# Swapper might not be ready or not deployed on this node
|
|
pass
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Swapper metrics collection failed: {e}")
|
|
|
|
return metrics
|
|
|
|
async def run_health_check(self) -> Dict[str, Any]:
|
|
"""
|
|
Виконати повну перевірку здоров'я ноди.
|
|
|
|
Returns:
|
|
{
|
|
"healthy": bool,
|
|
"checks": {
|
|
"visible_in_directory": bool,
|
|
"registered": bool,
|
|
"has_guardian": bool,
|
|
"has_steward": bool,
|
|
"heartbeat_fresh": bool
|
|
},
|
|
"actions_taken": []
|
|
}
|
|
"""
|
|
result = {
|
|
"healthy": True,
|
|
"checks": {},
|
|
"actions_taken": [],
|
|
"timestamp": datetime.now(timezone.utc).isoformat()
|
|
}
|
|
|
|
# 1. Check visibility in directory
|
|
visible = await self.check_visibility()
|
|
result["checks"]["visible_in_directory"] = visible
|
|
|
|
if not visible:
|
|
result["healthy"] = False
|
|
logger.warning("⚠️ Node not visible in directory, attempting self-registration...")
|
|
|
|
registered = await self.self_register()
|
|
result["actions_taken"].append({
|
|
"action": "self_register",
|
|
"success": registered
|
|
})
|
|
|
|
if registered:
|
|
# Re-check visibility
|
|
visible = await self.check_visibility()
|
|
result["checks"]["visible_in_directory_after_heal"] = visible
|
|
|
|
# 2. Get detailed status
|
|
status = await self.get_self_healing_status()
|
|
result["checks"]["registered"] = status.get("registered", False)
|
|
result["checks"]["has_guardian"] = status.get("has_guardian", False)
|
|
result["checks"]["has_steward"] = status.get("has_steward", False)
|
|
result["checks"]["agent_count_router"] = status.get("agent_count_router", 0)
|
|
result["checks"]["agent_count_system"] = status.get("agent_count_system", 0)
|
|
|
|
# 3. Check if healing needed based on status
|
|
if status.get("self_healing_status") == "error":
|
|
result["healthy"] = False
|
|
logger.warning("⚠️ Node in error state, triggering healing...")
|
|
|
|
heal_result = await self.trigger_healing()
|
|
result["actions_taken"].append({
|
|
"action": "trigger_healing",
|
|
"result": heal_result
|
|
})
|
|
|
|
# 4. Send heartbeat with metrics
|
|
metrics = await self.collect_metrics()
|
|
heartbeat_ok = await self.send_heartbeat(metrics)
|
|
result["checks"]["heartbeat_sent"] = heartbeat_ok
|
|
|
|
if heartbeat_ok:
|
|
self.last_successful_check = datetime.now(timezone.utc)
|
|
|
|
# Update healthy status
|
|
if result["actions_taken"]:
|
|
# If we took actions, check if any failed
|
|
failed_actions = [a for a in result["actions_taken"] if not a.get("success", True)]
|
|
if failed_actions:
|
|
result["healthy"] = False
|
|
|
|
return result
|
|
|
|
|
|
# ==============================================================================
|
|
# Main Loop
|
|
# ==============================================================================
|
|
|
|
async def run_guardian_loop(
|
|
node_id: str,
|
|
node_name: str,
|
|
city_url: str,
|
|
environment: str,
|
|
roles: list,
|
|
hostname: str,
|
|
interval: int
|
|
):
|
|
"""Run the Node Guardian self-healing loop"""
|
|
|
|
guardian = NodeGuardian(
|
|
node_id=node_id,
|
|
node_name=node_name,
|
|
city_url=city_url,
|
|
environment=environment,
|
|
roles=roles,
|
|
hostname=hostname
|
|
)
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("DAARION Node Guardian Starting")
|
|
logger.info("=" * 60)
|
|
logger.info(f" Node ID: {node_id}")
|
|
logger.info(f" Node Name: {node_name}")
|
|
logger.info(f" Environment: {environment}")
|
|
logger.info(f" City Service: {city_url}")
|
|
logger.info(f" Interval: {interval}s")
|
|
logger.info("=" * 60)
|
|
|
|
# Initial check
|
|
logger.info("Running initial health check...")
|
|
result = await guardian.run_health_check()
|
|
|
|
if result["healthy"]:
|
|
logger.info("✅ Initial check passed")
|
|
else:
|
|
logger.warning("⚠️ Initial check found issues:")
|
|
for action in result.get("actions_taken", []):
|
|
logger.warning(f" - {action}")
|
|
|
|
# Main loop
|
|
try:
|
|
while True:
|
|
await asyncio.sleep(interval)
|
|
|
|
logger.info(f"Running periodic health check...")
|
|
result = await guardian.run_health_check()
|
|
|
|
if result["healthy"]:
|
|
logger.info(f"✅ Health check passed")
|
|
else:
|
|
logger.warning(f"⚠️ Health check found issues")
|
|
for action in result.get("actions_taken", []):
|
|
logger.info(f" Action: {action['action']} - {action.get('success', 'done')}")
|
|
|
|
except KeyboardInterrupt:
|
|
logger.info("Shutting down Node Guardian...")
|
|
except Exception as e:
|
|
logger.error(f"Guardian loop error: {e}")
|
|
raise
|
|
finally:
|
|
await guardian.close()
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="DAARION Node Guardian Self-Healing Loop"
|
|
)
|
|
parser.add_argument(
|
|
"--node-id",
|
|
default=os.getenv("NODE_ID"),
|
|
help="Node ID (default: $NODE_ID)"
|
|
)
|
|
parser.add_argument(
|
|
"--node-name",
|
|
default=os.getenv("NODE_NAME"),
|
|
help="Node name (default: $NODE_NAME)"
|
|
)
|
|
parser.add_argument(
|
|
"--city-url",
|
|
default=os.getenv("CITY_SERVICE_URL", DEFAULT_CITY_URL),
|
|
help="City service URL"
|
|
)
|
|
parser.add_argument(
|
|
"--environment",
|
|
default=os.getenv("NODE_ENVIRONMENT", "development"),
|
|
help="Node environment"
|
|
)
|
|
parser.add_argument(
|
|
"--roles",
|
|
default=os.getenv("NODE_ROLES", "gpu,ai_runtime"),
|
|
help="Node roles (comma-separated)"
|
|
)
|
|
parser.add_argument(
|
|
"--hostname",
|
|
default=os.getenv("NODE_HOSTNAME"),
|
|
help="Node hostname"
|
|
)
|
|
parser.add_argument(
|
|
"--interval",
|
|
type=int,
|
|
default=int(os.getenv("GUARDIAN_INTERVAL", DEFAULT_INTERVAL)),
|
|
help=f"Check interval in seconds (default: {DEFAULT_INTERVAL})"
|
|
)
|
|
parser.add_argument(
|
|
"--once",
|
|
action="store_true",
|
|
help="Run single check and exit"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate required params
|
|
if not args.node_id:
|
|
logger.error("NODE_ID is required (--node-id or $NODE_ID)")
|
|
sys.exit(1)
|
|
|
|
if not args.node_name:
|
|
args.node_name = f"Node {args.node_id}"
|
|
|
|
roles = [r.strip() for r in args.roles.split(",") if r.strip()]
|
|
|
|
if args.once:
|
|
# Single check mode
|
|
async def single_check():
|
|
guardian = NodeGuardian(
|
|
node_id=args.node_id,
|
|
node_name=args.node_name,
|
|
city_url=args.city_url,
|
|
environment=args.environment,
|
|
roles=roles,
|
|
hostname=args.hostname
|
|
)
|
|
|
|
result = await guardian.run_health_check()
|
|
await guardian.close()
|
|
|
|
print(json.dumps(result, indent=2))
|
|
|
|
if not result["healthy"]:
|
|
sys.exit(1)
|
|
|
|
asyncio.run(single_check())
|
|
else:
|
|
# Loop mode
|
|
asyncio.run(run_guardian_loop(
|
|
node_id=args.node_id,
|
|
node_name=args.node_name,
|
|
city_url=args.city_url,
|
|
environment=args.environment,
|
|
roles=roles,
|
|
hostname=args.hostname,
|
|
interval=args.interval
|
|
))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|