feat: Node Self-Healing, DAGI Audit, Agent Prompts, Infra Invariants

### Backend (city-service)
- Node Registry + Self-Healing API (migration 039)
- Improved get_all_nodes() with robust fallback for node_registry/node_cache
- Agent Prompts Runtime API for DAGI Router integration
- DAGI Router Audit endpoints (phantom/stale detection)
- Node Agents API (Guardian/Steward)
- Node metrics extended (CPU/GPU/RAM/Disk)

### Frontend (apps/web)
- Node Directory with improved error handling
- Node Cabinet with metrics cards
- DAGI Router Card component
- Node Metrics Card component
- useDAGIAudit hook

### Scripts
- check-invariants.py - deploy verification
- node-bootstrap.sh - node self-registration
- node-guardian-loop.py - continuous self-healing
- dagi_agent_audit.py - DAGI audit utility

### Migrations
- 034: Agent prompts seed
- 035: Agent DAGI audit
- 036: Node metrics extended
- 037: Node agents complete
- 038: Agent prompts full coverage
- 039: Node registry self-healing

### Tests
- test_infra_smoke.py
- test_agent_prompts_runtime.py
- test_dagi_router_api.py

### Documentation
- DEPLOY_CHECKLIST_2024_11_30.md
- Multiple TASK_PHASE docs
This commit is contained in:
Apple
2025-11-30 13:52:01 -08:00
parent 0c7836af5a
commit bca81dc719
36 changed files with 10630 additions and 55 deletions

View File

@@ -4,7 +4,8 @@ City Backend API Routes
from fastapi import APIRouter, HTTPException, Depends, Body, Header, Query, Request, UploadFile, File, Form
from pydantic import BaseModel
from typing import List, Optional
from typing import List, Optional, Dict
from datetime import datetime, timezone
import logging
import httpx
import os
@@ -512,12 +513,52 @@ docker compose up -d
@public_router.get("/nodes")
async def list_nodes():
"""Список всіх нод мережі"""
"""Список всіх нод мережі з метриками"""
try:
from models_city import NodeMetrics, NodeAgentSummary
nodes = await repo_city.get_all_nodes()
items: List[NodeProfile] = []
for node in nodes:
# Build guardian agent
guardian_agent = None
if node.get("guardian_agent"):
guardian_agent = NodeAgentSummary(
id=node["guardian_agent"]["id"],
name=node["guardian_agent"].get("name"),
slug=node["guardian_agent"].get("slug")
)
# Build steward agent
steward_agent = None
if node.get("steward_agent"):
steward_agent = NodeAgentSummary(
id=node["steward_agent"]["id"],
name=node["steward_agent"].get("name"),
slug=node["steward_agent"].get("slug")
)
# Build metrics
metrics = None
if node.get("metrics"):
m = node["metrics"]
metrics = NodeMetrics(
cpu_model=m.get("cpu_model"),
cpu_cores=m.get("cpu_cores", 0),
cpu_usage=m.get("cpu_usage", 0.0),
gpu_model=m.get("gpu_model"),
gpu_vram_total=m.get("gpu_vram_total", 0),
gpu_vram_used=m.get("gpu_vram_used", 0),
ram_total=m.get("ram_total", 0),
ram_used=m.get("ram_used", 0),
disk_total=m.get("disk_total", 0),
disk_used=m.get("disk_used", 0),
agent_count_router=m.get("agent_count_router", 0),
agent_count_system=m.get("agent_count_system", 0),
dagi_router_url=m.get("dagi_router_url")
)
items.append(NodeProfile(
node_id=node["node_id"],
name=node["name"],
@@ -528,12 +569,17 @@ async def list_nodes():
gpu_info=node.get("gpu"),
agents_total=node.get("agents_total", 0),
agents_online=node.get("agents_online", 0),
last_heartbeat=str(node["last_heartbeat"]) if node.get("last_heartbeat") else None
last_heartbeat=str(node["last_heartbeat"]) if node.get("last_heartbeat") else None,
guardian_agent=guardian_agent,
steward_agent=steward_agent,
metrics=metrics
))
return {"items": items, "total": len(items)}
except Exception as e:
logger.error(f"Failed to list nodes: {e}")
import traceback
traceback.print_exc()
raise HTTPException(status_code=500, detail="Failed to list nodes")
@@ -3210,3 +3256,907 @@ async def ensure_orchestrator_room(
except Exception as e:
logger.error(f"Error ensuring orchestrator room for {slug}: {e}")
raise HTTPException(status_code=500, detail="Internal server error")
# =============================================================================
# DAGI Agent Audit API
# =============================================================================
class DAGIAuditSummary(BaseModel):
"""Підсумок DAGI audit"""
node_id: str
timestamp: str
router_total: int
db_total: int
active_count: int
phantom_count: int
stale_count: int
triggered_by: Optional[str] = None
class DAGIAgentStatus(BaseModel):
"""Статус агента в DAGI"""
id: str
name: str
external_id: Optional[str] = None
kind: Optional[str] = None
status: str # active, stale, phantom
dagi_status: Optional[str] = None
last_seen_at: Optional[str] = None
router_id: Optional[str] = None
reason: Optional[str] = None
class DAGIAuditResponse(BaseModel):
"""Повний звіт DAGI audit"""
summary: DAGIAuditSummary
active_agents: List[DAGIAgentStatus]
phantom_agents: List[DAGIAgentStatus]
stale_agents: List[DAGIAgentStatus]
@router.get("/internal/node/{node_id}/dagi-audit", response_model=Optional[DAGIAuditSummary])
async def get_node_dagi_audit(node_id: str):
"""
Отримати останній DAGI audit звіт для ноди.
"""
try:
audit = await repo_city.get_latest_dagi_audit(node_id)
if not audit:
return None
return DAGIAuditSummary(
node_id=audit["node_id"],
timestamp=audit["timestamp"],
router_total=audit["router_total"],
db_total=audit["db_total"],
active_count=audit["active_count"],
phantom_count=audit["phantom_count"],
stale_count=audit["stale_count"],
triggered_by=audit.get("triggered_by")
)
except Exception as e:
logger.error(f"Error getting DAGI audit for {node_id}: {e}")
raise HTTPException(status_code=500, detail="Failed to get DAGI audit")
@router.get("/internal/node/{node_id}/dagi-audit/full")
async def get_node_dagi_audit_full(node_id: str):
"""
Отримати повний DAGI audit звіт для ноди (з деталями).
"""
try:
audit = await repo_city.get_latest_dagi_audit(node_id)
if not audit:
raise HTTPException(status_code=404, detail="No audit found for this node")
return audit
except HTTPException:
raise
except Exception as e:
logger.error(f"Error getting full DAGI audit for {node_id}: {e}")
raise HTTPException(status_code=500, detail="Failed to get DAGI audit")
@router.get("/internal/node/{node_id}/dagi-audit/history")
async def get_node_dagi_audit_history(
node_id: str,
limit: int = Query(default=10, le=100)
):
"""
Отримати історію DAGI audit звітів для ноди.
"""
try:
history = await repo_city.get_dagi_audit_history(node_id, limit)
return {"node_id": node_id, "history": history}
except Exception as e:
logger.error(f"Error getting DAGI audit history for {node_id}: {e}")
raise HTTPException(status_code=500, detail="Failed to get audit history")
@router.get("/internal/node/{node_id}/agents/system")
async def get_node_system_agents(node_id: str):
"""
Отримати агентів з БД для ноди (для DAGI audit).
"""
try:
agents = await repo_city.get_agents_by_node_for_audit(node_id)
return {
"node_id": node_id,
"total": len(agents),
"agents": agents
}
except Exception as e:
logger.error(f"Error getting system agents for {node_id}: {e}")
raise HTTPException(status_code=500, detail="Failed to get system agents")
@router.post("/internal/node/{node_id}/dagi-audit/run")
async def run_node_dagi_audit(
node_id: str,
request: Request
):
"""
Запустити DAGI audit для ноди.
Порівнює агентів з router-config.yml та БД.
Цей endpoint викликає audit логіку inline (для MVP).
В продакшені краще делегувати на worker/celery.
"""
import yaml
from pathlib import Path
from datetime import datetime
try:
# Визначити шлях до router-config
project_root = Path(__file__).parent.parent.parent
config_path = project_root / "router-config.yml"
if not config_path.exists():
raise HTTPException(status_code=404, detail="router-config.yml not found")
# Парсити router config
with open(config_path, 'r', encoding='utf-8') as f:
router_config = yaml.safe_load(f)
router_agents = []
for agent_id, agent_data in router_config.get("agents", {}).items():
router_agents.append({
"id": agent_id,
"name": agent_id,
"description": agent_data.get("description", "")
})
# Отримати агентів з БД
db_agents = await repo_city.get_all_agents_for_audit()
# Зіставлення
def normalize(name: str) -> str:
return name.lower().replace(" ", "").replace("-", "").replace("_", "")
router_by_id = {a["id"].lower(): a for a in router_agents}
db_by_ext_id = {}
for a in db_agents:
if a.get("external_id"):
ext_id = a["external_id"].split(":")[-1].lower() if ":" in a["external_id"] else a["external_id"].lower()
db_by_ext_id[ext_id] = a
db_by_name = {normalize(a["name"]): a for a in db_agents}
active = []
phantom = []
stale = []
matched_db_ids = set()
for r_agent in router_agents:
r_id = r_agent["id"].lower()
r_name_norm = normalize(r_agent["name"])
db_match = db_by_ext_id.get(r_id) or db_by_name.get(r_name_norm)
if db_match:
active.append({
"router_id": r_agent["id"],
"router_name": r_agent["name"],
"db_id": db_match["id"],
"db_name": db_match["name"],
"db_external_id": db_match.get("external_id"),
"kind": db_match.get("kind"),
"status": db_match.get("status", "unknown")
})
matched_db_ids.add(db_match["id"])
else:
phantom.append({
"router_id": r_agent["id"],
"router_name": r_agent["name"],
"description": r_agent.get("description", ""),
"reason": "In Router config but not in DB"
})
for db_agent in db_agents:
if db_agent["id"] not in matched_db_ids:
stale.append({
"db_id": db_agent["id"],
"db_name": db_agent["name"],
"db_external_id": db_agent.get("external_id"),
"kind": db_agent.get("kind"),
"reason": "In DB but not in Router config"
})
# Формуємо звіт
report = {
"node_id": node_id,
"timestamp": datetime.utcnow().isoformat() + "Z",
"summary": {
"router_total": len(router_agents),
"db_total": len(db_agents),
"active_count": len(active),
"phantom_count": len(phantom),
"stale_count": len(stale)
},
"active_agents": active,
"phantom_agents": phantom,
"stale_agents": stale
}
# Зберегти звіт в БД
saved = await repo_city.save_dagi_audit_report(node_id, report, triggered_by="api")
# Оновити статуси агентів
if active:
active_ids = [a["db_id"] for a in active]
await repo_city.update_agents_dagi_status(active_ids, "active", update_last_seen=True)
if stale:
stale_ids = [a["db_id"] for a in stale]
await repo_city.update_agents_dagi_status(stale_ids, "stale")
return {
"status": "completed",
"report_id": saved["id"],
"summary": report["summary"],
"message": f"Audit completed: {len(active)} active, {len(phantom)} phantom, {len(stale)} stale"
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error running DAGI audit for {node_id}: {e}")
import traceback
traceback.print_exc()
raise HTTPException(status_code=500, detail=f"Failed to run DAGI audit: {str(e)}")
# =============================================================================
# DAGI Router Agents API (for Node Cabinet Table)
# =============================================================================
class DAGIRouterAgentItem(BaseModel):
"""Агент для таблиці DAGI Router"""
id: str
name: str
role: Optional[str] = None
status: str # active, phantom, stale, error
node_id: Optional[str] = None
models: List[str] = []
gpu: Optional[str] = None
cpu: Optional[str] = None
last_seen_at: Optional[str] = None
has_cabinet: bool = False
cabinet_slug: Optional[str] = None
description: Optional[str] = None
has_prompts: bool = False # Чи є системні промти в БД
class DAGIRouterAgentsSummary(BaseModel):
"""Summary для DAGI Router Agents"""
active: int = 0
phantom: int = 0
stale: int = 0
router_total: int = 0
system_total: int = 0
class DAGIRouterAgentsResponse(BaseModel):
"""Відповідь API DAGI Router Agents"""
node_id: str
last_audit_at: Optional[str] = None
summary: DAGIRouterAgentsSummary
agents: List[DAGIRouterAgentItem]
@router.get("/internal/node/{node_id}/dagi-router/agents", response_model=DAGIRouterAgentsResponse)
async def get_dagi_router_agents(node_id: str):
"""
Отримати агентів DAGI Router для Node Cabinet таблиці.
Повертає уніфікований список агентів зі статусами.
"""
try:
data = await repo_city.get_dagi_router_agents_for_node(node_id)
return DAGIRouterAgentsResponse(
node_id=data["node_id"],
last_audit_at=data.get("last_audit_at"),
summary=DAGIRouterAgentsSummary(**data["summary"]),
agents=[DAGIRouterAgentItem(**a) for a in data["agents"]]
)
except Exception as e:
logger.error(f"Error getting DAGI router agents for {node_id}: {e}")
# Return empty response on error
return DAGIRouterAgentsResponse(
node_id=node_id,
last_audit_at=None,
summary=DAGIRouterAgentsSummary(),
agents=[]
)
# =============================================================================
# Node Metrics API
# =============================================================================
class NodeMetricsResponse(BaseModel):
"""Метрики ноди"""
node_id: str
node_name: Optional[str] = None
hostname: Optional[str] = None
status: Optional[str] = "unknown"
environment: Optional[str] = None
cpu_model: Optional[str] = None
cpu_cores: int = 0
cpu_usage: float = 0.0
gpu_model: Optional[str] = None
gpu_memory_total: int = 0
gpu_memory_used: int = 0
ram_total: int = 0
ram_used: int = 0
disk_total: int = 0
disk_used: int = 0
agent_count_router: int = 0
agent_count_system: int = 0
last_heartbeat: Optional[str] = None
@router.get("/internal/node/{node_id}/metrics/current", response_model=NodeMetricsResponse)
async def get_node_metrics_current(node_id: str):
"""
Отримати поточні метрики ноди.
Єдине джерело правди для Node Cabinet індикаторів.
"""
try:
metrics = await repo_city.get_node_metrics_current(node_id)
if not metrics:
# Return minimal response for unknown node
return NodeMetricsResponse(node_id=node_id)
return NodeMetricsResponse(**metrics)
except Exception as e:
logger.error(f"Error getting metrics for {node_id}: {e}")
return NodeMetricsResponse(node_id=node_id)
class NodeMetricsUpdateRequest(BaseModel):
"""Запит на оновлення метрик"""
cpu_usage: Optional[float] = None
gpu_vram_used: Optional[int] = None
ram_used: Optional[int] = None
disk_used: Optional[int] = None
agent_count_router: Optional[int] = None
agent_count_system: Optional[int] = None
@router.post("/internal/node/{node_id}/metrics/update")
async def update_node_metrics(
node_id: str,
metrics: NodeMetricsUpdateRequest
):
"""
Оновити метрики ноди (heartbeat).
Викликається з agent на ноді.
"""
try:
success = await repo_city.update_node_metrics(node_id, metrics.dict(exclude_unset=True))
return {
"status": "updated" if success else "not_found",
"node_id": node_id
}
except Exception as e:
logger.error(f"Error updating metrics for {node_id}: {e}")
raise HTTPException(status_code=500, detail="Failed to update metrics")
# =============================================================================
# Phantom / Stale Autosync API
# =============================================================================
class PhantomSyncRequest(BaseModel):
"""Запит на синхронізацію phantom агентів"""
agent_ids: List[str]
@router.post("/internal/node/{node_id}/dagi-router/phantom/sync")
async def sync_phantom_agents(
node_id: str,
request: PhantomSyncRequest
):
"""
Синхронізувати phantom агентів — створити їх у БД на основі router-config.
"""
import yaml
from pathlib import Path
try:
# Читаємо router-config
project_root = Path(__file__).parent.parent.parent
config_path = project_root / "router-config.yml"
if not config_path.exists():
raise HTTPException(status_code=404, detail="router-config.yml not found")
with open(config_path, 'r', encoding='utf-8') as f:
router_config = yaml.safe_load(f)
# Синхронізуємо агентів
created = await repo_city.sync_phantom_agents(
node_id,
request.agent_ids,
router_config
)
return {
"status": "completed",
"node_id": node_id,
"created_count": len(created),
"created_agents": created
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error syncing phantom agents for {node_id}: {e}")
raise HTTPException(status_code=500, detail=f"Failed to sync phantom agents: {str(e)}")
class StaleSyncRequest(BaseModel):
"""Запит на позначення stale агентів"""
agent_ids: List[str]
@router.post("/internal/node/{node_id}/dagi-router/stale/mark")
async def mark_stale_agents(
node_id: str,
request: StaleSyncRequest
):
"""
Позначити агентів як stale (в БД, але не в Router).
"""
try:
updated_count = await repo_city.mark_stale_agents(request.agent_ids)
return {
"status": "completed",
"node_id": node_id,
"marked_count": updated_count
}
except Exception as e:
logger.error(f"Error marking stale agents for {node_id}: {e}")
raise HTTPException(status_code=500, detail=f"Failed to mark stale agents: {str(e)}")
# =============================================================================
# Node Agents API (для Node Cabinet)
# =============================================================================
class NodeAgentItem(BaseModel):
"""Агент ноди для Node Cabinet"""
id: str
name: str
slug: Optional[str] = None
kind: Optional[str] = None
role: Optional[str] = None # node_guardian, node_steward, etc.
status: str = "unknown"
dagi_status: Optional[str] = None
last_seen_at: Optional[str] = None
is_guardian: bool = False
is_steward: bool = False
class NodeAgentsResponse(BaseModel):
"""Список агентів ноди"""
node_id: str
total: int
guardian: Optional[NodeAgentItem] = None
steward: Optional[NodeAgentItem] = None
agents: List[NodeAgentItem]
@router.get("/internal/node/{node_id}/agents", response_model=NodeAgentsResponse)
async def get_node_agents(node_id: str):
"""
Отримати всіх агентів ноди (Guardian, Steward, runtime agents).
"""
try:
agents_data = await repo_city.get_node_agents(node_id)
agents = []
guardian = None
steward = None
for a in agents_data:
item = NodeAgentItem(
id=a["id"],
name=a.get("display_name") or a.get("name") or a["id"],
slug=a.get("public_slug") or a["id"],
kind=a.get("kind"),
role=a.get("kind"), # Use kind as role for now
status=a.get("status", "unknown"),
dagi_status=a.get("dagi_status"),
last_seen_at=a.get("last_seen_at").isoformat() if a.get("last_seen_at") else None,
is_guardian=a.get("is_node_guardian", False) or a.get("kind") == "node_guardian",
is_steward=a.get("is_node_steward", False) or a.get("kind") == "node_steward"
)
agents.append(item)
if item.is_guardian and not guardian:
guardian = item
if item.is_steward and not steward:
steward = item
return NodeAgentsResponse(
node_id=node_id,
total=len(agents),
guardian=guardian,
steward=steward,
agents=agents
)
except Exception as e:
logger.error(f"Error getting node agents for {node_id}: {e}")
return NodeAgentsResponse(
node_id=node_id,
total=0,
agents=[]
)
# =============================================================================
# Agent Runtime Prompts API (for DAGI Router integration)
# =============================================================================
class RuntimePromptsResponse(BaseModel):
"""Runtime prompts for DAGI Router"""
agent_id: str
has_prompts: bool
prompts: Dict[str, Optional[str]]
class RuntimeSystemPromptResponse(BaseModel):
"""Full runtime system prompt for DAGI Router"""
agent_id: str
agent_name: Optional[str] = None
agent_kind: Optional[str] = None
has_prompts: bool
system_prompt: str
prompts: Dict[str, Optional[str]]
class AgentPromptsStatusRequest(BaseModel):
"""Request to check prompts status for multiple agents"""
agent_ids: List[str]
class AgentPromptsStatusResponse(BaseModel):
"""Response with prompts status for multiple agents"""
status: Dict[str, bool]
@router.get("/internal/agents/{agent_id}/prompts/runtime", response_model=RuntimePromptsResponse)
async def get_agent_runtime_prompts(agent_id: str):
"""
Отримати runtime промти агента для DAGI Router.
Повертає тільки content промтів без метаданих.
Використовується DAGI Router для побудови system prompt.
"""
try:
data = await repo_city.get_runtime_prompts(agent_id)
return RuntimePromptsResponse(**data)
except Exception as e:
logger.error(f"Error getting runtime prompts for {agent_id}: {e}")
return RuntimePromptsResponse(
agent_id=agent_id,
has_prompts=False,
prompts={"core": None, "safety": None, "governance": None, "tools": None}
)
@router.get("/internal/agents/{agent_id}/system-prompt", response_model=RuntimeSystemPromptResponse)
async def get_agent_system_prompt(agent_id: str):
"""
Отримати зібраний system prompt для агента.
DAGI Router використовує цей endpoint для отримання повного system prompt,
який включає core, safety, governance, tools та контекст.
"""
try:
data = await repo_city.get_agent_with_runtime_prompt(agent_id)
if not data:
# Fallback for unknown agent
return RuntimeSystemPromptResponse(
agent_id=agent_id,
agent_name=None,
agent_kind=None,
has_prompts=False,
system_prompt=f"You are an AI agent (ID: {agent_id}) in DAARION.city. Be helpful and accurate.",
prompts={"core": None, "safety": None, "governance": None, "tools": None}
)
return RuntimeSystemPromptResponse(**data)
except Exception as e:
logger.error(f"Error getting system prompt for {agent_id}: {e}")
import traceback
traceback.print_exc()
return RuntimeSystemPromptResponse(
agent_id=agent_id,
has_prompts=False,
system_prompt=f"You are an AI agent in DAARION.city. Be helpful and accurate.",
prompts={"core": None, "safety": None, "governance": None, "tools": None}
)
@router.post("/internal/agents/prompts/status", response_model=AgentPromptsStatusResponse)
async def check_agents_prompts_status(request: AgentPromptsStatusRequest):
"""
Перевірити наявність промтів для списку агентів.
Використовується UI для показу індикаторів has_prompts в таблицях агентів.
"""
try:
status = await repo_city.check_agents_prompts_status(request.agent_ids)
return AgentPromptsStatusResponse(status=status)
except Exception as e:
logger.error(f"Error checking prompts status: {e}")
return AgentPromptsStatusResponse(
status={agent_id: False for agent_id in request.agent_ids}
)
# =============================================================================
# Node Self-Registration & Self-Healing API
# =============================================================================
class NodeSelfRegisterRequest(BaseModel):
"""Request body for node self-registration"""
id: str
name: str
hostname: Optional[str] = None
environment: str = "development"
roles: List[str] = []
description: Optional[str] = None
class NodeSelfRegisterResponse(BaseModel):
"""Response for node self-registration"""
success: bool
node_id: str
is_new: bool = False
message: str = ""
error: Optional[str] = None
class NodeHeartbeatRequest(BaseModel):
"""Request body for node heartbeat"""
metrics: Optional[Dict] = None
class NodeHeartbeatResponse(BaseModel):
"""Response for node heartbeat"""
success: bool
node_id: Optional[str] = None
heartbeat_at: Optional[str] = None
error: Optional[str] = None
should_self_register: bool = False
class NodeSelfHealingStatusResponse(BaseModel):
"""Response for node self-healing status"""
node_id: str
registered: bool
is_active: Optional[bool] = None
name: Optional[str] = None
self_healing_status: str = "unknown"
last_heartbeat: Optional[str] = None
last_self_registration: Optional[str] = None
self_registration_count: int = 0
agent_count_router: int = 0
agent_count_system: int = 0
has_guardian: bool = False
has_steward: bool = False
errors: List[Dict] = []
status: Optional[str] = None
error: Optional[str] = None
class NodesNeedingHealingResponse(BaseModel):
"""Response listing nodes that need healing"""
nodes: List[Dict]
total: int
@router.post("/internal/nodes/register-or-update", response_model=NodeSelfRegisterResponse)
async def node_self_register(request: NodeSelfRegisterRequest):
"""
Самореєстрація ноди.
Цей endpoint викликається:
- Node Bootstrap script при старті ноди
- Node Guardian при виявленні, що нода зникла з Directory
Якщо нода вже зареєстрована — оновлює дані.
Якщо нова — створює запис в node_registry.
"""
try:
result = await repo_city.node_self_register(
node_id=request.id,
name=request.name,
hostname=request.hostname,
environment=request.environment,
roles=request.roles,
description=request.description
)
return NodeSelfRegisterResponse(
success=result.get("success", False),
node_id=result.get("node_id", request.id),
is_new=result.get("is_new", False),
message=result.get("message", ""),
error=result.get("error")
)
except Exception as e:
logger.error(f"Node self-registration failed for {request.id}: {e}")
return NodeSelfRegisterResponse(
success=False,
node_id=request.id,
message="Registration failed",
error=str(e)
)
@router.post("/internal/node/{node_id}/heartbeat", response_model=NodeHeartbeatResponse)
async def node_heartbeat(node_id: str, request: NodeHeartbeatRequest = NodeHeartbeatRequest()):
"""
Heartbeat ноди з оновленням метрик.
Повертає should_self_register=True якщо нода не зареєстрована,
що є сигналом для Node Guardian виконати self-registration.
"""
try:
result = await repo_city.node_heartbeat(
node_id=node_id,
metrics=request.metrics
)
return NodeHeartbeatResponse(
success=result.get("success", False),
node_id=result.get("node_id"),
heartbeat_at=result.get("heartbeat_at"),
error=result.get("error"),
should_self_register=result.get("should_self_register", False)
)
except Exception as e:
logger.error(f"Heartbeat failed for {node_id}: {e}")
return NodeHeartbeatResponse(
success=False,
node_id=node_id,
error=str(e)
)
@router.get("/internal/node/{node_id}/self-healing/status", response_model=NodeSelfHealingStatusResponse)
async def get_node_self_healing_status(node_id: str):
"""
Отримати статус self-healing для ноди.
Використовується Node Guardian для моніторингу стану ноди.
"""
try:
result = await repo_city.get_node_self_healing_status(node_id)
return NodeSelfHealingStatusResponse(**result)
except Exception as e:
logger.error(f"Failed to get self-healing status for {node_id}: {e}")
return NodeSelfHealingStatusResponse(
node_id=node_id,
registered=False,
status="error",
error=str(e)
)
@router.get("/internal/node/{node_id}/directory-check")
async def check_node_in_directory(node_id: str):
"""
Перевірити чи нода видима в Node Directory.
Простий endpoint для Node Guardian self-healing loop.
"""
try:
visible = await repo_city.check_node_in_directory(node_id)
return {
"node_id": node_id,
"visible_in_directory": visible,
"checked_at": datetime.now(timezone.utc).isoformat()
}
except Exception as e:
logger.error(f"Directory check failed for {node_id}: {e}")
return {
"node_id": node_id,
"visible_in_directory": False,
"error": str(e)
}
@router.get("/internal/nodes/needing-healing", response_model=NodesNeedingHealingResponse)
async def get_nodes_needing_healing():
"""
Отримати список нод, які потребують self-healing.
Використовується для моніторингу та автоматичного healing.
"""
try:
nodes = await repo_city.get_nodes_needing_healing()
return NodesNeedingHealingResponse(
nodes=nodes,
total=len(nodes)
)
except Exception as e:
logger.error(f"Failed to get nodes needing healing: {e}")
return NodesNeedingHealingResponse(nodes=[], total=0)
@router.post("/internal/node/{node_id}/self-healing/trigger")
async def trigger_node_self_healing(node_id: str):
"""
Тригернути self-healing для ноди.
Ця операція:
1. Перевіряє стан ноди
2. Якщо нода не в Directory — виконує self-registration
3. Оновлює статус self-healing
"""
try:
# Check current state
status = await repo_city.get_node_self_healing_status(node_id)
actions_taken = []
if not status.get("registered"):
# Need to register
result = await repo_city.node_self_register(
node_id=node_id,
name=f"Auto-healed node {node_id}",
environment="production" if "node-1" in node_id else "development"
)
actions_taken.append({
"action": "self_register",
"result": result
})
# Check if visible in directory
visible = await repo_city.check_node_in_directory(node_id)
if not visible:
actions_taken.append({
"action": "visibility_check",
"result": {"visible": False, "needs_manual_intervention": True}
})
# Update healing status
final_status = "healthy" if visible else "needs_attention"
await repo_city.update_node_self_healing_status(
node_id=node_id,
status=final_status
)
return {
"node_id": node_id,
"triggered_at": datetime.now(timezone.utc).isoformat(),
"actions_taken": actions_taken,
"final_status": final_status,
"visible_in_directory": visible
}
except Exception as e:
logger.error(f"Self-healing trigger failed for {node_id}: {e}")
# Record error
await repo_city.update_node_self_healing_status(
node_id=node_id,
status="error",
error=str(e)
)
raise HTTPException(status_code=500, detail=f"Self-healing failed: {e}")