feat: Node Self-Healing, DAGI Audit, Agent Prompts, Infra Invariants

### Backend (city-service)
- Node Registry + Self-Healing API (migration 039)
- Improved get_all_nodes() with robust fallback for node_registry/node_cache
- Agent Prompts Runtime API for DAGI Router integration
- DAGI Router Audit endpoints (phantom/stale detection)
- Node Agents API (Guardian/Steward)
- Node metrics extended (CPU/GPU/RAM/Disk)

### Frontend (apps/web)
- Node Directory with improved error handling
- Node Cabinet with metrics cards
- DAGI Router Card component
- Node Metrics Card component
- useDAGIAudit hook

### Scripts
- check-invariants.py - deploy verification
- node-bootstrap.sh - node self-registration
- node-guardian-loop.py - continuous self-healing
- dagi_agent_audit.py - DAGI audit utility

### Migrations
- 034: Agent prompts seed
- 035: Agent DAGI audit
- 036: Node metrics extended
- 037: Node agents complete
- 038: Agent prompts full coverage
- 039: Node registry self-healing

### Tests
- test_infra_smoke.py
- test_agent_prompts_runtime.py
- test_dagi_router_api.py

### Documentation
- DEPLOY_CHECKLIST_2024_11_30.md
- Multiple TASK_PHASE docs
This commit is contained in:
Apple
2025-11-30 13:52:01 -08:00
parent 0c7836af5a
commit bca81dc719
36 changed files with 10630 additions and 55 deletions

View File

@@ -239,6 +239,23 @@ class NodeMicrodaoSummary(BaseModel):
rooms_count: int = 0
class NodeMetrics(BaseModel):
"""Node metrics for Node Directory cards"""
cpu_model: Optional[str] = None
cpu_cores: int = 0
cpu_usage: float = 0.0
gpu_model: Optional[str] = None
gpu_vram_total: int = 0
gpu_vram_used: int = 0
ram_total: int = 0
ram_used: int = 0
disk_total: int = 0
disk_used: int = 0
agent_count_router: int = 0
agent_count_system: int = 0
dagi_router_url: Optional[str] = None
class NodeProfile(BaseModel):
"""Node profile for Node Directory"""
node_id: str
@@ -256,6 +273,7 @@ class NodeProfile(BaseModel):
guardian_agent: Optional[NodeAgentSummary] = None
steward_agent: Optional[NodeAgentSummary] = None
microdaos: List[NodeMicrodaoSummary] = []
metrics: Optional[NodeMetrics] = None
class ModelBindings(BaseModel):

File diff suppressed because it is too large Load Diff

View File

@@ -4,7 +4,8 @@ City Backend API Routes
from fastapi import APIRouter, HTTPException, Depends, Body, Header, Query, Request, UploadFile, File, Form
from pydantic import BaseModel
from typing import List, Optional
from typing import List, Optional, Dict
from datetime import datetime, timezone
import logging
import httpx
import os
@@ -512,12 +513,52 @@ docker compose up -d
@public_router.get("/nodes")
async def list_nodes():
"""Список всіх нод мережі"""
"""Список всіх нод мережі з метриками"""
try:
from models_city import NodeMetrics, NodeAgentSummary
nodes = await repo_city.get_all_nodes()
items: List[NodeProfile] = []
for node in nodes:
# Build guardian agent
guardian_agent = None
if node.get("guardian_agent"):
guardian_agent = NodeAgentSummary(
id=node["guardian_agent"]["id"],
name=node["guardian_agent"].get("name"),
slug=node["guardian_agent"].get("slug")
)
# Build steward agent
steward_agent = None
if node.get("steward_agent"):
steward_agent = NodeAgentSummary(
id=node["steward_agent"]["id"],
name=node["steward_agent"].get("name"),
slug=node["steward_agent"].get("slug")
)
# Build metrics
metrics = None
if node.get("metrics"):
m = node["metrics"]
metrics = NodeMetrics(
cpu_model=m.get("cpu_model"),
cpu_cores=m.get("cpu_cores", 0),
cpu_usage=m.get("cpu_usage", 0.0),
gpu_model=m.get("gpu_model"),
gpu_vram_total=m.get("gpu_vram_total", 0),
gpu_vram_used=m.get("gpu_vram_used", 0),
ram_total=m.get("ram_total", 0),
ram_used=m.get("ram_used", 0),
disk_total=m.get("disk_total", 0),
disk_used=m.get("disk_used", 0),
agent_count_router=m.get("agent_count_router", 0),
agent_count_system=m.get("agent_count_system", 0),
dagi_router_url=m.get("dagi_router_url")
)
items.append(NodeProfile(
node_id=node["node_id"],
name=node["name"],
@@ -528,12 +569,17 @@ async def list_nodes():
gpu_info=node.get("gpu"),
agents_total=node.get("agents_total", 0),
agents_online=node.get("agents_online", 0),
last_heartbeat=str(node["last_heartbeat"]) if node.get("last_heartbeat") else None
last_heartbeat=str(node["last_heartbeat"]) if node.get("last_heartbeat") else None,
guardian_agent=guardian_agent,
steward_agent=steward_agent,
metrics=metrics
))
return {"items": items, "total": len(items)}
except Exception as e:
logger.error(f"Failed to list nodes: {e}")
import traceback
traceback.print_exc()
raise HTTPException(status_code=500, detail="Failed to list nodes")
@@ -3210,3 +3256,907 @@ async def ensure_orchestrator_room(
except Exception as e:
logger.error(f"Error ensuring orchestrator room for {slug}: {e}")
raise HTTPException(status_code=500, detail="Internal server error")
# =============================================================================
# DAGI Agent Audit API
# =============================================================================
class DAGIAuditSummary(BaseModel):
"""Підсумок DAGI audit"""
node_id: str
timestamp: str
router_total: int
db_total: int
active_count: int
phantom_count: int
stale_count: int
triggered_by: Optional[str] = None
class DAGIAgentStatus(BaseModel):
"""Статус агента в DAGI"""
id: str
name: str
external_id: Optional[str] = None
kind: Optional[str] = None
status: str # active, stale, phantom
dagi_status: Optional[str] = None
last_seen_at: Optional[str] = None
router_id: Optional[str] = None
reason: Optional[str] = None
class DAGIAuditResponse(BaseModel):
"""Повний звіт DAGI audit"""
summary: DAGIAuditSummary
active_agents: List[DAGIAgentStatus]
phantom_agents: List[DAGIAgentStatus]
stale_agents: List[DAGIAgentStatus]
@router.get("/internal/node/{node_id}/dagi-audit", response_model=Optional[DAGIAuditSummary])
async def get_node_dagi_audit(node_id: str):
"""
Отримати останній DAGI audit звіт для ноди.
"""
try:
audit = await repo_city.get_latest_dagi_audit(node_id)
if not audit:
return None
return DAGIAuditSummary(
node_id=audit["node_id"],
timestamp=audit["timestamp"],
router_total=audit["router_total"],
db_total=audit["db_total"],
active_count=audit["active_count"],
phantom_count=audit["phantom_count"],
stale_count=audit["stale_count"],
triggered_by=audit.get("triggered_by")
)
except Exception as e:
logger.error(f"Error getting DAGI audit for {node_id}: {e}")
raise HTTPException(status_code=500, detail="Failed to get DAGI audit")
@router.get("/internal/node/{node_id}/dagi-audit/full")
async def get_node_dagi_audit_full(node_id: str):
"""
Отримати повний DAGI audit звіт для ноди (з деталями).
"""
try:
audit = await repo_city.get_latest_dagi_audit(node_id)
if not audit:
raise HTTPException(status_code=404, detail="No audit found for this node")
return audit
except HTTPException:
raise
except Exception as e:
logger.error(f"Error getting full DAGI audit for {node_id}: {e}")
raise HTTPException(status_code=500, detail="Failed to get DAGI audit")
@router.get("/internal/node/{node_id}/dagi-audit/history")
async def get_node_dagi_audit_history(
node_id: str,
limit: int = Query(default=10, le=100)
):
"""
Отримати історію DAGI audit звітів для ноди.
"""
try:
history = await repo_city.get_dagi_audit_history(node_id, limit)
return {"node_id": node_id, "history": history}
except Exception as e:
logger.error(f"Error getting DAGI audit history for {node_id}: {e}")
raise HTTPException(status_code=500, detail="Failed to get audit history")
@router.get("/internal/node/{node_id}/agents/system")
async def get_node_system_agents(node_id: str):
"""
Отримати агентів з БД для ноди (для DAGI audit).
"""
try:
agents = await repo_city.get_agents_by_node_for_audit(node_id)
return {
"node_id": node_id,
"total": len(agents),
"agents": agents
}
except Exception as e:
logger.error(f"Error getting system agents for {node_id}: {e}")
raise HTTPException(status_code=500, detail="Failed to get system agents")
@router.post("/internal/node/{node_id}/dagi-audit/run")
async def run_node_dagi_audit(
node_id: str,
request: Request
):
"""
Запустити DAGI audit для ноди.
Порівнює агентів з router-config.yml та БД.
Цей endpoint викликає audit логіку inline (для MVP).
В продакшені краще делегувати на worker/celery.
"""
import yaml
from pathlib import Path
from datetime import datetime
try:
# Визначити шлях до router-config
project_root = Path(__file__).parent.parent.parent
config_path = project_root / "router-config.yml"
if not config_path.exists():
raise HTTPException(status_code=404, detail="router-config.yml not found")
# Парсити router config
with open(config_path, 'r', encoding='utf-8') as f:
router_config = yaml.safe_load(f)
router_agents = []
for agent_id, agent_data in router_config.get("agents", {}).items():
router_agents.append({
"id": agent_id,
"name": agent_id,
"description": agent_data.get("description", "")
})
# Отримати агентів з БД
db_agents = await repo_city.get_all_agents_for_audit()
# Зіставлення
def normalize(name: str) -> str:
return name.lower().replace(" ", "").replace("-", "").replace("_", "")
router_by_id = {a["id"].lower(): a for a in router_agents}
db_by_ext_id = {}
for a in db_agents:
if a.get("external_id"):
ext_id = a["external_id"].split(":")[-1].lower() if ":" in a["external_id"] else a["external_id"].lower()
db_by_ext_id[ext_id] = a
db_by_name = {normalize(a["name"]): a for a in db_agents}
active = []
phantom = []
stale = []
matched_db_ids = set()
for r_agent in router_agents:
r_id = r_agent["id"].lower()
r_name_norm = normalize(r_agent["name"])
db_match = db_by_ext_id.get(r_id) or db_by_name.get(r_name_norm)
if db_match:
active.append({
"router_id": r_agent["id"],
"router_name": r_agent["name"],
"db_id": db_match["id"],
"db_name": db_match["name"],
"db_external_id": db_match.get("external_id"),
"kind": db_match.get("kind"),
"status": db_match.get("status", "unknown")
})
matched_db_ids.add(db_match["id"])
else:
phantom.append({
"router_id": r_agent["id"],
"router_name": r_agent["name"],
"description": r_agent.get("description", ""),
"reason": "In Router config but not in DB"
})
for db_agent in db_agents:
if db_agent["id"] not in matched_db_ids:
stale.append({
"db_id": db_agent["id"],
"db_name": db_agent["name"],
"db_external_id": db_agent.get("external_id"),
"kind": db_agent.get("kind"),
"reason": "In DB but not in Router config"
})
# Формуємо звіт
report = {
"node_id": node_id,
"timestamp": datetime.utcnow().isoformat() + "Z",
"summary": {
"router_total": len(router_agents),
"db_total": len(db_agents),
"active_count": len(active),
"phantom_count": len(phantom),
"stale_count": len(stale)
},
"active_agents": active,
"phantom_agents": phantom,
"stale_agents": stale
}
# Зберегти звіт в БД
saved = await repo_city.save_dagi_audit_report(node_id, report, triggered_by="api")
# Оновити статуси агентів
if active:
active_ids = [a["db_id"] for a in active]
await repo_city.update_agents_dagi_status(active_ids, "active", update_last_seen=True)
if stale:
stale_ids = [a["db_id"] for a in stale]
await repo_city.update_agents_dagi_status(stale_ids, "stale")
return {
"status": "completed",
"report_id": saved["id"],
"summary": report["summary"],
"message": f"Audit completed: {len(active)} active, {len(phantom)} phantom, {len(stale)} stale"
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error running DAGI audit for {node_id}: {e}")
import traceback
traceback.print_exc()
raise HTTPException(status_code=500, detail=f"Failed to run DAGI audit: {str(e)}")
# =============================================================================
# DAGI Router Agents API (for Node Cabinet Table)
# =============================================================================
class DAGIRouterAgentItem(BaseModel):
"""Агент для таблиці DAGI Router"""
id: str
name: str
role: Optional[str] = None
status: str # active, phantom, stale, error
node_id: Optional[str] = None
models: List[str] = []
gpu: Optional[str] = None
cpu: Optional[str] = None
last_seen_at: Optional[str] = None
has_cabinet: bool = False
cabinet_slug: Optional[str] = None
description: Optional[str] = None
has_prompts: bool = False # Чи є системні промти в БД
class DAGIRouterAgentsSummary(BaseModel):
"""Summary для DAGI Router Agents"""
active: int = 0
phantom: int = 0
stale: int = 0
router_total: int = 0
system_total: int = 0
class DAGIRouterAgentsResponse(BaseModel):
"""Відповідь API DAGI Router Agents"""
node_id: str
last_audit_at: Optional[str] = None
summary: DAGIRouterAgentsSummary
agents: List[DAGIRouterAgentItem]
@router.get("/internal/node/{node_id}/dagi-router/agents", response_model=DAGIRouterAgentsResponse)
async def get_dagi_router_agents(node_id: str):
"""
Отримати агентів DAGI Router для Node Cabinet таблиці.
Повертає уніфікований список агентів зі статусами.
"""
try:
data = await repo_city.get_dagi_router_agents_for_node(node_id)
return DAGIRouterAgentsResponse(
node_id=data["node_id"],
last_audit_at=data.get("last_audit_at"),
summary=DAGIRouterAgentsSummary(**data["summary"]),
agents=[DAGIRouterAgentItem(**a) for a in data["agents"]]
)
except Exception as e:
logger.error(f"Error getting DAGI router agents for {node_id}: {e}")
# Return empty response on error
return DAGIRouterAgentsResponse(
node_id=node_id,
last_audit_at=None,
summary=DAGIRouterAgentsSummary(),
agents=[]
)
# =============================================================================
# Node Metrics API
# =============================================================================
class NodeMetricsResponse(BaseModel):
"""Метрики ноди"""
node_id: str
node_name: Optional[str] = None
hostname: Optional[str] = None
status: Optional[str] = "unknown"
environment: Optional[str] = None
cpu_model: Optional[str] = None
cpu_cores: int = 0
cpu_usage: float = 0.0
gpu_model: Optional[str] = None
gpu_memory_total: int = 0
gpu_memory_used: int = 0
ram_total: int = 0
ram_used: int = 0
disk_total: int = 0
disk_used: int = 0
agent_count_router: int = 0
agent_count_system: int = 0
last_heartbeat: Optional[str] = None
@router.get("/internal/node/{node_id}/metrics/current", response_model=NodeMetricsResponse)
async def get_node_metrics_current(node_id: str):
"""
Отримати поточні метрики ноди.
Єдине джерело правди для Node Cabinet індикаторів.
"""
try:
metrics = await repo_city.get_node_metrics_current(node_id)
if not metrics:
# Return minimal response for unknown node
return NodeMetricsResponse(node_id=node_id)
return NodeMetricsResponse(**metrics)
except Exception as e:
logger.error(f"Error getting metrics for {node_id}: {e}")
return NodeMetricsResponse(node_id=node_id)
class NodeMetricsUpdateRequest(BaseModel):
"""Запит на оновлення метрик"""
cpu_usage: Optional[float] = None
gpu_vram_used: Optional[int] = None
ram_used: Optional[int] = None
disk_used: Optional[int] = None
agent_count_router: Optional[int] = None
agent_count_system: Optional[int] = None
@router.post("/internal/node/{node_id}/metrics/update")
async def update_node_metrics(
node_id: str,
metrics: NodeMetricsUpdateRequest
):
"""
Оновити метрики ноди (heartbeat).
Викликається з agent на ноді.
"""
try:
success = await repo_city.update_node_metrics(node_id, metrics.dict(exclude_unset=True))
return {
"status": "updated" if success else "not_found",
"node_id": node_id
}
except Exception as e:
logger.error(f"Error updating metrics for {node_id}: {e}")
raise HTTPException(status_code=500, detail="Failed to update metrics")
# =============================================================================
# Phantom / Stale Autosync API
# =============================================================================
class PhantomSyncRequest(BaseModel):
"""Запит на синхронізацію phantom агентів"""
agent_ids: List[str]
@router.post("/internal/node/{node_id}/dagi-router/phantom/sync")
async def sync_phantom_agents(
node_id: str,
request: PhantomSyncRequest
):
"""
Синхронізувати phantom агентів — створити їх у БД на основі router-config.
"""
import yaml
from pathlib import Path
try:
# Читаємо router-config
project_root = Path(__file__).parent.parent.parent
config_path = project_root / "router-config.yml"
if not config_path.exists():
raise HTTPException(status_code=404, detail="router-config.yml not found")
with open(config_path, 'r', encoding='utf-8') as f:
router_config = yaml.safe_load(f)
# Синхронізуємо агентів
created = await repo_city.sync_phantom_agents(
node_id,
request.agent_ids,
router_config
)
return {
"status": "completed",
"node_id": node_id,
"created_count": len(created),
"created_agents": created
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error syncing phantom agents for {node_id}: {e}")
raise HTTPException(status_code=500, detail=f"Failed to sync phantom agents: {str(e)}")
class StaleSyncRequest(BaseModel):
"""Запит на позначення stale агентів"""
agent_ids: List[str]
@router.post("/internal/node/{node_id}/dagi-router/stale/mark")
async def mark_stale_agents(
node_id: str,
request: StaleSyncRequest
):
"""
Позначити агентів як stale (в БД, але не в Router).
"""
try:
updated_count = await repo_city.mark_stale_agents(request.agent_ids)
return {
"status": "completed",
"node_id": node_id,
"marked_count": updated_count
}
except Exception as e:
logger.error(f"Error marking stale agents for {node_id}: {e}")
raise HTTPException(status_code=500, detail=f"Failed to mark stale agents: {str(e)}")
# =============================================================================
# Node Agents API (для Node Cabinet)
# =============================================================================
class NodeAgentItem(BaseModel):
"""Агент ноди для Node Cabinet"""
id: str
name: str
slug: Optional[str] = None
kind: Optional[str] = None
role: Optional[str] = None # node_guardian, node_steward, etc.
status: str = "unknown"
dagi_status: Optional[str] = None
last_seen_at: Optional[str] = None
is_guardian: bool = False
is_steward: bool = False
class NodeAgentsResponse(BaseModel):
"""Список агентів ноди"""
node_id: str
total: int
guardian: Optional[NodeAgentItem] = None
steward: Optional[NodeAgentItem] = None
agents: List[NodeAgentItem]
@router.get("/internal/node/{node_id}/agents", response_model=NodeAgentsResponse)
async def get_node_agents(node_id: str):
"""
Отримати всіх агентів ноди (Guardian, Steward, runtime agents).
"""
try:
agents_data = await repo_city.get_node_agents(node_id)
agents = []
guardian = None
steward = None
for a in agents_data:
item = NodeAgentItem(
id=a["id"],
name=a.get("display_name") or a.get("name") or a["id"],
slug=a.get("public_slug") or a["id"],
kind=a.get("kind"),
role=a.get("kind"), # Use kind as role for now
status=a.get("status", "unknown"),
dagi_status=a.get("dagi_status"),
last_seen_at=a.get("last_seen_at").isoformat() if a.get("last_seen_at") else None,
is_guardian=a.get("is_node_guardian", False) or a.get("kind") == "node_guardian",
is_steward=a.get("is_node_steward", False) or a.get("kind") == "node_steward"
)
agents.append(item)
if item.is_guardian and not guardian:
guardian = item
if item.is_steward and not steward:
steward = item
return NodeAgentsResponse(
node_id=node_id,
total=len(agents),
guardian=guardian,
steward=steward,
agents=agents
)
except Exception as e:
logger.error(f"Error getting node agents for {node_id}: {e}")
return NodeAgentsResponse(
node_id=node_id,
total=0,
agents=[]
)
# =============================================================================
# Agent Runtime Prompts API (for DAGI Router integration)
# =============================================================================
class RuntimePromptsResponse(BaseModel):
"""Runtime prompts for DAGI Router"""
agent_id: str
has_prompts: bool
prompts: Dict[str, Optional[str]]
class RuntimeSystemPromptResponse(BaseModel):
"""Full runtime system prompt for DAGI Router"""
agent_id: str
agent_name: Optional[str] = None
agent_kind: Optional[str] = None
has_prompts: bool
system_prompt: str
prompts: Dict[str, Optional[str]]
class AgentPromptsStatusRequest(BaseModel):
"""Request to check prompts status for multiple agents"""
agent_ids: List[str]
class AgentPromptsStatusResponse(BaseModel):
"""Response with prompts status for multiple agents"""
status: Dict[str, bool]
@router.get("/internal/agents/{agent_id}/prompts/runtime", response_model=RuntimePromptsResponse)
async def get_agent_runtime_prompts(agent_id: str):
"""
Отримати runtime промти агента для DAGI Router.
Повертає тільки content промтів без метаданих.
Використовується DAGI Router для побудови system prompt.
"""
try:
data = await repo_city.get_runtime_prompts(agent_id)
return RuntimePromptsResponse(**data)
except Exception as e:
logger.error(f"Error getting runtime prompts for {agent_id}: {e}")
return RuntimePromptsResponse(
agent_id=agent_id,
has_prompts=False,
prompts={"core": None, "safety": None, "governance": None, "tools": None}
)
@router.get("/internal/agents/{agent_id}/system-prompt", response_model=RuntimeSystemPromptResponse)
async def get_agent_system_prompt(agent_id: str):
"""
Отримати зібраний system prompt для агента.
DAGI Router використовує цей endpoint для отримання повного system prompt,
який включає core, safety, governance, tools та контекст.
"""
try:
data = await repo_city.get_agent_with_runtime_prompt(agent_id)
if not data:
# Fallback for unknown agent
return RuntimeSystemPromptResponse(
agent_id=agent_id,
agent_name=None,
agent_kind=None,
has_prompts=False,
system_prompt=f"You are an AI agent (ID: {agent_id}) in DAARION.city. Be helpful and accurate.",
prompts={"core": None, "safety": None, "governance": None, "tools": None}
)
return RuntimeSystemPromptResponse(**data)
except Exception as e:
logger.error(f"Error getting system prompt for {agent_id}: {e}")
import traceback
traceback.print_exc()
return RuntimeSystemPromptResponse(
agent_id=agent_id,
has_prompts=False,
system_prompt=f"You are an AI agent in DAARION.city. Be helpful and accurate.",
prompts={"core": None, "safety": None, "governance": None, "tools": None}
)
@router.post("/internal/agents/prompts/status", response_model=AgentPromptsStatusResponse)
async def check_agents_prompts_status(request: AgentPromptsStatusRequest):
"""
Перевірити наявність промтів для списку агентів.
Використовується UI для показу індикаторів has_prompts в таблицях агентів.
"""
try:
status = await repo_city.check_agents_prompts_status(request.agent_ids)
return AgentPromptsStatusResponse(status=status)
except Exception as e:
logger.error(f"Error checking prompts status: {e}")
return AgentPromptsStatusResponse(
status={agent_id: False for agent_id in request.agent_ids}
)
# =============================================================================
# Node Self-Registration & Self-Healing API
# =============================================================================
class NodeSelfRegisterRequest(BaseModel):
"""Request body for node self-registration"""
id: str
name: str
hostname: Optional[str] = None
environment: str = "development"
roles: List[str] = []
description: Optional[str] = None
class NodeSelfRegisterResponse(BaseModel):
"""Response for node self-registration"""
success: bool
node_id: str
is_new: bool = False
message: str = ""
error: Optional[str] = None
class NodeHeartbeatRequest(BaseModel):
"""Request body for node heartbeat"""
metrics: Optional[Dict] = None
class NodeHeartbeatResponse(BaseModel):
"""Response for node heartbeat"""
success: bool
node_id: Optional[str] = None
heartbeat_at: Optional[str] = None
error: Optional[str] = None
should_self_register: bool = False
class NodeSelfHealingStatusResponse(BaseModel):
"""Response for node self-healing status"""
node_id: str
registered: bool
is_active: Optional[bool] = None
name: Optional[str] = None
self_healing_status: str = "unknown"
last_heartbeat: Optional[str] = None
last_self_registration: Optional[str] = None
self_registration_count: int = 0
agent_count_router: int = 0
agent_count_system: int = 0
has_guardian: bool = False
has_steward: bool = False
errors: List[Dict] = []
status: Optional[str] = None
error: Optional[str] = None
class NodesNeedingHealingResponse(BaseModel):
"""Response listing nodes that need healing"""
nodes: List[Dict]
total: int
@router.post("/internal/nodes/register-or-update", response_model=NodeSelfRegisterResponse)
async def node_self_register(request: NodeSelfRegisterRequest):
"""
Самореєстрація ноди.
Цей endpoint викликається:
- Node Bootstrap script при старті ноди
- Node Guardian при виявленні, що нода зникла з Directory
Якщо нода вже зареєстрована — оновлює дані.
Якщо нова — створює запис в node_registry.
"""
try:
result = await repo_city.node_self_register(
node_id=request.id,
name=request.name,
hostname=request.hostname,
environment=request.environment,
roles=request.roles,
description=request.description
)
return NodeSelfRegisterResponse(
success=result.get("success", False),
node_id=result.get("node_id", request.id),
is_new=result.get("is_new", False),
message=result.get("message", ""),
error=result.get("error")
)
except Exception as e:
logger.error(f"Node self-registration failed for {request.id}: {e}")
return NodeSelfRegisterResponse(
success=False,
node_id=request.id,
message="Registration failed",
error=str(e)
)
@router.post("/internal/node/{node_id}/heartbeat", response_model=NodeHeartbeatResponse)
async def node_heartbeat(node_id: str, request: NodeHeartbeatRequest = NodeHeartbeatRequest()):
"""
Heartbeat ноди з оновленням метрик.
Повертає should_self_register=True якщо нода не зареєстрована,
що є сигналом для Node Guardian виконати self-registration.
"""
try:
result = await repo_city.node_heartbeat(
node_id=node_id,
metrics=request.metrics
)
return NodeHeartbeatResponse(
success=result.get("success", False),
node_id=result.get("node_id"),
heartbeat_at=result.get("heartbeat_at"),
error=result.get("error"),
should_self_register=result.get("should_self_register", False)
)
except Exception as e:
logger.error(f"Heartbeat failed for {node_id}: {e}")
return NodeHeartbeatResponse(
success=False,
node_id=node_id,
error=str(e)
)
@router.get("/internal/node/{node_id}/self-healing/status", response_model=NodeSelfHealingStatusResponse)
async def get_node_self_healing_status(node_id: str):
"""
Отримати статус self-healing для ноди.
Використовується Node Guardian для моніторингу стану ноди.
"""
try:
result = await repo_city.get_node_self_healing_status(node_id)
return NodeSelfHealingStatusResponse(**result)
except Exception as e:
logger.error(f"Failed to get self-healing status for {node_id}: {e}")
return NodeSelfHealingStatusResponse(
node_id=node_id,
registered=False,
status="error",
error=str(e)
)
@router.get("/internal/node/{node_id}/directory-check")
async def check_node_in_directory(node_id: str):
"""
Перевірити чи нода видима в Node Directory.
Простий endpoint для Node Guardian self-healing loop.
"""
try:
visible = await repo_city.check_node_in_directory(node_id)
return {
"node_id": node_id,
"visible_in_directory": visible,
"checked_at": datetime.now(timezone.utc).isoformat()
}
except Exception as e:
logger.error(f"Directory check failed for {node_id}: {e}")
return {
"node_id": node_id,
"visible_in_directory": False,
"error": str(e)
}
@router.get("/internal/nodes/needing-healing", response_model=NodesNeedingHealingResponse)
async def get_nodes_needing_healing():
"""
Отримати список нод, які потребують self-healing.
Використовується для моніторингу та автоматичного healing.
"""
try:
nodes = await repo_city.get_nodes_needing_healing()
return NodesNeedingHealingResponse(
nodes=nodes,
total=len(nodes)
)
except Exception as e:
logger.error(f"Failed to get nodes needing healing: {e}")
return NodesNeedingHealingResponse(nodes=[], total=0)
@router.post("/internal/node/{node_id}/self-healing/trigger")
async def trigger_node_self_healing(node_id: str):
"""
Тригернути self-healing для ноди.
Ця операція:
1. Перевіряє стан ноди
2. Якщо нода не в Directory — виконує self-registration
3. Оновлює статус self-healing
"""
try:
# Check current state
status = await repo_city.get_node_self_healing_status(node_id)
actions_taken = []
if not status.get("registered"):
# Need to register
result = await repo_city.node_self_register(
node_id=node_id,
name=f"Auto-healed node {node_id}",
environment="production" if "node-1" in node_id else "development"
)
actions_taken.append({
"action": "self_register",
"result": result
})
# Check if visible in directory
visible = await repo_city.check_node_in_directory(node_id)
if not visible:
actions_taken.append({
"action": "visibility_check",
"result": {"visible": False, "needs_manual_intervention": True}
})
# Update healing status
final_status = "healthy" if visible else "needs_attention"
await repo_city.update_node_self_healing_status(
node_id=node_id,
status=final_status
)
return {
"node_id": node_id,
"triggered_at": datetime.now(timezone.utc).isoformat(),
"actions_taken": actions_taken,
"final_status": final_status,
"visible_in_directory": visible
}
except Exception as e:
logger.error(f"Self-healing trigger failed for {node_id}: {e}")
# Record error
await repo_city.update_node_self_healing_status(
node_id=node_id,
status="error",
error=str(e)
)
raise HTTPException(status_code=500, detail=f"Self-healing failed: {e}")

View File

@@ -19,6 +19,7 @@ SWAPPER_URL = os.getenv("SWAPPER_URL", "http://192.168.1.33:8890")
STT_URL = os.getenv("STT_URL", "http://192.168.1.33:8895")
VISION_URL = os.getenv("VISION_URL", "http://192.168.1.33:11434")
OCR_URL = os.getenv("OCR_URL", "http://192.168.1.33:8896")
CITY_SERVICE_URL = os.getenv("CITY_SERVICE_URL", "http://daarion-city-service:7001")
# HTTP client for backend services
http_client: Optional[httpx.AsyncClient] = None
@@ -56,7 +57,27 @@ def load_config():
}
}
def load_router_config():
"""Load main router-config.yml with agents and LLM profiles"""
# Try multiple locations
paths = [
"router-config.yml",
"/app/router-config.yml",
"../router-config.yml",
"../../router-config.yml"
]
for path in paths:
if os.path.exists(path):
with open(path, 'r') as f:
logger.info(f"✅ Loaded router config from {path}")
return yaml.safe_load(f)
logger.warning("⚠️ router-config.yml not found, using empty config")
return {"agents": {}}
config = load_config()
router_config = load_router_config()
@app.on_event("startup")
async def startup_event():
@@ -363,10 +384,30 @@ async def agent_infer(agent_id: str, request: InferRequest):
- Agent configuration (model, capabilities)
- Request type (text, vision, audio)
- Backend availability
System prompt is fetched from database via city-service API.
"""
logger.info(f"🔀 Inference request for agent: {agent_id}")
logger.info(f"📝 Prompt: {request.prompt[:100]}...")
# Get system prompt from database or config
system_prompt = request.system_prompt
if not system_prompt:
try:
from prompt_builder import get_agent_system_prompt
system_prompt = await get_agent_system_prompt(
agent_id,
city_service_url=CITY_SERVICE_URL,
router_config=router_config
)
logger.info(f"✅ Loaded system prompt from database for {agent_id}")
except Exception as e:
logger.warning(f"⚠️ Could not load prompt from database: {e}")
# Fallback to config
agent_config = router_config.get("agents", {}).get(agent_id, {})
system_prompt = agent_config.get("system_prompt")
# Determine which backend to use
model = request.model or "gpt-oss:latest"
@@ -389,7 +430,7 @@ async def agent_infer(agent_id: str, request: InferRequest):
json={
"model": model,
"prompt": request.prompt,
"system": request.system_prompt,
"system": system_prompt,
"stream": False,
"options": {
"num_predict": request.max_tokens,

View File

@@ -0,0 +1,278 @@
"""
Prompt Builder for DAGI Router
Цей модуль відповідає за побудову system prompts для агентів,
використовуючи дані з БД через city-service API.
Частина Agent System Prompts MVP v2
"""
import httpx
import logging
from typing import Dict, Any, Optional
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@dataclass
class AgentSystemPrompt:
"""Результат побудови system prompt"""
agent_id: str
agent_name: Optional[str]
has_prompts: bool
system_prompt: str
source: str # "database", "fallback", "config"
class PromptBuilder:
"""
Будує system prompts для агентів.
Порядок пріоритетів:
1. Промти з БД (через city-service API)
2. Промти з router-config.yml
3. Fallback default prompt
"""
def __init__(
self,
city_service_url: str = "http://daarion-city-service:7001",
router_config: Optional[Dict[str, Any]] = None
):
self.city_service_url = city_service_url.rstrip("/")
self.router_config = router_config or {}
self._http_client: Optional[httpx.AsyncClient] = None
async def _get_http_client(self) -> httpx.AsyncClient:
"""Lazy initialization of HTTP client"""
if self._http_client is None:
self._http_client = httpx.AsyncClient(timeout=10.0)
return self._http_client
async def close(self):
"""Close HTTP client"""
if self._http_client:
await self._http_client.aclose()
self._http_client = None
async def get_system_prompt(self, agent_id: str) -> AgentSystemPrompt:
"""
Отримати system prompt для агента.
Спочатку пробує отримати з БД, потім з конфігу, потім fallback.
"""
# Try database first
db_prompt = await self._fetch_from_database(agent_id)
if db_prompt and db_prompt.has_prompts:
logger.info(f"Using database prompt for agent {agent_id}")
return db_prompt
# Try config
config_prompt = self._get_from_config(agent_id)
if config_prompt:
logger.info(f"Using config prompt for agent {agent_id}")
return config_prompt
# Fallback
logger.warning(f"No prompts found for agent {agent_id}, using fallback")
return self._get_fallback_prompt(agent_id)
async def _fetch_from_database(self, agent_id: str) -> Optional[AgentSystemPrompt]:
"""Fetch system prompt from city-service API"""
try:
client = await self._get_http_client()
url = f"{self.city_service_url}/internal/agents/{agent_id}/system-prompt"
response = await client.get(url)
if response.status_code == 200:
data = response.json()
return AgentSystemPrompt(
agent_id=data.get("agent_id", agent_id),
agent_name=data.get("agent_name"),
has_prompts=data.get("has_prompts", False),
system_prompt=data.get("system_prompt", ""),
source="database"
)
else:
logger.warning(f"City service returned {response.status_code} for agent {agent_id}")
return None
except httpx.RequestError as e:
logger.error(f"Error fetching prompt from city-service: {e}")
return None
except Exception as e:
logger.error(f"Unexpected error fetching prompt: {e}")
return None
def _get_from_config(self, agent_id: str) -> Optional[AgentSystemPrompt]:
"""Get system prompt from router config"""
agents = self.router_config.get("agents", {})
agent_config = agents.get(agent_id)
if not agent_config:
return None
system_prompt = agent_config.get("system_prompt")
if not system_prompt:
return None
return AgentSystemPrompt(
agent_id=agent_id,
agent_name=agent_config.get("description"),
has_prompts=True,
system_prompt=system_prompt.strip(),
source="config"
)
def _get_fallback_prompt(self, agent_id: str) -> AgentSystemPrompt:
"""Generate fallback prompt for unknown agent"""
fallback_prompt = (
f"You are an AI agent (ID: {agent_id}) in the DAARION.city ecosystem.\n\n"
"Guidelines:\n"
"- Be helpful, accurate, and professional\n"
"- Follow ethical guidelines and safety protocols\n"
"- Respect user privacy and data protection\n"
"- Ask for clarification when uncertain\n"
"- Never execute harmful or unauthorized actions\n"
)
return AgentSystemPrompt(
agent_id=agent_id,
agent_name=None,
has_prompts=False,
system_prompt=fallback_prompt,
source="fallback"
)
async def check_prompts_available(self, agent_ids: list[str]) -> Dict[str, bool]:
"""
Check if prompts are available for multiple agents.
Returns dict mapping agent_id to has_prompts boolean.
"""
result = {}
try:
client = await self._get_http_client()
url = f"{self.city_service_url}/internal/agents/prompts/status"
response = await client.post(url, json={"agent_ids": agent_ids})
if response.status_code == 200:
data = response.json()
result = data.get("status", {})
except Exception as e:
logger.error(f"Error checking prompts status: {e}")
# Fill missing with config check
for agent_id in agent_ids:
if agent_id not in result:
config_prompt = self._get_from_config(agent_id)
result[agent_id] = config_prompt is not None
return result
def build_system_prompt_from_parts(
prompts: Dict[str, Optional[str]],
agent_info: Optional[Dict[str, Any]] = None,
context: Optional[Dict[str, Any]] = None
) -> str:
"""
Build system prompt from individual parts.
This is a standalone function that can be used without PromptBuilder class.
Args:
prompts: Dict with keys "core", "safety", "governance", "tools"
agent_info: Optional dict with agent metadata (name, kind, etc.)
context: Optional dict with runtime context (node, microdao, etc.)
Returns:
Assembled system prompt string
"""
parts = []
# Core prompt (required)
if prompts.get("core"):
parts.append(prompts["core"])
elif agent_info:
agent_name = agent_info.get("display_name") or agent_info.get("name") or "Agent"
agent_kind = agent_info.get("kind") or "assistant"
parts.append(
f"You are {agent_name}, an AI {agent_kind} in DAARION.city ecosystem. "
f"Be helpful, accurate, and follow ethical guidelines."
)
else:
parts.append("You are an AI assistant. Be helpful and accurate.")
# Governance rules
if prompts.get("governance"):
parts.append("\n\n## Governance\n" + prompts["governance"])
# Safety guidelines
if prompts.get("safety"):
parts.append("\n\n## Safety Guidelines\n" + prompts["safety"])
# Tools instructions
if prompts.get("tools"):
parts.append("\n\n## Tools & Capabilities\n" + prompts["tools"])
# Context additions
if context:
context_lines = []
if context.get("node"):
node = context["node"]
context_lines.append(f"- **Node**: {node.get('name', 'Unknown')}")
if context.get("district"):
district = context["district"]
context_lines.append(f"- **District**: {district.get('name', 'Unknown')}")
if context.get("microdao"):
microdao = context["microdao"]
context_lines.append(f"- **MicroDAO**: {microdao.get('name', 'Unknown')}")
if context.get("user_role"):
context_lines.append(f"- **User Role**: {context['user_role']}")
if context_lines:
parts.append("\n\n## Current Context\n" + "\n".join(context_lines))
return "\n".join(parts)
# Singleton instance for convenience
_prompt_builder: Optional[PromptBuilder] = None
async def get_prompt_builder(
city_service_url: str = "http://daarion-city-service:7001",
router_config: Optional[Dict[str, Any]] = None
) -> PromptBuilder:
"""Get or create singleton PromptBuilder instance"""
global _prompt_builder
if _prompt_builder is None:
_prompt_builder = PromptBuilder(city_service_url, router_config)
return _prompt_builder
async def get_agent_system_prompt(
agent_id: str,
city_service_url: str = "http://daarion-city-service:7001",
router_config: Optional[Dict[str, Any]] = None
) -> str:
"""
Convenience function to get system prompt for an agent.
Usage in DAGI Router:
system_prompt = await get_agent_system_prompt("daarwizz")
"""
builder = await get_prompt_builder(city_service_url, router_config)
result = await builder.get_system_prompt(agent_id)
return result.system_prompt