feat: Node Self-Healing, DAGI Audit, Agent Prompts, Infra Invariants

### Backend (city-service)
- Node Registry + Self-Healing API (migration 039)
- Improved get_all_nodes() with robust fallback for node_registry/node_cache
- Agent Prompts Runtime API for DAGI Router integration
- DAGI Router Audit endpoints (phantom/stale detection)
- Node Agents API (Guardian/Steward)
- Node metrics extended (CPU/GPU/RAM/Disk)

### Frontend (apps/web)
- Node Directory with improved error handling
- Node Cabinet with metrics cards
- DAGI Router Card component
- Node Metrics Card component
- useDAGIAudit hook

### Scripts
- check-invariants.py - deploy verification
- node-bootstrap.sh - node self-registration
- node-guardian-loop.py - continuous self-healing
- dagi_agent_audit.py - DAGI audit utility

### Migrations
- 034: Agent prompts seed
- 035: Agent DAGI audit
- 036: Node metrics extended
- 037: Node agents complete
- 038: Agent prompts full coverage
- 039: Node registry self-healing

### Tests
- test_infra_smoke.py
- test_agent_prompts_runtime.py
- test_dagi_router_api.py

### Documentation
- DEPLOY_CHECKLIST_2024_11_30.md
- Multiple TASK_PHASE docs
This commit is contained in:
Apple
2025-11-30 13:52:01 -08:00
parent 0c7836af5a
commit bca81dc719
36 changed files with 10630 additions and 55 deletions

View File

@@ -0,0 +1,280 @@
"""
DAGI Router API Tests
Тести для endpoints:
- GET /internal/node/{node_id}/dagi-router/agents
- GET /internal/node/{node_id}/metrics/current
- POST /internal/node/{node_id}/dagi-audit/run
- POST /internal/node/{node_id}/dagi-router/phantom/sync
- POST /internal/node/{node_id}/dagi-router/stale/mark
"""
import pytest
import httpx
from typing import Any, Dict
# Test configuration
CITY_SERVICE_URL = "http://localhost:7001"
NODE1_ID = "node-1-hetzner-gex44"
NODE2_ID = "node-2-macbook-m4max"
# ============================================================================
# Fixtures
# ============================================================================
@pytest.fixture
def client():
"""HTTP client для тестування"""
return httpx.Client(base_url=CITY_SERVICE_URL, timeout=30.0)
@pytest.fixture
def node_ids():
"""Node IDs для тестування"""
return [NODE1_ID, NODE2_ID]
# ============================================================================
# DAGI Router Agents Tests
# ============================================================================
class TestDAGIRouterAgents:
"""Тести для GET /internal/node/{node_id}/dagi-router/agents"""
def test_get_agents_returns_valid_response(self, client):
"""Endpoint повертає валідну структуру"""
response = client.get(f"/city/internal/node/{NODE1_ID}/dagi-router/agents")
assert response.status_code == 200
data = response.json()
# Перевірка структури
assert "node_id" in data
assert "summary" in data
assert "agents" in data
# Перевірка summary
summary = data["summary"]
assert "active" in summary
assert "phantom" in summary
assert "stale" in summary
assert "router_total" in summary
assert "system_total" in summary
# Types
assert isinstance(summary["active"], int)
assert isinstance(summary["phantom"], int)
assert isinstance(data["agents"], list)
def test_get_agents_for_unknown_node(self, client):
"""Endpoint повертає пустий response для невідомої ноди"""
response = client.get("/city/internal/node/unknown-node-id/dagi-router/agents")
# Має повернути 200 з пустим списком, не 404
assert response.status_code == 200
data = response.json()
assert data["agents"] == []
assert data["summary"]["active"] == 0
def test_agents_have_required_fields(self, client):
"""Агенти мають всі необхідні поля"""
response = client.get(f"/city/internal/node/{NODE1_ID}/dagi-router/agents")
assert response.status_code == 200
data = response.json()
if data["agents"]:
agent = data["agents"][0]
# Required fields
assert "id" in agent
assert "name" in agent
assert "status" in agent
# Status must be valid
assert agent["status"] in ["active", "phantom", "stale", "error"]
# ============================================================================
# Node Metrics Tests
# ============================================================================
class TestNodeMetrics:
"""Тести для GET /internal/node/{node_id}/metrics/current"""
def test_get_metrics_returns_valid_response(self, client):
"""Endpoint повертає валідну структуру"""
response = client.get(f"/city/internal/node/{NODE1_ID}/metrics/current")
assert response.status_code == 200
data = response.json()
# Required fields
assert "node_id" in data
assert data["node_id"] == NODE1_ID
# Metric fields
assert "cpu_cores" in data
assert "cpu_usage" in data
assert "gpu_model" in data
assert "gpu_memory_total" in data
assert "gpu_memory_used" in data
assert "ram_total" in data
assert "ram_used" in data
assert "disk_total" in data
assert "disk_used" in data
assert "agent_count_router" in data
assert "agent_count_system" in data
def test_get_metrics_for_unknown_node(self, client):
"""Endpoint повертає minimal response для невідомої ноди"""
response = client.get("/city/internal/node/unknown-node-id/metrics/current")
# Має повернути 200 з мінімальним response
assert response.status_code == 200
data = response.json()
assert data["node_id"] == "unknown-node-id"
def test_metrics_have_numeric_values(self, client):
"""Метрики мають числові значення"""
response = client.get(f"/city/internal/node/{NODE1_ID}/metrics/current")
assert response.status_code == 200
data = response.json()
# All numeric fields should be numbers
numeric_fields = [
"cpu_cores", "cpu_usage",
"gpu_memory_total", "gpu_memory_used",
"ram_total", "ram_used",
"disk_total", "disk_used",
"agent_count_router", "agent_count_system"
]
for field in numeric_fields:
assert isinstance(data[field], (int, float)), f"{field} should be numeric"
# ============================================================================
# DAGI Audit Tests
# ============================================================================
class TestDAGIAudit:
"""Тести для POST /internal/node/{node_id}/dagi-audit/run"""
def test_run_audit_returns_valid_response(self, client):
"""POST audit повертає валідну структуру"""
response = client.post(f"/city/internal/node/{NODE1_ID}/dagi-audit/run")
assert response.status_code == 200
data = response.json()
assert "status" in data
assert data["status"] == "completed"
assert "summary" in data
assert "message" in data
# Summary fields
summary = data["summary"]
assert "router_total" in summary
assert "db_total" in summary
assert "active_count" in summary
assert "phantom_count" in summary
assert "stale_count" in summary
def test_get_audit_summary(self, client):
"""GET audit summary повертає дані"""
response = client.get(f"/city/internal/node/{NODE1_ID}/dagi-audit")
# Може бути 200 з даними або null
assert response.status_code == 200
data = response.json()
if data:
assert "node_id" in data
assert "timestamp" in data
assert "active_count" in data
# ============================================================================
# Phantom/Stale Sync Tests
# ============================================================================
class TestPhantomStaleSync:
"""Тести для phantom/stale sync endpoints"""
def test_phantom_sync_empty_list(self, client):
"""Sync з пустим списком не падає"""
response = client.post(
f"/city/internal/node/{NODE1_ID}/dagi-router/phantom/sync",
json={"agent_ids": []}
)
assert response.status_code == 200
data = response.json()
assert data["status"] == "completed"
assert data["created_count"] == 0
def test_stale_mark_empty_list(self, client):
"""Mark stale з пустим списком не падає"""
response = client.post(
f"/city/internal/node/{NODE1_ID}/dagi-router/stale/mark",
json={"agent_ids": []}
)
assert response.status_code == 200
data = response.json()
assert data["status"] == "completed"
assert data["marked_count"] == 0
# ============================================================================
# Integration Tests
# ============================================================================
class TestIntegration:
"""Інтеграційні тести"""
def test_full_audit_flow(self, client):
"""Повний цикл: audit → get agents → get metrics"""
# 1. Run audit
audit_response = client.post(f"/city/internal/node/{NODE1_ID}/dagi-audit/run")
assert audit_response.status_code == 200
# 2. Get agents
agents_response = client.get(f"/city/internal/node/{NODE1_ID}/dagi-router/agents")
assert agents_response.status_code == 200
agents_data = agents_response.json()
# 3. Get metrics
metrics_response = client.get(f"/city/internal/node/{NODE1_ID}/metrics/current")
assert metrics_response.status_code == 200
# 4. Verify consistency
audit_data = audit_response.json()
# Agent counts should match
assert agents_data["summary"]["active"] + agents_data["summary"]["phantom"] + agents_data["summary"]["stale"] >= 0
def test_both_nodes_accessible(self, client, node_ids):
"""Обидві ноди доступні через API"""
for node_id in node_ids:
response = client.get(f"/city/internal/node/{node_id}/metrics/current")
assert response.status_code == 200
data = response.json()
assert data["node_id"] == node_id
# ============================================================================
# Run tests
# ============================================================================
if __name__ == "__main__":
pytest.main([__file__, "-v", "--tb=short"])