### Backend (city-service) - Node Registry + Self-Healing API (migration 039) - Improved get_all_nodes() with robust fallback for node_registry/node_cache - Agent Prompts Runtime API for DAGI Router integration - DAGI Router Audit endpoints (phantom/stale detection) - Node Agents API (Guardian/Steward) - Node metrics extended (CPU/GPU/RAM/Disk) ### Frontend (apps/web) - Node Directory with improved error handling - Node Cabinet with metrics cards - DAGI Router Card component - Node Metrics Card component - useDAGIAudit hook ### Scripts - check-invariants.py - deploy verification - node-bootstrap.sh - node self-registration - node-guardian-loop.py - continuous self-healing - dagi_agent_audit.py - DAGI audit utility ### Migrations - 034: Agent prompts seed - 035: Agent DAGI audit - 036: Node metrics extended - 037: Node agents complete - 038: Agent prompts full coverage - 039: Node registry self-healing ### Tests - test_infra_smoke.py - test_agent_prompts_runtime.py - test_dagi_router_api.py ### Documentation - DEPLOY_CHECKLIST_2024_11_30.md - Multiple TASK_PHASE docs
337 lines
12 KiB
Python
337 lines
12 KiB
Python
"""
|
||
Infrastructure Smoke Tests
|
||
|
||
Базові API тести для перевірки після деплою.
|
||
Запускаються як частина deploy pipeline або вручну.
|
||
|
||
Використання:
|
||
pytest tests/test_infra_smoke.py -v
|
||
pytest tests/test_infra_smoke.py -v --base-url http://localhost:7001
|
||
"""
|
||
|
||
import os
|
||
import pytest
|
||
import requests
|
||
from datetime import datetime, timezone, timedelta
|
||
from typing import Optional
|
||
|
||
# Configuration
|
||
BASE_URL = os.getenv("CITY_SERVICE_URL", "http://daarion-city-service:7001")
|
||
TIMEOUT = 10
|
||
|
||
# Node IDs
|
||
NODE1_ID = "node-1-hetzner-gex44"
|
||
NODE2_ID = "node-2-macbook-m4max"
|
||
|
||
|
||
def pytest_addoption(parser):
|
||
"""Add command line options"""
|
||
parser.addoption(
|
||
"--base-url",
|
||
action="store",
|
||
default=BASE_URL,
|
||
help="Base URL of city-service API"
|
||
)
|
||
|
||
|
||
@pytest.fixture
|
||
def base_url(request):
|
||
"""Get base URL from command line or environment"""
|
||
return request.config.getoption("--base-url") or BASE_URL
|
||
|
||
|
||
@pytest.fixture
|
||
def api_client(base_url):
|
||
"""Create API client session"""
|
||
session = requests.Session()
|
||
session.timeout = TIMEOUT
|
||
|
||
class Client:
|
||
def __init__(self, base_url: str, session: requests.Session):
|
||
self.base_url = base_url.rstrip("/")
|
||
self.session = session
|
||
|
||
def get(self, path: str) -> requests.Response:
|
||
return self.session.get(f"{self.base_url}{path}", timeout=TIMEOUT)
|
||
|
||
def post(self, path: str, json: dict) -> requests.Response:
|
||
return self.session.post(f"{self.base_url}{path}", json=json, timeout=TIMEOUT)
|
||
|
||
return Client(base_url, session)
|
||
|
||
|
||
# ==============================================================================
|
||
# Health Checks
|
||
# ==============================================================================
|
||
|
||
class TestHealthChecks:
|
||
"""Basic health check tests"""
|
||
|
||
def test_healthz_endpoint(self, api_client):
|
||
"""Test /healthz returns 200 and status ok"""
|
||
response = api_client.get("/healthz")
|
||
|
||
assert response.status_code == 200, f"Health check failed: {response.text}"
|
||
data = response.json()
|
||
assert data.get("status") == "ok", f"Unhealthy status: {data}"
|
||
|
||
def test_public_nodes_endpoint(self, api_client):
|
||
"""Test /public/nodes returns node list"""
|
||
response = api_client.get("/public/nodes")
|
||
|
||
assert response.status_code == 200, f"Nodes endpoint failed: {response.text}"
|
||
data = response.json()
|
||
assert "items" in data, "Response missing 'items' key"
|
||
assert "total" in data, "Response missing 'total' key"
|
||
|
||
|
||
# ==============================================================================
|
||
# Node Metrics Tests
|
||
# ==============================================================================
|
||
|
||
class TestNodeMetrics:
|
||
"""Node metrics tests"""
|
||
|
||
@pytest.mark.parametrize("node_id", [NODE1_ID, NODE2_ID])
|
||
def test_node_metrics_endpoint(self, api_client, node_id):
|
||
"""Test node metrics endpoint returns data"""
|
||
response = api_client.get(f"/internal/node/{node_id}/metrics/current")
|
||
|
||
assert response.status_code == 200, f"Node metrics failed for {node_id}: {response.text}"
|
||
data = response.json()
|
||
|
||
# Check required fields
|
||
assert "node_id" in data, "Missing node_id"
|
||
assert "agent_count_router" in data, "Missing agent_count_router"
|
||
assert "agent_count_system" in data, "Missing agent_count_system"
|
||
|
||
def test_node1_has_agents(self, api_client):
|
||
"""Test NODE1 has at least 1 agent in router"""
|
||
response = api_client.get(f"/internal/node/{NODE1_ID}/metrics/current")
|
||
|
||
if response.status_code != 200:
|
||
pytest.skip(f"NODE1 metrics not available: {response.status_code}")
|
||
|
||
data = response.json()
|
||
agent_count = data.get("agent_count_router", 0)
|
||
|
||
assert agent_count >= 1, f"NODE1 has {agent_count} agents in router, expected >= 1"
|
||
|
||
def test_node2_has_agents(self, api_client):
|
||
"""Test NODE2 has at least 1 agent in system"""
|
||
response = api_client.get(f"/internal/node/{NODE2_ID}/metrics/current")
|
||
|
||
if response.status_code != 200:
|
||
pytest.skip(f"NODE2 metrics not available: {response.status_code}")
|
||
|
||
data = response.json()
|
||
agent_count = data.get("agent_count_system", 0)
|
||
|
||
assert agent_count >= 1, f"NODE2 has {agent_count} agents in system, expected >= 1"
|
||
|
||
|
||
# ==============================================================================
|
||
# Node Agents Tests
|
||
# ==============================================================================
|
||
|
||
class TestNodeAgents:
|
||
"""Node agents (Guardian/Steward) tests"""
|
||
|
||
@pytest.mark.parametrize("node_id", [NODE1_ID, NODE2_ID])
|
||
def test_node_agents_endpoint(self, api_client, node_id):
|
||
"""Test node agents endpoint returns data"""
|
||
response = api_client.get(f"/internal/node/{node_id}/agents")
|
||
|
||
assert response.status_code == 200, f"Node agents failed for {node_id}: {response.text}"
|
||
data = response.json()
|
||
|
||
assert "node_id" in data, "Missing node_id"
|
||
assert "total" in data, "Missing total"
|
||
assert "agents" in data, "Missing agents list"
|
||
|
||
def test_node1_has_guardian(self, api_client):
|
||
"""Test NODE1 has Node Guardian"""
|
||
response = api_client.get(f"/internal/node/{NODE1_ID}/agents")
|
||
|
||
if response.status_code != 200:
|
||
pytest.skip(f"NODE1 agents not available: {response.status_code}")
|
||
|
||
data = response.json()
|
||
guardian = data.get("guardian")
|
||
|
||
assert guardian is not None, "NODE1 missing Node Guardian"
|
||
assert guardian.get("id"), "Guardian has no ID"
|
||
|
||
def test_node1_has_steward(self, api_client):
|
||
"""Test NODE1 has Node Steward"""
|
||
response = api_client.get(f"/internal/node/{NODE1_ID}/agents")
|
||
|
||
if response.status_code != 200:
|
||
pytest.skip(f"NODE1 agents not available: {response.status_code}")
|
||
|
||
data = response.json()
|
||
steward = data.get("steward")
|
||
|
||
assert steward is not None, "NODE1 missing Node Steward"
|
||
assert steward.get("id"), "Steward has no ID"
|
||
|
||
def test_node2_has_guardian(self, api_client):
|
||
"""Test NODE2 has Node Guardian"""
|
||
response = api_client.get(f"/internal/node/{NODE2_ID}/agents")
|
||
|
||
if response.status_code != 200:
|
||
pytest.skip(f"NODE2 agents not available: {response.status_code}")
|
||
|
||
data = response.json()
|
||
guardian = data.get("guardian")
|
||
|
||
assert guardian is not None, "NODE2 missing Node Guardian"
|
||
|
||
|
||
# ==============================================================================
|
||
# DAGI Router Tests
|
||
# ==============================================================================
|
||
|
||
class TestDAGIRouter:
|
||
"""DAGI Router tests"""
|
||
|
||
@pytest.mark.parametrize("node_id", [NODE1_ID, NODE2_ID])
|
||
def test_dagi_router_agents_endpoint(self, api_client, node_id):
|
||
"""Test DAGI Router agents endpoint returns data"""
|
||
response = api_client.get(f"/internal/node/{node_id}/dagi-router/agents")
|
||
|
||
# May return empty if no audit yet
|
||
if response.status_code == 404:
|
||
pytest.skip(f"DAGI Router not configured for {node_id}")
|
||
|
||
assert response.status_code == 200, f"DAGI Router failed for {node_id}: {response.text}"
|
||
data = response.json()
|
||
|
||
assert "node_id" in data, "Missing node_id"
|
||
assert "summary" in data, "Missing summary"
|
||
assert "agents" in data, "Missing agents list"
|
||
|
||
def test_node1_router_has_agents(self, api_client):
|
||
"""Test NODE1 DAGI Router has agents"""
|
||
response = api_client.get(f"/internal/node/{NODE1_ID}/dagi-router/agents")
|
||
|
||
if response.status_code != 200:
|
||
pytest.skip(f"NODE1 DAGI Router not available: {response.status_code}")
|
||
|
||
data = response.json()
|
||
summary = data.get("summary", {})
|
||
router_total = summary.get("router_total", 0)
|
||
|
||
# Warn but don't fail - router may not be configured
|
||
if router_total == 0:
|
||
pytest.skip("NODE1 DAGI Router has 0 agents (may not be configured)")
|
||
|
||
assert router_total >= 1, f"DAGI Router has {router_total} agents, expected >= 1"
|
||
|
||
|
||
# ==============================================================================
|
||
# Core Agents Tests
|
||
# ==============================================================================
|
||
|
||
class TestCoreAgents:
|
||
"""Core agents tests"""
|
||
|
||
def test_prompts_status_endpoint(self, api_client):
|
||
"""Test prompts status batch endpoint"""
|
||
agent_ids = ["agent-daarwizz", "agent-devtools", "agent-soul"]
|
||
|
||
response = api_client.post("/internal/agents/prompts/status", {"agent_ids": agent_ids})
|
||
|
||
assert response.status_code == 200, f"Prompts status failed: {response.text}"
|
||
data = response.json()
|
||
|
||
assert "status" in data, "Missing status in response"
|
||
assert isinstance(data["status"], dict), "Status should be a dict"
|
||
|
||
def test_daarwizz_runtime_prompt(self, api_client):
|
||
"""Test DAARWIZZ has runtime prompt"""
|
||
# Try both possible slugs
|
||
for agent_id in ["agent-daarwizz", "daarwizz"]:
|
||
response = api_client.get(f"/internal/agents/{agent_id}/prompts/runtime")
|
||
|
||
if response.status_code == 200:
|
||
data = response.json()
|
||
if data.get("has_prompts"):
|
||
assert data.get("prompts", {}).get("core"), "DAARWIZZ missing core prompt"
|
||
return
|
||
|
||
pytest.skip("DAARWIZZ agent not found or no prompts configured")
|
||
|
||
def test_runtime_system_prompt_endpoint(self, api_client):
|
||
"""Test runtime system prompt endpoint works"""
|
||
response = api_client.get("/internal/agents/agent-daarwizz/system-prompt")
|
||
|
||
if response.status_code == 404:
|
||
pytest.skip("DAARWIZZ agent not found")
|
||
|
||
assert response.status_code == 200, f"System prompt failed: {response.text}"
|
||
data = response.json()
|
||
|
||
assert "agent_id" in data, "Missing agent_id"
|
||
assert "system_prompt" in data, "Missing system_prompt"
|
||
assert len(data.get("system_prompt", "")) > 10, "System prompt too short"
|
||
|
||
|
||
# ==============================================================================
|
||
# Integration Tests
|
||
# ==============================================================================
|
||
|
||
class TestIntegration:
|
||
"""End-to-end integration tests"""
|
||
|
||
def test_node_to_agents_flow(self, api_client):
|
||
"""Test full flow: node → agents → prompts"""
|
||
# Get node
|
||
response = api_client.get(f"/internal/node/{NODE1_ID}/agents")
|
||
|
||
if response.status_code != 200:
|
||
pytest.skip(f"NODE1 not available: {response.status_code}")
|
||
|
||
data = response.json()
|
||
agents = data.get("agents", [])
|
||
|
||
if not agents:
|
||
pytest.skip("No agents found for NODE1")
|
||
|
||
# Get first agent's prompts
|
||
agent = agents[0]
|
||
agent_id = agent.get("id")
|
||
|
||
response = api_client.get(f"/internal/agents/{agent_id}/prompts/runtime")
|
||
|
||
# Should return successfully even if no prompts
|
||
assert response.status_code == 200, f"Agent prompts failed for {agent_id}: {response.text}"
|
||
|
||
def test_public_nodes_have_metrics(self, api_client):
|
||
"""Test public nodes endpoint includes metrics"""
|
||
response = api_client.get("/public/nodes")
|
||
|
||
assert response.status_code == 200
|
||
data = response.json()
|
||
|
||
items = data.get("items", [])
|
||
if not items:
|
||
pytest.skip("No nodes in system")
|
||
|
||
# Check first node has metrics
|
||
node = items[0]
|
||
|
||
# Should have metrics object after our changes
|
||
if "metrics" in node:
|
||
metrics = node["metrics"]
|
||
assert "cpu_cores" in metrics or "ram_total" in metrics, "Metrics object empty"
|
||
|
||
|
||
# ==============================================================================
|
||
# Run as script
|
||
# ==============================================================================
|
||
|
||
if __name__ == "__main__":
|
||
pytest.main([__file__, "-v"])
|
||
|