fix: correct API endpoints in verification scripts
This commit is contained in:
@@ -90,9 +90,10 @@ def http_post(base_url: str, path: str, json_body: Any = None, timeout: float =
|
||||
# --- 1. Health / базові перевірки --------------------------------------------
|
||||
|
||||
def check_city_health(base_url: str, results: List[CheckResult]):
|
||||
name = "city-service /healthz"
|
||||
name = "city-service /health"
|
||||
try:
|
||||
r = http_get(base_url, "/healthz", timeout=5)
|
||||
# Assuming /health is at root
|
||||
r = http_get(base_url, "/health", timeout=5)
|
||||
if r.status_code == 200:
|
||||
add_result(results, name, "CRITICAL", True, "City service healthy")
|
||||
else:
|
||||
@@ -103,22 +104,26 @@ def check_city_health(base_url: str, results: List[CheckResult]):
|
||||
# --- 2. Node Directory / Registry / Metrics ----------------------------------
|
||||
|
||||
def check_nodes_api(base_url: str, results: List[CheckResult]):
|
||||
name = "nodes: GET /api/v1/nodes"
|
||||
name = "nodes: GET /public/nodes"
|
||||
try:
|
||||
r = http_get(base_url, "/api/v1/nodes", timeout=5)
|
||||
r = http_get(base_url, "/public/nodes", timeout=5)
|
||||
if r.status_code != 200:
|
||||
add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}")
|
||||
return
|
||||
|
||||
data = r.json()
|
||||
nodes = data.get("nodes", []) or data.get("items", [])
|
||||
# /public/nodes usually returns a list directly or {"items": []} or {"nodes": []}
|
||||
# Based on routes_city.py it returns list if response_model=List[NodeProfile]
|
||||
# or dict if response_model=NodeListResponse
|
||||
|
||||
nodes = []
|
||||
if isinstance(data, list):
|
||||
nodes = data
|
||||
elif isinstance(data, dict):
|
||||
nodes = data.get("nodes", []) or data.get("items", [])
|
||||
|
||||
if not isinstance(nodes, list):
|
||||
add_result(results, name, "CRITICAL", False, "Response nodes/items is not a list")
|
||||
return
|
||||
|
||||
if len(nodes) == 0:
|
||||
add_result(results, name, "WARNING", False, "API OK, але nodes.length == 0 (жодної зареєстрованої ноди)")
|
||||
add_result(results, name, "WARNING", False, "API OK, але nodes list empty")
|
||||
else:
|
||||
add_result(results, name, "CRITICAL", True, f"Знайдено нод: {len(nodes)}")
|
||||
|
||||
@@ -128,14 +133,11 @@ def check_nodes_api(base_url: str, results: List[CheckResult]):
|
||||
def check_node_metrics(base_url: str, node_id: str, results: List[CheckResult]):
|
||||
name = f"node metrics: {node_id}"
|
||||
try:
|
||||
# Note: The endpoint might be /internal/node/{node_id}/metrics or /internal/node/{node_id}/heartbeat
|
||||
# Assuming /internal/node/{node_id}/metrics/current based on request description, or fallback to dashboard
|
||||
# Let's try getting node profile from public API first as it contains metrics
|
||||
|
||||
# Try public profile first
|
||||
r = http_get(base_url, f"/public/nodes/{node_id}", timeout=5)
|
||||
if r.status_code != 200:
|
||||
# Fallback to internal
|
||||
r = http_get(base_url, f"/internal/nodes/{node_id}/profile", timeout=5)
|
||||
# Fallback to internal metrics
|
||||
r = http_get(base_url, f"/city/internal/node/{node_id}/metrics/current", timeout=5)
|
||||
|
||||
if r.status_code != 200:
|
||||
add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}")
|
||||
@@ -143,30 +145,24 @@ def check_node_metrics(base_url: str, node_id: str, results: List[CheckResult]):
|
||||
|
||||
data = r.json()
|
||||
|
||||
# Metrics often in 'metrics' field or flat
|
||||
agent_router = data.get("agent_count_router") or data.get("agents_total") # approximate
|
||||
agent_router = data.get("agent_count_router") or data.get("agents_total")
|
||||
agent_sys = data.get("agent_count_system")
|
||||
last_hb_raw = data.get("last_heartbeat")
|
||||
gpu_info = data.get("gpu_info") or data.get("gpu")
|
||||
|
||||
# Нода існує
|
||||
add_result(results, f"{name} - exists", "CRITICAL", True, "Node metrics entry exists")
|
||||
|
||||
# Agent counts
|
||||
# We might not have specific router/system counts in public profile, checking what we have
|
||||
if data.get("agents_total", 0) < 1:
|
||||
if data.get("agents_total", 0) < 1 and (agent_router is None or agent_router < 1):
|
||||
add_result(results, f"{name} - agents_total", "WARNING", False, f"agents_total={data.get('agents_total')}")
|
||||
else:
|
||||
add_result(results, f"{name} - agents_total", "CRITICAL", True, f"Total agents: {data.get('agents_total')}")
|
||||
add_result(results, f"{name} - agents_total", "CRITICAL", True, f"Total agents: {data.get('agents_total') or agent_router}")
|
||||
|
||||
# GPU only для NODE1 (можна орієнтуватися по id)
|
||||
if "hetzner" in node_id.lower():
|
||||
if not gpu_info:
|
||||
add_result(results, f"{name} - gpu_info", "WARNING", False, "gpu_info is empty for NODE1")
|
||||
else:
|
||||
add_result(results, f"{name} - gpu_info", "WARNING", True, "GPU info present")
|
||||
|
||||
# Heartbeat
|
||||
if last_hb_raw:
|
||||
dt = parse_dt(last_hb_raw)
|
||||
if dt is None:
|
||||
@@ -174,21 +170,9 @@ def check_node_metrics(base_url: str, node_id: str, results: List[CheckResult]):
|
||||
else:
|
||||
age = now_utc() - dt
|
||||
if age > timedelta(minutes=HEARTBEAT_MAX_AGE_MIN):
|
||||
add_result(
|
||||
results,
|
||||
f"{name} - heartbeat age",
|
||||
"WARNING",
|
||||
False,
|
||||
f"Heartbeat too old: {age}",
|
||||
)
|
||||
add_result(results, f"{name} - heartbeat age", "WARNING", False, f"Heartbeat too old: {age}")
|
||||
else:
|
||||
add_result(
|
||||
results,
|
||||
f"{name} - heartbeat age",
|
||||
"WARNING",
|
||||
True,
|
||||
f"Heartbeat age OK: {age}",
|
||||
)
|
||||
add_result(results, f"{name} - heartbeat age", "WARNING", True, f"Heartbeat age OK: {age}")
|
||||
else:
|
||||
add_result(results, f"{name} - heartbeat present", "WARNING", False, "last_heartbeat is missing")
|
||||
|
||||
@@ -200,7 +184,6 @@ def check_node_metrics(base_url: str, node_id: str, results: List[CheckResult]):
|
||||
def check_node_agents(base_url: str, node_id: str, results: List[CheckResult]):
|
||||
name = f"node agents: {node_id}"
|
||||
try:
|
||||
# Using public nodes API to check guardian/steward
|
||||
r = http_get(base_url, f"/public/nodes/{node_id}", timeout=5)
|
||||
if r.status_code != 200:
|
||||
add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}")
|
||||
@@ -210,7 +193,6 @@ def check_node_agents(base_url: str, node_id: str, results: List[CheckResult]):
|
||||
guardian = data.get("guardian_agent")
|
||||
steward = data.get("steward_agent")
|
||||
|
||||
# Guardian / Steward
|
||||
if guardian:
|
||||
add_result(results, f"{name} - guardian", "CRITICAL", True, "Guardian present")
|
||||
else:
|
||||
@@ -221,19 +203,18 @@ def check_node_agents(base_url: str, node_id: str, results: List[CheckResult]):
|
||||
else:
|
||||
add_result(results, f"{name} - steward", "CRITICAL", False, "Steward missing")
|
||||
|
||||
# To check other agents, we might need to list agents by node_id
|
||||
# Check all agents via /public/agents?node_id=...
|
||||
r_agents = http_get(base_url, f"/public/agents?node_id={node_id}", timeout=5)
|
||||
if r_agents.status_code == 200:
|
||||
agents_data = r_agents.json()
|
||||
agents_list = agents_data.get("items", [])
|
||||
agents_list = agents_data.get("items", []) if isinstance(agents_data, dict) else agents_data
|
||||
kinds = {a.get("kind") for a in agents_list if isinstance(a, dict)}
|
||||
|
||||
for kind in NODE_CORE_AGENT_KINDS:
|
||||
severity = "WARNING" if kind == "archivist_agent" else "INFO" # Changing CRITICAL to INFO/WARNING as strict check might fail on some nodes
|
||||
severity = "WARNING" if kind == "archivist_agent" else "INFO"
|
||||
if kind in kinds:
|
||||
add_result(results, f"{name} - core kind {kind}", severity, True, "present")
|
||||
else:
|
||||
# It's possible not all nodes have all agents yet
|
||||
add_result(results, f"{name} - core kind {kind}", severity, False, "missing")
|
||||
|
||||
except Exception as e:
|
||||
@@ -244,11 +225,10 @@ def check_node_agents(base_url: str, node_id: str, results: List[CheckResult]):
|
||||
def check_dagi_router(base_url: str, node_id: str, results: List[CheckResult]):
|
||||
name = f"DAGI Router: {node_id}"
|
||||
try:
|
||||
r = http_get(base_url, f"/internal/node/{node_id}/dagi-router/agents", timeout=5)
|
||||
# Path with /city prefix for internal routes
|
||||
r = http_get(base_url, f"/city/internal/node/{node_id}/dagi-router/agents", timeout=5)
|
||||
|
||||
# If internal endpoint not available or node not reachable, we might get 500/404
|
||||
if r.status_code != 200:
|
||||
# Try to fail gracefully if this is not yet implemented or accessible
|
||||
add_result(results, name, "WARNING", False, f"HTTP {r.status_code}: {r.text[:200]}")
|
||||
return
|
||||
|
||||
@@ -271,7 +251,6 @@ def check_dagi_router(base_url: str, node_id: str, results: List[CheckResult]):
|
||||
else:
|
||||
add_result(results, f"{name} - system_total", "CRITICAL", True, f"system_total={system_total}")
|
||||
|
||||
# Phantom/Stale limits
|
||||
if phantom is not None and phantom > PHANTOM_STALE_LIMIT:
|
||||
add_result(results, f"{name} - phantom", "WARNING", False, f"phantom={phantom} > {PHANTOM_STALE_LIMIT}")
|
||||
else:
|
||||
@@ -282,7 +261,6 @@ def check_dagi_router(base_url: str, node_id: str, results: List[CheckResult]):
|
||||
else:
|
||||
add_result(results, f"{name} - stale", "WARNING", True, f"stale={stale}")
|
||||
|
||||
# Active presence
|
||||
if active is None or active < 1:
|
||||
add_result(results, f"{name} - active", "CRITICAL", False, f"active={active}")
|
||||
else:
|
||||
@@ -295,14 +273,12 @@ def check_dagi_router(base_url: str, node_id: str, results: List[CheckResult]):
|
||||
|
||||
def find_agent_by_slug(base_url: str, slug: str) -> Optional[Dict[str, Any]]:
|
||||
try:
|
||||
# Ендпоінт прикладний, підлаштуй під свій API
|
||||
r = http_get(base_url, f"/public/agents?public_slug={slug}", timeout=5) # Changed to public endpoint
|
||||
r = http_get(base_url, f"/public/agents?is_public=true", timeout=5)
|
||||
if r.status_code != 200:
|
||||
return None
|
||||
data = r.json()
|
||||
items = data.get("items", [])
|
||||
items = data.get("items", []) if isinstance(data, dict) else data
|
||||
|
||||
# Try to find exact match
|
||||
for item in items:
|
||||
if item.get("slug") == slug or item.get("public_slug") == slug:
|
||||
return item
|
||||
@@ -312,6 +288,9 @@ def find_agent_by_slug(base_url: str, slug: str) -> Optional[Dict[str, Any]]:
|
||||
|
||||
def get_agent_prompts(base_url: str, agent_id: str) -> Dict[str, Any]:
|
||||
try:
|
||||
# Prompts are under /api/v1 which is NOT prefixed with /city in main.py includes?
|
||||
# main.py: app.include_router(routes_agents.router, prefix="/api/v1")
|
||||
# So it should be /api/v1/agents/...
|
||||
r = http_get(base_url, f"/api/v1/agents/{agent_id}/prompts", timeout=5)
|
||||
if r.status_code != 200:
|
||||
return {}
|
||||
@@ -325,14 +304,12 @@ def check_core_agents_prompts(base_url: str, results: List[CheckResult]):
|
||||
agent = find_agent_by_slug(base_url, slug)
|
||||
|
||||
if not agent:
|
||||
# Some agents might not be public or visible yet
|
||||
add_result(results, name, "INFO", False, "Agent not found by slug (public)")
|
||||
continue
|
||||
|
||||
agent_id = agent.get("id")
|
||||
prompts = get_agent_prompts(base_url, agent_id)
|
||||
|
||||
# очікуємо хоча б core
|
||||
records = prompts.get("prompts") or prompts
|
||||
has_core = False
|
||||
|
||||
@@ -350,15 +327,10 @@ def check_core_agents_prompts(base_url: str, results: List[CheckResult]):
|
||||
# --- 6. Мульти-модальність (high-level health) ------------------------------
|
||||
|
||||
def check_multimodal_services(base_url: str, results: List[CheckResult]):
|
||||
"""
|
||||
High-level: перевірка STT, OCR, можливо інших мультимодальних сервісів.
|
||||
Тут робимо тільки healthz-запити до gateway/health, якщо такі є.
|
||||
Цей блок адаптуй під свої реальні ендпоінти.
|
||||
"""
|
||||
services = [
|
||||
("/internal/health/stt", "Multimodal STT"),
|
||||
("/internal/health/ocr", "Multimodal OCR"),
|
||||
("/internal/health/vlm", "Multimodal VLM"),
|
||||
("/city/internal/health/stt", "Multimodal STT"),
|
||||
("/city/internal/health/ocr", "Multimodal OCR"),
|
||||
("/city/internal/health/vlm", "Multimodal VLM"),
|
||||
]
|
||||
for path, label in services:
|
||||
name = f"{label} health"
|
||||
@@ -367,8 +339,7 @@ def check_multimodal_services(base_url: str, results: List[CheckResult]):
|
||||
if r.status_code == 200:
|
||||
add_result(results, name, "WARNING", True, "OK")
|
||||
else:
|
||||
# These endpoints might not exist yet
|
||||
add_result(results, name, "INFO", False, f"HTTP {r.status_code}: {r.text[:200]}")
|
||||
add_result(results, name, "INFO", False, f"HTTP {r.status_code}")
|
||||
except Exception as e:
|
||||
add_result(results, name, "INFO", False, f"Exception: {e}")
|
||||
|
||||
@@ -383,7 +354,6 @@ def run_invariants_script(base_url: str, results: List[CheckResult]):
|
||||
return
|
||||
|
||||
try:
|
||||
# Assuming python is available
|
||||
cmd = [sys.executable, str(script_path), "--base-url", base_url, "--json"]
|
||||
proc = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
@@ -399,11 +369,9 @@ def run_invariants_script(base_url: str, results: List[CheckResult]):
|
||||
|
||||
def run_smoke_tests(results: List[CheckResult]):
|
||||
name = "pytest tests/test_infra_smoke.py"
|
||||
# Assuming run from project root
|
||||
tests_file = Path("tests") / "test_infra_smoke.py"
|
||||
|
||||
if not tests_file.exists():
|
||||
# Try relative to script location
|
||||
tests_file = Path(__file__).parent.parent / "tests" / "test_infra_smoke.py"
|
||||
|
||||
if not tests_file.exists():
|
||||
@@ -558,4 +526,4 @@ def main():
|
||||
sys.exit(0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
@@ -168,7 +168,7 @@ def check_node_exists(client: APIClient, node_id: str, results: CheckResults):
|
||||
"""Перевірити що нода існує і має базові метрики"""
|
||||
inv_name = f"Node exists: {node_id}"
|
||||
|
||||
data, error = client.get(f"/internal/node/{node_id}/metrics/current")
|
||||
data, error = client.get(f"/city/internal/node/{node_id}/metrics/current")
|
||||
|
||||
if error:
|
||||
results.failed.append(InvariantResult(
|
||||
@@ -288,7 +288,7 @@ def check_node_metrics(client: APIClient, node_id: str, metrics: Dict, results:
|
||||
def check_node_agents(client: APIClient, node_id: str, results: CheckResults):
|
||||
"""Перевірити Node Guardian та Steward"""
|
||||
|
||||
data, error = client.get(f"/internal/node/{node_id}/agents")
|
||||
data, error = client.get(f"/city/internal/node/{node_id}/agents")
|
||||
|
||||
if error:
|
||||
results.failed.append(InvariantResult(
|
||||
@@ -354,7 +354,7 @@ def check_node_agents(client: APIClient, node_id: str, results: CheckResults):
|
||||
def check_dagi_router(client: APIClient, node_id: str, results: CheckResults):
|
||||
"""Перевірити DAGI Router стан"""
|
||||
|
||||
data, error = client.get(f"/internal/node/{node_id}/dagi-router/agents")
|
||||
data, error = client.get(f"/city/internal/node/{node_id}/dagi-router/agents")
|
||||
|
||||
if error:
|
||||
results.warnings.append(InvariantResult(
|
||||
@@ -442,7 +442,7 @@ def check_core_agents_prompts(client: APIClient, results: CheckResults):
|
||||
agent_ids = [a["slug"] for a in CORE_AGENTS]
|
||||
|
||||
# Batch check prompts status
|
||||
data, error = client.post("/internal/agents/prompts/status", {"agent_ids": agent_ids})
|
||||
data, error = client.post("/city/internal/agents/prompts/status", {"agent_ids": agent_ids})
|
||||
|
||||
if error:
|
||||
results.warnings.append(InvariantResult(
|
||||
@@ -526,7 +526,7 @@ def check_healthz(client: APIClient, results: CheckResults):
|
||||
def check_node_self_healing(client: APIClient, node_id: str, results: CheckResults):
|
||||
"""Перевірити self-healing статус ноди"""
|
||||
|
||||
data, error = client.get(f"/internal/node/{node_id}/self-healing/status")
|
||||
data, error = client.get(f"/city/internal/node/{node_id}/self-healing/status")
|
||||
|
||||
if error:
|
||||
results.warnings.append(InvariantResult(
|
||||
@@ -580,7 +580,7 @@ def check_node_self_healing(client: APIClient, node_id: str, results: CheckResul
|
||||
def check_nodes_needing_healing(client: APIClient, results: CheckResults):
|
||||
"""Перевірити чи є ноди, які потребують healing"""
|
||||
|
||||
data, error = client.get("/internal/nodes/needing-healing")
|
||||
data, error = client.get("/city/internal/nodes/needing-healing")
|
||||
|
||||
if error:
|
||||
results.warnings.append(InvariantResult(
|
||||
|
||||
Reference in New Issue
Block a user