fix: correct API endpoints in verification scripts

This commit is contained in:
Apple
2025-11-30 14:51:59 -08:00
parent 6d4f9ec7c5
commit 5c1d7d15f9
2 changed files with 44 additions and 76 deletions

View File

@@ -90,9 +90,10 @@ def http_post(base_url: str, path: str, json_body: Any = None, timeout: float =
# --- 1. Health / базові перевірки --------------------------------------------
def check_city_health(base_url: str, results: List[CheckResult]):
name = "city-service /healthz"
name = "city-service /health"
try:
r = http_get(base_url, "/healthz", timeout=5)
# Assuming /health is at root
r = http_get(base_url, "/health", timeout=5)
if r.status_code == 200:
add_result(results, name, "CRITICAL", True, "City service healthy")
else:
@@ -103,22 +104,26 @@ def check_city_health(base_url: str, results: List[CheckResult]):
# --- 2. Node Directory / Registry / Metrics ----------------------------------
def check_nodes_api(base_url: str, results: List[CheckResult]):
name = "nodes: GET /api/v1/nodes"
name = "nodes: GET /public/nodes"
try:
r = http_get(base_url, "/api/v1/nodes", timeout=5)
r = http_get(base_url, "/public/nodes", timeout=5)
if r.status_code != 200:
add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}")
return
data = r.json()
nodes = data.get("nodes", []) or data.get("items", [])
# /public/nodes usually returns a list directly or {"items": []} or {"nodes": []}
# Based on routes_city.py it returns list if response_model=List[NodeProfile]
# or dict if response_model=NodeListResponse
nodes = []
if isinstance(data, list):
nodes = data
elif isinstance(data, dict):
nodes = data.get("nodes", []) or data.get("items", [])
if not isinstance(nodes, list):
add_result(results, name, "CRITICAL", False, "Response nodes/items is not a list")
return
if len(nodes) == 0:
add_result(results, name, "WARNING", False, "API OK, але nodes.length == 0 (жодної зареєстрованої ноди)")
add_result(results, name, "WARNING", False, "API OK, але nodes list empty")
else:
add_result(results, name, "CRITICAL", True, f"Знайдено нод: {len(nodes)}")
@@ -128,14 +133,11 @@ def check_nodes_api(base_url: str, results: List[CheckResult]):
def check_node_metrics(base_url: str, node_id: str, results: List[CheckResult]):
name = f"node metrics: {node_id}"
try:
# Note: The endpoint might be /internal/node/{node_id}/metrics or /internal/node/{node_id}/heartbeat
# Assuming /internal/node/{node_id}/metrics/current based on request description, or fallback to dashboard
# Let's try getting node profile from public API first as it contains metrics
# Try public profile first
r = http_get(base_url, f"/public/nodes/{node_id}", timeout=5)
if r.status_code != 200:
# Fallback to internal
r = http_get(base_url, f"/internal/nodes/{node_id}/profile", timeout=5)
# Fallback to internal metrics
r = http_get(base_url, f"/city/internal/node/{node_id}/metrics/current", timeout=5)
if r.status_code != 200:
add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}")
@@ -143,30 +145,24 @@ def check_node_metrics(base_url: str, node_id: str, results: List[CheckResult]):
data = r.json()
# Metrics often in 'metrics' field or flat
agent_router = data.get("agent_count_router") or data.get("agents_total") # approximate
agent_router = data.get("agent_count_router") or data.get("agents_total")
agent_sys = data.get("agent_count_system")
last_hb_raw = data.get("last_heartbeat")
gpu_info = data.get("gpu_info") or data.get("gpu")
# Нода існує
add_result(results, f"{name} - exists", "CRITICAL", True, "Node metrics entry exists")
# Agent counts
# We might not have specific router/system counts in public profile, checking what we have
if data.get("agents_total", 0) < 1:
if data.get("agents_total", 0) < 1 and (agent_router is None or agent_router < 1):
add_result(results, f"{name} - agents_total", "WARNING", False, f"agents_total={data.get('agents_total')}")
else:
add_result(results, f"{name} - agents_total", "CRITICAL", True, f"Total agents: {data.get('agents_total')}")
add_result(results, f"{name} - agents_total", "CRITICAL", True, f"Total agents: {data.get('agents_total') or agent_router}")
# GPU only для NODE1 (можна орієнтуватися по id)
if "hetzner" in node_id.lower():
if not gpu_info:
add_result(results, f"{name} - gpu_info", "WARNING", False, "gpu_info is empty for NODE1")
else:
add_result(results, f"{name} - gpu_info", "WARNING", True, "GPU info present")
# Heartbeat
if last_hb_raw:
dt = parse_dt(last_hb_raw)
if dt is None:
@@ -174,21 +170,9 @@ def check_node_metrics(base_url: str, node_id: str, results: List[CheckResult]):
else:
age = now_utc() - dt
if age > timedelta(minutes=HEARTBEAT_MAX_AGE_MIN):
add_result(
results,
f"{name} - heartbeat age",
"WARNING",
False,
f"Heartbeat too old: {age}",
)
add_result(results, f"{name} - heartbeat age", "WARNING", False, f"Heartbeat too old: {age}")
else:
add_result(
results,
f"{name} - heartbeat age",
"WARNING",
True,
f"Heartbeat age OK: {age}",
)
add_result(results, f"{name} - heartbeat age", "WARNING", True, f"Heartbeat age OK: {age}")
else:
add_result(results, f"{name} - heartbeat present", "WARNING", False, "last_heartbeat is missing")
@@ -200,7 +184,6 @@ def check_node_metrics(base_url: str, node_id: str, results: List[CheckResult]):
def check_node_agents(base_url: str, node_id: str, results: List[CheckResult]):
name = f"node agents: {node_id}"
try:
# Using public nodes API to check guardian/steward
r = http_get(base_url, f"/public/nodes/{node_id}", timeout=5)
if r.status_code != 200:
add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}")
@@ -210,7 +193,6 @@ def check_node_agents(base_url: str, node_id: str, results: List[CheckResult]):
guardian = data.get("guardian_agent")
steward = data.get("steward_agent")
# Guardian / Steward
if guardian:
add_result(results, f"{name} - guardian", "CRITICAL", True, "Guardian present")
else:
@@ -221,19 +203,18 @@ def check_node_agents(base_url: str, node_id: str, results: List[CheckResult]):
else:
add_result(results, f"{name} - steward", "CRITICAL", False, "Steward missing")
# To check other agents, we might need to list agents by node_id
# Check all agents via /public/agents?node_id=...
r_agents = http_get(base_url, f"/public/agents?node_id={node_id}", timeout=5)
if r_agents.status_code == 200:
agents_data = r_agents.json()
agents_list = agents_data.get("items", [])
agents_list = agents_data.get("items", []) if isinstance(agents_data, dict) else agents_data
kinds = {a.get("kind") for a in agents_list if isinstance(a, dict)}
for kind in NODE_CORE_AGENT_KINDS:
severity = "WARNING" if kind == "archivist_agent" else "INFO" # Changing CRITICAL to INFO/WARNING as strict check might fail on some nodes
severity = "WARNING" if kind == "archivist_agent" else "INFO"
if kind in kinds:
add_result(results, f"{name} - core kind {kind}", severity, True, "present")
else:
# It's possible not all nodes have all agents yet
add_result(results, f"{name} - core kind {kind}", severity, False, "missing")
except Exception as e:
@@ -244,11 +225,10 @@ def check_node_agents(base_url: str, node_id: str, results: List[CheckResult]):
def check_dagi_router(base_url: str, node_id: str, results: List[CheckResult]):
name = f"DAGI Router: {node_id}"
try:
r = http_get(base_url, f"/internal/node/{node_id}/dagi-router/agents", timeout=5)
# Path with /city prefix for internal routes
r = http_get(base_url, f"/city/internal/node/{node_id}/dagi-router/agents", timeout=5)
# If internal endpoint not available or node not reachable, we might get 500/404
if r.status_code != 200:
# Try to fail gracefully if this is not yet implemented or accessible
add_result(results, name, "WARNING", False, f"HTTP {r.status_code}: {r.text[:200]}")
return
@@ -271,7 +251,6 @@ def check_dagi_router(base_url: str, node_id: str, results: List[CheckResult]):
else:
add_result(results, f"{name} - system_total", "CRITICAL", True, f"system_total={system_total}")
# Phantom/Stale limits
if phantom is not None and phantom > PHANTOM_STALE_LIMIT:
add_result(results, f"{name} - phantom", "WARNING", False, f"phantom={phantom} > {PHANTOM_STALE_LIMIT}")
else:
@@ -282,7 +261,6 @@ def check_dagi_router(base_url: str, node_id: str, results: List[CheckResult]):
else:
add_result(results, f"{name} - stale", "WARNING", True, f"stale={stale}")
# Active presence
if active is None or active < 1:
add_result(results, f"{name} - active", "CRITICAL", False, f"active={active}")
else:
@@ -295,14 +273,12 @@ def check_dagi_router(base_url: str, node_id: str, results: List[CheckResult]):
def find_agent_by_slug(base_url: str, slug: str) -> Optional[Dict[str, Any]]:
try:
# Ендпоінт прикладний, підлаштуй під свій API
r = http_get(base_url, f"/public/agents?public_slug={slug}", timeout=5) # Changed to public endpoint
r = http_get(base_url, f"/public/agents?is_public=true", timeout=5)
if r.status_code != 200:
return None
data = r.json()
items = data.get("items", [])
items = data.get("items", []) if isinstance(data, dict) else data
# Try to find exact match
for item in items:
if item.get("slug") == slug or item.get("public_slug") == slug:
return item
@@ -312,6 +288,9 @@ def find_agent_by_slug(base_url: str, slug: str) -> Optional[Dict[str, Any]]:
def get_agent_prompts(base_url: str, agent_id: str) -> Dict[str, Any]:
try:
# Prompts are under /api/v1 which is NOT prefixed with /city in main.py includes?
# main.py: app.include_router(routes_agents.router, prefix="/api/v1")
# So it should be /api/v1/agents/...
r = http_get(base_url, f"/api/v1/agents/{agent_id}/prompts", timeout=5)
if r.status_code != 200:
return {}
@@ -325,14 +304,12 @@ def check_core_agents_prompts(base_url: str, results: List[CheckResult]):
agent = find_agent_by_slug(base_url, slug)
if not agent:
# Some agents might not be public or visible yet
add_result(results, name, "INFO", False, "Agent not found by slug (public)")
continue
agent_id = agent.get("id")
prompts = get_agent_prompts(base_url, agent_id)
# очікуємо хоча б core
records = prompts.get("prompts") or prompts
has_core = False
@@ -350,15 +327,10 @@ def check_core_agents_prompts(base_url: str, results: List[CheckResult]):
# --- 6. Мульти-модальність (high-level health) ------------------------------
def check_multimodal_services(base_url: str, results: List[CheckResult]):
"""
High-level: перевірка STT, OCR, можливо інших мультимодальних сервісів.
Тут робимо тільки healthz-запити до gateway/health, якщо такі є.
Цей блок адаптуй під свої реальні ендпоінти.
"""
services = [
("/internal/health/stt", "Multimodal STT"),
("/internal/health/ocr", "Multimodal OCR"),
("/internal/health/vlm", "Multimodal VLM"),
("/city/internal/health/stt", "Multimodal STT"),
("/city/internal/health/ocr", "Multimodal OCR"),
("/city/internal/health/vlm", "Multimodal VLM"),
]
for path, label in services:
name = f"{label} health"
@@ -367,8 +339,7 @@ def check_multimodal_services(base_url: str, results: List[CheckResult]):
if r.status_code == 200:
add_result(results, name, "WARNING", True, "OK")
else:
# These endpoints might not exist yet
add_result(results, name, "INFO", False, f"HTTP {r.status_code}: {r.text[:200]}")
add_result(results, name, "INFO", False, f"HTTP {r.status_code}")
except Exception as e:
add_result(results, name, "INFO", False, f"Exception: {e}")
@@ -383,7 +354,6 @@ def run_invariants_script(base_url: str, results: List[CheckResult]):
return
try:
# Assuming python is available
cmd = [sys.executable, str(script_path), "--base-url", base_url, "--json"]
proc = subprocess.run(cmd, capture_output=True, text=True)
@@ -399,11 +369,9 @@ def run_invariants_script(base_url: str, results: List[CheckResult]):
def run_smoke_tests(results: List[CheckResult]):
name = "pytest tests/test_infra_smoke.py"
# Assuming run from project root
tests_file = Path("tests") / "test_infra_smoke.py"
if not tests_file.exists():
# Try relative to script location
tests_file = Path(__file__).parent.parent / "tests" / "test_infra_smoke.py"
if not tests_file.exists():
@@ -558,4 +526,4 @@ def main():
sys.exit(0)
if __name__ == "__main__":
main()
main()