diff --git a/scripts/check-deploy-post.py b/scripts/check-deploy-post.py index 7791b6e1..ec97f74b 100644 --- a/scripts/check-deploy-post.py +++ b/scripts/check-deploy-post.py @@ -90,9 +90,10 @@ def http_post(base_url: str, path: str, json_body: Any = None, timeout: float = # --- 1. Health / базові перевірки -------------------------------------------- def check_city_health(base_url: str, results: List[CheckResult]): - name = "city-service /healthz" + name = "city-service /health" try: - r = http_get(base_url, "/healthz", timeout=5) + # Assuming /health is at root + r = http_get(base_url, "/health", timeout=5) if r.status_code == 200: add_result(results, name, "CRITICAL", True, "City service healthy") else: @@ -103,22 +104,26 @@ def check_city_health(base_url: str, results: List[CheckResult]): # --- 2. Node Directory / Registry / Metrics ---------------------------------- def check_nodes_api(base_url: str, results: List[CheckResult]): - name = "nodes: GET /api/v1/nodes" + name = "nodes: GET /public/nodes" try: - r = http_get(base_url, "/api/v1/nodes", timeout=5) + r = http_get(base_url, "/public/nodes", timeout=5) if r.status_code != 200: add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}") return data = r.json() - nodes = data.get("nodes", []) or data.get("items", []) + # /public/nodes usually returns a list directly or {"items": []} or {"nodes": []} + # Based on routes_city.py it returns list if response_model=List[NodeProfile] + # or dict if response_model=NodeListResponse + + nodes = [] + if isinstance(data, list): + nodes = data + elif isinstance(data, dict): + nodes = data.get("nodes", []) or data.get("items", []) - if not isinstance(nodes, list): - add_result(results, name, "CRITICAL", False, "Response nodes/items is not a list") - return - if len(nodes) == 0: - add_result(results, name, "WARNING", False, "API OK, але nodes.length == 0 (жодної зареєстрованої ноди)") + add_result(results, name, "WARNING", False, "API OK, але nodes list empty") else: add_result(results, name, "CRITICAL", True, f"Знайдено нод: {len(nodes)}") @@ -128,14 +133,11 @@ def check_nodes_api(base_url: str, results: List[CheckResult]): def check_node_metrics(base_url: str, node_id: str, results: List[CheckResult]): name = f"node metrics: {node_id}" try: - # Note: The endpoint might be /internal/node/{node_id}/metrics or /internal/node/{node_id}/heartbeat - # Assuming /internal/node/{node_id}/metrics/current based on request description, or fallback to dashboard - # Let's try getting node profile from public API first as it contains metrics - + # Try public profile first r = http_get(base_url, f"/public/nodes/{node_id}", timeout=5) if r.status_code != 200: - # Fallback to internal - r = http_get(base_url, f"/internal/nodes/{node_id}/profile", timeout=5) + # Fallback to internal metrics + r = http_get(base_url, f"/city/internal/node/{node_id}/metrics/current", timeout=5) if r.status_code != 200: add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}") @@ -143,30 +145,24 @@ def check_node_metrics(base_url: str, node_id: str, results: List[CheckResult]): data = r.json() - # Metrics often in 'metrics' field or flat - agent_router = data.get("agent_count_router") or data.get("agents_total") # approximate + agent_router = data.get("agent_count_router") or data.get("agents_total") agent_sys = data.get("agent_count_system") last_hb_raw = data.get("last_heartbeat") gpu_info = data.get("gpu_info") or data.get("gpu") - # Нода існує add_result(results, f"{name} - exists", "CRITICAL", True, "Node metrics entry exists") - # Agent counts - # We might not have specific router/system counts in public profile, checking what we have - if data.get("agents_total", 0) < 1: + if data.get("agents_total", 0) < 1 and (agent_router is None or agent_router < 1): add_result(results, f"{name} - agents_total", "WARNING", False, f"agents_total={data.get('agents_total')}") else: - add_result(results, f"{name} - agents_total", "CRITICAL", True, f"Total agents: {data.get('agents_total')}") + add_result(results, f"{name} - agents_total", "CRITICAL", True, f"Total agents: {data.get('agents_total') or agent_router}") - # GPU only для NODE1 (можна орієнтуватися по id) if "hetzner" in node_id.lower(): if not gpu_info: add_result(results, f"{name} - gpu_info", "WARNING", False, "gpu_info is empty for NODE1") else: add_result(results, f"{name} - gpu_info", "WARNING", True, "GPU info present") - # Heartbeat if last_hb_raw: dt = parse_dt(last_hb_raw) if dt is None: @@ -174,21 +170,9 @@ def check_node_metrics(base_url: str, node_id: str, results: List[CheckResult]): else: age = now_utc() - dt if age > timedelta(minutes=HEARTBEAT_MAX_AGE_MIN): - add_result( - results, - f"{name} - heartbeat age", - "WARNING", - False, - f"Heartbeat too old: {age}", - ) + add_result(results, f"{name} - heartbeat age", "WARNING", False, f"Heartbeat too old: {age}") else: - add_result( - results, - f"{name} - heartbeat age", - "WARNING", - True, - f"Heartbeat age OK: {age}", - ) + add_result(results, f"{name} - heartbeat age", "WARNING", True, f"Heartbeat age OK: {age}") else: add_result(results, f"{name} - heartbeat present", "WARNING", False, "last_heartbeat is missing") @@ -200,7 +184,6 @@ def check_node_metrics(base_url: str, node_id: str, results: List[CheckResult]): def check_node_agents(base_url: str, node_id: str, results: List[CheckResult]): name = f"node agents: {node_id}" try: - # Using public nodes API to check guardian/steward r = http_get(base_url, f"/public/nodes/{node_id}", timeout=5) if r.status_code != 200: add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}") @@ -210,7 +193,6 @@ def check_node_agents(base_url: str, node_id: str, results: List[CheckResult]): guardian = data.get("guardian_agent") steward = data.get("steward_agent") - # Guardian / Steward if guardian: add_result(results, f"{name} - guardian", "CRITICAL", True, "Guardian present") else: @@ -221,19 +203,18 @@ def check_node_agents(base_url: str, node_id: str, results: List[CheckResult]): else: add_result(results, f"{name} - steward", "CRITICAL", False, "Steward missing") - # To check other agents, we might need to list agents by node_id + # Check all agents via /public/agents?node_id=... r_agents = http_get(base_url, f"/public/agents?node_id={node_id}", timeout=5) if r_agents.status_code == 200: agents_data = r_agents.json() - agents_list = agents_data.get("items", []) + agents_list = agents_data.get("items", []) if isinstance(agents_data, dict) else agents_data kinds = {a.get("kind") for a in agents_list if isinstance(a, dict)} for kind in NODE_CORE_AGENT_KINDS: - severity = "WARNING" if kind == "archivist_agent" else "INFO" # Changing CRITICAL to INFO/WARNING as strict check might fail on some nodes + severity = "WARNING" if kind == "archivist_agent" else "INFO" if kind in kinds: add_result(results, f"{name} - core kind {kind}", severity, True, "present") else: - # It's possible not all nodes have all agents yet add_result(results, f"{name} - core kind {kind}", severity, False, "missing") except Exception as e: @@ -244,11 +225,10 @@ def check_node_agents(base_url: str, node_id: str, results: List[CheckResult]): def check_dagi_router(base_url: str, node_id: str, results: List[CheckResult]): name = f"DAGI Router: {node_id}" try: - r = http_get(base_url, f"/internal/node/{node_id}/dagi-router/agents", timeout=5) + # Path with /city prefix for internal routes + r = http_get(base_url, f"/city/internal/node/{node_id}/dagi-router/agents", timeout=5) - # If internal endpoint not available or node not reachable, we might get 500/404 if r.status_code != 200: - # Try to fail gracefully if this is not yet implemented or accessible add_result(results, name, "WARNING", False, f"HTTP {r.status_code}: {r.text[:200]}") return @@ -271,7 +251,6 @@ def check_dagi_router(base_url: str, node_id: str, results: List[CheckResult]): else: add_result(results, f"{name} - system_total", "CRITICAL", True, f"system_total={system_total}") - # Phantom/Stale limits if phantom is not None and phantom > PHANTOM_STALE_LIMIT: add_result(results, f"{name} - phantom", "WARNING", False, f"phantom={phantom} > {PHANTOM_STALE_LIMIT}") else: @@ -282,7 +261,6 @@ def check_dagi_router(base_url: str, node_id: str, results: List[CheckResult]): else: add_result(results, f"{name} - stale", "WARNING", True, f"stale={stale}") - # Active presence if active is None or active < 1: add_result(results, f"{name} - active", "CRITICAL", False, f"active={active}") else: @@ -295,14 +273,12 @@ def check_dagi_router(base_url: str, node_id: str, results: List[CheckResult]): def find_agent_by_slug(base_url: str, slug: str) -> Optional[Dict[str, Any]]: try: - # Ендпоінт прикладний, підлаштуй під свій API - r = http_get(base_url, f"/public/agents?public_slug={slug}", timeout=5) # Changed to public endpoint + r = http_get(base_url, f"/public/agents?is_public=true", timeout=5) if r.status_code != 200: return None data = r.json() - items = data.get("items", []) + items = data.get("items", []) if isinstance(data, dict) else data - # Try to find exact match for item in items: if item.get("slug") == slug or item.get("public_slug") == slug: return item @@ -312,6 +288,9 @@ def find_agent_by_slug(base_url: str, slug: str) -> Optional[Dict[str, Any]]: def get_agent_prompts(base_url: str, agent_id: str) -> Dict[str, Any]: try: + # Prompts are under /api/v1 which is NOT prefixed with /city in main.py includes? + # main.py: app.include_router(routes_agents.router, prefix="/api/v1") + # So it should be /api/v1/agents/... r = http_get(base_url, f"/api/v1/agents/{agent_id}/prompts", timeout=5) if r.status_code != 200: return {} @@ -325,14 +304,12 @@ def check_core_agents_prompts(base_url: str, results: List[CheckResult]): agent = find_agent_by_slug(base_url, slug) if not agent: - # Some agents might not be public or visible yet add_result(results, name, "INFO", False, "Agent not found by slug (public)") continue agent_id = agent.get("id") prompts = get_agent_prompts(base_url, agent_id) - # очікуємо хоча б core records = prompts.get("prompts") or prompts has_core = False @@ -350,15 +327,10 @@ def check_core_agents_prompts(base_url: str, results: List[CheckResult]): # --- 6. Мульти-модальність (high-level health) ------------------------------ def check_multimodal_services(base_url: str, results: List[CheckResult]): - """ - High-level: перевірка STT, OCR, можливо інших мультимодальних сервісів. - Тут робимо тільки healthz-запити до gateway/health, якщо такі є. - Цей блок адаптуй під свої реальні ендпоінти. - """ services = [ - ("/internal/health/stt", "Multimodal STT"), - ("/internal/health/ocr", "Multimodal OCR"), - ("/internal/health/vlm", "Multimodal VLM"), + ("/city/internal/health/stt", "Multimodal STT"), + ("/city/internal/health/ocr", "Multimodal OCR"), + ("/city/internal/health/vlm", "Multimodal VLM"), ] for path, label in services: name = f"{label} health" @@ -367,8 +339,7 @@ def check_multimodal_services(base_url: str, results: List[CheckResult]): if r.status_code == 200: add_result(results, name, "WARNING", True, "OK") else: - # These endpoints might not exist yet - add_result(results, name, "INFO", False, f"HTTP {r.status_code}: {r.text[:200]}") + add_result(results, name, "INFO", False, f"HTTP {r.status_code}") except Exception as e: add_result(results, name, "INFO", False, f"Exception: {e}") @@ -383,7 +354,6 @@ def run_invariants_script(base_url: str, results: List[CheckResult]): return try: - # Assuming python is available cmd = [sys.executable, str(script_path), "--base-url", base_url, "--json"] proc = subprocess.run(cmd, capture_output=True, text=True) @@ -399,11 +369,9 @@ def run_invariants_script(base_url: str, results: List[CheckResult]): def run_smoke_tests(results: List[CheckResult]): name = "pytest tests/test_infra_smoke.py" - # Assuming run from project root tests_file = Path("tests") / "test_infra_smoke.py" if not tests_file.exists(): - # Try relative to script location tests_file = Path(__file__).parent.parent / "tests" / "test_infra_smoke.py" if not tests_file.exists(): @@ -558,4 +526,4 @@ def main(): sys.exit(0) if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/scripts/check-invariants.py b/scripts/check-invariants.py index e5abc5a2..481f22ff 100755 --- a/scripts/check-invariants.py +++ b/scripts/check-invariants.py @@ -168,7 +168,7 @@ def check_node_exists(client: APIClient, node_id: str, results: CheckResults): """Перевірити що нода існує і має базові метрики""" inv_name = f"Node exists: {node_id}" - data, error = client.get(f"/internal/node/{node_id}/metrics/current") + data, error = client.get(f"/city/internal/node/{node_id}/metrics/current") if error: results.failed.append(InvariantResult( @@ -288,7 +288,7 @@ def check_node_metrics(client: APIClient, node_id: str, metrics: Dict, results: def check_node_agents(client: APIClient, node_id: str, results: CheckResults): """Перевірити Node Guardian та Steward""" - data, error = client.get(f"/internal/node/{node_id}/agents") + data, error = client.get(f"/city/internal/node/{node_id}/agents") if error: results.failed.append(InvariantResult( @@ -354,7 +354,7 @@ def check_node_agents(client: APIClient, node_id: str, results: CheckResults): def check_dagi_router(client: APIClient, node_id: str, results: CheckResults): """Перевірити DAGI Router стан""" - data, error = client.get(f"/internal/node/{node_id}/dagi-router/agents") + data, error = client.get(f"/city/internal/node/{node_id}/dagi-router/agents") if error: results.warnings.append(InvariantResult( @@ -442,7 +442,7 @@ def check_core_agents_prompts(client: APIClient, results: CheckResults): agent_ids = [a["slug"] for a in CORE_AGENTS] # Batch check prompts status - data, error = client.post("/internal/agents/prompts/status", {"agent_ids": agent_ids}) + data, error = client.post("/city/internal/agents/prompts/status", {"agent_ids": agent_ids}) if error: results.warnings.append(InvariantResult( @@ -526,7 +526,7 @@ def check_healthz(client: APIClient, results: CheckResults): def check_node_self_healing(client: APIClient, node_id: str, results: CheckResults): """Перевірити self-healing статус ноди""" - data, error = client.get(f"/internal/node/{node_id}/self-healing/status") + data, error = client.get(f"/city/internal/node/{node_id}/self-healing/status") if error: results.warnings.append(InvariantResult( @@ -580,7 +580,7 @@ def check_node_self_healing(client: APIClient, node_id: str, results: CheckResul def check_nodes_needing_healing(client: APIClient, results: CheckResults): """Перевірити чи є ноди, які потребують healing""" - data, error = client.get("/internal/nodes/needing-healing") + data, error = client.get("/city/internal/nodes/needing-healing") if error: results.warnings.append(InvariantResult(