#!/usr/bin/env python """ scripts/check-deploy-post.py Розширена перевірка після деплою (60+ чеків) для DAARION.city. Рівні: - CRITICAL: порушення → скрипт повертає exit code 1 - WARNING: порушення → скрипт повертає exit code 0 (але позначає проблему) """ import argparse import json import sys import subprocess from dataclasses import dataclass, asdict from datetime import datetime, timezone, timedelta from pathlib import Path from typing import List, Dict, Any, Optional import requests # --- Конфіг за замовчуванням ------------------------------------------------- DEFAULT_BASE_URL = "http://localhost:7001" # node_id для NODE1 / NODE2 — підстав свої, якщо відрізняються DEFAULT_NODES = [ "node-1-hetzner-gex44", "node-2-macbook-m4max", ] CORE_AGENT_SLUGS = [ "daarwizz", "daria", "dario", "soul", "spirit", "logic", "greenfood-erp", "helion", ] NODE_CORE_AGENT_KINDS = [ "node_guardian", "node_steward", "dagi_router_agent", "swapper_agent", "multimodal_agent", "tools_planner_agent", "security_agent", "archivist_agent", # якщо ще не в проді — не критично ] PHANTOM_STALE_LIMIT = 20 HEARTBEAT_MAX_AGE_MIN = 10 # --- Модель результату ------------------------------------------------------- @dataclass class CheckResult: name: str severity: str # "CRITICAL" | "WARNING" | "INFO" ok: bool message: str def to_dict(self) -> Dict[str, Any]: return asdict(self) # --- Утиліти ----------------------------------------------------------------- def now_utc() -> datetime: return datetime.now(timezone.utc) def parse_dt(value: str) -> Optional[datetime]: try: return datetime.fromisoformat(value.replace("Z", "+00:00")) except Exception: return None def add_result(results: List[CheckResult], name: str, severity: str, ok: bool, message: str): results.append(CheckResult(name=name, severity=severity, ok=ok, message=message)) def http_get(base_url: str, path: str, timeout: float = 5.0) -> requests.Response: url = base_url.rstrip("/") + path return requests.get(url, timeout=timeout) def http_post(base_url: str, path: str, json_body: Any = None, timeout: float = 10.0) -> requests.Response: url = base_url.rstrip("/") + path return requests.post(url, json=json_body, timeout=timeout) # --- 1. Health / базові перевірки -------------------------------------------- def check_city_health(base_url: str, results: List[CheckResult]): name = "city-service /health" try: # Assuming /health is at root r = http_get(base_url, "/health", timeout=5) if r.status_code == 200: add_result(results, name, "CRITICAL", True, "City service healthy") else: add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}") except Exception as e: add_result(results, name, "CRITICAL", False, f"Exception: {e}") # --- 2. Node Directory / Registry / Metrics ---------------------------------- def check_nodes_api(base_url: str, results: List[CheckResult]): name = "nodes: GET /public/nodes" try: r = http_get(base_url, "/public/nodes", timeout=5) if r.status_code != 200: add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}") return data = r.json() # /public/nodes usually returns a list directly or {"items": []} or {"nodes": []} # Based on routes_city.py it returns list if response_model=List[NodeProfile] # or dict if response_model=NodeListResponse nodes = [] if isinstance(data, list): nodes = data elif isinstance(data, dict): nodes = data.get("nodes", []) or data.get("items", []) if len(nodes) == 0: add_result(results, name, "WARNING", False, "API OK, але nodes list empty") else: add_result(results, name, "CRITICAL", True, f"Знайдено нод: {len(nodes)}") except Exception as e: add_result(results, name, "CRITICAL", False, f"Exception: {e}") def check_node_metrics(base_url: str, node_id: str, results: List[CheckResult]): name = f"node metrics: {node_id}" try: # Try public profile first r = http_get(base_url, f"/public/nodes/{node_id}", timeout=5) if r.status_code != 200: # Fallback to internal metrics r = http_get(base_url, f"/city/internal/node/{node_id}/metrics/current", timeout=5) if r.status_code != 200: add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}") return data = r.json() agent_router = data.get("agent_count_router") or data.get("agents_total") agent_sys = data.get("agent_count_system") last_hb_raw = data.get("last_heartbeat") gpu_info = data.get("gpu_info") or data.get("gpu") add_result(results, f"{name} - exists", "CRITICAL", True, "Node metrics entry exists") if data.get("agents_total", 0) < 1 and (agent_router is None or agent_router < 1): add_result(results, f"{name} - agents_total", "WARNING", False, f"agents_total={data.get('agents_total')}") else: add_result(results, f"{name} - agents_total", "CRITICAL", True, f"Total agents: {data.get('agents_total') or agent_router}") if "hetzner" in node_id.lower(): if not gpu_info: add_result(results, f"{name} - gpu_info", "WARNING", False, "gpu_info is empty for NODE1") else: add_result(results, f"{name} - gpu_info", "WARNING", True, "GPU info present") if last_hb_raw: dt = parse_dt(last_hb_raw) if dt is None: add_result(results, f"{name} - heartbeat parse", "WARNING", False, f"Cannot parse last_heartbeat: {last_hb_raw}") else: age = now_utc() - dt if age > timedelta(minutes=HEARTBEAT_MAX_AGE_MIN): add_result(results, f"{name} - heartbeat age", "WARNING", False, f"Heartbeat too old: {age}") else: add_result(results, f"{name} - heartbeat age", "WARNING", True, f"Heartbeat age OK: {age}") else: add_result(results, f"{name} - heartbeat present", "WARNING", False, "last_heartbeat is missing") except Exception as e: add_result(results, name, "CRITICAL", False, f"Exception: {e}") # --- 3. Node Agents / Core DAOS Node Core ------------------------------------ def check_node_agents(base_url: str, node_id: str, results: List[CheckResult]): name = f"node agents: {node_id}" try: r = http_get(base_url, f"/public/nodes/{node_id}", timeout=5) if r.status_code != 200: add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}") return data = r.json() guardian = data.get("guardian_agent") steward = data.get("steward_agent") if guardian: add_result(results, f"{name} - guardian", "CRITICAL", True, "Guardian present") else: add_result(results, f"{name} - guardian", "CRITICAL", False, "Guardian missing") if steward: add_result(results, f"{name} - steward", "CRITICAL", True, "Steward present") else: add_result(results, f"{name} - steward", "CRITICAL", False, "Steward missing") # Check all agents via /public/agents?node_id=... r_agents = http_get(base_url, f"/public/agents?node_id={node_id}", timeout=5) if r_agents.status_code == 200: agents_data = r_agents.json() agents_list = agents_data.get("items", []) if isinstance(agents_data, dict) else agents_data kinds = {a.get("kind") for a in agents_list if isinstance(a, dict)} for kind in NODE_CORE_AGENT_KINDS: severity = "WARNING" if kind == "archivist_agent" else "INFO" if kind in kinds: add_result(results, f"{name} - core kind {kind}", severity, True, "present") else: add_result(results, f"{name} - core kind {kind}", severity, False, "missing") except Exception as e: add_result(results, name, "CRITICAL", False, f"Exception: {e}") # --- 4. DAGI Router / Audit -------------------------------------------------- def check_dagi_router(base_url: str, node_id: str, results: List[CheckResult]): name = f"DAGI Router: {node_id}" try: # Path with /city prefix for internal routes r = http_get(base_url, f"/city/internal/node/{node_id}/dagi-router/agents", timeout=5) if r.status_code != 200: add_result(results, name, "WARNING", False, f"HTTP {r.status_code}: {r.text[:200]}") return data = r.json() summary = data.get("summary", {}) router_total = summary.get("router_total") system_total = summary.get("system_total") active = summary.get("active") phantom = summary.get("phantom") stale = summary.get("stale") if router_total is None or router_total < 1: add_result(results, f"{name} - router_total", "CRITICAL", False, f"router_total={router_total}") else: add_result(results, f"{name} - router_total", "CRITICAL", True, f"router_total={router_total}") if system_total is None or system_total < 1: add_result(results, f"{name} - system_total", "CRITICAL", False, f"system_total={system_total}") else: add_result(results, f"{name} - system_total", "CRITICAL", True, f"system_total={system_total}") if phantom is not None and phantom > PHANTOM_STALE_LIMIT: add_result(results, f"{name} - phantom", "WARNING", False, f"phantom={phantom} > {PHANTOM_STALE_LIMIT}") else: add_result(results, f"{name} - phantom", "WARNING", True, f"phantom={phantom}") if stale is not None and stale > PHANTOM_STALE_LIMIT: add_result(results, f"{name} - stale", "WARNING", False, f"stale={stale} > {PHANTOM_STALE_LIMIT}") else: add_result(results, f"{name} - stale", "WARNING", True, f"stale={stale}") if active is None or active < 1: add_result(results, f"{name} - active", "CRITICAL", False, f"active={active}") else: add_result(results, f"{name} - active", "CRITICAL", True, f"active={active}") except Exception as e: add_result(results, name, "WARNING", False, f"Exception (skipping): {e}") # --- 5. Core Agents & System Prompts ---------------------------------------- def find_agent_by_slug(base_url: str, slug: str) -> Optional[Dict[str, Any]]: try: r = http_get(base_url, f"/public/agents?is_public=true", timeout=5) if r.status_code != 200: return None data = r.json() items = data.get("items", []) if isinstance(data, dict) else data for item in items: if item.get("slug") == slug or item.get("public_slug") == slug: return item return None except Exception: return None def get_agent_prompts(base_url: str, agent_id: str) -> Dict[str, Any]: try: # Prompts are under /api/v1 which is NOT prefixed with /city in main.py includes? # main.py: app.include_router(routes_agents.router, prefix="/api/v1") # So it should be /api/v1/agents/... r = http_get(base_url, f"/api/v1/agents/{agent_id}/prompts", timeout=5) if r.status_code != 200: return {} return r.json() except Exception: return {} def check_core_agents_prompts(base_url: str, results: List[CheckResult]): for slug in CORE_AGENT_SLUGS: name = f"core agent prompts: {slug}" agent = find_agent_by_slug(base_url, slug) if not agent: add_result(results, name, "INFO", False, "Agent not found by slug (public)") continue agent_id = agent.get("id") prompts = get_agent_prompts(base_url, agent_id) records = prompts.get("prompts") or prompts has_core = False if isinstance(records, list): for p in records: if p.get("kind") == "core": has_core = True break if has_core: add_result(results, name, "WARNING", True, "core prompt present") else: add_result(results, name, "WARNING", False, "core prompt missing") # --- 6. Мульти-модальність (high-level health) ------------------------------ def check_multimodal_services(base_url: str, results: List[CheckResult]): services = [ ("/city/internal/health/stt", "Multimodal STT"), ("/city/internal/health/ocr", "Multimodal OCR"), ("/city/internal/health/vlm", "Multimodal VLM"), ] for path, label in services: name = f"{label} health" try: r = http_get(base_url, path, timeout=5) if r.status_code == 200: add_result(results, name, "WARNING", True, "OK") else: add_result(results, name, "INFO", False, f"HTTP {r.status_code}") except Exception as e: add_result(results, name, "INFO", False, f"Exception: {e}") # --- 7. Виклик scripts/check-invariants.py ---------------------------------- def run_invariants_script(base_url: str, results: List[CheckResult]): name = "check-invariants.py" script_path = Path(__file__).with_name("check-invariants.py") if not script_path.exists(): add_result(results, name, "WARNING", False, f"{script_path} not found (skip)") return try: cmd = [sys.executable, str(script_path), "--base-url", base_url, "--json"] proc = subprocess.run(cmd, capture_output=True, text=True) if proc.returncode != 0: add_result(results, name, "CRITICAL", False, f"Failed (exit {proc.returncode}): {proc.stdout or proc.stderr}") else: add_result(results, name, "INFO", True, "check-invariants.py passed") except Exception as e: add_result(results, name, "CRITICAL", False, f"Exception: {e}") # --- 8. (Опційно) smoke-тести ------------------------------------------------ def run_smoke_tests(results: List[CheckResult]): name = "pytest tests/test_infra_smoke.py" tests_file = Path("tests") / "test_infra_smoke.py" if not tests_file.exists(): tests_file = Path(__file__).parent.parent / "tests" / "test_infra_smoke.py" if not tests_file.exists(): add_result(results, name, "WARNING", False, f"{tests_file} not found (skip)") return try: cmd = ["pytest", str(tests_file), "-q"] proc = subprocess.run(cmd, capture_output=True, text=True) if proc.returncode != 0: add_result(results, name, "CRITICAL", False, f"Smoke tests failed (exit {proc.returncode})") else: add_result(results, name, "INFO", True, "Smoke tests passed") except Exception as e: add_result(results, name, "CRITICAL", False, f"Exception: {e}") # --- Формат виводу ----------------------------------------------------------- def summarize_results(results: List[CheckResult]) -> Dict[str, Any]: total = len(results) passed = sum(1 for r in results if r.ok) failed = [r for r in results if not r.ok and r.severity == "CRITICAL"] warnings = [r for r in results if not r.ok and r.severity == "WARNING"] info = [r for r in results if r.ok and r.severity == "INFO"] return { "total_checks": total, "passed": passed, "failed_critical": len(failed), "warnings": len(warnings), "info": len(info), "timestamp": now_utc().isoformat(), } def print_human(results: List[CheckResult]): summary = summarize_results(results) print("=" * 60) print("DAARION Post-Deploy Check") print("=" * 60) print(f"Time: {summary['timestamp']}") print() print("RESULTS:") for r in results: status = "✅" if r.ok else ("⚠️ " if r.severity == "WARNING" else ("ℹ️ " if r.severity == "INFO" else "❌")) print(f" {status} [{r.severity}] {r.name}: {r.message}") print() print("=" * 60) print("SUMMARY") print("=" * 60) print(f" Total checks: {summary['total_checks']}") print(f" Passed: {summary['passed']}") print(f" Warnings: {summary['warnings']}") print(f" Failed (crit): {summary['failed_critical']}") def print_json(results: List[CheckResult]): summary = summarize_results(results) payload = { "summary": summary, "results": [r.to_dict() for r in results], } print(json.dumps(payload, indent=2, ensure_ascii=False)) # --- main -------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser(description="Post-deploy extended checks for DAARION.city") parser.add_argument("--base-url", default=DEFAULT_BASE_URL, help="Base URL of city-service") parser.add_argument( "--nodes", nargs="*", default=DEFAULT_NODES, help="Node IDs to check (default: NODE1 + NODE2)", ) parser.add_argument( "--json", action="store_true", help="Output JSON instead of human-readable text", ) parser.add_argument( "--skip-smoke", action="store_true", help="Skip pytest smoke tests", ) parser.add_argument( "--output-file", help="Write JSON report to this file", ) args = parser.parse_args() results: List[CheckResult] = [] base_url = args.base_url nodes = args.nodes # 1. Базове здоров'я check_city_health(base_url, results) # 2. Node Directory / Registry check_nodes_api(base_url, results) # 3. Ноди for node_id in nodes: check_node_metrics(base_url, node_id, results) check_node_agents(base_url, node_id, results) check_dagi_router(base_url, node_id, results) # 4. Core agents & prompts check_core_agents_prompts(base_url, results) # 5. Multimodal check_multimodal_services(base_url, results) # 6. check-invariants.py run_invariants_script(base_url, results) # 7. Smoke tests if not args.skip_smoke: run_smoke_tests(results) # Вивід if args.json: print_json(results) else: print_human(results) # Збереження в файл summary = summarize_results(results) if args.output_file: try: report = { "summary": summary, "results": [r.to_dict() for r in results], } path = Path(args.output_file) path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(report, ensure_ascii=False, indent=2)) print(f"\nReport saved to: {path.absolute()}") except Exception as e: print(f"\nError saving report: {e}") failed_crit = summary["failed_critical"] # якщо є критичні помилки → exit 1 if failed_crit > 0: sys.exit(1) sys.exit(0) if __name__ == "__main__": main()