Files
microdao-daarion/scripts/check-deploy-post.py

529 lines
20 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python
"""
scripts/check-deploy-post.py
Розширена перевірка після деплою (60+ чеків) для DAARION.city.
Рівні:
- CRITICAL: порушення → скрипт повертає exit code 1
- WARNING: порушення → скрипт повертає exit code 0 (але позначає проблему)
"""
import argparse
import json
import sys
import subprocess
from dataclasses import dataclass, asdict
from datetime import datetime, timezone, timedelta
from pathlib import Path
from typing import List, Dict, Any, Optional
import requests
# --- Конфіг за замовчуванням -------------------------------------------------
DEFAULT_BASE_URL = "http://localhost:7001"
# node_id для NODE1 / NODE2 — підстав свої, якщо відрізняються
DEFAULT_NODES = [
"node-1-hetzner-gex44",
"node-2-macbook-m4max",
]
CORE_AGENT_SLUGS = [
"daarwizz",
"daria",
"dario",
"soul",
"spirit",
"logic",
"greenfood-erp",
"helion",
]
NODE_CORE_AGENT_KINDS = [
"node_guardian",
"node_steward",
"dagi_router_agent",
"swapper_agent",
"multimodal_agent",
"tools_planner_agent",
"security_agent",
"archivist_agent", # якщо ще не в проді — не критично
]
PHANTOM_STALE_LIMIT = 20
HEARTBEAT_MAX_AGE_MIN = 10
# --- Модель результату -------------------------------------------------------
@dataclass
class CheckResult:
name: str
severity: str # "CRITICAL" | "WARNING" | "INFO"
ok: bool
message: str
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
# --- Утиліти -----------------------------------------------------------------
def now_utc() -> datetime:
return datetime.now(timezone.utc)
def parse_dt(value: str) -> Optional[datetime]:
try:
return datetime.fromisoformat(value.replace("Z", "+00:00"))
except Exception:
return None
def add_result(results: List[CheckResult], name: str, severity: str, ok: bool, message: str):
results.append(CheckResult(name=name, severity=severity, ok=ok, message=message))
def http_get(base_url: str, path: str, timeout: float = 5.0) -> requests.Response:
url = base_url.rstrip("/") + path
return requests.get(url, timeout=timeout)
def http_post(base_url: str, path: str, json_body: Any = None, timeout: float = 10.0) -> requests.Response:
url = base_url.rstrip("/") + path
return requests.post(url, json=json_body, timeout=timeout)
# --- 1. Health / базові перевірки --------------------------------------------
def check_city_health(base_url: str, results: List[CheckResult]):
name = "city-service /health"
try:
# Assuming /health is at root
r = http_get(base_url, "/health", timeout=5)
if r.status_code == 200:
add_result(results, name, "CRITICAL", True, "City service healthy")
else:
add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}")
except Exception as e:
add_result(results, name, "CRITICAL", False, f"Exception: {e}")
# --- 2. Node Directory / Registry / Metrics ----------------------------------
def check_nodes_api(base_url: str, results: List[CheckResult]):
name = "nodes: GET /public/nodes"
try:
r = http_get(base_url, "/public/nodes", timeout=5)
if r.status_code != 200:
add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}")
return
data = r.json()
# /public/nodes usually returns a list directly or {"items": []} or {"nodes": []}
# Based on routes_city.py it returns list if response_model=List[NodeProfile]
# or dict if response_model=NodeListResponse
nodes = []
if isinstance(data, list):
nodes = data
elif isinstance(data, dict):
nodes = data.get("nodes", []) or data.get("items", [])
if len(nodes) == 0:
add_result(results, name, "WARNING", False, "API OK, але nodes list empty")
else:
add_result(results, name, "CRITICAL", True, f"Знайдено нод: {len(nodes)}")
except Exception as e:
add_result(results, name, "CRITICAL", False, f"Exception: {e}")
def check_node_metrics(base_url: str, node_id: str, results: List[CheckResult]):
name = f"node metrics: {node_id}"
try:
# Try public profile first
r = http_get(base_url, f"/public/nodes/{node_id}", timeout=5)
if r.status_code != 200:
# Fallback to internal metrics
r = http_get(base_url, f"/city/internal/node/{node_id}/metrics/current", timeout=5)
if r.status_code != 200:
add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}")
return
data = r.json()
agent_router = data.get("agent_count_router") or data.get("agents_total")
agent_sys = data.get("agent_count_system")
last_hb_raw = data.get("last_heartbeat")
gpu_info = data.get("gpu_info") or data.get("gpu")
add_result(results, f"{name} - exists", "CRITICAL", True, "Node metrics entry exists")
if data.get("agents_total", 0) < 1 and (agent_router is None or agent_router < 1):
add_result(results, f"{name} - agents_total", "WARNING", False, f"agents_total={data.get('agents_total')}")
else:
add_result(results, f"{name} - agents_total", "CRITICAL", True, f"Total agents: {data.get('agents_total') or agent_router}")
if "hetzner" in node_id.lower():
if not gpu_info:
add_result(results, f"{name} - gpu_info", "WARNING", False, "gpu_info is empty for NODE1")
else:
add_result(results, f"{name} - gpu_info", "WARNING", True, "GPU info present")
if last_hb_raw:
dt = parse_dt(last_hb_raw)
if dt is None:
add_result(results, f"{name} - heartbeat parse", "WARNING", False, f"Cannot parse last_heartbeat: {last_hb_raw}")
else:
age = now_utc() - dt
if age > timedelta(minutes=HEARTBEAT_MAX_AGE_MIN):
add_result(results, f"{name} - heartbeat age", "WARNING", False, f"Heartbeat too old: {age}")
else:
add_result(results, f"{name} - heartbeat age", "WARNING", True, f"Heartbeat age OK: {age}")
else:
add_result(results, f"{name} - heartbeat present", "WARNING", False, "last_heartbeat is missing")
except Exception as e:
add_result(results, name, "CRITICAL", False, f"Exception: {e}")
# --- 3. Node Agents / Core DAOS Node Core ------------------------------------
def check_node_agents(base_url: str, node_id: str, results: List[CheckResult]):
name = f"node agents: {node_id}"
try:
r = http_get(base_url, f"/public/nodes/{node_id}", timeout=5)
if r.status_code != 200:
add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}")
return
data = r.json()
guardian = data.get("guardian_agent")
steward = data.get("steward_agent")
if guardian:
add_result(results, f"{name} - guardian", "CRITICAL", True, "Guardian present")
else:
add_result(results, f"{name} - guardian", "CRITICAL", False, "Guardian missing")
if steward:
add_result(results, f"{name} - steward", "CRITICAL", True, "Steward present")
else:
add_result(results, f"{name} - steward", "CRITICAL", False, "Steward missing")
# Check all agents via /public/agents?node_id=...
r_agents = http_get(base_url, f"/public/agents?node_id={node_id}", timeout=5)
if r_agents.status_code == 200:
agents_data = r_agents.json()
agents_list = agents_data.get("items", []) if isinstance(agents_data, dict) else agents_data
kinds = {a.get("kind") for a in agents_list if isinstance(a, dict)}
for kind in NODE_CORE_AGENT_KINDS:
severity = "WARNING" if kind == "archivist_agent" else "INFO"
if kind in kinds:
add_result(results, f"{name} - core kind {kind}", severity, True, "present")
else:
add_result(results, f"{name} - core kind {kind}", severity, False, "missing")
except Exception as e:
add_result(results, name, "CRITICAL", False, f"Exception: {e}")
# --- 4. DAGI Router / Audit --------------------------------------------------
def check_dagi_router(base_url: str, node_id: str, results: List[CheckResult]):
name = f"DAGI Router: {node_id}"
try:
# Path with /city prefix for internal routes
r = http_get(base_url, f"/city/internal/node/{node_id}/dagi-router/agents", timeout=5)
if r.status_code != 200:
add_result(results, name, "WARNING", False, f"HTTP {r.status_code}: {r.text[:200]}")
return
data = r.json()
summary = data.get("summary", {})
router_total = summary.get("router_total")
system_total = summary.get("system_total")
active = summary.get("active")
phantom = summary.get("phantom")
stale = summary.get("stale")
if router_total is None or router_total < 1:
add_result(results, f"{name} - router_total", "CRITICAL", False, f"router_total={router_total}")
else:
add_result(results, f"{name} - router_total", "CRITICAL", True, f"router_total={router_total}")
if system_total is None or system_total < 1:
add_result(results, f"{name} - system_total", "CRITICAL", False, f"system_total={system_total}")
else:
add_result(results, f"{name} - system_total", "CRITICAL", True, f"system_total={system_total}")
if phantom is not None and phantom > PHANTOM_STALE_LIMIT:
add_result(results, f"{name} - phantom", "WARNING", False, f"phantom={phantom} > {PHANTOM_STALE_LIMIT}")
else:
add_result(results, f"{name} - phantom", "WARNING", True, f"phantom={phantom}")
if stale is not None and stale > PHANTOM_STALE_LIMIT:
add_result(results, f"{name} - stale", "WARNING", False, f"stale={stale} > {PHANTOM_STALE_LIMIT}")
else:
add_result(results, f"{name} - stale", "WARNING", True, f"stale={stale}")
if active is None or active < 1:
add_result(results, f"{name} - active", "CRITICAL", False, f"active={active}")
else:
add_result(results, f"{name} - active", "CRITICAL", True, f"active={active}")
except Exception as e:
add_result(results, name, "WARNING", False, f"Exception (skipping): {e}")
# --- 5. Core Agents & System Prompts ----------------------------------------
def find_agent_by_slug(base_url: str, slug: str) -> Optional[Dict[str, Any]]:
try:
r = http_get(base_url, f"/public/agents?is_public=true", timeout=5)
if r.status_code != 200:
return None
data = r.json()
items = data.get("items", []) if isinstance(data, dict) else data
for item in items:
if item.get("slug") == slug or item.get("public_slug") == slug:
return item
return None
except Exception:
return None
def get_agent_prompts(base_url: str, agent_id: str) -> Dict[str, Any]:
try:
# Prompts are under /api/v1 which is NOT prefixed with /city in main.py includes?
# main.py: app.include_router(routes_agents.router, prefix="/api/v1")
# So it should be /api/v1/agents/...
r = http_get(base_url, f"/api/v1/agents/{agent_id}/prompts", timeout=5)
if r.status_code != 200:
return {}
return r.json()
except Exception:
return {}
def check_core_agents_prompts(base_url: str, results: List[CheckResult]):
for slug in CORE_AGENT_SLUGS:
name = f"core agent prompts: {slug}"
agent = find_agent_by_slug(base_url, slug)
if not agent:
add_result(results, name, "INFO", False, "Agent not found by slug (public)")
continue
agent_id = agent.get("id")
prompts = get_agent_prompts(base_url, agent_id)
records = prompts.get("prompts") or prompts
has_core = False
if isinstance(records, list):
for p in records:
if p.get("kind") == "core":
has_core = True
break
if has_core:
add_result(results, name, "WARNING", True, "core prompt present")
else:
add_result(results, name, "WARNING", False, "core prompt missing")
# --- 6. Мульти-модальність (high-level health) ------------------------------
def check_multimodal_services(base_url: str, results: List[CheckResult]):
services = [
("/city/internal/health/stt", "Multimodal STT"),
("/city/internal/health/ocr", "Multimodal OCR"),
("/city/internal/health/vlm", "Multimodal VLM"),
]
for path, label in services:
name = f"{label} health"
try:
r = http_get(base_url, path, timeout=5)
if r.status_code == 200:
add_result(results, name, "WARNING", True, "OK")
else:
add_result(results, name, "INFO", False, f"HTTP {r.status_code}")
except Exception as e:
add_result(results, name, "INFO", False, f"Exception: {e}")
# --- 7. Виклик scripts/check-invariants.py ----------------------------------
def run_invariants_script(base_url: str, results: List[CheckResult]):
name = "check-invariants.py"
script_path = Path(__file__).with_name("check-invariants.py")
if not script_path.exists():
add_result(results, name, "WARNING", False, f"{script_path} not found (skip)")
return
try:
cmd = [sys.executable, str(script_path), "--base-url", base_url, "--json"]
proc = subprocess.run(cmd, capture_output=True, text=True)
if proc.returncode != 0:
add_result(results, name, "CRITICAL", False, f"Failed (exit {proc.returncode}): {proc.stdout or proc.stderr}")
else:
add_result(results, name, "INFO", True, "check-invariants.py passed")
except Exception as e:
add_result(results, name, "CRITICAL", False, f"Exception: {e}")
# --- 8. (Опційно) smoke-тести ------------------------------------------------
def run_smoke_tests(results: List[CheckResult]):
name = "pytest tests/test_infra_smoke.py"
tests_file = Path("tests") / "test_infra_smoke.py"
if not tests_file.exists():
tests_file = Path(__file__).parent.parent / "tests" / "test_infra_smoke.py"
if not tests_file.exists():
add_result(results, name, "WARNING", False, f"{tests_file} not found (skip)")
return
try:
cmd = ["pytest", str(tests_file), "-q"]
proc = subprocess.run(cmd, capture_output=True, text=True)
if proc.returncode != 0:
add_result(results, name, "CRITICAL", False, f"Smoke tests failed (exit {proc.returncode})")
else:
add_result(results, name, "INFO", True, "Smoke tests passed")
except Exception as e:
add_result(results, name, "CRITICAL", False, f"Exception: {e}")
# --- Формат виводу -----------------------------------------------------------
def summarize_results(results: List[CheckResult]) -> Dict[str, Any]:
total = len(results)
passed = sum(1 for r in results if r.ok)
failed = [r for r in results if not r.ok and r.severity == "CRITICAL"]
warnings = [r for r in results if not r.ok and r.severity == "WARNING"]
info = [r for r in results if r.ok and r.severity == "INFO"]
return {
"total_checks": total,
"passed": passed,
"failed_critical": len(failed),
"warnings": len(warnings),
"info": len(info),
"timestamp": now_utc().isoformat(),
}
def print_human(results: List[CheckResult]):
summary = summarize_results(results)
print("=" * 60)
print("DAARION Post-Deploy Check")
print("=" * 60)
print(f"Time: {summary['timestamp']}")
print()
print("RESULTS:")
for r in results:
status = "" if r.ok else ("⚠️ " if r.severity == "WARNING" else (" " if r.severity == "INFO" else ""))
print(f" {status} [{r.severity}] {r.name}: {r.message}")
print()
print("=" * 60)
print("SUMMARY")
print("=" * 60)
print(f" Total checks: {summary['total_checks']}")
print(f" Passed: {summary['passed']}")
print(f" Warnings: {summary['warnings']}")
print(f" Failed (crit): {summary['failed_critical']}")
def print_json(results: List[CheckResult]):
summary = summarize_results(results)
payload = {
"summary": summary,
"results": [r.to_dict() for r in results],
}
print(json.dumps(payload, indent=2, ensure_ascii=False))
# --- main --------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(description="Post-deploy extended checks for DAARION.city")
parser.add_argument("--base-url", default=DEFAULT_BASE_URL, help="Base URL of city-service")
parser.add_argument(
"--nodes",
nargs="*",
default=DEFAULT_NODES,
help="Node IDs to check (default: NODE1 + NODE2)",
)
parser.add_argument(
"--json",
action="store_true",
help="Output JSON instead of human-readable text",
)
parser.add_argument(
"--skip-smoke",
action="store_true",
help="Skip pytest smoke tests",
)
parser.add_argument(
"--output-file",
help="Write JSON report to this file",
)
args = parser.parse_args()
results: List[CheckResult] = []
base_url = args.base_url
nodes = args.nodes
# 1. Базове здоров'я
check_city_health(base_url, results)
# 2. Node Directory / Registry
check_nodes_api(base_url, results)
# 3. Ноди
for node_id in nodes:
check_node_metrics(base_url, node_id, results)
check_node_agents(base_url, node_id, results)
check_dagi_router(base_url, node_id, results)
# 4. Core agents & prompts
check_core_agents_prompts(base_url, results)
# 5. Multimodal
check_multimodal_services(base_url, results)
# 6. check-invariants.py
run_invariants_script(base_url, results)
# 7. Smoke tests
if not args.skip_smoke:
run_smoke_tests(results)
# Вивід
if args.json:
print_json(results)
else:
print_human(results)
# Збереження в файл
summary = summarize_results(results)
if args.output_file:
try:
report = {
"summary": summary,
"results": [r.to_dict() for r in results],
}
path = Path(args.output_file)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(report, ensure_ascii=False, indent=2))
print(f"\nReport saved to: {path.absolute()}")
except Exception as e:
print(f"\nError saving report: {e}")
failed_crit = summary["failed_critical"]
# якщо є критичні помилки → exit 1
if failed_crit > 0:
sys.exit(1)
sys.exit(0)
if __name__ == "__main__":
main()