529 lines
20 KiB
Python
529 lines
20 KiB
Python
#!/usr/bin/env python
|
||
"""
|
||
scripts/check-deploy-post.py
|
||
|
||
Розширена перевірка після деплою (60+ чеків) для DAARION.city.
|
||
Рівні:
|
||
- CRITICAL: порушення → скрипт повертає exit code 1
|
||
- WARNING: порушення → скрипт повертає exit code 0 (але позначає проблему)
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import sys
|
||
import subprocess
|
||
from dataclasses import dataclass, asdict
|
||
from datetime import datetime, timezone, timedelta
|
||
from pathlib import Path
|
||
from typing import List, Dict, Any, Optional
|
||
import requests
|
||
|
||
# --- Конфіг за замовчуванням -------------------------------------------------
|
||
|
||
DEFAULT_BASE_URL = "http://localhost:7001"
|
||
|
||
# node_id для NODE1 / NODE2 — підстав свої, якщо відрізняються
|
||
DEFAULT_NODES = [
|
||
"node-1-hetzner-gex44",
|
||
"node-2-macbook-m4max",
|
||
]
|
||
|
||
CORE_AGENT_SLUGS = [
|
||
"daarwizz",
|
||
"daria",
|
||
"dario",
|
||
"soul",
|
||
"spirit",
|
||
"logic",
|
||
"greenfood-erp",
|
||
"helion",
|
||
]
|
||
|
||
NODE_CORE_AGENT_KINDS = [
|
||
"node_guardian",
|
||
"node_steward",
|
||
"dagi_router_agent",
|
||
"swapper_agent",
|
||
"multimodal_agent",
|
||
"tools_planner_agent",
|
||
"security_agent",
|
||
"archivist_agent", # якщо ще не в проді — не критично
|
||
]
|
||
|
||
PHANTOM_STALE_LIMIT = 20
|
||
HEARTBEAT_MAX_AGE_MIN = 10
|
||
|
||
# --- Модель результату -------------------------------------------------------
|
||
|
||
@dataclass
|
||
class CheckResult:
|
||
name: str
|
||
severity: str # "CRITICAL" | "WARNING" | "INFO"
|
||
ok: bool
|
||
message: str
|
||
|
||
def to_dict(self) -> Dict[str, Any]:
|
||
return asdict(self)
|
||
|
||
# --- Утиліти -----------------------------------------------------------------
|
||
|
||
def now_utc() -> datetime:
|
||
return datetime.now(timezone.utc)
|
||
|
||
def parse_dt(value: str) -> Optional[datetime]:
|
||
try:
|
||
return datetime.fromisoformat(value.replace("Z", "+00:00"))
|
||
except Exception:
|
||
return None
|
||
|
||
def add_result(results: List[CheckResult], name: str, severity: str, ok: bool, message: str):
|
||
results.append(CheckResult(name=name, severity=severity, ok=ok, message=message))
|
||
|
||
def http_get(base_url: str, path: str, timeout: float = 5.0) -> requests.Response:
|
||
url = base_url.rstrip("/") + path
|
||
return requests.get(url, timeout=timeout)
|
||
|
||
def http_post(base_url: str, path: str, json_body: Any = None, timeout: float = 10.0) -> requests.Response:
|
||
url = base_url.rstrip("/") + path
|
||
return requests.post(url, json=json_body, timeout=timeout)
|
||
|
||
# --- 1. Health / базові перевірки --------------------------------------------
|
||
|
||
def check_city_health(base_url: str, results: List[CheckResult]):
|
||
name = "city-service /health"
|
||
try:
|
||
# Assuming /health is at root
|
||
r = http_get(base_url, "/health", timeout=5)
|
||
if r.status_code == 200:
|
||
add_result(results, name, "CRITICAL", True, "City service healthy")
|
||
else:
|
||
add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}")
|
||
except Exception as e:
|
||
add_result(results, name, "CRITICAL", False, f"Exception: {e}")
|
||
|
||
# --- 2. Node Directory / Registry / Metrics ----------------------------------
|
||
|
||
def check_nodes_api(base_url: str, results: List[CheckResult]):
|
||
name = "nodes: GET /public/nodes"
|
||
try:
|
||
r = http_get(base_url, "/public/nodes", timeout=5)
|
||
if r.status_code != 200:
|
||
add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}")
|
||
return
|
||
|
||
data = r.json()
|
||
# /public/nodes usually returns a list directly or {"items": []} or {"nodes": []}
|
||
# Based on routes_city.py it returns list if response_model=List[NodeProfile]
|
||
# or dict if response_model=NodeListResponse
|
||
|
||
nodes = []
|
||
if isinstance(data, list):
|
||
nodes = data
|
||
elif isinstance(data, dict):
|
||
nodes = data.get("nodes", []) or data.get("items", [])
|
||
|
||
if len(nodes) == 0:
|
||
add_result(results, name, "WARNING", False, "API OK, але nodes list empty")
|
||
else:
|
||
add_result(results, name, "CRITICAL", True, f"Знайдено нод: {len(nodes)}")
|
||
|
||
except Exception as e:
|
||
add_result(results, name, "CRITICAL", False, f"Exception: {e}")
|
||
|
||
def check_node_metrics(base_url: str, node_id: str, results: List[CheckResult]):
|
||
name = f"node metrics: {node_id}"
|
||
try:
|
||
# Try public profile first
|
||
r = http_get(base_url, f"/public/nodes/{node_id}", timeout=5)
|
||
if r.status_code != 200:
|
||
# Fallback to internal metrics
|
||
r = http_get(base_url, f"/city/internal/node/{node_id}/metrics/current", timeout=5)
|
||
|
||
if r.status_code != 200:
|
||
add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}")
|
||
return
|
||
|
||
data = r.json()
|
||
|
||
agent_router = data.get("agent_count_router") or data.get("agents_total")
|
||
agent_sys = data.get("agent_count_system")
|
||
last_hb_raw = data.get("last_heartbeat")
|
||
gpu_info = data.get("gpu_info") or data.get("gpu")
|
||
|
||
add_result(results, f"{name} - exists", "CRITICAL", True, "Node metrics entry exists")
|
||
|
||
if data.get("agents_total", 0) < 1 and (agent_router is None or agent_router < 1):
|
||
add_result(results, f"{name} - agents_total", "WARNING", False, f"agents_total={data.get('agents_total')}")
|
||
else:
|
||
add_result(results, f"{name} - agents_total", "CRITICAL", True, f"Total agents: {data.get('agents_total') or agent_router}")
|
||
|
||
if "hetzner" in node_id.lower():
|
||
if not gpu_info:
|
||
add_result(results, f"{name} - gpu_info", "WARNING", False, "gpu_info is empty for NODE1")
|
||
else:
|
||
add_result(results, f"{name} - gpu_info", "WARNING", True, "GPU info present")
|
||
|
||
if last_hb_raw:
|
||
dt = parse_dt(last_hb_raw)
|
||
if dt is None:
|
||
add_result(results, f"{name} - heartbeat parse", "WARNING", False, f"Cannot parse last_heartbeat: {last_hb_raw}")
|
||
else:
|
||
age = now_utc() - dt
|
||
if age > timedelta(minutes=HEARTBEAT_MAX_AGE_MIN):
|
||
add_result(results, f"{name} - heartbeat age", "WARNING", False, f"Heartbeat too old: {age}")
|
||
else:
|
||
add_result(results, f"{name} - heartbeat age", "WARNING", True, f"Heartbeat age OK: {age}")
|
||
else:
|
||
add_result(results, f"{name} - heartbeat present", "WARNING", False, "last_heartbeat is missing")
|
||
|
||
except Exception as e:
|
||
add_result(results, name, "CRITICAL", False, f"Exception: {e}")
|
||
|
||
# --- 3. Node Agents / Core DAOS Node Core ------------------------------------
|
||
|
||
def check_node_agents(base_url: str, node_id: str, results: List[CheckResult]):
|
||
name = f"node agents: {node_id}"
|
||
try:
|
||
r = http_get(base_url, f"/public/nodes/{node_id}", timeout=5)
|
||
if r.status_code != 200:
|
||
add_result(results, name, "CRITICAL", False, f"HTTP {r.status_code}: {r.text[:200]}")
|
||
return
|
||
|
||
data = r.json()
|
||
guardian = data.get("guardian_agent")
|
||
steward = data.get("steward_agent")
|
||
|
||
if guardian:
|
||
add_result(results, f"{name} - guardian", "CRITICAL", True, "Guardian present")
|
||
else:
|
||
add_result(results, f"{name} - guardian", "CRITICAL", False, "Guardian missing")
|
||
|
||
if steward:
|
||
add_result(results, f"{name} - steward", "CRITICAL", True, "Steward present")
|
||
else:
|
||
add_result(results, f"{name} - steward", "CRITICAL", False, "Steward missing")
|
||
|
||
# Check all agents via /public/agents?node_id=...
|
||
r_agents = http_get(base_url, f"/public/agents?node_id={node_id}", timeout=5)
|
||
if r_agents.status_code == 200:
|
||
agents_data = r_agents.json()
|
||
agents_list = agents_data.get("items", []) if isinstance(agents_data, dict) else agents_data
|
||
kinds = {a.get("kind") for a in agents_list if isinstance(a, dict)}
|
||
|
||
for kind in NODE_CORE_AGENT_KINDS:
|
||
severity = "WARNING" if kind == "archivist_agent" else "INFO"
|
||
if kind in kinds:
|
||
add_result(results, f"{name} - core kind {kind}", severity, True, "present")
|
||
else:
|
||
add_result(results, f"{name} - core kind {kind}", severity, False, "missing")
|
||
|
||
except Exception as e:
|
||
add_result(results, name, "CRITICAL", False, f"Exception: {e}")
|
||
|
||
# --- 4. DAGI Router / Audit --------------------------------------------------
|
||
|
||
def check_dagi_router(base_url: str, node_id: str, results: List[CheckResult]):
|
||
name = f"DAGI Router: {node_id}"
|
||
try:
|
||
# Path with /city prefix for internal routes
|
||
r = http_get(base_url, f"/city/internal/node/{node_id}/dagi-router/agents", timeout=5)
|
||
|
||
if r.status_code != 200:
|
||
add_result(results, name, "WARNING", False, f"HTTP {r.status_code}: {r.text[:200]}")
|
||
return
|
||
|
||
data = r.json()
|
||
summary = data.get("summary", {})
|
||
|
||
router_total = summary.get("router_total")
|
||
system_total = summary.get("system_total")
|
||
active = summary.get("active")
|
||
phantom = summary.get("phantom")
|
||
stale = summary.get("stale")
|
||
|
||
if router_total is None or router_total < 1:
|
||
add_result(results, f"{name} - router_total", "CRITICAL", False, f"router_total={router_total}")
|
||
else:
|
||
add_result(results, f"{name} - router_total", "CRITICAL", True, f"router_total={router_total}")
|
||
|
||
if system_total is None or system_total < 1:
|
||
add_result(results, f"{name} - system_total", "CRITICAL", False, f"system_total={system_total}")
|
||
else:
|
||
add_result(results, f"{name} - system_total", "CRITICAL", True, f"system_total={system_total}")
|
||
|
||
if phantom is not None and phantom > PHANTOM_STALE_LIMIT:
|
||
add_result(results, f"{name} - phantom", "WARNING", False, f"phantom={phantom} > {PHANTOM_STALE_LIMIT}")
|
||
else:
|
||
add_result(results, f"{name} - phantom", "WARNING", True, f"phantom={phantom}")
|
||
|
||
if stale is not None and stale > PHANTOM_STALE_LIMIT:
|
||
add_result(results, f"{name} - stale", "WARNING", False, f"stale={stale} > {PHANTOM_STALE_LIMIT}")
|
||
else:
|
||
add_result(results, f"{name} - stale", "WARNING", True, f"stale={stale}")
|
||
|
||
if active is None or active < 1:
|
||
add_result(results, f"{name} - active", "CRITICAL", False, f"active={active}")
|
||
else:
|
||
add_result(results, f"{name} - active", "CRITICAL", True, f"active={active}")
|
||
|
||
except Exception as e:
|
||
add_result(results, name, "WARNING", False, f"Exception (skipping): {e}")
|
||
|
||
# --- 5. Core Agents & System Prompts ----------------------------------------
|
||
|
||
def find_agent_by_slug(base_url: str, slug: str) -> Optional[Dict[str, Any]]:
|
||
try:
|
||
r = http_get(base_url, f"/public/agents?is_public=true", timeout=5)
|
||
if r.status_code != 200:
|
||
return None
|
||
data = r.json()
|
||
items = data.get("items", []) if isinstance(data, dict) else data
|
||
|
||
for item in items:
|
||
if item.get("slug") == slug or item.get("public_slug") == slug:
|
||
return item
|
||
return None
|
||
except Exception:
|
||
return None
|
||
|
||
def get_agent_prompts(base_url: str, agent_id: str) -> Dict[str, Any]:
|
||
try:
|
||
# Prompts are under /api/v1 which is NOT prefixed with /city in main.py includes?
|
||
# main.py: app.include_router(routes_agents.router, prefix="/api/v1")
|
||
# So it should be /api/v1/agents/...
|
||
r = http_get(base_url, f"/api/v1/agents/{agent_id}/prompts", timeout=5)
|
||
if r.status_code != 200:
|
||
return {}
|
||
return r.json()
|
||
except Exception:
|
||
return {}
|
||
|
||
def check_core_agents_prompts(base_url: str, results: List[CheckResult]):
|
||
for slug in CORE_AGENT_SLUGS:
|
||
name = f"core agent prompts: {slug}"
|
||
agent = find_agent_by_slug(base_url, slug)
|
||
|
||
if not agent:
|
||
add_result(results, name, "INFO", False, "Agent not found by slug (public)")
|
||
continue
|
||
|
||
agent_id = agent.get("id")
|
||
prompts = get_agent_prompts(base_url, agent_id)
|
||
|
||
records = prompts.get("prompts") or prompts
|
||
has_core = False
|
||
|
||
if isinstance(records, list):
|
||
for p in records:
|
||
if p.get("kind") == "core":
|
||
has_core = True
|
||
break
|
||
|
||
if has_core:
|
||
add_result(results, name, "WARNING", True, "core prompt present")
|
||
else:
|
||
add_result(results, name, "WARNING", False, "core prompt missing")
|
||
|
||
# --- 6. Мульти-модальність (high-level health) ------------------------------
|
||
|
||
def check_multimodal_services(base_url: str, results: List[CheckResult]):
|
||
services = [
|
||
("/city/internal/health/stt", "Multimodal STT"),
|
||
("/city/internal/health/ocr", "Multimodal OCR"),
|
||
("/city/internal/health/vlm", "Multimodal VLM"),
|
||
]
|
||
for path, label in services:
|
||
name = f"{label} health"
|
||
try:
|
||
r = http_get(base_url, path, timeout=5)
|
||
if r.status_code == 200:
|
||
add_result(results, name, "WARNING", True, "OK")
|
||
else:
|
||
add_result(results, name, "INFO", False, f"HTTP {r.status_code}")
|
||
except Exception as e:
|
||
add_result(results, name, "INFO", False, f"Exception: {e}")
|
||
|
||
# --- 7. Виклик scripts/check-invariants.py ----------------------------------
|
||
|
||
def run_invariants_script(base_url: str, results: List[CheckResult]):
|
||
name = "check-invariants.py"
|
||
script_path = Path(__file__).with_name("check-invariants.py")
|
||
|
||
if not script_path.exists():
|
||
add_result(results, name, "WARNING", False, f"{script_path} not found (skip)")
|
||
return
|
||
|
||
try:
|
||
cmd = [sys.executable, str(script_path), "--base-url", base_url, "--json"]
|
||
proc = subprocess.run(cmd, capture_output=True, text=True)
|
||
|
||
if proc.returncode != 0:
|
||
add_result(results, name, "CRITICAL", False, f"Failed (exit {proc.returncode}): {proc.stdout or proc.stderr}")
|
||
else:
|
||
add_result(results, name, "INFO", True, "check-invariants.py passed")
|
||
|
||
except Exception as e:
|
||
add_result(results, name, "CRITICAL", False, f"Exception: {e}")
|
||
|
||
# --- 8. (Опційно) smoke-тести ------------------------------------------------
|
||
|
||
def run_smoke_tests(results: List[CheckResult]):
|
||
name = "pytest tests/test_infra_smoke.py"
|
||
tests_file = Path("tests") / "test_infra_smoke.py"
|
||
|
||
if not tests_file.exists():
|
||
tests_file = Path(__file__).parent.parent / "tests" / "test_infra_smoke.py"
|
||
|
||
if not tests_file.exists():
|
||
add_result(results, name, "WARNING", False, f"{tests_file} not found (skip)")
|
||
return
|
||
|
||
try:
|
||
cmd = ["pytest", str(tests_file), "-q"]
|
||
proc = subprocess.run(cmd, capture_output=True, text=True)
|
||
|
||
if proc.returncode != 0:
|
||
add_result(results, name, "CRITICAL", False, f"Smoke tests failed (exit {proc.returncode})")
|
||
else:
|
||
add_result(results, name, "INFO", True, "Smoke tests passed")
|
||
|
||
except Exception as e:
|
||
add_result(results, name, "CRITICAL", False, f"Exception: {e}")
|
||
|
||
# --- Формат виводу -----------------------------------------------------------
|
||
|
||
def summarize_results(results: List[CheckResult]) -> Dict[str, Any]:
|
||
total = len(results)
|
||
passed = sum(1 for r in results if r.ok)
|
||
failed = [r for r in results if not r.ok and r.severity == "CRITICAL"]
|
||
warnings = [r for r in results if not r.ok and r.severity == "WARNING"]
|
||
info = [r for r in results if r.ok and r.severity == "INFO"]
|
||
|
||
return {
|
||
"total_checks": total,
|
||
"passed": passed,
|
||
"failed_critical": len(failed),
|
||
"warnings": len(warnings),
|
||
"info": len(info),
|
||
"timestamp": now_utc().isoformat(),
|
||
}
|
||
|
||
def print_human(results: List[CheckResult]):
|
||
summary = summarize_results(results)
|
||
|
||
print("=" * 60)
|
||
print("DAARION Post-Deploy Check")
|
||
print("=" * 60)
|
||
print(f"Time: {summary['timestamp']}")
|
||
print()
|
||
print("RESULTS:")
|
||
|
||
for r in results:
|
||
status = "✅" if r.ok else ("⚠️ " if r.severity == "WARNING" else ("ℹ️ " if r.severity == "INFO" else "❌"))
|
||
print(f" {status} [{r.severity}] {r.name}: {r.message}")
|
||
|
||
print()
|
||
print("=" * 60)
|
||
print("SUMMARY")
|
||
print("=" * 60)
|
||
print(f" Total checks: {summary['total_checks']}")
|
||
print(f" Passed: {summary['passed']}")
|
||
print(f" Warnings: {summary['warnings']}")
|
||
print(f" Failed (crit): {summary['failed_critical']}")
|
||
|
||
def print_json(results: List[CheckResult]):
|
||
summary = summarize_results(results)
|
||
payload = {
|
||
"summary": summary,
|
||
"results": [r.to_dict() for r in results],
|
||
}
|
||
print(json.dumps(payload, indent=2, ensure_ascii=False))
|
||
|
||
# --- main --------------------------------------------------------------------
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="Post-deploy extended checks for DAARION.city")
|
||
parser.add_argument("--base-url", default=DEFAULT_BASE_URL, help="Base URL of city-service")
|
||
parser.add_argument(
|
||
"--nodes",
|
||
nargs="*",
|
||
default=DEFAULT_NODES,
|
||
help="Node IDs to check (default: NODE1 + NODE2)",
|
||
)
|
||
parser.add_argument(
|
||
"--json",
|
||
action="store_true",
|
||
help="Output JSON instead of human-readable text",
|
||
)
|
||
parser.add_argument(
|
||
"--skip-smoke",
|
||
action="store_true",
|
||
help="Skip pytest smoke tests",
|
||
)
|
||
parser.add_argument(
|
||
"--output-file",
|
||
help="Write JSON report to this file",
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
|
||
results: List[CheckResult] = []
|
||
base_url = args.base_url
|
||
nodes = args.nodes
|
||
|
||
# 1. Базове здоров'я
|
||
check_city_health(base_url, results)
|
||
|
||
# 2. Node Directory / Registry
|
||
check_nodes_api(base_url, results)
|
||
|
||
# 3. Ноди
|
||
for node_id in nodes:
|
||
check_node_metrics(base_url, node_id, results)
|
||
check_node_agents(base_url, node_id, results)
|
||
check_dagi_router(base_url, node_id, results)
|
||
|
||
# 4. Core agents & prompts
|
||
check_core_agents_prompts(base_url, results)
|
||
|
||
# 5. Multimodal
|
||
check_multimodal_services(base_url, results)
|
||
|
||
# 6. check-invariants.py
|
||
run_invariants_script(base_url, results)
|
||
|
||
# 7. Smoke tests
|
||
if not args.skip_smoke:
|
||
run_smoke_tests(results)
|
||
|
||
# Вивід
|
||
if args.json:
|
||
print_json(results)
|
||
else:
|
||
print_human(results)
|
||
|
||
# Збереження в файл
|
||
summary = summarize_results(results)
|
||
if args.output_file:
|
||
try:
|
||
report = {
|
||
"summary": summary,
|
||
"results": [r.to_dict() for r in results],
|
||
}
|
||
path = Path(args.output_file)
|
||
path.parent.mkdir(parents=True, exist_ok=True)
|
||
path.write_text(json.dumps(report, ensure_ascii=False, indent=2))
|
||
print(f"\nReport saved to: {path.absolute()}")
|
||
except Exception as e:
|
||
print(f"\nError saving report: {e}")
|
||
|
||
failed_crit = summary["failed_critical"]
|
||
|
||
# якщо є критичні помилки → exit 1
|
||
if failed_crit > 0:
|
||
sys.exit(1)
|
||
|
||
sys.exit(0)
|
||
|
||
if __name__ == "__main__":
|
||
main() |