"""Nodes dashboard: aggregate telemetry from all configured nodes.""" import logging from typing import Any, Dict from .config import load_nodes_registry from .monitor import collect_all_nodes logger = logging.getLogger(__name__) async def get_nodes_dashboard(router_api_key: str = "") -> Dict[str, Any]: """ GET /api/nodes/dashboard For each node in nodes_registry.yml, collects: - router health (ok, latency) - gateway health (ok, latency) — optional - monitor agent telemetry (heartbeat, SLO, incidents, backends, artifacts) All probes run in parallel with per-node timeout. Non-fatal: unreachable nodes appear with online=false. """ reg = load_nodes_registry() nodes_cfg = reg.get("nodes", {}) defaults = reg.get("defaults", {}) timeout = float(defaults.get("health_timeout_sec", 10)) nodes = await collect_all_nodes( nodes_cfg, router_api_key=router_api_key, timeout_per_node=timeout, ) online_count = sum(1 for n in nodes if n.get("online")) router_ok_count = sum(1 for n in nodes if n.get("router_ok")) return { "nodes": nodes, "summary": { "total": len(nodes), "online": online_count, "router_ok": router_ok_count, }, "defaults": defaults, }