microdao-daarion/services/sofiia-console/app/nodes.py

"""Nodes dashboard: aggregate telemetry from all configured nodes."""
import logging
from typing import Any, Dict

from .config import load_nodes_registry
from .monitor import collect_all_nodes

logger = logging.getLogger(__name__)


async def get_nodes_dashboard(router_api_key: str = "") -> Dict[str, Any]:
    """
    GET /api/nodes/dashboard

    For each node in nodes_registry.yml, collects:
      - router health (ok, latency)
      - gateway health (ok, latency) — optional
      - monitor agent telemetry (heartbeat, SLO, incidents, backends, artifacts)

    All probes run in parallel with per-node timeout.
    Non-fatal: unreachable nodes appear with online=false.
    """
    reg = load_nodes_registry()
    nodes_cfg = reg.get("nodes", {})
    defaults  = reg.get("defaults", {})
    timeout   = float(defaults.get("health_timeout_sec", 10))

    nodes = await collect_all_nodes(
        nodes_cfg,
        router_api_key=router_api_key,
        timeout_per_node=timeout,
    )

    online_count  = sum(1 for n in nodes if n.get("online"))
    router_ok_count = sum(1 for n in nodes if n.get("router_ok"))

    return {
        "nodes": nodes,
        "summary": {
            "total": len(nodes),
            "online": online_count,
            "router_ok": router_ok_count,
        },
        "defaults": defaults,
    }