microdao-daarion/services/node-registry/app/dashboard.py

"""
Node Dashboard API - Aggregator for node status and metrics
"""
import asyncio
import logging
import httpx
import psutil
from typing import Dict, Any, Optional, List
from datetime import datetime

logger = logging.getLogger(__name__)

# Probe timeout in seconds
PROBE_TIMEOUT = 0.5
PROBE_TIMEOUT_LONG = 1.0


class DashboardAggregator:
    """Aggregates data from multiple services for node dashboard"""

    def __init__(self, node_ip: str = "localhost"):
        self.node_ip = node_ip
        self.client = httpx.AsyncClient(timeout=PROBE_TIMEOUT)

    async def close(self):
        await self.client.aclose()

    async def _probe(self, url: str, timeout: float = PROBE_TIMEOUT) -> Dict[str, Any]:
        """Execute HTTP probe with timeout"""
        try:
            resp = await self.client.get(url, timeout=timeout)
            if resp.status_code == 200:
                return {"status": "up", "data": resp.json(), "latency_ms": int(resp.elapsed.total_seconds() * 1000)}
            else:
                return {"status": "degraded", "error": f"HTTP {resp.status_code}"}
        except httpx.TimeoutException:
            return {"status": "down", "error": "timeout"}
        except httpx.ConnectError:
            return {"status": "down", "error": "connection refused"}
        except Exception as e:
            return {"status": "down", "error": str(e)}

    async def get_infra_metrics(self) -> Dict[str, Any]:
        """Get infrastructure metrics using psutil"""
        try:
            cpu_pct = psutil.cpu_percent(interval=0.1)
            mem = psutil.virtual_memory()
            disk = psutil.disk_usage('/')

            result = {
                "cpu_usage_pct": round(cpu_pct, 1),
                "ram": {
                    "total_gb": round(mem.total / (1024**3), 1),
                    "used_gb": round(mem.used / (1024**3), 1)
                },
                "disk": {
                    "total_gb": round(disk.total / (1024**3), 1),
                    "used_gb": round(disk.used / (1024**3), 1)
                },
                "gpus": []
            }

            # Try to get GPU info (nvidia-smi or similar)
            try:
                import subprocess
                nvidia_output = subprocess.run(
                    ['nvidia-smi', '--query-gpu=name,memory.total,memory.used,utilization.gpu', '--format=csv,noheader,nounits'],
                    capture_output=True, text=True, timeout=2
                )
                if nvidia_output.returncode == 0:
                    for line in nvidia_output.stdout.strip().split('\n'):
                        parts = [p.strip() for p in line.split(',')]
                        if len(parts) >= 4:
                            result["gpus"].append({
                                "name": parts[0],
                                "vram_gb": round(float(parts[1]) / 1024, 1),
                                "used_gb": round(float(parts[2]) / 1024, 1),
                                "sm_util_pct": int(parts[3])
                            })
            except:
                pass

            return result
        except Exception as e:
            logger.error(f"Failed to get infra metrics: {e}")
            return {
                "cpu_usage_pct": 0,
                "ram": {"total_gb": 0, "used_gb": 0},
                "disk": {"total_gb": 0, "used_gb": 0},
                "gpus": []
            }

    async def probe_swapper(self, port: int = 8890) -> Dict[str, Any]:
        """Probe Swapper service"""
        base_url = f"http://{self.node_ip}:{port}"

        health_result = await self._probe(f"{base_url}/health", PROBE_TIMEOUT_LONG)
        models_result = await self._probe(f"{base_url}/models", PROBE_TIMEOUT_LONG)

        result = {
            "status": health_result.get("status", "unknown"),
            "endpoint": base_url,
            "latency_ms": health_result.get("latency_ms", 0),
            "storage": {"total_gb": 0, "used_gb": 0, "free_gb": 0},
            "models": []
        }

        if health_result.get("status") == "up":
            data = health_result.get("data", {})
            result["active_model"] = data.get("active_model")
            result["mode"] = data.get("mode")

        if models_result.get("status") == "up":
            data = models_result.get("data", {})
            for m in data.get("models", []):
                result["models"].append({
                    "name": m.get("name"),
                    "size_gb": m.get("size_gb", 0),
                    "device": m.get("device", "disk"),
                    "state": m.get("status", "unloaded")
                })

        return result

    async def probe_router(self, port: int = 9102) -> Dict[str, Any]:
        """Probe DAGI Router service"""
        base_url = f"http://{self.node_ip}:{port}"

        health_result = await self._probe(f"{base_url}/health", PROBE_TIMEOUT_LONG)
        backends_result = await self._probe(f"{base_url}/backends/status", PROBE_TIMEOUT_LONG)

        result = {
            "status": health_result.get("status", "unknown"),
            "endpoint": base_url,
            "version": "unknown",
            "backends": [],
            "metrics": {
                "requests_1m": 0,
                "requests_1h": 0,
                "error_rate_1h": 0,
                "avg_latency_ms_1h": 0
            }
        }

        if health_result.get("status") == "up":
            data = health_result.get("data", {})
            result["version"] = data.get("version", "unknown")
            result["nats_connected"] = data.get("nats_connected", False)

        if backends_result.get("status") == "up":
            for backend in backends_result.get("data", []):
                result["backends"].append({
                    "name": backend.get("name"),
                    "status": backend.get("status"),
                    "latency_ms": backend.get("latency_ms", 0),
                    "error": backend.get("error")
                })

        return result

    async def probe_service(self, name: str, port: int, health_path: str = "/health") -> Dict[str, Any]:
        """Probe generic AI service"""
        base_url = f"http://{self.node_ip}:{port}"

        result = await self._probe(f"{base_url}{health_path}")

        return {
            "status": result.get("status", "unknown"),
            "endpoint": base_url,
            "latency_ms": result.get("latency_ms", 0),
            "error": result.get("error")
        }

    async def probe_ollama(self, port: int = 11434) -> Dict[str, Any]:
        """Probe Ollama service"""
        base_url = f"http://{self.node_ip}:{port}"

        result = await self._probe(f"{base_url}/api/tags", PROBE_TIMEOUT_LONG)

        models = []
        if result.get("status") == "up":
            data = result.get("data", {})
            for m in data.get("models", []):
                models.append(m.get("name"))

        return {
            "status": result.get("status", "unknown"),
            "endpoint": base_url,
            "latency_ms": result.get("latency_ms", 0),
            "models": models[:10],  # Limit to 10 models
            "error": result.get("error")
        }

    async def probe_matrix(self, synapse_port: int = 8018, presence_port: int = 8085) -> Dict[str, Any]:
        """Probe Matrix services"""
        synapse_result = await self._probe(f"http://{self.node_ip}:{synapse_port}/_matrix/client/versions")
        presence_result = await self._probe(f"http://{self.node_ip}:{presence_port}/health")

        return {
            "enabled": synapse_result.get("status") == "up",
            "homeserver": f"http://{self.node_ip}:{synapse_port}",
            "synapse": {
                "status": synapse_result.get("status", "unknown"),
                "latency_ms": synapse_result.get("latency_ms", 0)
            },
            "presence_bridge": {
                "status": presence_result.get("status", "unknown"),
                "latency_ms": presence_result.get("latency_ms", 0)
            }
        }

    async def probe_monitoring(self, prometheus_port: int = 9090, grafana_port: int = 3001) -> Dict[str, Any]:
        """Probe monitoring services"""
        prometheus_result = await self._probe(f"http://{self.node_ip}:{prometheus_port}/-/ready")
        grafana_result = await self._probe(f"http://{self.node_ip}:{grafana_port}/api/health")

        return {
            "prometheus": {
                "url": f"http://{self.node_ip}:{prometheus_port}",
                "status": prometheus_result.get("status", "unknown")
            },
            "grafana": {
                "url": f"http://{self.node_ip}:{grafana_port}",
                "status": grafana_result.get("status", "unknown")
            },
            "logging": {
                "loki": {"status": "unknown"}
            }
        }

    async def get_agents_summary(self, city_service_port: int = 7001) -> Dict[str, Any]:
        """Get agents summary from city service"""
        # City service uses /city/agents endpoint
        result = await self._probe(f"http://{self.node_ip}:{city_service_port}/city/agents", PROBE_TIMEOUT_LONG)

        summary = {
            "total": 0,
            "running": 0,
            "by_kind": {},
            "top": []
        }

        if result.get("status") == "up":
            agents = result.get("data", [])
            summary["total"] = len(agents)

            for agent in agents:
                kind = agent.get("kind", "unknown")
                summary["by_kind"][kind] = summary["by_kind"].get(kind, 0) + 1

                if agent.get("status") in ["online", "busy"]:
                    summary["running"] += 1

            # Top 5 agents
            online_agents = [a for a in agents if a.get("status") in ["online", "busy"]][:5]
            for agent in online_agents:
                summary["top"].append({
                    "agent_id": agent.get("id"),
                    "display_name": agent.get("display_name"),
                    "kind": agent.get("kind"),
                    "status": agent.get("status"),
                    "node_id": agent.get("node_id")
                })

        return summary


async def build_dashboard(node_profile: Dict[str, Any], node_ip: str = "localhost") -> Dict[str, Any]:
    """
    Build complete dashboard from node profile.

    Args:
        node_profile: Node profile from registry (with modules, gpu, roles)
        node_ip: IP address to probe services

    Returns:
        Complete dashboard JSON
    """
    aggregator = DashboardAggregator(node_ip)

    try:
        # Build module port map
        module_ports = {}
        for module in node_profile.get("modules", []):
            if module.get("port"):
                module_ports[module["id"]] = module["port"]

        # Parallel probes
        tasks = {
            "infra": aggregator.get_infra_metrics(),
        }

        # Add probes based on modules
        if "ai.swapper" in module_ports:
            tasks["swapper"] = aggregator.probe_swapper(module_ports["ai.swapper"])

        if "ai.router" in module_ports:
            tasks["router"] = aggregator.probe_router(module_ports["ai.router"])

        if "ai.ollama" in module_ports:
            tasks["ollama"] = aggregator.probe_ollama(module_ports["ai.ollama"])

        # Generic AI services
        ai_services = ["ai.stt", "ai.tts", "ai.ocr", "ai.memory", "ai.crewai"]
        for svc in ai_services:
            if svc in module_ports:
                svc_name = svc.replace("ai.", "")
                tasks[f"svc_{svc_name}"] = aggregator.probe_service(svc_name, module_ports[svc])

        # Matrix
        synapse_port = module_ports.get("matrix.synapse", 8018)
        presence_port = module_ports.get("matrix.presence", 8085)
        if "matrix.synapse" in module_ports or "matrix.presence" in module_ports:
            tasks["matrix"] = aggregator.probe_matrix(synapse_port, presence_port)

        # Monitoring
        prometheus_port = module_ports.get("monitoring.prometheus", 9090)
        tasks["monitoring"] = aggregator.probe_monitoring(prometheus_port)

        # Agents
        city_port = module_ports.get("daarion.city", 7001)
        if "daarion.city" in module_ports or "daarion.agents" in module_ports:
            tasks["agents"] = aggregator.get_agents_summary(city_port)

        # Execute all probes in parallel
        results = {}
        for name, task in tasks.items():
            try:
                results[name] = await task
            except Exception as e:
                logger.error(f"Probe {name} failed: {e}")
                results[name] = {"status": "error", "error": str(e)}

        # Build dashboard response
        dashboard = {
            "node": {
                "node_id": node_profile.get("node_id"),
                "name": node_profile.get("name"),
                "roles": node_profile.get("roles", []),
                "status": node_profile.get("status", "unknown"),
                "public_hostname": node_profile.get("ip_address"),
                "environment": node_profile.get("role", "production"),
                "gpu": node_profile.get("gpu"),
                "modules": node_profile.get("modules", []),
                "version": node_profile.get("version", "1.0.0")
            },
            "infra": results.get("infra", {}),
            "ai": {
                "swapper": results.get("swapper", {"status": "not_installed"}),
                "router": results.get("router", {"status": "not_installed"}),
                "ollama": results.get("ollama", {"status": "not_installed"}),
                "services": {}
            },
            "agents": results.get("agents", {"total": 0, "running": 0, "by_kind": {}, "top": []}),
            "matrix": results.get("matrix", {"enabled": False}),
            "monitoring": results.get("monitoring", {})
        }

        # Add AI services
        for key, value in results.items():
            if key.startswith("svc_"):
                svc_name = key.replace("svc_", "")
                dashboard["ai"]["services"][svc_name] = value

        return dashboard

    finally:
        await aggregator.close()