microdao-daarion/services/node-capabilities/metrics.py

"""Runtime health and load metrics for NCS capabilities payload."""
import logging
import os
import platform
import subprocess
import time
from collections import deque
from typing import Any, Dict, List, Optional

import httpx

logger = logging.getLogger("ncs-metrics")

NODE_WORKER_URL = os.getenv("NODE_WORKER_URL", "http://127.0.0.1:8109")
_latency_buffer: Dict[str, deque] = {}  # key: "runtime:type" → deque of (latency_ms, ts)
LATENCY_BUFFER_SIZE = 50


def record_latency(runtime: str, req_type: str, latency_ms: int):
    key = f"{runtime}:{req_type}"
    buf = _latency_buffer.setdefault(key, deque(maxlen=LATENCY_BUFFER_SIZE))
    buf.append((latency_ms, time.time()))


def _percentile(values: List[int], p: float) -> int:
    if not values:
        return 0
    s = sorted(values)
    idx = int(len(s) * p / 100)
    return s[min(idx, len(s) - 1)]


def get_latency_stats(runtime: str, req_type: str) -> Dict[str, Optional[int]]:
    key = f"{runtime}:{req_type}"
    buf = _latency_buffer.get(key)
    if not buf or len(buf) == 0:
        return {"p50_ms": None, "p95_ms": None, "samples": 0}
    cutoff = time.time() - 600
    recent = [lat for lat, ts in buf if ts > cutoff]
    if not recent:
        return {"p50_ms": None, "p95_ms": None, "samples": 0}
    return {
        "p50_ms": _percentile(recent, 50),
        "p95_ms": _percentile(recent, 95),
        "samples": len(recent),
    }


async def fetch_worker_metrics() -> Dict[str, Any]:
    """Fetch inflight/concurrency from local node-worker /metrics."""
    defaults = {"inflight_jobs": 0, "concurrency_limit": 1, "queue_depth": 0,
                "last_latencies_llm": [], "last_latencies_vision": []}
    try:
        async with httpx.AsyncClient(timeout=2) as c:
            r = await c.get(f"{NODE_WORKER_URL}/metrics")
            if r.status_code == 200:
                return r.json()
    except Exception as e:
        logger.debug(f"Node-worker metrics unavailable: {e}")
    return defaults


def get_cpu_load() -> Optional[float]:
    try:
        return round(os.getloadavg()[0], 2)
    except (OSError, AttributeError):
        return None


def get_mem_pressure() -> Optional[str]:
    """macOS: use memory_pressure -Q or vm_stat. Linux: /proc/meminfo."""
    if platform.system() == "Darwin":
        try:
            out = subprocess.check_output(
                ["memory_pressure", "-Q"], timeout=2, stderr=subprocess.DEVNULL
            ).decode()
            for line in out.splitlines():
                ll = line.lower()
                if "system-wide" in ll and "level" in ll:
                    if "critical" in ll:
                        return "critical"
                    if "warn" in ll:
                        return "high"
                    if "normal" in ll:
                        return "low"
            return "low"
        except Exception:
            try:
                out = subprocess.check_output(
                    ["vm_stat"], timeout=2, stderr=subprocess.DEVNULL
                ).decode()
                return "low"
            except Exception:
                return None
    elif platform.system() == "Linux":
        try:
            with open("/proc/meminfo") as f:
                info = {}
                for line in f:
                    parts = line.split(":")
                    if len(parts) == 2:
                        info[parts[0].strip()] = int(parts[1].strip().split()[0])
                total = info.get("MemTotal", 1)
                avail = info.get("MemAvailable", total)
                ratio = avail / total
                if ratio < 0.05:
                    return "critical"
                elif ratio < 0.15:
                    return "high"
                elif ratio < 0.30:
                    return "medium"
                return "low"
        except Exception:
            return None
    return None


async def build_node_load(worker_metrics: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
    """Build NodeLoad object for capabilities payload."""
    wm = worker_metrics or await fetch_worker_metrics()

    inflight = wm.get("inflight_jobs", 0)
    concurrency = wm.get("concurrency_limit", 1)
    queue_depth = wm.get("queue_depth", 0)

    llm_stats = get_latency_stats("ollama", "llm")
    p50 = llm_stats["p50_ms"] or 1500

    if inflight < concurrency:
        estimated_wait = 0
    else:
        estimated_wait = (inflight - concurrency + 1) * p50

    return {
        "ts": int(time.time() * 1000),
        "inflight_jobs": inflight,
        "queue_depth": queue_depth,
        "concurrency_limit": concurrency,
        "estimated_wait_ms": estimated_wait,
        "cpu_load_1m": get_cpu_load(),
        "mem_pressure": get_mem_pressure(),
        "rtt_ms_to_hub": None,
    }


async def build_runtime_load(runtimes: Dict[str, Any]) -> List[Dict[str, Any]]:
    """Build RuntimeLoad list from collected runtimes."""
    result = []
    for rt_name, rt_data in runtimes.items():
        status = rt_data.get("status", "unknown")
        healthy = status == "ok"

        llm_stats = get_latency_stats(rt_name, "llm")
        vis_stats = get_latency_stats(rt_name, "vision")
        best_stats = vis_stats if vis_stats["samples"] > llm_stats["samples"] else llm_stats

        result.append({
            "runtime": rt_name,
            "healthy": healthy,
            "last_check_ms": int(time.time() * 1000),
            "p50_ms": best_stats["p50_ms"],
            "p95_ms": best_stats["p95_ms"],
        })
    return result