import asyncio import json import logging import os import time from datetime import datetime, timezone from typing import Any, Dict, List, Optional, Tuple import httpx from redis.asyncio import Redis from .registry_api import _load_crewai_roles, _load_district_registry, _load_registry logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s") logger = logging.getLogger("daarion-metrics-poller") REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0") POLL_INTERVAL_SECONDS = int(os.getenv("DAARION_METRICS_POLL_INTERVAL_SECONDS", "10")) METRICS_TTL_SECONDS = int(os.getenv("DAARION_METRICS_TTL_SECONDS", "60")) HTTP_CONNECT_TIMEOUT_SECONDS = float(os.getenv("DAARION_METRICS_HTTP_CONNECT_TIMEOUT_SECONDS", "2")) HTTP_TOTAL_TIMEOUT_SECONDS = float(os.getenv("DAARION_METRICS_HTTP_TOTAL_TIMEOUT_SECONDS", "5")) NODES_TOTAL = int(os.getenv("DAARION_NODE_COUNT", "1")) MEMORY_SERVICE_URL = os.getenv("MEMORY_SERVICE_URL", "http://memory-service:8000") DASHBOARD_KEY = "daarion:metrics:dashboard" DISTRICT_KEY_PREFIX = "daarion:metrics:district" _redis: Optional[Redis] = None def _now_iso() -> str: return datetime.now(timezone.utc).isoformat() def _ensure_url(value: str) -> str: value = (value or "").strip() if not value: return "" if value.startswith("http://") or value.startswith("https://"): return value return f"https://{value}" def _health_candidates(district: Dict[str, Any]) -> List[str]: base = _ensure_url(str(district.get("domain") or "")) candidates: List[str] = [] explicit = str(district.get("health_url") or "").strip() if explicit: candidates.append(_ensure_url(explicit)) if base: candidates.extend( [ f"{base}/.well-known/daarion-health.json", f"{base}/health", f"{base}/v1/health", ] ) dedup: List[str] = [] seen = set() for url in candidates: if url and url not in seen: dedup.append(url) seen.add(url) return dedup def _extract_agents_online(payload: Dict[str, Any], agents_total: int) -> Optional[int]: raw = payload.get("agents_online") if isinstance(raw, bool): return agents_total if raw else 0 if isinstance(raw, int): return max(0, min(raw, agents_total)) agents = payload.get("agents") if isinstance(agents, list): count = 0 for agent in agents: if not isinstance(agent, dict): continue status = str(agent.get("status", "")).lower() if status in {"online", "active", "ok"}: count += 1 return min(count, agents_total) return None async def redis_client() -> Redis: global _redis if _redis is None: _redis = Redis.from_url(REDIS_URL, decode_responses=True) return _redis async def close_redis() -> None: global _redis if _redis is not None: await _redis.close() _redis = None async def _fetch_json_with_latency( client: httpx.AsyncClient, url: str, ) -> Tuple[bool, Optional[Dict[str, Any]], Optional[float], Optional[str]]: started = time.perf_counter() try: response = await client.get(url) latency_ms = round((time.perf_counter() - started) * 1000, 2) if response.status_code >= 400: return False, None, latency_ms, f"HTTP {response.status_code}" data: Optional[Dict[str, Any]] = None try: parsed = response.json() if isinstance(parsed, dict): data = parsed except Exception: data = None return True, data, latency_ms, None except Exception as e: latency_ms = round((time.perf_counter() - started) * 1000, 2) return False, None, latency_ms, str(e) async def _read_memory_vectors(client: httpx.AsyncClient) -> int: try: ok, payload, _, _ = await _fetch_json_with_latency(client, f"{MEMORY_SERVICE_URL}/health") if not ok or not payload: return 0 return int(payload.get("vector_store", {}).get("memories", {}).get("vectors_count", 0) or 0) except Exception: return 0 async def _registry_snapshot() -> Tuple[List[Dict[str, Any]], Dict[str, List[Dict[str, Any]]], int, int]: raw_districts = _load_district_registry().get("districts", []) districts = [d for d in raw_districts if isinstance(d, dict) and d.get("district_id")] agents_map = _load_registry().get("agents", {}) role_counts = await _load_crewai_roles() by_district: Dict[str, List[Dict[str, Any]]] = {} subagents_total = 0 for aid, cfg in agents_map.items(): if not isinstance(cfg, dict): continue aid_str = str(aid) district_id = str(cfg.get("district_id") or "city-core") subagents_total += int(role_counts.get(aid_str, 0)) by_district.setdefault(district_id, []).append( { "agent_id": aid_str, "status": str(cfg.get("status", "active")), } ) return districts, by_district, len(agents_map), subagents_total async def build_dashboard() -> Dict[str, Any]: districts, agents_by_district, agents_total, subagents_total = await _registry_snapshot() timeout = httpx.Timeout(timeout=HTTP_TOTAL_TIMEOUT_SECONDS, connect=HTTP_CONNECT_TIMEOUT_SECONDS) by_district: List[Dict[str, Any]] = [] districts_online = 0 agents_online_total = 0 async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client: memory_vectors = await _read_memory_vectors(client) for district in districts: district_id = str(district.get("district_id")) title = district.get("title") or district_id domain = str(district.get("domain") or "") status = district.get("status") or "active" members = agents_by_district.get(district_id, []) agents_total_district = len(members) sample = { "district_id": district_id, "title": title, "domain": domain, "status": status, "ok": False, "agents_total": agents_total_district, "agents_online": 0, "latency_ms": None, "last_check_ts": _now_iso(), "error": None, } last_error = "No health endpoint configured" for candidate in _health_candidates(district): ok, payload, latency_ms, error_message = await _fetch_json_with_latency(client, candidate) sample["latency_ms"] = latency_ms if ok: sample["ok"] = True sample["error"] = None inferred = _extract_agents_online(payload or {}, agents_total_district) sample["agents_online"] = inferred if inferred is not None else agents_total_district break last_error = error_message or "health check failed" if sample["ok"]: districts_online += 1 agents_online_total += int(sample.get("agents_online") or 0) else: sample["error"] = {"message": last_error} by_district.append(sample) return { "global": { "nodes": NODES_TOTAL, "districts": len(districts), "agents": agents_total, "subagents": subagents_total, "memory_vectors": memory_vectors, "districts_online": districts_online, "agents_online": agents_online_total, }, "by_district": by_district, "updated_at": _now_iso(), } async def publish_dashboard(dashboard: Dict[str, Any]) -> None: redis = await redis_client() payload = json.dumps(dashboard, ensure_ascii=False) await redis.set(DASHBOARD_KEY, payload, ex=METRICS_TTL_SECONDS) for row in dashboard.get("by_district", []): district_id = row.get("district_id") if not district_id: continue key = f"{DISTRICT_KEY_PREFIX}:{district_id}" await redis.set(key, json.dumps(row, ensure_ascii=False), ex=METRICS_TTL_SECONDS) async def run_once() -> None: dashboard = await build_dashboard() await publish_dashboard(dashboard) logger.info( "dashboard_updated districts=%s districts_online=%s agents=%s agents_online=%s", dashboard["global"].get("districts"), dashboard["global"].get("districts_online"), dashboard["global"].get("agents"), dashboard["global"].get("agents_online"), ) async def worker_loop() -> None: logger.info( "metrics_poller_started interval=%ss ttl=%ss redis=%s", POLL_INTERVAL_SECONDS, METRICS_TTL_SECONDS, REDIS_URL, ) while True: started = time.perf_counter() try: await run_once() except asyncio.CancelledError: raise except Exception: logger.exception("metrics_poller_cycle_failed") elapsed = time.perf_counter() - started sleep_for = max(1.0, POLL_INTERVAL_SECONDS - elapsed) await asyncio.sleep(sleep_for) if __name__ == "__main__": try: asyncio.run(worker_loop()) finally: try: asyncio.run(close_redis()) except Exception: pass