2026-03-05 10:38:18 -08:00
5 changed files with 575 additions and 62 deletions
--- a/docker-compose.node1.yml
+++ b/docker-compose.node1.yml
@@ -27,7 +27,7 @@ services:
      - DEEPSEEK_API_KEY=sk-0db94e8193ec4a6e9acd593ee8d898e7
      - MISTRAL_API_KEY=40Gwjo8nVBx4i4vIkgszvXw9bOwDOu4G
      - COHERE_API_KEY=nOdOXnuepLku2ipJWpe6acWgAsJCsDhMO0RnaEJB
-      - GROK_API_KEY=xai-69zEnDse8qRuQyZATs9jVKgfwdyvkHzgEVrTbV0OTAurZqsjHmvGepXG6H9GhVRYEC7E4NFl6iZeG0ww
+      - GROK_API_KEY=xai-CpoLMPgw91NP9AEdHPhIrvU4ZnhV1q1P8BJBKCpD5kTPFRXJmTOkgGNHwYdZpXMlRxBgHcgcSlIXccxh
      - VISION_ENCODER_URL=http://vision-encoder:8001
      - SWAPPER_SERVICE_URL=http://swapper-service:8890
      - IMAGE_GEN_URL=http://swapper-service:8890/image/generate
@@ -35,12 +35,22 @@ services:
      - STT_SERVICE_UPLOAD_URL=http://swapper-service:8890/stt
      - OCR_SERVICE_URL=http://swapper-service:8890
      - WEB_SEARCH_SERVICE_URL=http://swapper-service:8890
      - REDIS_URL=redis://redis:6379/0
      - CREWAI_SERVICE_URL=http://dagi-staging-crewai-service:9010
      - NATURE_ID_URL=http://plant-vision-node1:8085
      - NATURE_ID_MIN_CONFIDENCE=0.65
      - PLANTNET_API_KEY=${PLANTNET_API_KEY}
      - ONEOK_CRM_BASE_URL=http://oneok-crm-adapter:8088
      - ONEOK_CALC_BASE_URL=http://oneok-calc-adapter:8089
      - ONEOK_DOCS_BASE_URL=http://oneok-docs-adapter:8090
      - ONEOK_SCHEDULE_BASE_URL=http://oneok-schedule-adapter:8091
      - ONEOK_ADAPTER_API_KEY=${ONEOK_ADAPTER_API_KEY}
      - ROUTER_TOOL_MAX_ROUNDS=${ROUTER_TOOL_MAX_ROUNDS:-10}
      - AGROMATRIX_REVIEW_AUTH_MODE=${AGROMATRIX_REVIEW_AUTH_MODE:-bearer}
      - AGROMATRIX_REVIEW_BEARER_TOKENS=${AGROMATRIX_REVIEW_BEARER_TOKENS}
      # ── Node Capabilities (multi-node model selection) ──
      - NODE_CAPABILITIES_URL=http://node-capabilities:8099/capabilities
      - ENABLE_GLOBAL_CAPS_NATS=true
    volumes:
      - ${DEPLOY_ROOT:-.}/services/router/router_config.yaml:/app/router_config.yaml:ro
      - ${DEPLOY_ROOT:-.}/services/router/router-config.yml:/app/router-config.yml:ro
@@ -77,7 +87,7 @@ services:
      - CUDA_VISIBLE_DEVICES=0
      - CRAWL4AI_URL=http://crawl4ai:11235
      # Cloud API keys for video/image generation
-      - GROK_API_KEY=xai-69zEnDse8qRuQyZATs9jVKgfwdyvkHzgEVrTbV0OTAurZqsjHmvGepXG6H9GhVRYEC7E4NFl6iZeG0ww
+      - GROK_API_KEY=xai-CpoLMPgw91NP9AEdHPhIrvU4ZnhV1q1P8BJBKCpD5kTPFRXJmTOkgGNHwYdZpXMlRxBgHcgcSlIXccxh
      - MISTRAL_API_KEY=40Gwjo8nVBx4i4vIkgszvXw9bOwDOu4G
    volumes:
      - ${DEPLOY_ROOT:-.}/services/swapper-service/config/swapper_config_node1.yaml:/app/config/swapper_config.yaml:ro
@@ -106,6 +116,28 @@ services:
  # Image Generation тепер інтегровано в Swapper Service (lazy loading)
  # Endpoint: POST /image/generate на swapper-service:8890
  # Plant Vision wrapper (local nature-id CLI -> HTTP)
  plant-vision-node1:
    build:
      context: ./services/plant-vision-node1
      dockerfile: Dockerfile
    container_name: plant-vision-node1
    environment:
      - NATURE_ID_CMD=${NATURE_ID_CMD:-python /opt/nature-id/nature_id.py -m plants -l -r 5 -s {image_path}}
      - NATURE_ID_TIMEOUT=40
      - DOWNLOAD_TIMEOUT=20
    networks:
      - dagi-network
    volumes:
      - ${DEPLOY_ROOT:-.}/third_party/nature-id:/opt/nature-id:ro
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8085/health')\""]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 15s
  # Crawl4AI - Advanced Web Crawler with JavaScript support
  crawl4ai:
    image: unclecode/crawl4ai@sha256:4d8b065bf185962733cb5f9701f4122d03383fa1ab6b5f6a9873f04fa0416a84
@@ -134,7 +166,11 @@ services:
    ports:
      - "9300:9300"
    environment:
-      - ROUTER_URL=http://router:8000
+      - ROUTER_URL=${ROUTER_URL:-http://dagi-staging-router:8000}
      - GATEWAY_MAX_TOKENS_CONCISE=350
      - GATEWAY_MAX_TOKENS_SENPAI_DEFAULT=700
      - GATEWAY_MAX_TOKENS_DEFAULT=700
      - GATEWAY_MAX_TOKENS_DETAILED=1200
      - SERVICE_ID=gateway
      - SERVICE_ROLE=gateway
      - BRAND_INTAKE_URL=http://brand-intake:9211
@@ -191,12 +227,25 @@ services:
      - STT_SERVICE_UPLOAD_URL=http://swapper-service:8890/stt
      - OCR_SERVICE_URL=http://swapper-service:8890
      - WEB_SEARCH_SERVICE_URL=http://swapper-service:8890
      - REDIS_URL=redis://redis:6379/0
      - CREWAI_SERVICE_URL=http://dagi-staging-crewai-service:9010
      - AGROMATRIX_REVIEW_AUTH_MODE=${AGROMATRIX_REVIEW_AUTH_MODE:-bearer}
      - AGROMATRIX_REVIEW_BEARER_TOKENS=${AGROMATRIX_REVIEW_BEARER_TOKENS}
      # v4.3 FarmOS integration (fail-closed: якщо пусто — агент повідомить "не налаштований")
      - FARMOS_BASE_URL=http://dagi-farmos-node1
      - FARMOS_TOKEN=${FARMOS_TOKEN:-}
      - FARMOS_USER=${FARMOS_USER:-}
      - FARMOS_PASS=${FARMOS_PASS:-}
      - FARMOS_CLIENT_ID=${FARMOS_CLIENT_ID:-farm}
    env_file:
      - .env.stepan.node1
    volumes:
      - ${DEPLOY_ROOT:-.}/gateway-bot:/app/gateway-bot:ro
      - ${DEPLOY_ROOT:-.}/logs:/app/logs
    depends_on:
      - router
      - memory-service
      - redis
    networks:
      - dagi-network
    restart: unless-stopped
@@ -207,6 +256,107 @@ services:
      retries: 3
      start_period: 10s
  gateway-worker:
    build:
      context: ./gateway-bot
      dockerfile: Dockerfile
    container_name: dagi-gateway-worker-node1
    command: ["python", "-m", "daarion_facade.worker"]
    environment:
      - ROUTER_BASE_URL=http://router:8000
      - REDIS_URL=redis://redis:6379/0
      - ROUTER_WORKER_TIMEOUT=60
    volumes:
      - ${DEPLOY_ROOT:-.}/gateway-bot:/app/gateway-bot:ro
      - ${DEPLOY_ROOT:-.}/logs:/app/logs
    depends_on:
      - router
      - redis
    networks:
      - dagi-network
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "python", "-c", "print(\"ok\")"]
      interval: 30s
      timeout: 5s
      retries: 3
  gateway-reminder-worker:
    build:
      context: ./gateway-bot
      dockerfile: Dockerfile
    container_name: dagi-gateway-reminder-worker-node1
    command: ["python", "-m", "daarion_facade.reminder_worker"]
    environment:
      - REDIS_URL=redis://redis:6379/0
      - DAARION_REMINDER_POLL_SECONDS=${DAARION_REMINDER_POLL_SECONDS:-2}
      - DAARION_REMINDER_TTL_SECONDS=${DAARION_REMINDER_TTL_SECONDS:-2592000}
      - DAARION_REMINDER_DEFAULT_TZ=${DAARION_REMINDER_DEFAULT_TZ:-Europe/Kyiv}
      - GLOBAL_RELAY_ALLOWED_USER_IDS=${GLOBAL_RELAY_ALLOWED_USER_IDS:-}
      - MENTOR_PRIVATE_HANDLES=${MENTOR_PRIVATE_HANDLES:-ivantytar,archenvis,olegarch88}
      - MENTOR_PRIVATE_NAMES=${MENTOR_PRIVATE_NAMES:-Іван Титар,Александр Вертій,Олег Ковальчук}
      - MENTOR_DISCLOSURE_ALLOWED_USER_IDS=${MENTOR_DISCLOSURE_ALLOWED_USER_IDS:-}
      - HELION_MENTOR_CHAT_IDS=${HELION_MENTOR_CHAT_IDS:-}
      - HELION_RELAY_ALLOWED_USER_IDS=${HELION_RELAY_ALLOWED_USER_IDS:-}
      - DAARWIZZ_TELEGRAM_BOT_TOKEN=${DAARWIZZ_TELEGRAM_BOT_TOKEN:-8323412397:AAGZbAR22LuOiGD8xVC3OXMjahQ8rs2lJwo}
      - HELION_TELEGRAM_BOT_TOKEN=${HELION_TELEGRAM_BOT_TOKEN:-8112062582:AAGS-HwRLEI269lDutLtAJTFArsIq31YNhE}
      - GREENFOOD_TELEGRAM_BOT_TOKEN=${GREENFOOD_TELEGRAM_BOT_TOKEN:-7495165343:AAGR1XEOzg7DkPFPCzL_eYLCJfxJuonCxug}
      - AGROMATRIX_TELEGRAM_BOT_TOKEN=${AGROMATRIX_TELEGRAM_BOT_TOKEN:-8580290441:AAFuDBmFJtpl-3I_WfkH7Hkb59X0fhYNMOE}
      - ALATEYA_TELEGRAM_BOT_TOKEN=${ALATEYA_TELEGRAM_BOT_TOKEN:-8436880945:AAEi-HS6GEctddoqBUd37MHfweZQP-OjRlo}
      - NUTRA_TELEGRAM_BOT_TOKEN=${NUTRA_TELEGRAM_BOT_TOKEN:-8517315428:AAGTLcKxBAZDsMgx28agKTvl1SqJGi0utH4}
      - DRUID_TELEGRAM_BOT_TOKEN=${DRUID_TELEGRAM_BOT_TOKEN:-8145618489:AAFR714mBsNmiuF-rjCw-295iORBReJQZ70}
      - CLAN_TELEGRAM_BOT_TOKEN=${CLAN_TELEGRAM_BOT_TOKEN:-8516872152:AAHH26wU8hJZJbSCJXb4vbmPmakTP77ok5E}
      - EONARCH_TELEGRAM_BOT_TOKEN=${EONARCH_TELEGRAM_BOT_TOKEN:-7962391584:AAFYkelLRG3VR_Lxuu6pEGG76t4vZdANtz4}
      - SENPAI_TELEGRAM_BOT_TOKEN=${SENPAI_TELEGRAM_BOT_TOKEN:-8510265026:AAGFrFBIIEihsLptZSxuKdmW2RoRPQDY9FE}
      - ONEOK_TELEGRAM_BOT_TOKEN=${ONEOK_TELEGRAM_BOT_TOKEN}
      - SOUL_TELEGRAM_BOT_TOKEN=${SOUL_TELEGRAM_BOT_TOKEN:-8041596416:AAHhpfCtY8paCm_9AD-4stJJg-Vw-CBf6Qk}
      - YAROMIR_TELEGRAM_BOT_TOKEN=${YAROMIR_TELEGRAM_BOT_TOKEN:-8128180674:AAGNZdG3LwECI4z_803smsuRHsK3nPdjMLY}
      - SOFIIA_TELEGRAM_BOT_TOKEN=${SOFIIA_TELEGRAM_BOT_TOKEN:-8589292566:AAEmPvS6nY9e-Y-TZm04CAHWlaFnWVxajE4}
    volumes:
      - ${DEPLOY_ROOT:-.}/gateway-bot:/app/gateway-bot:ro
      - ${DEPLOY_ROOT:-.}/logs:/app/logs
    depends_on:
      - redis
    networks:
      - dagi-network
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "python", "-c", "print(\"ok\")"]
      interval: 30s
      timeout: 5s
      retries: 3
  metrics-poller-node1:
    build:
      context: ./gateway-bot
      dockerfile: Dockerfile
    container_name: dagi-metrics-poller-node1
    command: ["python", "-m", "daarion_facade.metrics_poller"]
    environment:
      - REDIS_URL=redis://redis:6379/0
      - MEMORY_SERVICE_URL=http://memory-service:8000
      - DAARION_METRICS_POLL_INTERVAL_SECONDS=${DAARION_METRICS_POLL_INTERVAL_SECONDS:-10}
      - DAARION_METRICS_TTL_SECONDS=${DAARION_METRICS_TTL_SECONDS:-60}
      - DAARION_METRICS_HTTP_CONNECT_TIMEOUT_SECONDS=${DAARION_METRICS_HTTP_CONNECT_TIMEOUT_SECONDS:-2}
      - DAARION_METRICS_HTTP_TOTAL_TIMEOUT_SECONDS=${DAARION_METRICS_HTTP_TOTAL_TIMEOUT_SECONDS:-5}
      - DAARION_NODE_COUNT=${DAARION_NODE_COUNT:-1}
    volumes:
      - ${DEPLOY_ROOT:-.}/gateway-bot:/app/gateway-bot:ro
      - ${DEPLOY_ROOT:-.}/logs:/app/logs
    depends_on:
      - redis
      - memory-service
    networks:
      - dagi-network
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "python", "-c", "print(\"ok\")"]
      interval: 30s
      timeout: 5s
      retries: 3
  # CLAN Consent Outbox Worker (Postgres event-store applier; no execute)
  clan-consent-outbox-worker:
    build:
@@ -340,6 +490,29 @@ services:
      - dagi-network
    restart: unless-stopped
  # Node Capabilities Service (model inventory for router)
  node-capabilities:
    build:
      context: ./services/node-capabilities
      dockerfile: Dockerfile
    container_name: node-capabilities-node1
    environment:
      - NODE_ID=noda1
      - OLLAMA_BASE_URL=http://host.docker.internal:11434
      - SWAPPER_URL=http://swapper-service:8890
      - CACHE_TTL_SEC=15
      - ENABLE_NATS_CAPS=true
      - NATS_URL=nats://nats:4222
    extra_hosts:
      - "host.docker.internal:host-gateway"
    depends_on:
      - nats
    networks:
      dagi-network:
        aliases:
          - node-capabilities
    restart: unless-stopped
  # NATS (JetStream)
  nats:
    image: nats:2.10-alpine
@@ -736,10 +909,11 @@ services:
    ports:
      - "9108:9108"
    environment:
-      - GATEWAY_URL=http://172.18.0.18:9300
+      - GATEWAY_URL=http://gateway:9300
      - PROBE_INTERVAL=60
      - PROBE_TIMEOUT=30
      - METRICS_PORT=9108
      - SEMANTIC_AGENTS=clan,sofiia,monitor,helion,agromatrix,senpai
    networks:
      - dagi-network
    restart: unless-stopped
@@ -819,6 +993,72 @@ services:
      retries: 5
      start_period: 15s
  binance-bot-monitor:
    build:
      context: ./services/binance-bot-monitor
      dockerfile: Dockerfile
    container_name: dagi-binance-bot-monitor-node1
    restart: unless-stopped
    environment:
      - REDIS_URL=redis://redis:6379/0
      - CRAWL4AI_URL=http://crawl4ai:11235
      - SWAPPER_URL=http://swapper-service:8890
      - BINANCE_CACHE_TTL=3600
      - BINANCE_REFRESH_INTERVAL=1800
      - BINANCE_API_KEY=${BINANCE_API_KEY:-}
      - BINANCE_SECRET_KEY=${BINANCE_SECRET_KEY:-}
    networks:
      - dagi-network
  # ── FarmOS (v4.3 integration) ────────────────────────────────────────────────
  # PostgreSQL для farmOS (окрема БД, не чіпає dagi-postgres)
  dagi-farmos-db-node1:
    image: postgres:16-alpine
    container_name: dagi-farmos-db-node1
    restart: unless-stopped
    environment:
      - POSTGRES_DB=farmos
      - POSTGRES_USER=farmos
      - POSTGRES_PASSWORD=${FARMOS_DB_PASS}
    volumes:
      - farmos-db-data-node1:/var/lib/postgresql/data
    networks:
      - dagi-network
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U farmos -d farmos"]
      interval: 10s
      timeout: 5s
      retries: 10
      start_period: 15s
  # farmOS Drupal application (4.x — актуальна стабільна, amd64 для x86_64 сервера)
  dagi-farmos-node1:
    image: farmos/farmos:4.x-amd64
    container_name: dagi-farmos-node1
    restart: unless-stopped
    depends_on:
      dagi-farmos-db-node1:
        condition: service_healthy
    environment:
      - FARMOS_DB_HOST=dagi-farmos-db-node1
      - FARMOS_DB_NAME=farmos
      - FARMOS_DB_USER=farmos
      - FARMOS_DB_PASSWORD=${FARMOS_DB_PASS}
      - FARMOS_DB_DRIVER=pgsql
    volumes:
      - farmos-sites-node1:/opt/drupal/web/sites
    networks:
      - dagi-network
    ports:
      # Доступний тільки локально; для браузерного setup — SSH tunnel: ssh -L 8088:localhost:8088
      - "127.0.0.1:8088:80"
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://localhost:80 -o /dev/null || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 60s
 volumes:
  qdrant-data-node1:
@@ -871,6 +1111,14 @@ volumes:
    name: oneok-crm-data-node1
    driver: local
  # farmOS persistent volumes (v4.3)
  farmos-db-data-node1:
    name: farmos-db-data-node1
    driver: local
  farmos-sites-node1:
    name: farmos-sites-node1
    driver: local
 networks:
  dagi-network:
    external: true
--- a/docker-compose.node2-sofiia.yml
+++ b/docker-compose.node2-sofiia.yml
@@ -25,8 +25,9 @@ services:
      - XAI_API_KEY=${XAI_API_KEY}
      - GROK_API_KEY=${XAI_API_KEY}
      - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
-      # ── Node Capabilities ─────────────────────────────────────────────────
+      # ── Node Capabilities (multi-node model selection) ────────────────────
      - NODE_CAPABILITIES_URL=http://node-capabilities:8099/capabilities
      - ENABLE_GLOBAL_CAPS_NATS=true
      # ── Persistence backends ──────────────────────────────────────────────
      - ALERT_BACKEND=postgres
      - ALERT_DATABASE_URL=${ALERT_DATABASE_URL:-${DATABASE_URL}}
--- a/services/router/global_capabilities_client.py
+++ b/services/router/global_capabilities_client.py
@@ -0,0 +1,245 @@
 """Global Capabilities Client — aggregates model capabilities across all nodes.
 Design for 150+ nodes:
 - Local NCS: HTTP (fast, always available)
 - Remote nodes: NATS request/reply with wildcard discovery
  - node.*.capabilities.get → each NCS replies with its capabilities
  - No static node list needed — new nodes auto-register by subscribing
  - scatter-gather pattern: send one request, collect N replies
 - TTL cache per node, stale nodes expire automatically
 """
 import asyncio
 import json
 import logging
 import os
 import time
 from typing import Any, Dict, List, Optional
 import httpx
 logger = logging.getLogger("global_caps")
 LOCAL_NCS_URL = os.getenv("NODE_CAPABILITIES_URL", "")
 LOCAL_NODE_ID = os.getenv("NODE_ID", "unknown")
 NATS_URL = os.getenv("NATS_URL", "nats://nats:4222")
 CACHE_TTL = int(os.getenv("GLOBAL_CAPS_TTL", "30"))
 NATS_DISCOVERY_TIMEOUT_MS = int(os.getenv("NATS_DISCOVERY_TIMEOUT_MS", "500"))
 NATS_ENABLED = os.getenv("ENABLE_GLOBAL_CAPS_NATS", "true").lower() in ("true", "1")
 CAPS_DISCOVERY_SUBJECT = "node.*.capabilities.get"
 CAPS_INBOX_PREFIX = "_CAPS_REPLY"
 _node_cache: Dict[str, Dict[str, Any]] = {}
 _node_timestamps: Dict[str, float] = {}
 _nats_client = None
 _initialized = False
 async def initialize():
    """Connect to NATS for discovery. Called once at router startup."""
    global _nats_client, _initialized
    if not NATS_ENABLED:
        logger.info("Global caps NATS discovery disabled")
        _initialized = True
        return
    try:
        import nats as nats_lib
        _nats_client = await nats_lib.connect(NATS_URL)
        _initialized = True
        logger.info(f"✅ Global caps NATS connected: {NATS_URL}")
    except Exception as e:
        logger.warning(f"⚠️ Global caps NATS init failed (non-fatal): {e}")
        _nats_client = None
        _initialized = True
 async def shutdown():
    global _nats_client
    if _nats_client:
        try:
            await _nats_client.close()
        except Exception:
            pass
        _nats_client = None
 async def _fetch_local() -> Optional[Dict[str, Any]]:
    """Fetch capabilities from local NCS via HTTP."""
    if not LOCAL_NCS_URL:
        return None
    try:
        async with httpx.AsyncClient(timeout=3) as c:
            resp = await c.get(LOCAL_NCS_URL)
            if resp.status_code == 200:
                data = resp.json()
                node_id = data.get("node_id", LOCAL_NODE_ID)
                _node_cache[node_id] = data
                _node_timestamps[node_id] = time.time()
                return data
    except Exception as e:
        logger.warning(f"Local NCS fetch failed: {e}")
    return _node_cache.get(LOCAL_NODE_ID)
 async def _discover_remote_nodes() -> List[Dict[str, Any]]:
    """Scatter-gather discovery: send to node.*.capabilities.get, collect replies.
    Each NCS on every node subscribes to node.{node_id}.capabilities.get.
    NATS wildcard routing delivers our request to ALL of them.
    We collect replies within NATS_DISCOVERY_TIMEOUT_MS.
    This scales to 150+ nodes with zero static configuration:
    - New node deploys NCS → subscribes to its subject → automatically discovered.
    - Dead node stops responding → its cache entry expires after TTL.
    """
    if not _nats_client:
        return []
    collected: List[Dict[str, Any]] = []
    inbox = _nats_client.new_inbox()
    sub = await _nats_client.subscribe(inbox)
    try:
        await _nats_client.publish_request(
            "node.*.capabilities.get", inbox, b""
        )
        await _nats_client.flush()
        deadline = time.time() + (NATS_DISCOVERY_TIMEOUT_MS / 1000.0)
        while time.time() < deadline:
            remaining = deadline - time.time()
            if remaining <= 0:
                break
            try:
                msg = await asyncio.wait_for(
                    sub.next_msg(), timeout=remaining,
                )
                data = json.loads(msg.data)
                node_id = data.get("node_id", "?")
                if node_id != LOCAL_NODE_ID:
                    _node_cache[node_id] = data
                    _node_timestamps[node_id] = time.time()
                    collected.append(data)
            except asyncio.TimeoutError:
                break
            except Exception as e:
                logger.debug(f"Discovery parse error: {e}")
                break
    finally:
        await sub.unsubscribe()
    if collected:
        logger.info(
            f"Discovered {len(collected)} remote node(s): "
            f"{[c.get('node_id', '?') for c in collected]}"
        )
    return collected
 def _evict_stale():
    """Remove nodes that haven't refreshed within 3x TTL."""
    cutoff = time.time() - (CACHE_TTL * 3)
    stale = [nid for nid, ts in _node_timestamps.items() if ts < cutoff]
    for nid in stale:
        _node_cache.pop(nid, None)
        _node_timestamps.pop(nid, None)
        logger.info(f"Evicted stale node: {nid}")
 def _needs_refresh() -> bool:
    """Check if any node cache is older than TTL."""
    if not _node_timestamps:
        return True
    oldest = min(_node_timestamps.values())
    return (time.time() - oldest) > CACHE_TTL
 async def get_global_capabilities(force: bool = False) -> Dict[str, Any]:
    """Return merged capabilities from all known nodes.
    Returns:
        {
            "local_node": "noda1",
            "nodes": {"noda1": {...}, "noda2": {...}, ...},
            "served_models": [...],  # all models with "node" field
            "node_count": 2,
            "updated_at": "...",
        }
    """
    if not force and not _needs_refresh():
        return _build_global_view()
    _evict_stale()
    tasks = [_fetch_local()]
    if _nats_client:
        tasks.append(_discover_remote_nodes())
    await asyncio.gather(*tasks, return_exceptions=True)
    return _build_global_view()
 def _build_global_view() -> Dict[str, Any]:
    """Build a unified view from all cached node capabilities."""
    all_served: List[Dict[str, Any]] = []
    for node_id, caps in _node_cache.items():
        is_local = (node_id.lower() == LOCAL_NODE_ID.lower())
        age = time.time() - _node_timestamps.get(node_id, 0)
        for m in caps.get("served_models", []):
            all_served.append({
                **m,
                "node": node_id,
                "local": is_local,
                "node_age_s": round(age, 1),
            })
    all_served.sort(key=lambda m: (0 if m.get("local") else 1, m.get("name", "")))
    return {
        "local_node": LOCAL_NODE_ID,
        "nodes": {nid: {"node_id": nid, "served_count": len(c.get("served_models", [])),
                         "age_s": round(time.time() - _node_timestamps.get(nid, 0), 1)}
                  for nid, c in _node_cache.items()},
        "served_models": all_served,
        "served_count": len(all_served),
        "node_count": len(_node_cache),
        "updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    }
 def get_cached_global() -> Dict[str, Any]:
    """Return cached global view without fetching."""
    return _build_global_view()
 async def send_offload_request(
    node_id: str,
    request_type: str,
    payload: Dict[str, Any],
    timeout_s: float = 30.0,
 ) -> Optional[Dict[str, Any]]:
    """Send an inference request to a remote node via NATS.
    Subject pattern: node.{node_id}.{type}.request
    Reply: inline NATS request/reply
    """
    if not _nats_client:
        logger.warning("Cannot offload: NATS not connected")
        return None
    subject = f"node.{node_id.lower()}.{request_type}.request"
    try:
        msg = await _nats_client.request(
            subject,
            json.dumps(payload).encode(),
            timeout=timeout_s,
        )
        return json.loads(msg.data)
    except asyncio.TimeoutError:
        logger.warning(f"Offload timeout: {subject} ({timeout_s}s)")
        return None
    except Exception as e:
        logger.warning(f"Offload error: {subject}: {e}")
        return None
--- a/services/router/main.py
+++ b/services/router/main.py
@@ -46,14 +46,16 @@ except ImportError:
    RUNTIME_GUARD_AVAILABLE = False
    RuntimeGuard = None
-# NCS-first model selection
+# NCS-first model selection (multi-node global)
 try:
    import capabilities_client
    import global_capabilities_client
    from model_select import select_model_for_agent, ModelSelection, CLOUD_PROVIDERS as NCS_CLOUD_PROVIDERS
    NCS_AVAILABLE = True
 except ImportError:
    NCS_AVAILABLE = False
    capabilities_client = None  # type: ignore[assignment]
    global_capabilities_client = None  # type: ignore[assignment]
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -765,7 +767,7 @@ async def startup_event():
    else:
        tool_manager = None
-    # Initialize Node Capabilities client
+    # Initialize Node Capabilities (local + global multi-node)
    if NCS_AVAILABLE and capabilities_client:
        ncs_cfg = router_config.get("node_capabilities", {})
        ncs_url = ncs_cfg.get("url", "") or os.getenv("NODE_CAPABILITIES_URL", "")
@@ -774,11 +776,17 @@ async def startup_event():
            capabilities_client.configure(url=ncs_url, ttl=ncs_ttl)
            caps = await capabilities_client.fetch_capabilities()
            served = caps.get("served_count", 0)
-            logger.info(f"✅ NCS configured: url={ncs_url} ttl={ncs_ttl}s served={served} models")
+            logger.info(f"✅ NCS local configured: url={ncs_url} ttl={ncs_ttl}s served={served} models")
        else:
            logger.warning("⚠️ NCS url not configured; model selection will use static config only")
-    elif NCS_AVAILABLE:
+
-        logger.info("ℹ️ NCS modules loaded but capabilities_client is None")
+        if global_capabilities_client:
            await global_capabilities_client.initialize()
            gcaps = await global_capabilities_client.get_global_capabilities()
            logger.info(
                f"✅ Global caps: {gcaps.get('node_count', 0)} node(s), "
                f"{gcaps.get('served_count', 0)} total models"
            )
    else:
        logger.warning("⚠️ NCS modules not available (model_select / capabilities_client import failed)")
@@ -1629,18 +1637,24 @@ async def agent_infer(agent_id: str, request: InferRequest):
    cloud_provider_names = {"deepseek", "mistral", "grok", "openai", "anthropic"}
-    # ── NCS-first model selection ────────────────────────────────────────
+    # ── Global NCS-first model selection (multi-node) ───────────────────
    ncs_selection = None
-    if NCS_AVAILABLE and capabilities_client:
+    if NCS_AVAILABLE and global_capabilities_client:
        try:
            gcaps = await global_capabilities_client.get_global_capabilities()
            ncs_selection = await select_model_for_agent(
                agent_id, agent_config, router_config, gcaps, request.model,
            )
        except Exception as e:
            logger.warning(f"⚠️ Global NCS selection error: {e}; falling back to static")
    elif NCS_AVAILABLE and capabilities_client:
        try:
            caps = await capabilities_client.fetch_capabilities()
            if caps:
                caps["_fetch_ts"] = capabilities_client._cache_ts
            ncs_selection = await select_model_for_agent(
                agent_id, agent_config, router_config, caps, request.model,
            )
        except Exception as e:
-            logger.warning(f"⚠️ NCS selection error: {e}; falling back to static config")
+            logger.warning(f"⚠️ NCS selection error: {e}; falling back to static")
    llm_profiles = router_config.get("llm_profiles", {})
@@ -1651,9 +1665,10 @@ async def agent_infer(agent_id: str, request: InferRequest):
        if ncs_selection.base_url and provider == "ollama":
            llm_profile = {**llm_profile, "base_url": ncs_selection.base_url}
        logger.info(
-            f"🎯 NCS select: agent={agent_id} profile={default_llm} "
+            f"🎯 Select: agent={agent_id} profile={default_llm} "
-            f"→ runtime={ncs_selection.runtime} model={model} "
+            f"→ node={ncs_selection.node} runtime={ncs_selection.runtime} "
-            f"provider={provider} via_ncs={ncs_selection.via_ncs} "
+            f"model={model} provider={provider} "
            f"local={ncs_selection.local} via_nats={ncs_selection.via_nats} "
            f"caps_age={ncs_selection.caps_age_s}s "
            f"fallback={ncs_selection.fallback_reason or 'none'}"
        )
--- a/services/router/model_select.py
+++ b/services/router/model_select.py
@@ -1,8 +1,10 @@
-"""NCS-first model selection for DAGI Router.
+"""NCS-first model selection for DAGI Router — multi-node aware.
 Resolves an agent's LLM profile into a concrete model+provider using live
-capabilities from the Node Capabilities Service (NCS).  Falls back to static
+capabilities from Node Capabilities Services across all nodes.
-router-config.yml when NCS is unavailable.
+Falls back to static router-config.yml when NCS is unavailable.
 Scaling: works with 1 node or 150+. No static node lists.
 """
 import logging
 import time
@@ -31,7 +33,10 @@ class ModelSelection:
    model_type: str       # llm | vision | code | …
    base_url: str = ""
    provider: str = ""    # cloud provider name if applicable
    node: str = ""        # which node owns this model
    local: bool = True    # is it on the current node?
    via_ncs: bool = False
    via_nats: bool = False
    fallback_reason: str = ""
    caps_age_s: float = 0.0
@@ -44,13 +49,11 @@ def resolve_effective_profile(
    router_cfg: Dict[str, Any],
    request_model: Optional[str] = None,
 ) -> str:
    """Determine the effective LLM profile name for a request."""
    if request_model:
        llm_profiles = router_cfg.get("llm_profiles", {})
        for pname, pcfg in llm_profiles.items():
            if pcfg.get("model") == request_model:
                return pname
    return agent_cfg.get("default_llm", "local_default_coder")
@@ -59,11 +62,6 @@ def profile_requirements(
    agent_cfg: Dict[str, Any],
    router_cfg: Dict[str, Any],
 ) -> ProfileRequirements:
    """Build selection requirements from a profile definition.
    If the profile has `selection_policy` in config, use it directly.
    Otherwise, infer from the legacy `provider`/`model` fields.
    """
    llm_profiles = router_cfg.get("llm_profiles", {})
    selection_policies = router_cfg.get("selection_policies", {})
    profile_cfg = llm_profiles.get(profile_name, {})
@@ -107,22 +105,23 @@ def profile_requirements(
    )
-# ── NCS-based selection ───────────────────────────────────────────────────────
+# ── Multi-node model selection ────────────────────────────────────────────────
 def select_best_model(
    reqs: ProfileRequirements,
    capabilities: Dict[str, Any],
 ) -> Optional[ModelSelection]:
-    """Choose the best served model from NCS capabilities.
+    """Choose the best served model from global (multi-node) capabilities.
-    Returns None if no suitable model found (caller should try static fallback).
+    Selection order:
    1. Prefer list matches (local first, then remote)
    2. Best candidate by size (local first, then remote)
    3. None → caller should try static fallback
    """
    served = capabilities.get("served_models", [])
    if not served:
        return None
    caps_age = time.time() - capabilities.get("_fetch_ts", time.time())
    search_types = [reqs.required_type]
    if reqs.required_type == "code":
        search_types.append("llm")
@@ -133,24 +132,30 @@ def select_best_model(
    if not candidates:
        return None
    local_candidates = [m for m in candidates if m.get("local", False)]
    remote_candidates = [m for m in candidates if not m.get("local", False)]
    prefer = reqs.prefer if reqs.prefer else []
    for pref in prefer:
        if pref == "*":
            break
-        for m in candidates:
+        for m in local_candidates:
            if pref == m.get("name") or pref in m.get("name", ""):
-                return _make_selection(m, capabilities, caps_age, reqs)
+                return _make_selection(m, capabilities)
        for m in remote_candidates:
            if pref == m.get("name") or pref in m.get("name", ""):
                return _make_selection(m, capabilities)
-    if candidates:
+    if local_candidates:
-        best = _pick_best_candidate(candidates)
+        return _make_selection(_pick_best(local_candidates), capabilities)
-        return _make_selection(best, capabilities, caps_age, reqs)
+    if remote_candidates:
        return _make_selection(_pick_best(remote_candidates), capabilities)
    return None
-def _pick_best_candidate(candidates: List[Dict[str, Any]]) -> Dict[str, Any]:
+def _pick_best(candidates: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Prefer running models, then largest by size_gb."""
    running = [m for m in candidates if m.get("running")]
    pool = running if running else candidates
    return max(pool, key=lambda m: m.get("size_gb", 0))
@@ -159,15 +164,11 @@ def _pick_best_candidate(candidates: List[Dict[str, Any]]) -> Dict[str, Any]:
 def _make_selection(
    model: Dict[str, Any],
    capabilities: Dict[str, Any],
    caps_age: float,
    reqs: ProfileRequirements,
 ) -> ModelSelection:
    runtime = model.get("runtime", "ollama")
    is_local = model.get("local", False)
    node = model.get("node", capabilities.get("local_node", ""))
    base_url = model.get("base_url", "")
    if not base_url:
        runtimes = capabilities.get("runtimes", {})
        rt = runtimes.get(runtime, {})
        base_url = rt.get("base_url", "")
    return ModelSelection(
        runtime=runtime,
@@ -175,18 +176,20 @@ def _make_selection(
        model_type=model.get("type", "llm"),
        base_url=base_url,
        provider="ollama" if runtime in ("ollama", "llama_server") else runtime,
        node=node,
        local=is_local,
        via_ncs=True,
-        caps_age_s=round(caps_age, 1),
+        via_nats=not is_local,
        caps_age_s=model.get("node_age_s", 0.0),
    )
-# ── Static fallback (from router-config profiles) ────────────────────────────
+# ── Static fallback ──────────────────────────────────────────────────────────
 def static_fallback(
    profile_name: str,
    router_cfg: Dict[str, Any],
 ) -> Optional[ModelSelection]:
    """Build a ModelSelection from the static llm_profiles config."""
    llm_profiles = router_cfg.get("llm_profiles", {})
    cfg = llm_profiles.get(profile_name, {})
    if not cfg:
@@ -200,6 +203,8 @@ def static_fallback(
        model_type="cloud_llm" if provider in CLOUD_PROVIDERS else "llm",
        base_url=cfg.get("base_url", ""),
        provider=provider,
        node="local",
        local=True,
        via_ncs=False,
        fallback_reason="NCS unavailable or no match; using static config",
    )
@@ -214,10 +219,7 @@ async def select_model_for_agent(
    capabilities: Optional[Dict[str, Any]],
    request_model: Optional[str] = None,
 ) -> ModelSelection:
-    """Full selection pipeline: resolve profile → NCS → static fallback.
+    """Full selection pipeline: resolve profile → NCS (multi-node) → static → hard default."""
    This is the single entry point the router calls for each request.
    """
    profile = resolve_effective_profile(
        agent_id, agent_cfg, router_cfg, request_model,
    )
@@ -238,36 +240,36 @@ async def select_model_for_agent(
        sel = select_best_model(reqs, capabilities)
        if sel:
            logger.info(
-                f"[select] agent={agent_id} profile={profile} → NCS "
+                f"[select] agent={agent_id} profile={profile} → "
-                f"runtime={sel.runtime} model={sel.name} caps_age={sel.caps_age_s}s"
+                f"{'NCS' if sel.local else 'REMOTE'} "
                f"node={sel.node} runtime={sel.runtime} "
                f"model={sel.name} caps_age={sel.caps_age_s}s"
            )
            return sel
        logger.warning(
-            f"[select] agent={agent_id} profile={profile} → NCS had no match "
+            f"[select] agent={agent_id} profile={profile} → no match "
-            f"for type={reqs.required_type}; trying static"
+            f"for type={reqs.required_type} across {capabilities.get('node_count', 0)} node(s)"
        )
    static = static_fallback(profile, router_cfg)
    if static:
        logger.info(
            f"[select] agent={agent_id} profile={profile} → static "
-            f"provider={static.provider} model={static.name} "
+            f"provider={static.provider} model={static.name}"
            f"reason={static.fallback_reason}"
        )
        return static
    if reqs.fallback_profile and reqs.fallback_profile != profile:
        logger.warning(
            f"[select] agent={agent_id} profile={profile} not found → "
-            f"trying fallback_profile={reqs.fallback_profile}"
+            f"fallback_profile={reqs.fallback_profile}"
        )
        return await select_model_for_agent(
            agent_id, agent_cfg, router_cfg, capabilities,
        )
    logger.error(
-        f"[select] agent={agent_id} profile={profile} → ALL selection "
+        f"[select] agent={agent_id} ALL methods failed → hard default"
        f"methods failed. Using hard default qwen3:14b"
    )
    return ModelSelection(
        runtime="ollama",
@@ -275,6 +277,8 @@ async def select_model_for_agent(
        model_type="llm",
        base_url="http://host.docker.internal:11434",
        provider="ollama",
        node="local",
        local=True,
        via_ncs=False,
        fallback_reason="all methods failed; hard default",
    )