P2.2+P2.3: NATS offload node-worker + router offload integration
Node Worker (services/node-worker/):
- NATS subscriber for node.{NODE_ID}.llm.request / vision.request
- Canonical JobRequest/JobResponse envelope (Pydantic)
- Idempotency cache (TTL 10min) with inflight dedup
- Deadline enforcement (DEADLINE_EXCEEDED on expired jobs)
- Concurrency limiter (semaphore, returns busy)
- Ollama + Swapper vision providers
Router offload (services/router/offload_client.py):
- NATS req/reply with configurable retries
- Circuit breaker per node+type (3 fails/60s → open 120s)
- Concurrency semaphore for remote requests
Model selection (services/router/model_select.py):
- exclude_nodes parameter for circuit-broken nodes
- force_local flag for fallback re-selection
- Integrated circuit breaker state awareness
Router /infer pipeline:
- Remote offload path when NCS selects remote node
- Automatic fallback: exclude failed node → force_local re-select
- Deadline propagation from router to node-worker
Tests: 17 unit tests (idempotency, deadline, circuit breaker)
Docs: ops/offload_routing.md (subjects, envelope, verification)
Made-with: Cursor
This commit is contained in:
@@ -9,7 +9,7 @@ Scaling: works with 1 node or 150+. No static node lists.
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional
|
||||
from typing import Any, Dict, List, Optional, Set
|
||||
|
||||
logger = logging.getLogger("model_select")
|
||||
|
||||
@@ -110,6 +110,7 @@ def profile_requirements(
|
||||
def select_best_model(
|
||||
reqs: ProfileRequirements,
|
||||
capabilities: Dict[str, Any],
|
||||
exclude_nodes: Optional[Set[str]] = None,
|
||||
) -> Optional[ModelSelection]:
|
||||
"""Choose the best served model from global (multi-node) capabilities.
|
||||
|
||||
@@ -117,18 +118,25 @@ def select_best_model(
|
||||
1. Prefer list matches (local first, then remote)
|
||||
2. Best candidate by size (local first, then remote)
|
||||
3. None → caller should try static fallback
|
||||
|
||||
exclude_nodes: set of node_ids to skip (e.g. circuit-broken nodes).
|
||||
"""
|
||||
served = capabilities.get("served_models", [])
|
||||
if not served:
|
||||
return None
|
||||
|
||||
exclude = exclude_nodes or set()
|
||||
|
||||
search_types = [reqs.required_type]
|
||||
if reqs.required_type == "code":
|
||||
search_types.append("llm")
|
||||
if reqs.required_type == "llm":
|
||||
search_types.append("code")
|
||||
|
||||
candidates = [m for m in served if m.get("type") in search_types]
|
||||
candidates = [
|
||||
m for m in served
|
||||
if m.get("type") in search_types and m.get("node", "") not in exclude
|
||||
]
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
@@ -218,15 +226,21 @@ async def select_model_for_agent(
|
||||
router_cfg: Dict[str, Any],
|
||||
capabilities: Optional[Dict[str, Any]],
|
||||
request_model: Optional[str] = None,
|
||||
exclude_nodes: Optional[Set[str]] = None,
|
||||
force_local: bool = False,
|
||||
) -> ModelSelection:
|
||||
"""Full selection pipeline: resolve profile → NCS (multi-node) → static → hard default."""
|
||||
"""Full selection pipeline: resolve profile → NCS (multi-node) → static → hard default.
|
||||
|
||||
exclude_nodes: skip these nodes (circuit-broken). Used on fallback re-selection.
|
||||
force_local: prefer local-only models (fallback after remote failure).
|
||||
"""
|
||||
profile = resolve_effective_profile(
|
||||
agent_id, agent_cfg, router_cfg, request_model,
|
||||
)
|
||||
|
||||
reqs = profile_requirements(profile, agent_cfg, router_cfg)
|
||||
|
||||
if reqs.required_type == "cloud_llm":
|
||||
if reqs.required_type == "cloud_llm" and not force_local:
|
||||
static = static_fallback(profile, router_cfg)
|
||||
if static:
|
||||
static.fallback_reason = ""
|
||||
@@ -236,14 +250,31 @@ async def select_model_for_agent(
|
||||
)
|
||||
return static
|
||||
|
||||
excl = set(exclude_nodes) if exclude_nodes else set()
|
||||
try:
|
||||
from offload_client import get_unavailable_nodes
|
||||
cb_nodes = get_unavailable_nodes(reqs.required_type)
|
||||
excl |= cb_nodes
|
||||
if cb_nodes:
|
||||
logger.info(f"[select] circuit-broken nodes for {reqs.required_type}: {cb_nodes}")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
if capabilities and capabilities.get("served_models"):
|
||||
sel = select_best_model(reqs, capabilities)
|
||||
sel = select_best_model(reqs, capabilities, exclude_nodes=excl)
|
||||
if force_local and sel and not sel.local:
|
||||
sel = select_best_model(
|
||||
reqs, capabilities,
|
||||
exclude_nodes=excl | {n.get("node", "") for n in capabilities.get("served_models", []) if not n.get("local")},
|
||||
)
|
||||
if sel:
|
||||
logger.info(
|
||||
f"[select] agent={agent_id} profile={profile} → "
|
||||
f"{'NCS' if sel.local else 'REMOTE'} "
|
||||
f"{'LOCAL' if sel.local else 'REMOTE'} "
|
||||
f"node={sel.node} runtime={sel.runtime} "
|
||||
f"model={sel.name} caps_age={sel.caps_age_s}s"
|
||||
f"{' (force_local)' if force_local else ''}"
|
||||
f"{' (excluded: ' + ','.join(excl) + ')' if excl else ''}"
|
||||
)
|
||||
return sel
|
||||
logger.warning(
|
||||
|
||||
Reference in New Issue
Block a user