P2.2+P2.3: NATS offload node-worker + router offload integration
Node Worker (services/node-worker/):
- NATS subscriber for node.{NODE_ID}.llm.request / vision.request
- Canonical JobRequest/JobResponse envelope (Pydantic)
- Idempotency cache (TTL 10min) with inflight dedup
- Deadline enforcement (DEADLINE_EXCEEDED on expired jobs)
- Concurrency limiter (semaphore, returns busy)
- Ollama + Swapper vision providers
Router offload (services/router/offload_client.py):
- NATS req/reply with configurable retries
- Circuit breaker per node+type (3 fails/60s → open 120s)
- Concurrency semaphore for remote requests
Model selection (services/router/model_select.py):
- exclude_nodes parameter for circuit-broken nodes
- force_local flag for fallback re-selection
- Integrated circuit breaker state awareness
Router /infer pipeline:
- Remote offload path when NCS selects remote node
- Automatic fallback: exclude failed node → force_local re-select
- Deadline propagation from router to node-worker
Tests: 17 unit tests (idempotency, deadline, circuit breaker)
Docs: ops/offload_routing.md (subjects, envelope, verification)
Made-with: Cursor
This commit is contained in:
@@ -51,11 +51,13 @@ try:
|
||||
import capabilities_client
|
||||
import global_capabilities_client
|
||||
from model_select import select_model_for_agent, ModelSelection, CLOUD_PROVIDERS as NCS_CLOUD_PROVIDERS
|
||||
import offload_client
|
||||
NCS_AVAILABLE = True
|
||||
except ImportError:
|
||||
NCS_AVAILABLE = False
|
||||
capabilities_client = None # type: ignore[assignment]
|
||||
global_capabilities_client = None # type: ignore[assignment]
|
||||
offload_client = None # type: ignore[assignment]
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -1707,6 +1709,76 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
f"provider={provider} model={model}"
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
# REMOTE OFFLOAD (if model selected on remote node)
|
||||
# =========================================================================
|
||||
nats_client_available = nc is not None and nats_available
|
||||
if (ncs_selection and ncs_selection.via_nats and not ncs_selection.local
|
||||
and nats_client_available and offload_client and nc):
|
||||
infer_timeout = int(os.getenv("ROUTER_INFER_TIMEOUT_MS", "25000"))
|
||||
import uuid as _uuid
|
||||
job_payload = {
|
||||
"job_id": str(_uuid.uuid4()),
|
||||
"trace_id": str(_uuid.uuid4()),
|
||||
"actor_agent_id": request_agent_id or agent_id,
|
||||
"target_agent_id": agent_id,
|
||||
"required_type": ncs_selection.model_type if ncs_selection.model_type != "code" else "llm",
|
||||
"deadline_ts": int(time.time() * 1000) + infer_timeout,
|
||||
"idempotency_key": str(_uuid.uuid4()),
|
||||
"payload": {
|
||||
"prompt": request.prompt,
|
||||
"messages": [{"role": "system", "content": system_prompt}] if system_prompt else [],
|
||||
"model": ncs_selection.name,
|
||||
"max_tokens": request.max_tokens or 2048,
|
||||
"temperature": request.temperature or 0.2,
|
||||
},
|
||||
"hints": {"prefer_models": [ncs_selection.name]},
|
||||
}
|
||||
if request.images:
|
||||
job_payload["payload"]["images"] = request.images
|
||||
job_payload["required_type"] = "vision"
|
||||
job_payload["payload"]["messages"].append({"role": "user", "content": request.prompt})
|
||||
|
||||
offload_resp = await offload_client.offload_infer(
|
||||
nats_client=nc,
|
||||
node_id=ncs_selection.node,
|
||||
required_type=job_payload["required_type"],
|
||||
job_payload=job_payload,
|
||||
timeout_ms=infer_timeout,
|
||||
)
|
||||
if offload_resp and offload_resp.get("status") == "ok":
|
||||
result_text = offload_resp.get("result", {}).get("text", "")
|
||||
return InferResponse(
|
||||
response=result_text,
|
||||
model=f"{offload_resp.get('model', ncs_selection.name)}@{ncs_selection.node}",
|
||||
backend=f"nats-offload:{ncs_selection.node}",
|
||||
tokens_used=offload_resp.get("result", {}).get("eval_count", 0),
|
||||
)
|
||||
else:
|
||||
offload_status = offload_resp.get("status", "none") if offload_resp else "no_reply"
|
||||
logger.warning(
|
||||
f"[fallback] offload to {ncs_selection.node} failed ({offload_status}) "
|
||||
f"→ re-selecting with exclude={ncs_selection.node}, force_local"
|
||||
)
|
||||
try:
|
||||
gcaps = await global_capabilities_client.get_global_capabilities()
|
||||
ncs_selection = await select_model_for_agent(
|
||||
agent_id, agent_config, router_config, gcaps, request.model,
|
||||
exclude_nodes={ncs_selection.node}, force_local=True,
|
||||
)
|
||||
if ncs_selection and ncs_selection.name:
|
||||
provider = ncs_selection.provider
|
||||
model = ncs_selection.name
|
||||
llm_profile = router_config.get("llm_profiles", {}).get(default_llm, {})
|
||||
if ncs_selection.base_url and provider == "ollama":
|
||||
llm_profile = {**llm_profile, "base_url": ncs_selection.base_url}
|
||||
logger.info(
|
||||
f"[fallback.reselect] → local node={ncs_selection.node} "
|
||||
f"model={model} provider={provider}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"[fallback.reselect] error: {e}; proceeding with static")
|
||||
|
||||
# =========================================================================
|
||||
# VISION PROCESSING (if images present)
|
||||
# =========================================================================
|
||||
|
||||
Reference in New Issue
Block a user