P2.2+P2.3: NATS offload node-worker + router offload integration

Node Worker (services/node-worker/): - NATS subscriber for node.{NODE_ID}.llm.request / vision.request - Canonical JobRequest/JobResponse envelope (Pydantic) - Idempotency cache (TTL 10min) with inflight dedup - Deadline enforcement (DEADLINE_EXCEEDED on expired jobs) - Concurrency limiter (semaphore, returns busy) - Ollama + Swapper vision providers Router offload (services/router/offload_client.py): - NATS req/reply with configurable retries - Circuit breaker per node+type (3 fails/60s → open 120s) - Concurrency semaphore for remote requests Model selection (services/router/model_select.py): - exclude_nodes parameter for circuit-broken nodes - force_local flag for fallback re-selection - Integrated circuit breaker state awareness Router /infer pipeline: - Remote offload path when NCS selects remote node - Automatic fallback: exclude failed node → force_local re-select - Deadline propagation from router to node-worker Tests: 17 unit tests (idempotency, deadline, circuit breaker) Docs: ops/offload_routing.md (subjects, envelope, verification) Made-with: Cursor
2026-02-27 02:44:05 -08:00
parent a92c424845
commit c4b94a327d
19 changed files with 1075 additions and 6 deletions
--- a/services/node-worker/providers/init.py
+++ b/services/node-worker/providers/init.py
--- a/services/node-worker/providers/ollama.py
+++ b/services/node-worker/providers/ollama.py
@@ -0,0 +1,81 @@
+"""Ollama LLM provider for node-worker."""
+import logging
+from typing import Any, Dict, List, Optional
+
+import httpx
+
+from config import OLLAMA_BASE_URL, DEFAULT_LLM
+
+logger = logging.getLogger("provider.ollama")
+
+
+async def infer(
+    messages: Optional[List[Dict[str, str]]] = None,
+    prompt: str = "",
+    model: str = "",
+    system: str = "",
+    max_tokens: int = 2048,
+    temperature: float = 0.2,
+    timeout_s: float = 25.0,
+) -> Dict[str, Any]:
+    model = model or DEFAULT_LLM
+
+    if messages:
+        return await _chat(messages, model, max_tokens, temperature, timeout_s)
+    return await _generate(prompt, system, model, max_tokens, temperature, timeout_s)
+
+
+async def _chat(
+    messages: List[Dict[str, str]],
+    model: str,
+    max_tokens: int,
+    temperature: float,
+    timeout_s: float,
+) -> Dict[str, Any]:
+    async with httpx.AsyncClient(timeout=timeout_s) as c:
+        resp = await c.post(
+            f"{OLLAMA_BASE_URL}/api/chat",
+            json={
+                "model": model,
+                "messages": messages,
+                "stream": False,
+                "options": {"num_predict": max_tokens, "temperature": temperature},
+            },
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        return {
+            "text": data.get("message", {}).get("content", ""),
+            "model": model,
+            "provider": "ollama",
+            "eval_count": data.get("eval_count", 0),
+        }
+
+
+async def _generate(
+    prompt: str,
+    system: str,
+    model: str,
+    max_tokens: int,
+    temperature: float,
+    timeout_s: float,
+) -> Dict[str, Any]:
+    async with httpx.AsyncClient(timeout=timeout_s) as c:
+        resp = await c.post(
+            f"{OLLAMA_BASE_URL}/api/generate",
+            json={
+                "model": model,
+                "prompt": prompt,
+                "system": system,
+                "stream": False,
+                "options": {"num_predict": max_tokens, "temperature": temperature},
+            },
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        return {
+            "text": data.get("response", ""),
+            "model": model,
+            "provider": "ollama",
+            "eval_count": data.get("eval_count", 0),
+        }
--- a/services/node-worker/providers/swapper_vision.py
+++ b/services/node-worker/providers/swapper_vision.py
@@ -0,0 +1,42 @@
+"""Swapper vision provider for node-worker."""
+import logging
+from typing import Any, Dict, List, Optional
+
+import httpx
+
+from config import SWAPPER_URL, DEFAULT_VISION
+
+logger = logging.getLogger("provider.swapper_vision")
+
+
+async def infer(
+    images: Optional[List[str]] = None,
+    prompt: str = "",
+    model: str = "",
+    system: str = "",
+    max_tokens: int = 1024,
+    temperature: float = 0.2,
+    timeout_s: float = 60.0,
+) -> Dict[str, Any]:
+    model = model or DEFAULT_VISION
+
+    payload: Dict[str, Any] = {
+        "model": model,
+        "prompt": prompt or "Describe this image.",
+        "max_tokens": max_tokens,
+        "temperature": temperature,
+    }
+    if images:
+        payload["images"] = images
+    if system:
+        payload["system"] = system
+
+    async with httpx.AsyncClient(timeout=timeout_s) as c:
+        resp = await c.post(f"{SWAPPER_URL}/vision", json=payload)
+        resp.raise_for_status()
+        data = resp.json()
+        return {
+            "text": data.get("text", data.get("response", "")),
+            "model": model,
+            "provider": "swapper_vision",
+        }