feat(fabric): decommission Swapper from critical path, NCS = source of truth
- Node Worker: replace swapper_vision with ollama_vision (direct Ollama API) - Node Worker: add NATS subjects for stt/tts/image (stubs ready) - Node Worker: remove SWAPPER_URL dependency from config - Router: vision calls go directly to Ollama /api/generate with images - Router: local LLM calls go directly to Ollama /api/generate - Router: add OLLAMA_URL and PREFER_NODE_WORKER=true feature flag - Router: /v1/models now uses NCS global capabilities pool - NCS: SWAPPER_URL="" -> skip Swapper probing (status=disabled) - Swapper configs: remove all hardcoded model lists, keep only runtime URLs, timeouts, limits - docker-compose.node1.yml: add OLLAMA_URL, PREFER_NODE_WORKER for router; SWAPPER_URL= for NCS; remove swapper-service from node-worker depends_on - docker-compose.node2-sofiia.yml: same changes for NODA2 Swapper service still runs but is NOT in the critical inference path. Source of truth for models is now NCS -> Ollama /api/tags. Made-with: Cursor
This commit is contained in:
@@ -877,12 +877,13 @@ app = FastAPI(title="DAARION Router", version="2.0.0")
|
||||
# Configuration
|
||||
NATS_URL = os.getenv("NATS_URL", "nats://nats:4222")
|
||||
SWAPPER_URL = os.getenv("SWAPPER_URL", "http://swapper-service:8890")
|
||||
# All multimodal services now through Swapper
|
||||
STT_URL = os.getenv("STT_URL", "http://swapper-service:8890") # Swapper /stt endpoint
|
||||
TTS_URL = os.getenv("TTS_URL", "http://swapper-service:8890") # Swapper /tts endpoint
|
||||
VISION_URL = os.getenv("VISION_URL", "http://172.18.0.1:11434") # Host Ollama
|
||||
OCR_URL = os.getenv("OCR_URL", "http://swapper-service:8890") # Swapper /ocr endpoint
|
||||
DOCUMENT_URL = os.getenv("DOCUMENT_URL", "http://swapper-service:8890") # Swapper /document endpoint
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||
PREFER_NODE_WORKER = os.getenv("PREFER_NODE_WORKER", "true").lower() in ("true", "1")
|
||||
STT_URL = os.getenv("STT_URL", "http://swapper-service:8890")
|
||||
TTS_URL = os.getenv("TTS_URL", "http://swapper-service:8890")
|
||||
VISION_URL = os.getenv("VISION_URL", "http://host.docker.internal:11434")
|
||||
OCR_URL = os.getenv("OCR_URL", "http://swapper-service:8890")
|
||||
DOCUMENT_URL = os.getenv("DOCUMENT_URL", "http://swapper-service:8890")
|
||||
CITY_SERVICE_URL = os.getenv("CITY_SERVICE_URL", "http://daarion-city-service:7001")
|
||||
|
||||
# CrewAI Routing Configuration
|
||||
@@ -1083,8 +1084,8 @@ async def startup_event():
|
||||
runtime_guard_engine = None
|
||||
|
||||
# Log backend URLs
|
||||
logger.info(f"📡 Swapper URL: {SWAPPER_URL}")
|
||||
logger.info(f"📡 STT URL: {STT_URL}")
|
||||
logger.info(f"📡 Ollama URL: {OLLAMA_URL} (prefer_node_worker={PREFER_NODE_WORKER})")
|
||||
logger.info(f"📡 Swapper URL: {SWAPPER_URL} (legacy, being decommissioned)")
|
||||
logger.info(f"📡 Vision URL: {VISION_URL}")
|
||||
logger.info(f"📡 OCR URL: {OCR_URL}")
|
||||
logger.info(f"📡 Neo4j URL: {NEO4J_URI}")
|
||||
@@ -2388,33 +2389,39 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
logger.warning(f"⚠️ Deterministic AgroMatrix plant flow failed, fallback to generic vision: {e}")
|
||||
|
||||
try:
|
||||
# Use Swapper's /vision endpoint (manages model loading)
|
||||
vision_payload = {
|
||||
"model": "qwen3-vl-8b",
|
||||
"prompt": request.prompt,
|
||||
"images": request.images, # Swapper handles data URL conversion
|
||||
"max_tokens": request.max_tokens or 1024,
|
||||
"temperature": request.temperature or 0.7
|
||||
}
|
||||
|
||||
# Add system prompt if available
|
||||
if system_prompt:
|
||||
if memory_brief_text:
|
||||
vision_payload["system"] = f"{system_prompt}\n\n[INTERNAL MEMORY - do NOT repeat to user]\n{memory_brief_text}"
|
||||
vision_model = "qwen3-vl-8b"
|
||||
vision_system = system_prompt or ""
|
||||
if vision_system and memory_brief_text:
|
||||
vision_system = f"{vision_system}\n\n[INTERNAL MEMORY - do NOT repeat to user]\n{memory_brief_text}"
|
||||
|
||||
clean_images = []
|
||||
for img in (request.images or []):
|
||||
if "," in img and img.startswith("data:"):
|
||||
clean_images.append(img.split(",", 1)[1])
|
||||
else:
|
||||
vision_payload["system"] = system_prompt
|
||||
|
||||
logger.info(f"🖼️ Sending to Swapper /vision: {SWAPPER_URL}/vision")
|
||||
|
||||
clean_images.append(img)
|
||||
|
||||
logger.info(f"🖼️ Vision inference: model={vision_model} images={len(clean_images)} prefer_nw={PREFER_NODE_WORKER}")
|
||||
|
||||
vision_resp = await http_client.post(
|
||||
f"{SWAPPER_URL}/vision",
|
||||
json=vision_payload,
|
||||
timeout=120.0
|
||||
f"{OLLAMA_URL}/api/generate",
|
||||
json={
|
||||
"model": vision_model.replace("-", "-vl:").replace("qwen3-vl:", "qwen3-vl:") if ":" not in vision_model else vision_model,
|
||||
"prompt": request.prompt,
|
||||
"images": clean_images,
|
||||
"system": vision_system,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"num_predict": request.max_tokens or 1024,
|
||||
"temperature": request.temperature or 0.7,
|
||||
},
|
||||
},
|
||||
timeout=120.0,
|
||||
)
|
||||
|
||||
if vision_resp.status_code == 200:
|
||||
vision_data = vision_resp.json()
|
||||
raw_response = vision_data.get("text", "")
|
||||
raw_response = vision_data.get("response", vision_data.get("text", ""))
|
||||
full_response = _sanitize_vision_text_for_user(raw_response)
|
||||
vision_web_query = ""
|
||||
vision_sources: List[Dict[str, str]] = []
|
||||
@@ -2519,14 +2526,23 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
"Відповідай українською 2-4 реченнями, без службових фраз. "
|
||||
f"Запит користувача: {request.prompt}"
|
||||
)
|
||||
retry_ollama_payload = {
|
||||
"model": vision_model.replace("-", "-vl:").replace("qwen3-vl:", "qwen3-vl:") if ":" not in vision_model else vision_model,
|
||||
"prompt": retry_payload["prompt"],
|
||||
"images": clean_images,
|
||||
"stream": False,
|
||||
"options": {"num_predict": request.max_tokens or 1024, "temperature": 0.7},
|
||||
}
|
||||
if retry_payload.get("system"):
|
||||
retry_ollama_payload["system"] = retry_payload["system"]
|
||||
retry_resp = await http_client.post(
|
||||
f"{SWAPPER_URL}/vision",
|
||||
json=retry_payload,
|
||||
timeout=120.0
|
||||
f"{OLLAMA_URL}/api/generate",
|
||||
json=retry_ollama_payload,
|
||||
timeout=120.0,
|
||||
)
|
||||
if retry_resp.status_code == 200:
|
||||
retry_data = retry_resp.json()
|
||||
retry_raw = retry_data.get("text", "")
|
||||
retry_raw = retry_data.get("response", retry_data.get("text", ""))
|
||||
retry_text = _sanitize_vision_text_for_user(retry_raw)
|
||||
if retry_raw and not retry_text:
|
||||
retry_text = _extract_vision_search_facts(retry_raw, max_chars=280)
|
||||
@@ -2541,7 +2557,7 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
elif request_agent_id in DETERMINISTIC_PLANT_POLICY_AGENTS and _vision_response_is_blurry(full_response):
|
||||
full_response = _build_image_fallback_response(request_agent_id, request.prompt)
|
||||
|
||||
full_response = await _finalize_response_text(full_response, "swapper-vision")
|
||||
full_response = await _finalize_response_text(full_response, "ollama-vision")
|
||||
|
||||
# Store vision message in agent-specific memory
|
||||
if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id and full_response:
|
||||
@@ -2567,10 +2583,10 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
response=full_response,
|
||||
model="qwen3-vl-8b",
|
||||
tokens_used=None,
|
||||
backend="swapper-vision"
|
||||
backend="ollama-vision"
|
||||
)
|
||||
else:
|
||||
logger.error(f"❌ Swapper vision error: {vision_resp.status_code} - {vision_resp.text[:200]}")
|
||||
logger.error(f"❌ Ollama vision error: {vision_resp.status_code} - {vision_resp.text[:200]}")
|
||||
fallback_response = await _finalize_response_text(
|
||||
_build_image_fallback_response(request_agent_id, request.prompt),
|
||||
"swapper-vision-fallback",
|
||||
@@ -2579,7 +2595,7 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
response=fallback_response,
|
||||
model="qwen3-vl-8b",
|
||||
tokens_used=None,
|
||||
backend="swapper-vision-fallback"
|
||||
backend="vision-fallback"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
@@ -2592,7 +2608,7 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
response=fallback_response,
|
||||
model="qwen3-vl-8b",
|
||||
tokens_used=None,
|
||||
backend="swapper-vision-fallback"
|
||||
backend="vision-fallback"
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
@@ -3142,90 +3158,88 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
logger.warning(f"⚠️ No local model in config, using hardcoded fallback: {local_model}")
|
||||
|
||||
try:
|
||||
# Check if Swapper is available
|
||||
health_resp = await http_client.get(f"{SWAPPER_URL}/health", timeout=5.0)
|
||||
if health_resp.status_code == 200:
|
||||
logger.info(f"📡 Calling Swapper with local model: {local_model}")
|
||||
# Generate response via Swapper (which handles model loading)
|
||||
generate_resp = await http_client.post(
|
||||
f"{SWAPPER_URL}/generate",
|
||||
json={
|
||||
"model": local_model,
|
||||
"prompt": request.prompt,
|
||||
"system": system_prompt,
|
||||
"max_tokens": request.max_tokens,
|
||||
"temperature": request.temperature,
|
||||
"stream": False
|
||||
ollama_model = local_model.replace("-", ":") if ":" not in local_model else local_model
|
||||
logger.info(f"📡 Calling Ollama direct: model={ollama_model}")
|
||||
|
||||
generate_resp = await http_client.post(
|
||||
f"{OLLAMA_URL}/api/generate",
|
||||
json={
|
||||
"model": ollama_model,
|
||||
"prompt": request.prompt,
|
||||
"system": system_prompt,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"num_predict": request.max_tokens or 2048,
|
||||
"temperature": request.temperature or 0.7,
|
||||
},
|
||||
timeout=300.0
|
||||
)
|
||||
|
||||
if generate_resp.status_code == 200:
|
||||
data = generate_resp.json()
|
||||
local_response = _normalize_text_response(data.get("response", ""))
|
||||
},
|
||||
timeout=300.0,
|
||||
)
|
||||
if generate_resp.status_code == 200:
|
||||
data = generate_resp.json()
|
||||
local_response = _normalize_text_response(data.get("response", ""))
|
||||
|
||||
# Empty-answer gate for selected local top-level agents.
|
||||
if request_agent_id in EMPTY_ANSWER_GUARD_AGENTS and _needs_empty_answer_recovery(local_response):
|
||||
logger.warning(f"⚠️ Empty-answer gate triggered for {request_agent_id}, retrying local generate once")
|
||||
retry_prompt = (
|
||||
f"{request.prompt}\n\n"
|
||||
"Відповідай коротко і конкретно (2-5 речень), без службових або мета-фраз."
|
||||
)
|
||||
retry_resp = await http_client.post(
|
||||
f"{SWAPPER_URL}/generate",
|
||||
json={
|
||||
"model": local_model,
|
||||
"prompt": retry_prompt,
|
||||
"system": system_prompt,
|
||||
"max_tokens": request.max_tokens,
|
||||
"temperature": request.temperature,
|
||||
"stream": False
|
||||
},
|
||||
timeout=300.0
|
||||
)
|
||||
if retry_resp.status_code == 200:
|
||||
retry_data = retry_resp.json()
|
||||
retry_text = _normalize_text_response(retry_data.get("response", ""))
|
||||
if retry_text and not _needs_empty_answer_recovery(retry_text):
|
||||
local_response = retry_text
|
||||
|
||||
if _needs_empty_answer_recovery(local_response):
|
||||
local_response = (
|
||||
"Я не отримав корисну відповідь з першої спроби. "
|
||||
"Сформулюй запит коротко ще раз, і я відповім конкретно."
|
||||
)
|
||||
local_response = await _finalize_response_text(local_response, "swapper+ollama")
|
||||
|
||||
# Store in agent-specific memory
|
||||
if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id and local_response:
|
||||
asyncio.create_task(
|
||||
memory_retrieval.store_message(
|
||||
agent_id=request_agent_id,
|
||||
user_id=user_id,
|
||||
username=username,
|
||||
message_text=request.prompt,
|
||||
response_text=local_response,
|
||||
chat_id=chat_id
|
||||
)
|
||||
)
|
||||
|
||||
return InferResponse(
|
||||
response=local_response,
|
||||
model=local_model,
|
||||
tokens_used=data.get("eval_count", 0),
|
||||
backend="swapper+ollama"
|
||||
if request_agent_id in EMPTY_ANSWER_GUARD_AGENTS and _needs_empty_answer_recovery(local_response):
|
||||
logger.warning(f"⚠️ Empty-answer gate triggered for {request_agent_id}, retrying local generate once")
|
||||
retry_prompt = (
|
||||
f"{request.prompt}\n\n"
|
||||
"Відповідай коротко і конкретно (2-5 речень), без службових або мета-фраз."
|
||||
)
|
||||
else:
|
||||
logger.error(f"❌ Swapper error: {generate_resp.status_code} - {generate_resp.text}")
|
||||
retry_resp = await http_client.post(
|
||||
f"{OLLAMA_URL}/api/generate",
|
||||
json={
|
||||
"model": ollama_model,
|
||||
"prompt": retry_prompt,
|
||||
"system": system_prompt,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"num_predict": request.max_tokens or 2048,
|
||||
"temperature": request.temperature or 0.7,
|
||||
},
|
||||
},
|
||||
timeout=300.0,
|
||||
)
|
||||
if retry_resp.status_code == 200:
|
||||
retry_data = retry_resp.json()
|
||||
retry_text = _normalize_text_response(retry_data.get("response", ""))
|
||||
if retry_text and not _needs_empty_answer_recovery(retry_text):
|
||||
local_response = retry_text
|
||||
|
||||
if _needs_empty_answer_recovery(local_response):
|
||||
local_response = (
|
||||
"Я не отримав корисну відповідь з першої спроби. "
|
||||
"Сформулюй запит коротко ще раз, і я відповім конкретно."
|
||||
)
|
||||
local_response = await _finalize_response_text(local_response, "ollama-direct")
|
||||
|
||||
if MEMORY_RETRIEVAL_AVAILABLE and memory_retrieval and chat_id and user_id and local_response:
|
||||
asyncio.create_task(
|
||||
memory_retrieval.store_message(
|
||||
agent_id=request_agent_id,
|
||||
user_id=user_id,
|
||||
username=username,
|
||||
message_text=request.prompt,
|
||||
response_text=local_response,
|
||||
chat_id=chat_id
|
||||
)
|
||||
)
|
||||
|
||||
return InferResponse(
|
||||
response=local_response,
|
||||
model=local_model,
|
||||
tokens_used=data.get("eval_count", 0),
|
||||
backend="ollama-direct"
|
||||
)
|
||||
else:
|
||||
logger.error(f"❌ Ollama generate error: {generate_resp.status_code} - {generate_resp.text[:200]}")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Swapper/Ollama error: {e}")
|
||||
# Fallback to direct Ollama if Swapper fails
|
||||
logger.error(f"❌ Ollama direct error: {e}")
|
||||
try:
|
||||
logger.info(f"🔄 Falling back to direct Ollama connection")
|
||||
logger.info(f"🔄 Falling back to Ollama with hardcoded model")
|
||||
generate_resp = await http_client.post(
|
||||
f"{VISION_URL}/api/generate",
|
||||
f"{OLLAMA_URL}/api/generate",
|
||||
json={
|
||||
"model": "qwen3:8b", # Use actual Ollama model name
|
||||
"model": "qwen3:8b",
|
||||
"prompt": request.prompt,
|
||||
"system": system_prompt,
|
||||
"stream": False,
|
||||
@@ -3526,42 +3540,38 @@ async def documents_versions(doc_id: str, agent_id: str, limit: int = 20):
|
||||
|
||||
@app.get("/v1/models")
|
||||
async def list_available_models():
|
||||
"""List all available models across backends"""
|
||||
"""List all available models from NCS (global capabilities pool)."""
|
||||
models = []
|
||||
|
||||
# Get Swapper models
|
||||
|
||||
try:
|
||||
resp = await http_client.get(f"{SWAPPER_URL}/models", timeout=5.0)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
for m in data.get("models", []):
|
||||
models.append({
|
||||
"id": m.get("name"),
|
||||
"backend": "swapper",
|
||||
"size_gb": m.get("size_gb"),
|
||||
"status": m.get("status", "available")
|
||||
})
|
||||
from global_capabilities_client import get_global_capabilities
|
||||
pool = await get_global_capabilities()
|
||||
for m in pool.get("served_models", []):
|
||||
models.append({
|
||||
"id": m.get("name"),
|
||||
"backend": m.get("runtime", "unknown"),
|
||||
"node": m.get("node", "?"),
|
||||
"type": m.get("type", "llm"),
|
||||
"size_gb": m.get("size_gb"),
|
||||
"status": "served",
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning(f"Cannot get Swapper models: {e}")
|
||||
|
||||
# Get Ollama models
|
||||
try:
|
||||
resp = await http_client.get(f"{VISION_URL}/api/tags", timeout=5.0)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
for m in data.get("models", []):
|
||||
# Avoid duplicates
|
||||
model_name = m.get("name")
|
||||
if not any(x.get("id") == model_name for x in models):
|
||||
logger.warning(f"Cannot get NCS global models: {e}")
|
||||
|
||||
if not models:
|
||||
try:
|
||||
resp = await http_client.get(f"{OLLAMA_URL}/api/tags", timeout=5.0)
|
||||
if resp.status_code == 200:
|
||||
for m in resp.json().get("models", []):
|
||||
models.append({
|
||||
"id": model_name,
|
||||
"id": m.get("name"),
|
||||
"backend": "ollama",
|
||||
"size_gb": round(m.get("size", 0) / 1e9, 1),
|
||||
"status": "loaded"
|
||||
"status": "loaded",
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning(f"Cannot get Ollama models: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Cannot get Ollama models: {e}")
|
||||
|
||||
return {"models": models, "total": len(models)}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user