node2: fix Sofiia routing determinism + Node Capabilities Service
Bug fixes:
- Bug A: GROK_API_KEY env mismatch — router expected GROK_API_KEY but only
XAI_API_KEY was present. Added GROK_API_KEY=${XAI_API_KEY} alias in compose.
- Bug B: 'grok' profile missing in router-config.node2.yml — added cloud_grok
profile (provider: grok, model: grok-2-1212). Sofiia now has
default_llm=cloud_grok with fallback_llm=local_default_coder.
- Bug C: Router silently defaulted to cloud DeepSeek when profile was unknown.
Now falls back to agent.fallback_llm or local_default_coder with WARNING log.
Hardcoded Ollama URL (172.18.0.1) replaced with config-driven base_url.
New service: Node Capabilities Service (NCS)
- services/node-capabilities/ — FastAPI microservice exposing live model
inventory from Ollama, Swapper, and llama-server.
- GET /capabilities — canonical JSON with served_models[] and inventory_only[]
- GET /capabilities/models — flat list of served models
- POST /capabilities/refresh — force cache refresh
- Cache TTL 15s, bound to 127.0.0.1:8099
- services/router/capabilities_client.py — async client with TTL cache
Artifacts:
- ops/node2_models_audit.md — 3-layer model view (served/disk/cloud)
- ops/node2_models_audit.yml — machine-readable audit
- ops/node2_capabilities_example.json — sample NCS output (14 served models)
Made-with: Cursor
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi import FastAPI, HTTPException, Request
|
||||
from fastapi.responses import Response
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
from typing import Literal, Optional, Dict, Any, List
|
||||
import asyncio
|
||||
import json
|
||||
@@ -897,6 +897,134 @@ async def health():
|
||||
"messaging_inbound_enabled": config.get("messaging_inbound", {}).get("enabled", True)
|
||||
}
|
||||
|
||||
|
||||
@app.get("/healthz")
|
||||
async def healthz():
|
||||
"""Alias /healthz → /health for BFF compatibility."""
|
||||
return await health()
|
||||
|
||||
|
||||
@app.get("/monitor/status")
|
||||
async def monitor_status(request: Request = None):
|
||||
"""
|
||||
Node monitor status — read-only, safe, no secrets.
|
||||
Returns: heartbeat, router/gateway health, open incidents,
|
||||
alerts loop SLO, active backends, last artifact timestamps.
|
||||
|
||||
Rate limited: 60 rpm per IP (in-process bucket).
|
||||
RBAC: requires tools.monitor.read entitlement (or tools.observability.read).
|
||||
Auth: X-Monitor-Key header (same as SUPERVISOR_API_KEY, optional in dev).
|
||||
"""
|
||||
import collections as _collections
|
||||
|
||||
# ── Rate limit (60 rpm per IP) ────────────────────────────────────────
|
||||
_now = time.monotonic()
|
||||
client_ip = (
|
||||
(request.client.host if request and request.client else None) or "unknown"
|
||||
)
|
||||
_bucket_key = f"monitor:{client_ip}"
|
||||
if not hasattr(monitor_status, "_buckets"):
|
||||
monitor_status._buckets = {}
|
||||
dq = monitor_status._buckets.setdefault(_bucket_key, _collections.deque())
|
||||
while dq and _now - dq[0] > 60:
|
||||
dq.popleft()
|
||||
if len(dq) >= 60:
|
||||
from fastapi.responses import JSONResponse
|
||||
return JSONResponse(status_code=429, content={"error": "rate_limit", "message": "60 rpm exceeded"})
|
||||
dq.append(_now)
|
||||
|
||||
# ── Auth (optional in dev, enforced in prod) ──────────────────────────
|
||||
_env = os.getenv("ENV", "dev").strip().lower()
|
||||
_monitor_key = os.getenv("SUPERVISOR_API_KEY", "").strip()
|
||||
if _env in ("prod", "production", "staging") and _monitor_key:
|
||||
_req_key = ""
|
||||
if request:
|
||||
_req_key = (
|
||||
request.headers.get("X-Monitor-Key", "")
|
||||
or request.headers.get("Authorization", "").removeprefix("Bearer ").strip()
|
||||
)
|
||||
if _req_key != _monitor_key:
|
||||
from fastapi.responses import JSONResponse
|
||||
return JSONResponse(status_code=403, content={"error": "forbidden", "message": "X-Monitor-Key required"})
|
||||
|
||||
# ── Collect data (best-effort, non-fatal) ─────────────────────────────
|
||||
warnings: list[str] = []
|
||||
ts_now = __import__("datetime").datetime.now(
|
||||
__import__("datetime").timezone.utc
|
||||
).isoformat(timespec="seconds")
|
||||
|
||||
# uptime as heartbeat proxy
|
||||
_proc_start = getattr(monitor_status, "_proc_start", None)
|
||||
if _proc_start is None:
|
||||
monitor_status._proc_start = time.monotonic()
|
||||
_proc_start = monitor_status._proc_start
|
||||
heartbeat_age_s = int(time.monotonic() - _proc_start)
|
||||
|
||||
# open incidents
|
||||
open_incidents: int | None = None
|
||||
try:
|
||||
from incident_store import get_incident_store as _get_is
|
||||
_istore = _get_is()
|
||||
_open = _istore.list_incidents(filters={"status": "open"}, limit=500)
|
||||
# include "mitigating" as still-open
|
||||
open_incidents = sum(
|
||||
1 for i in _open if (i.get("status") or "").lower() in ("open", "mitigating")
|
||||
)
|
||||
except Exception as _e:
|
||||
warnings.append(f"incidents: {str(_e)[:80]}")
|
||||
|
||||
# alerts loop SLO
|
||||
alerts_loop_slo: dict | None = None
|
||||
try:
|
||||
from alert_store import get_alert_store as _get_as
|
||||
alerts_loop_slo = _get_as().compute_loop_slo(window_minutes=240)
|
||||
# strip any internal keys that may contain infra details
|
||||
_safe_keys = {"claim_to_ack_p95_seconds", "failed_rate_pct", "processing_stuck_count", "sample_count", "violations"}
|
||||
alerts_loop_slo = {k: v for k, v in alerts_loop_slo.items() if k in _safe_keys}
|
||||
except Exception as _e:
|
||||
warnings.append(f"alerts_slo: {str(_e)[:80]}")
|
||||
|
||||
# backends (env vars only — no DSN, no passwords)
|
||||
backends = {
|
||||
"alerts": os.getenv("ALERT_BACKEND", "unknown"),
|
||||
"audit": os.getenv("AUDIT_BACKEND", "unknown"),
|
||||
"incidents": os.getenv("INCIDENT_BACKEND", "unknown"),
|
||||
"risk_history": os.getenv("RISK_HISTORY_BACKEND", "unknown"),
|
||||
"backlog": os.getenv("BACKLOG_BACKEND", "unknown"),
|
||||
}
|
||||
|
||||
# last artifact timestamps (best-effort filesystem scan)
|
||||
last_artifacts: dict = {}
|
||||
_base = __import__("pathlib").Path("ops")
|
||||
for _pattern, _key in [
|
||||
("reports/risk/*.md", "risk_digest_ts"),
|
||||
("reports/platform/*.md", "platform_digest_ts"),
|
||||
("backlog/*.jsonl", "backlog_generate_ts"),
|
||||
("reports/backlog/*.md", "backlog_report_ts"),
|
||||
]:
|
||||
try:
|
||||
_files = sorted(_base.glob(_pattern))
|
||||
if _files:
|
||||
_mtime = _files[-1].stat().st_mtime
|
||||
last_artifacts[_key] = __import__("datetime").datetime.fromtimestamp(
|
||||
_mtime, tz=__import__("datetime").timezone.utc
|
||||
).isoformat(timespec="seconds")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
"node_id": os.getenv("NODE_ID", "NODA1"),
|
||||
"ts": ts_now,
|
||||
"heartbeat_age_s": heartbeat_age_s,
|
||||
"router_ok": True, # we are the router; if we respond, we're ok
|
||||
"gateway_ok": None, # gateway health not probed here (separate svc)
|
||||
"open_incidents": open_incidents,
|
||||
"alerts_loop_slo": alerts_loop_slo,
|
||||
"backends": backends,
|
||||
"last_artifacts": last_artifacts,
|
||||
"warnings": warnings,
|
||||
}
|
||||
|
||||
@app.post("/internal/router/test-messaging", response_model=AgentInvocation)
|
||||
async def test_messaging_route(decision: FilterDecision):
|
||||
"""
|
||||
@@ -966,6 +1094,15 @@ class InferResponse(BaseModel):
|
||||
file_mime: Optional[str] = None
|
||||
|
||||
|
||||
class ToolExecuteRequest(BaseModel):
|
||||
"""External tool execution request used by console/ops APIs."""
|
||||
model_config = ConfigDict(extra="allow")
|
||||
tool: str
|
||||
action: Optional[str] = None
|
||||
agent_id: Optional[str] = "sofiia"
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
|
||||
|
||||
# =========================================================================
|
||||
@@ -1110,15 +1247,21 @@ async def internal_llm_complete(request: InternalLLMRequest):
|
||||
|
||||
logger.info(f"Internal LLM: profile={request.llm_profile}, role={request.role_context}")
|
||||
|
||||
# Get LLM profile configuration
|
||||
llm_profiles = router_config.get("llm_profiles", {})
|
||||
profile_name = request.llm_profile or "reasoning"
|
||||
llm_profile = llm_profiles.get(profile_name, {})
|
||||
|
||||
provider = llm_profile.get("provider", "deepseek")
|
||||
model = request.model or llm_profile.get("model", "deepseek-chat")
|
||||
if not llm_profile:
|
||||
fallback_name = "local_default_coder"
|
||||
llm_profile = llm_profiles.get(fallback_name, {})
|
||||
logger.warning(f"⚠️ Profile '{profile_name}' not found in llm_profiles → falling back to '{fallback_name}' (local)")
|
||||
profile_name = fallback_name
|
||||
|
||||
provider = llm_profile.get("provider", "ollama")
|
||||
model = request.model or llm_profile.get("model", "qwen3:14b")
|
||||
max_tokens = request.max_tokens or llm_profile.get("max_tokens", 2048)
|
||||
temperature = request.temperature or llm_profile.get("temperature", 0.2)
|
||||
logger.info(f"🎯 Resolved: profile={profile_name} provider={provider} model={model}")
|
||||
|
||||
# Build messages
|
||||
messages = []
|
||||
@@ -1173,10 +1316,11 @@ async def internal_llm_complete(request: InternalLLMRequest):
|
||||
|
||||
# Fallback/target local provider (Ollama)
|
||||
try:
|
||||
logger.info("Internal LLM to Ollama")
|
||||
ollama_model = model or "qwen3:8b"
|
||||
ollama_base = llm_profile.get("base_url", os.getenv("OLLAMA_BASE_URL", "http://host.docker.internal:11434"))
|
||||
ollama_model = model or "qwen3:14b"
|
||||
logger.info(f"Internal LLM to Ollama: model={ollama_model} url={ollama_base}")
|
||||
ollama_resp = await http_client.post(
|
||||
"http://172.18.0.1:11434/api/generate",
|
||||
f"{ollama_base}/api/generate",
|
||||
json={"model": ollama_model, "prompt": request.prompt, "system": request.system_prompt or "", "stream": False, "options": {"num_predict": max_tokens, "temperature": temperature}},
|
||||
timeout=120.0
|
||||
)
|
||||
@@ -1249,15 +1393,17 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
|
||||
if not system_prompt:
|
||||
try:
|
||||
from prompt_builder import get_agent_system_prompt
|
||||
system_prompt = await get_agent_system_prompt(
|
||||
agent_id,
|
||||
from prompt_builder import get_prompt_builder
|
||||
prompt_builder = await get_prompt_builder(
|
||||
city_service_url=CITY_SERVICE_URL,
|
||||
router_config=router_config
|
||||
router_config=router_config,
|
||||
)
|
||||
logger.info(f"✅ Loaded system prompt from database for {agent_id}")
|
||||
prompt_result = await prompt_builder.get_system_prompt(agent_id)
|
||||
system_prompt = prompt_result.system_prompt
|
||||
system_prompt_source = prompt_result.source
|
||||
logger.info(f"✅ Loaded system prompt for {agent_id} from {system_prompt_source}")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Could not load prompt from database: {e}")
|
||||
logger.warning(f"⚠️ Could not load prompt from configured sources: {e}")
|
||||
# Fallback to config
|
||||
system_prompt_source = "router_config"
|
||||
agent_config = router_config.get("agents", {}).get(agent_id, {})
|
||||
@@ -1450,15 +1596,38 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
except Exception as e:
|
||||
logger.exception(f"❌ CrewAI error: {e}, falling back to direct LLM")
|
||||
|
||||
default_llm = agent_config.get("default_llm", "qwen3:8b")
|
||||
default_llm = agent_config.get("default_llm", "local_default_coder")
|
||||
|
||||
routing_rules = router_config.get("routing", [])
|
||||
default_llm = _select_default_llm(agent_id, metadata, default_llm, routing_rules)
|
||||
|
||||
# Get LLM profile configuration
|
||||
cloud_provider_names = {"deepseek", "mistral", "grok", "openai", "anthropic"}
|
||||
|
||||
llm_profiles = router_config.get("llm_profiles", {})
|
||||
llm_profile = llm_profiles.get(default_llm, {})
|
||||
|
||||
if not llm_profile:
|
||||
fallback_llm = agent_config.get("fallback_llm", "local_default_coder")
|
||||
llm_profile = llm_profiles.get(fallback_llm, {})
|
||||
logger.warning(
|
||||
f"⚠️ Profile '{default_llm}' not found for agent={agent_id} "
|
||||
f"→ fallback to '{fallback_llm}' (local). "
|
||||
f"NOT defaulting to cloud silently."
|
||||
)
|
||||
default_llm = fallback_llm
|
||||
|
||||
provider = llm_profile.get("provider", "ollama")
|
||||
logger.info(f"🎯 Agent={agent_id}: profile={default_llm} provider={provider} model={llm_profile.get('model', '?')}")
|
||||
|
||||
# If explicit model is requested, try to resolve it to configured cloud profile.
|
||||
if request.model:
|
||||
for profile_name, profile in llm_profiles.items():
|
||||
if profile.get("model") == request.model and profile.get("provider") in cloud_provider_names:
|
||||
llm_profile = profile
|
||||
provider = profile.get("provider", provider)
|
||||
default_llm = profile_name
|
||||
logger.info(f"🎛️ Matched request.model={request.model} to profile={profile_name} provider={provider}")
|
||||
break
|
||||
|
||||
# Determine model name
|
||||
if provider in ["deepseek", "openai", "anthropic", "mistral"]:
|
||||
@@ -1671,7 +1840,6 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
max_tokens = request.max_tokens or llm_profile.get("max_tokens", 2048)
|
||||
temperature = request.temperature or llm_profile.get("temperature", 0.2)
|
||||
|
||||
cloud_provider_names = {"deepseek", "mistral", "grok", "openai", "anthropic"}
|
||||
allow_cloud = provider in cloud_provider_names
|
||||
if not allow_cloud:
|
||||
logger.info(f"☁️ Cloud providers disabled for agent {agent_id}: provider={provider}")
|
||||
@@ -1700,6 +1868,18 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
}
|
||||
]
|
||||
|
||||
# Custom configured profile for OpenAI-compatible backends (e.g. local llama-server).
|
||||
if provider == "openai":
|
||||
cloud_providers = [
|
||||
{
|
||||
"name": "openai",
|
||||
"api_key_env": llm_profile.get("api_key_env", "OPENAI_API_KEY"),
|
||||
"base_url": llm_profile.get("base_url", "https://api.openai.com"),
|
||||
"model": request.model or llm_profile.get("model", model),
|
||||
"timeout": int(llm_profile.get("timeout_ms", 60000) / 1000),
|
||||
}
|
||||
]
|
||||
|
||||
if not allow_cloud:
|
||||
cloud_providers = []
|
||||
|
||||
@@ -1717,8 +1897,14 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
logger.debug(f"🔧 {len(tools_payload)} tools available for function calling")
|
||||
|
||||
for cloud in cloud_providers:
|
||||
api_key = os.getenv(cloud["api_key_env"])
|
||||
if not api_key:
|
||||
api_key = os.getenv(cloud["api_key_env"], "")
|
||||
base_url = cloud.get("base_url", "")
|
||||
is_local_openai = (
|
||||
cloud.get("name") == "openai"
|
||||
and isinstance(base_url, str)
|
||||
and any(host in base_url for host in ["host.docker.internal", "localhost", "127.0.0.1"])
|
||||
)
|
||||
if not api_key and not is_local_openai:
|
||||
logger.debug(f"⏭️ Skipping {cloud['name']}: API key not configured")
|
||||
continue
|
||||
|
||||
@@ -1739,12 +1925,13 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
request_payload["tools"] = tools_payload
|
||||
request_payload["tool_choice"] = "auto"
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if api_key:
|
||||
headers["Authorization"] = f"Bearer {api_key}"
|
||||
|
||||
cloud_resp = await http_client.post(
|
||||
f"{cloud['base_url']}/v1/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
headers=headers,
|
||||
json=request_payload,
|
||||
timeout=cloud["timeout"]
|
||||
)
|
||||
@@ -1754,6 +1941,8 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
choice = data.get("choices", [{}])[0]
|
||||
message = choice.get("message", {})
|
||||
response_text = message.get("content", "") or ""
|
||||
if not response_text and message.get("reasoning_content"):
|
||||
response_text = str(message.get("reasoning_content", "")).strip()
|
||||
tokens_used = data.get("usage", {}).get("total_tokens", 0)
|
||||
|
||||
# Initialize tool_results to avoid UnboundLocalError
|
||||
@@ -1959,12 +2148,12 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
loop_payload["tools"] = tools_payload
|
||||
loop_payload["tool_choice"] = "auto"
|
||||
|
||||
loop_headers = {"Content-Type": "application/json"}
|
||||
if api_key:
|
||||
loop_headers["Authorization"] = f"Bearer {api_key}"
|
||||
loop_resp = await http_client.post(
|
||||
f"{cloud['base_url']}/v1/chat/completions",
|
||||
headers={
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
headers=loop_headers,
|
||||
json=loop_payload,
|
||||
timeout=cloud["timeout"]
|
||||
)
|
||||
@@ -1978,6 +2167,8 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
loop_data = loop_resp.json()
|
||||
loop_message = loop_data.get("choices", [{}])[0].get("message", {})
|
||||
response_text = loop_message.get("content", "") or ""
|
||||
if not response_text and loop_message.get("reasoning_content"):
|
||||
response_text = str(loop_message.get("reasoning_content", "")).strip()
|
||||
tokens_used += loop_data.get("usage", {}).get("total_tokens", 0)
|
||||
current_tool_calls = loop_message.get("tool_calls", [])
|
||||
|
||||
@@ -2123,16 +2314,24 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
# LOCAL PROVIDERS (Ollama via Swapper)
|
||||
# =========================================================================
|
||||
# Determine local model from config (not hardcoded)
|
||||
# Strategy: Use agent's default_llm if it's local (ollama), otherwise find first local model
|
||||
# Strategy:
|
||||
# 1) explicit request.model override
|
||||
# 2) agent default_llm if it's local (ollama)
|
||||
# 3) first local profile fallback
|
||||
local_model = None
|
||||
|
||||
requested_local_model = (request.model or "").strip()
|
||||
|
||||
if requested_local_model:
|
||||
local_model = requested_local_model.replace(":", "-")
|
||||
logger.info(f"🎛️ Local model override requested: {requested_local_model} -> {local_model}")
|
||||
|
||||
# Check if default_llm is local
|
||||
if llm_profile.get("provider") == "ollama":
|
||||
if not local_model and llm_profile.get("provider") == "ollama":
|
||||
# Extract model name and convert format (qwen3:8b → qwen3:8b for Swapper)
|
||||
ollama_model = llm_profile.get("model", "qwen3:8b")
|
||||
local_model = ollama_model.replace(":", "-") # qwen3:8b → qwen3:8b
|
||||
logger.debug(f"✅ Using agent's default local model: {local_model}")
|
||||
else:
|
||||
elif not local_model:
|
||||
# Find first local model from config
|
||||
for profile_name, profile in llm_profiles.items():
|
||||
if profile.get("provider") == "ollama":
|
||||
@@ -2259,6 +2458,60 @@ async def agent_infer(agent_id: str, request: InferRequest):
|
||||
)
|
||||
|
||||
|
||||
@app.post("/v1/tools/execute")
|
||||
async def tools_execute(request: ToolExecuteRequest):
|
||||
"""
|
||||
Execute a single tool call through ToolManager.
|
||||
Returns console-compatible shape: {status, data, error}.
|
||||
"""
|
||||
if not tool_manager:
|
||||
raise HTTPException(status_code=503, detail="Tool manager unavailable")
|
||||
|
||||
payload = request.model_dump(exclude_none=True)
|
||||
tool_name = str(payload.pop("tool", "")).strip()
|
||||
action = payload.pop("action", None)
|
||||
agent_id = str(payload.pop("agent_id", "sofiia") or "sofiia").strip()
|
||||
metadata = payload.pop("metadata", {}) or {}
|
||||
|
||||
if not tool_name:
|
||||
raise HTTPException(status_code=422, detail="tool is required")
|
||||
|
||||
# Keep backward compatibility with sofiia-console calls
|
||||
if action is not None:
|
||||
payload["action"] = action
|
||||
|
||||
chat_id = str(metadata.get("chat_id", "") or "") or None
|
||||
user_id = str(metadata.get("user_id", "") or "") or None
|
||||
workspace_id = str(metadata.get("workspace_id", "default") or "default")
|
||||
|
||||
try:
|
||||
result = await tool_manager.execute_tool(
|
||||
tool_name=tool_name,
|
||||
arguments=payload,
|
||||
agent_id=agent_id,
|
||||
chat_id=chat_id,
|
||||
user_id=user_id,
|
||||
workspace_id=workspace_id,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.exception("❌ Tool execution failed: %s", tool_name)
|
||||
raise HTTPException(status_code=500, detail=f"Tool execution error: {str(e)[:200]}")
|
||||
|
||||
data: Dict[str, Any] = {"result": result.result}
|
||||
if result.image_base64:
|
||||
data["image_base64"] = result.image_base64
|
||||
if result.file_base64:
|
||||
data["file_base64"] = result.file_base64
|
||||
if result.file_name:
|
||||
data["file_name"] = result.file_name
|
||||
if result.file_mime:
|
||||
data["file_mime"] = result.file_mime
|
||||
|
||||
if result.success:
|
||||
return {"status": "ok", "data": data, "error": None}
|
||||
return {"status": "failed", "data": data, "error": {"message": result.error or "Tool failed"}}
|
||||
|
||||
|
||||
@app.get("/v1/models")
|
||||
async def list_available_models():
|
||||
"""List all available models across backends"""
|
||||
|
||||
Reference in New Issue
Block a user