New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
234 lines
8.7 KiB
Python
234 lines
8.7 KiB
Python
"""
|
|
Sofiia Supervisor — Gateway Client
|
|
|
|
Thin HTTP wrapper around the DAARION router's /v1/tools/execute endpoint.
|
|
|
|
Security rules:
|
|
- Only allowed destination: GATEWAY_BASE_URL (single allowlisted origin)
|
|
- Payload is NOT logged; only hash + sizes in audit
|
|
- Correlation: graph_run_id + graph_node injected into every request metadata
|
|
- Retries only on 5xx (retryable=True) — max 2 attempts
|
|
- Timeouts enforced per call
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import time
|
|
from typing import Any, Dict, Optional, Tuple
|
|
|
|
import httpx
|
|
|
|
from .config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ─── Result ──────────────────────────────────────────────────────────────────
|
|
|
|
class ToolCallResult:
|
|
__slots__ = ("success", "data", "error_code", "error_message", "retryable", "elapsed_ms")
|
|
|
|
def __init__(
|
|
self,
|
|
success: bool,
|
|
data: Any = None,
|
|
error_code: str = "",
|
|
error_message: str = "",
|
|
retryable: bool = False,
|
|
elapsed_ms: float = 0.0,
|
|
):
|
|
self.success = success
|
|
self.data = data
|
|
self.error_code = error_code
|
|
self.error_message = error_message
|
|
self.retryable = retryable
|
|
self.elapsed_ms = elapsed_ms
|
|
|
|
def __repr__(self) -> str:
|
|
return f"ToolCallResult(success={self.success}, error={self.error_code})"
|
|
|
|
|
|
# ─── Audit helpers (no payload) ──────────────────────────────────────────────
|
|
|
|
def _payload_hash(payload: Dict) -> str:
|
|
"""SHA-256 of canonical JSON — for audit log without exposing content."""
|
|
try:
|
|
canon = json.dumps(payload, sort_keys=True, ensure_ascii=False)
|
|
return hashlib.sha256(canon.encode()).hexdigest()[:16]
|
|
except Exception:
|
|
return "hash_error"
|
|
|
|
|
|
def _payload_size(payload: Dict) -> int:
|
|
try:
|
|
return len(json.dumps(payload).encode())
|
|
except Exception:
|
|
return 0
|
|
|
|
|
|
# ─── Gateway Client ──────────────────────────────────────────────────────────
|
|
|
|
class GatewayClient:
|
|
"""
|
|
HTTP client for calling DAARION router tool execution endpoint.
|
|
|
|
Usage:
|
|
async with GatewayClient() as gw:
|
|
result = await gw.call_tool(
|
|
tool="job_orchestrator_tool",
|
|
action="start_task",
|
|
params={"task_id": "release_check", "inputs": {...}},
|
|
agent_id="sofiia",
|
|
workspace_id="daarion",
|
|
user_id="system",
|
|
graph_run_id="gr_abc123",
|
|
graph_node="start_job",
|
|
)
|
|
"""
|
|
|
|
_ENDPOINT = "/v1/tools/execute"
|
|
|
|
def __init__(self):
|
|
self._client: Optional[httpx.AsyncClient] = None
|
|
|
|
async def __aenter__(self) -> "GatewayClient":
|
|
self._client = httpx.AsyncClient(
|
|
base_url=settings.GATEWAY_BASE_URL,
|
|
timeout=settings.TOOL_CALL_TIMEOUT_SEC,
|
|
headers=self._base_headers(),
|
|
)
|
|
return self
|
|
|
|
async def __aexit__(self, *args):
|
|
if self._client:
|
|
await self._client.aclose()
|
|
|
|
def _base_headers(self) -> Dict[str, str]:
|
|
headers = {"Content-Type": "application/json"}
|
|
if settings.SUPERVISOR_API_KEY:
|
|
headers["Authorization"] = f"Bearer {settings.SUPERVISOR_API_KEY}"
|
|
return headers
|
|
|
|
async def call_tool(
|
|
self,
|
|
tool: str,
|
|
action: str,
|
|
params: Optional[Dict[str, Any]] = None,
|
|
agent_id: str = "",
|
|
workspace_id: str = "",
|
|
user_id: str = "",
|
|
graph_run_id: str = "",
|
|
graph_node: str = "",
|
|
trace_id: str = "",
|
|
) -> ToolCallResult:
|
|
"""
|
|
Execute a tool via the gateway's /v1/tools/execute endpoint.
|
|
|
|
Injects graph_run_id + graph_node into metadata for correlation.
|
|
Retries up to MAX_RETRIES times on retryable (5xx) errors.
|
|
Does NOT log payload — only hash + sizes.
|
|
"""
|
|
payload: Dict[str, Any] = {
|
|
"tool": tool,
|
|
"action": action,
|
|
"params": params or {},
|
|
"agent_id": agent_id or settings.DEFAULT_AGENT_ID,
|
|
"workspace_id": workspace_id or settings.DEFAULT_WORKSPACE_ID,
|
|
"user_id": user_id,
|
|
"metadata": {
|
|
"graph_run_id": graph_run_id,
|
|
"graph_node": graph_node,
|
|
**({"trace_id": trace_id} if trace_id else {}),
|
|
},
|
|
}
|
|
|
|
p_hash = _payload_hash(payload)
|
|
p_size = _payload_size(payload)
|
|
|
|
for attempt in range(1, settings.TOOL_CALL_MAX_RETRIES + 2):
|
|
t0 = time.monotonic()
|
|
try:
|
|
logger.info(
|
|
"gateway_call tool=%s action=%s node=%s run=%s "
|
|
"hash=%s size=%d attempt=%d",
|
|
tool, action, graph_node, graph_run_id, p_hash, p_size, attempt,
|
|
)
|
|
resp = await self._client.post(self._ENDPOINT, json=payload)
|
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
|
|
|
if resp.status_code == 200:
|
|
body = resp.json()
|
|
if body.get("status") == "succeeded":
|
|
logger.info(
|
|
"gateway_ok tool=%s node=%s run=%s elapsed_ms=%.0f",
|
|
tool, graph_node, graph_run_id, elapsed_ms,
|
|
)
|
|
return ToolCallResult(
|
|
success=True, data=body.get("data"), elapsed_ms=elapsed_ms
|
|
)
|
|
else:
|
|
err = body.get("error") or {}
|
|
retryable = err.get("retryable", False)
|
|
logger.warning(
|
|
"gateway_tool_fail tool=%s code=%s msg=%s retryable=%s",
|
|
tool, err.get("code"), err.get("message", "")[:120], retryable,
|
|
)
|
|
if retryable and attempt <= settings.TOOL_CALL_MAX_RETRIES:
|
|
await asyncio.sleep(1.5 * attempt)
|
|
continue
|
|
return ToolCallResult(
|
|
success=False,
|
|
error_code=err.get("code", "tool_error"),
|
|
error_message=err.get("message", "tool failed"),
|
|
retryable=retryable,
|
|
elapsed_ms=elapsed_ms,
|
|
)
|
|
elif resp.status_code in (502, 503, 504) and attempt <= settings.TOOL_CALL_MAX_RETRIES:
|
|
logger.warning("gateway_http_%d tool=%s attempt=%d, retrying", resp.status_code, tool, attempt)
|
|
await asyncio.sleep(2.0 * attempt)
|
|
continue
|
|
else:
|
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
|
return ToolCallResult(
|
|
success=False,
|
|
error_code=f"http_{resp.status_code}",
|
|
error_message=f"HTTP {resp.status_code}",
|
|
retryable=resp.status_code >= 500,
|
|
elapsed_ms=elapsed_ms,
|
|
)
|
|
|
|
except httpx.TimeoutException as e:
|
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
|
logger.warning("gateway_timeout tool=%s node=%s elapsed_ms=%.0f", tool, graph_node, elapsed_ms)
|
|
if attempt <= settings.TOOL_CALL_MAX_RETRIES:
|
|
await asyncio.sleep(2.0 * attempt)
|
|
continue
|
|
return ToolCallResult(
|
|
success=False,
|
|
error_code="timeout",
|
|
error_message=f"Timeout after {settings.TOOL_CALL_TIMEOUT_SEC}s",
|
|
retryable=True,
|
|
elapsed_ms=elapsed_ms,
|
|
)
|
|
except Exception as e:
|
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
|
logger.error("gateway_error tool=%s: %s", tool, str(e)[:200])
|
|
return ToolCallResult(
|
|
success=False,
|
|
error_code="client_error",
|
|
error_message=str(e)[:200],
|
|
retryable=False,
|
|
elapsed_ms=elapsed_ms,
|
|
)
|
|
|
|
# Exhausted retries
|
|
return ToolCallResult(
|
|
success=False,
|
|
error_code="max_retries",
|
|
error_message=f"Failed after {settings.TOOL_CALL_MAX_RETRIES + 1} attempts",
|
|
retryable=False,
|
|
)
|