""" Sofiia Supervisor — Gateway Client Thin HTTP wrapper around the DAARION router's /v1/tools/execute endpoint. Security rules: - Only allowed destination: GATEWAY_BASE_URL (single allowlisted origin) - Payload is NOT logged; only hash + sizes in audit - Correlation: graph_run_id + graph_node injected into every request metadata - Retries only on 5xx (retryable=True) — max 2 attempts - Timeouts enforced per call """ from __future__ import annotations import asyncio import hashlib import json import logging import time from typing import Any, Dict, Optional, Tuple import httpx from .config import settings logger = logging.getLogger(__name__) # ─── Result ────────────────────────────────────────────────────────────────── class ToolCallResult: __slots__ = ("success", "data", "error_code", "error_message", "retryable", "elapsed_ms") def __init__( self, success: bool, data: Any = None, error_code: str = "", error_message: str = "", retryable: bool = False, elapsed_ms: float = 0.0, ): self.success = success self.data = data self.error_code = error_code self.error_message = error_message self.retryable = retryable self.elapsed_ms = elapsed_ms def __repr__(self) -> str: return f"ToolCallResult(success={self.success}, error={self.error_code})" # ─── Audit helpers (no payload) ────────────────────────────────────────────── def _payload_hash(payload: Dict) -> str: """SHA-256 of canonical JSON — for audit log without exposing content.""" try: canon = json.dumps(payload, sort_keys=True, ensure_ascii=False) return hashlib.sha256(canon.encode()).hexdigest()[:16] except Exception: return "hash_error" def _payload_size(payload: Dict) -> int: try: return len(json.dumps(payload).encode()) except Exception: return 0 # ─── Gateway Client ────────────────────────────────────────────────────────── class GatewayClient: """ HTTP client for calling DAARION router tool execution endpoint. Usage: async with GatewayClient() as gw: result = await gw.call_tool( tool="job_orchestrator_tool", action="start_task", params={"task_id": "release_check", "inputs": {...}}, agent_id="sofiia", workspace_id="daarion", user_id="system", graph_run_id="gr_abc123", graph_node="start_job", ) """ _ENDPOINT = "/v1/tools/execute" def __init__(self): self._client: Optional[httpx.AsyncClient] = None async def __aenter__(self) -> "GatewayClient": self._client = httpx.AsyncClient( base_url=settings.GATEWAY_BASE_URL, timeout=settings.TOOL_CALL_TIMEOUT_SEC, headers=self._base_headers(), ) return self async def __aexit__(self, *args): if self._client: await self._client.aclose() def _base_headers(self) -> Dict[str, str]: headers = {"Content-Type": "application/json"} if settings.SUPERVISOR_API_KEY: headers["Authorization"] = f"Bearer {settings.SUPERVISOR_API_KEY}" return headers async def call_tool( self, tool: str, action: str, params: Optional[Dict[str, Any]] = None, agent_id: str = "", workspace_id: str = "", user_id: str = "", graph_run_id: str = "", graph_node: str = "", trace_id: str = "", ) -> ToolCallResult: """ Execute a tool via the gateway's /v1/tools/execute endpoint. Injects graph_run_id + graph_node into metadata for correlation. Retries up to MAX_RETRIES times on retryable (5xx) errors. Does NOT log payload — only hash + sizes. """ payload: Dict[str, Any] = { "tool": tool, "action": action, "params": params or {}, "agent_id": agent_id or settings.DEFAULT_AGENT_ID, "workspace_id": workspace_id or settings.DEFAULT_WORKSPACE_ID, "user_id": user_id, "metadata": { "graph_run_id": graph_run_id, "graph_node": graph_node, **({"trace_id": trace_id} if trace_id else {}), }, } p_hash = _payload_hash(payload) p_size = _payload_size(payload) for attempt in range(1, settings.TOOL_CALL_MAX_RETRIES + 2): t0 = time.monotonic() try: logger.info( "gateway_call tool=%s action=%s node=%s run=%s " "hash=%s size=%d attempt=%d", tool, action, graph_node, graph_run_id, p_hash, p_size, attempt, ) resp = await self._client.post(self._ENDPOINT, json=payload) elapsed_ms = (time.monotonic() - t0) * 1000 if resp.status_code == 200: body = resp.json() if body.get("status") == "succeeded": logger.info( "gateway_ok tool=%s node=%s run=%s elapsed_ms=%.0f", tool, graph_node, graph_run_id, elapsed_ms, ) return ToolCallResult( success=True, data=body.get("data"), elapsed_ms=elapsed_ms ) else: err = body.get("error") or {} retryable = err.get("retryable", False) logger.warning( "gateway_tool_fail tool=%s code=%s msg=%s retryable=%s", tool, err.get("code"), err.get("message", "")[:120], retryable, ) if retryable and attempt <= settings.TOOL_CALL_MAX_RETRIES: await asyncio.sleep(1.5 * attempt) continue return ToolCallResult( success=False, error_code=err.get("code", "tool_error"), error_message=err.get("message", "tool failed"), retryable=retryable, elapsed_ms=elapsed_ms, ) elif resp.status_code in (502, 503, 504) and attempt <= settings.TOOL_CALL_MAX_RETRIES: logger.warning("gateway_http_%d tool=%s attempt=%d, retrying", resp.status_code, tool, attempt) await asyncio.sleep(2.0 * attempt) continue else: elapsed_ms = (time.monotonic() - t0) * 1000 return ToolCallResult( success=False, error_code=f"http_{resp.status_code}", error_message=f"HTTP {resp.status_code}", retryable=resp.status_code >= 500, elapsed_ms=elapsed_ms, ) except httpx.TimeoutException as e: elapsed_ms = (time.monotonic() - t0) * 1000 logger.warning("gateway_timeout tool=%s node=%s elapsed_ms=%.0f", tool, graph_node, elapsed_ms) if attempt <= settings.TOOL_CALL_MAX_RETRIES: await asyncio.sleep(2.0 * attempt) continue return ToolCallResult( success=False, error_code="timeout", error_message=f"Timeout after {settings.TOOL_CALL_TIMEOUT_SEC}s", retryable=True, elapsed_ms=elapsed_ms, ) except Exception as e: elapsed_ms = (time.monotonic() - t0) * 1000 logger.error("gateway_error tool=%s: %s", tool, str(e)[:200]) return ToolCallResult( success=False, error_code="client_error", error_message=str(e)[:200], retryable=False, elapsed_ms=elapsed_ms, ) # Exhausted retries return ToolCallResult( success=False, error_code="max_retries", error_message=f"Failed after {settings.TOOL_CALL_MAX_RETRIES + 1} attempts", retryable=False, )