microdao-daarion/services/sofiia-supervisor/app/gateway_client.py

"""
Sofiia Supervisor — Gateway Client

Thin HTTP wrapper around the DAARION router's /v1/tools/execute endpoint.

Security rules:
  - Only allowed destination: GATEWAY_BASE_URL (single allowlisted origin)
  - Payload is NOT logged; only hash + sizes in audit
  - Correlation: graph_run_id + graph_node injected into every request metadata
  - Retries only on 5xx (retryable=True) — max 2 attempts
  - Timeouts enforced per call
"""

from __future__ import annotations

import asyncio
import hashlib
import json
import logging
import time
from typing import Any, Dict, Optional, Tuple

import httpx

from .config import settings

logger = logging.getLogger(__name__)

# ─── Result ──────────────────────────────────────────────────────────────────

class ToolCallResult:
    __slots__ = ("success", "data", "error_code", "error_message", "retryable", "elapsed_ms")

    def __init__(
        self,
        success: bool,
        data: Any = None,
        error_code: str = "",
        error_message: str = "",
        retryable: bool = False,
        elapsed_ms: float = 0.0,
    ):
        self.success = success
        self.data = data
        self.error_code = error_code
        self.error_message = error_message
        self.retryable = retryable
        self.elapsed_ms = elapsed_ms

    def __repr__(self) -> str:
        return f"ToolCallResult(success={self.success}, error={self.error_code})"


# ─── Audit helpers (no payload) ──────────────────────────────────────────────

def _payload_hash(payload: Dict) -> str:
    """SHA-256 of canonical JSON — for audit log without exposing content."""
    try:
        canon = json.dumps(payload, sort_keys=True, ensure_ascii=False)
        return hashlib.sha256(canon.encode()).hexdigest()[:16]
    except Exception:
        return "hash_error"


def _payload_size(payload: Dict) -> int:
    try:
        return len(json.dumps(payload).encode())
    except Exception:
        return 0


# ─── Gateway Client ──────────────────────────────────────────────────────────

class GatewayClient:
    """
    HTTP client for calling DAARION router tool execution endpoint.

    Usage:
        async with GatewayClient() as gw:
            result = await gw.call_tool(
                tool="job_orchestrator_tool",
                action="start_task",
                params={"task_id": "release_check", "inputs": {...}},
                agent_id="sofiia",
                workspace_id="daarion",
                user_id="system",
                graph_run_id="gr_abc123",
                graph_node="start_job",
            )
    """

    _ENDPOINT = "/v1/tools/execute"

    def __init__(self):
        self._client: Optional[httpx.AsyncClient] = None

    async def __aenter__(self) -> "GatewayClient":
        self._client = httpx.AsyncClient(
            base_url=settings.GATEWAY_BASE_URL,
            timeout=settings.TOOL_CALL_TIMEOUT_SEC,
            headers=self._base_headers(),
        )
        return self

    async def __aexit__(self, *args):
        if self._client:
            await self._client.aclose()

    def _base_headers(self) -> Dict[str, str]:
        headers = {"Content-Type": "application/json"}
        if settings.SUPERVISOR_API_KEY:
            headers["Authorization"] = f"Bearer {settings.SUPERVISOR_API_KEY}"
        return headers

    async def call_tool(
        self,
        tool: str,
        action: str,
        params: Optional[Dict[str, Any]] = None,
        agent_id: str = "",
        workspace_id: str = "",
        user_id: str = "",
        graph_run_id: str = "",
        graph_node: str = "",
        trace_id: str = "",
    ) -> ToolCallResult:
        """
        Execute a tool via the gateway's /v1/tools/execute endpoint.

        Injects graph_run_id + graph_node into metadata for correlation.
        Retries up to MAX_RETRIES times on retryable (5xx) errors.
        Does NOT log payload — only hash + sizes.
        """
        payload: Dict[str, Any] = {
            "tool": tool,
            "action": action,
            "params": params or {},
            "agent_id": agent_id or settings.DEFAULT_AGENT_ID,
            "workspace_id": workspace_id or settings.DEFAULT_WORKSPACE_ID,
            "user_id": user_id,
            "metadata": {
                "graph_run_id": graph_run_id,
                "graph_node": graph_node,
                **({"trace_id": trace_id} if trace_id else {}),
            },
        }

        p_hash = _payload_hash(payload)
        p_size = _payload_size(payload)

        for attempt in range(1, settings.TOOL_CALL_MAX_RETRIES + 2):
            t0 = time.monotonic()
            try:
                logger.info(
                    "gateway_call tool=%s action=%s node=%s run=%s "
                    "hash=%s size=%d attempt=%d",
                    tool, action, graph_node, graph_run_id, p_hash, p_size, attempt,
                )
                resp = await self._client.post(self._ENDPOINT, json=payload)
                elapsed_ms = (time.monotonic() - t0) * 1000

                if resp.status_code == 200:
                    body = resp.json()
                    if body.get("status") == "succeeded":
                        logger.info(
                            "gateway_ok tool=%s node=%s run=%s elapsed_ms=%.0f",
                            tool, graph_node, graph_run_id, elapsed_ms,
                        )
                        return ToolCallResult(
                            success=True, data=body.get("data"), elapsed_ms=elapsed_ms
                        )
                    else:
                        err = body.get("error") or {}
                        retryable = err.get("retryable", False)
                        logger.warning(
                            "gateway_tool_fail tool=%s code=%s msg=%s retryable=%s",
                            tool, err.get("code"), err.get("message", "")[:120], retryable,
                        )
                        if retryable and attempt <= settings.TOOL_CALL_MAX_RETRIES:
                            await asyncio.sleep(1.5 * attempt)
                            continue
                        return ToolCallResult(
                            success=False,
                            error_code=err.get("code", "tool_error"),
                            error_message=err.get("message", "tool failed"),
                            retryable=retryable,
                            elapsed_ms=elapsed_ms,
                        )
                elif resp.status_code in (502, 503, 504) and attempt <= settings.TOOL_CALL_MAX_RETRIES:
                    logger.warning("gateway_http_%d tool=%s attempt=%d, retrying", resp.status_code, tool, attempt)
                    await asyncio.sleep(2.0 * attempt)
                    continue
                else:
                    elapsed_ms = (time.monotonic() - t0) * 1000
                    return ToolCallResult(
                        success=False,
                        error_code=f"http_{resp.status_code}",
                        error_message=f"HTTP {resp.status_code}",
                        retryable=resp.status_code >= 500,
                        elapsed_ms=elapsed_ms,
                    )

            except httpx.TimeoutException as e:
                elapsed_ms = (time.monotonic() - t0) * 1000
                logger.warning("gateway_timeout tool=%s node=%s elapsed_ms=%.0f", tool, graph_node, elapsed_ms)
                if attempt <= settings.TOOL_CALL_MAX_RETRIES:
                    await asyncio.sleep(2.0 * attempt)
                    continue
                return ToolCallResult(
                    success=False,
                    error_code="timeout",
                    error_message=f"Timeout after {settings.TOOL_CALL_TIMEOUT_SEC}s",
                    retryable=True,
                    elapsed_ms=elapsed_ms,
                )
            except Exception as e:
                elapsed_ms = (time.monotonic() - t0) * 1000
                logger.error("gateway_error tool=%s: %s", tool, str(e)[:200])
                return ToolCallResult(
                    success=False,
                    error_code="client_error",
                    error_message=str(e)[:200],
                    retryable=False,
                    elapsed_ms=elapsed_ms,
                )

        # Exhausted retries
        return ToolCallResult(
            success=False,
            error_code="max_retries",
            error_message=f"Failed after {settings.TOOL_CALL_MAX_RETRIES + 1} attempts",
            retryable=False,
        )