Files
microdao-daarion/services/sofiia-supervisor/app/gateway_client.py
Apple 129e4ea1fc feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store,
architecture_pressure, backlog_generator/store, cost_analyzer, data_governance,
dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment,
platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files),
signature_state_store, sofiia_auto_router, tool_governance

New services:
- sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static
- memory-service: integration_endpoints, integrations, voice_endpoints, static UI
- aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents)
- sofiia-supervisor: new supervisor service
- aistalk-bridge-lite: Telegram bridge lite
- calendar-service: CalDAV calendar service with reminders
- mlx-stt-service / mlx-tts-service: Apple Silicon speech services
- binance-bot-monitor: market monitor service
- node-worker: STT/TTS memory providers

New tools (9): agent_email, browser_tool, contract_tool, observability_tool,
oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault

New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus,
farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine,
session_context, style_adapter, telemetry)

Tests: 85+ test files for all new modules
Made-with: Cursor
2026-03-03 07:14:14 -08:00

234 lines
8.7 KiB
Python

"""
Sofiia Supervisor — Gateway Client
Thin HTTP wrapper around the DAARION router's /v1/tools/execute endpoint.
Security rules:
- Only allowed destination: GATEWAY_BASE_URL (single allowlisted origin)
- Payload is NOT logged; only hash + sizes in audit
- Correlation: graph_run_id + graph_node injected into every request metadata
- Retries only on 5xx (retryable=True) — max 2 attempts
- Timeouts enforced per call
"""
from __future__ import annotations
import asyncio
import hashlib
import json
import logging
import time
from typing import Any, Dict, Optional, Tuple
import httpx
from .config import settings
logger = logging.getLogger(__name__)
# ─── Result ──────────────────────────────────────────────────────────────────
class ToolCallResult:
__slots__ = ("success", "data", "error_code", "error_message", "retryable", "elapsed_ms")
def __init__(
self,
success: bool,
data: Any = None,
error_code: str = "",
error_message: str = "",
retryable: bool = False,
elapsed_ms: float = 0.0,
):
self.success = success
self.data = data
self.error_code = error_code
self.error_message = error_message
self.retryable = retryable
self.elapsed_ms = elapsed_ms
def __repr__(self) -> str:
return f"ToolCallResult(success={self.success}, error={self.error_code})"
# ─── Audit helpers (no payload) ──────────────────────────────────────────────
def _payload_hash(payload: Dict) -> str:
"""SHA-256 of canonical JSON — for audit log without exposing content."""
try:
canon = json.dumps(payload, sort_keys=True, ensure_ascii=False)
return hashlib.sha256(canon.encode()).hexdigest()[:16]
except Exception:
return "hash_error"
def _payload_size(payload: Dict) -> int:
try:
return len(json.dumps(payload).encode())
except Exception:
return 0
# ─── Gateway Client ──────────────────────────────────────────────────────────
class GatewayClient:
"""
HTTP client for calling DAARION router tool execution endpoint.
Usage:
async with GatewayClient() as gw:
result = await gw.call_tool(
tool="job_orchestrator_tool",
action="start_task",
params={"task_id": "release_check", "inputs": {...}},
agent_id="sofiia",
workspace_id="daarion",
user_id="system",
graph_run_id="gr_abc123",
graph_node="start_job",
)
"""
_ENDPOINT = "/v1/tools/execute"
def __init__(self):
self._client: Optional[httpx.AsyncClient] = None
async def __aenter__(self) -> "GatewayClient":
self._client = httpx.AsyncClient(
base_url=settings.GATEWAY_BASE_URL,
timeout=settings.TOOL_CALL_TIMEOUT_SEC,
headers=self._base_headers(),
)
return self
async def __aexit__(self, *args):
if self._client:
await self._client.aclose()
def _base_headers(self) -> Dict[str, str]:
headers = {"Content-Type": "application/json"}
if settings.SUPERVISOR_API_KEY:
headers["Authorization"] = f"Bearer {settings.SUPERVISOR_API_KEY}"
return headers
async def call_tool(
self,
tool: str,
action: str,
params: Optional[Dict[str, Any]] = None,
agent_id: str = "",
workspace_id: str = "",
user_id: str = "",
graph_run_id: str = "",
graph_node: str = "",
trace_id: str = "",
) -> ToolCallResult:
"""
Execute a tool via the gateway's /v1/tools/execute endpoint.
Injects graph_run_id + graph_node into metadata for correlation.
Retries up to MAX_RETRIES times on retryable (5xx) errors.
Does NOT log payload — only hash + sizes.
"""
payload: Dict[str, Any] = {
"tool": tool,
"action": action,
"params": params or {},
"agent_id": agent_id or settings.DEFAULT_AGENT_ID,
"workspace_id": workspace_id or settings.DEFAULT_WORKSPACE_ID,
"user_id": user_id,
"metadata": {
"graph_run_id": graph_run_id,
"graph_node": graph_node,
**({"trace_id": trace_id} if trace_id else {}),
},
}
p_hash = _payload_hash(payload)
p_size = _payload_size(payload)
for attempt in range(1, settings.TOOL_CALL_MAX_RETRIES + 2):
t0 = time.monotonic()
try:
logger.info(
"gateway_call tool=%s action=%s node=%s run=%s "
"hash=%s size=%d attempt=%d",
tool, action, graph_node, graph_run_id, p_hash, p_size, attempt,
)
resp = await self._client.post(self._ENDPOINT, json=payload)
elapsed_ms = (time.monotonic() - t0) * 1000
if resp.status_code == 200:
body = resp.json()
if body.get("status") == "succeeded":
logger.info(
"gateway_ok tool=%s node=%s run=%s elapsed_ms=%.0f",
tool, graph_node, graph_run_id, elapsed_ms,
)
return ToolCallResult(
success=True, data=body.get("data"), elapsed_ms=elapsed_ms
)
else:
err = body.get("error") or {}
retryable = err.get("retryable", False)
logger.warning(
"gateway_tool_fail tool=%s code=%s msg=%s retryable=%s",
tool, err.get("code"), err.get("message", "")[:120], retryable,
)
if retryable and attempt <= settings.TOOL_CALL_MAX_RETRIES:
await asyncio.sleep(1.5 * attempt)
continue
return ToolCallResult(
success=False,
error_code=err.get("code", "tool_error"),
error_message=err.get("message", "tool failed"),
retryable=retryable,
elapsed_ms=elapsed_ms,
)
elif resp.status_code in (502, 503, 504) and attempt <= settings.TOOL_CALL_MAX_RETRIES:
logger.warning("gateway_http_%d tool=%s attempt=%d, retrying", resp.status_code, tool, attempt)
await asyncio.sleep(2.0 * attempt)
continue
else:
elapsed_ms = (time.monotonic() - t0) * 1000
return ToolCallResult(
success=False,
error_code=f"http_{resp.status_code}",
error_message=f"HTTP {resp.status_code}",
retryable=resp.status_code >= 500,
elapsed_ms=elapsed_ms,
)
except httpx.TimeoutException as e:
elapsed_ms = (time.monotonic() - t0) * 1000
logger.warning("gateway_timeout tool=%s node=%s elapsed_ms=%.0f", tool, graph_node, elapsed_ms)
if attempt <= settings.TOOL_CALL_MAX_RETRIES:
await asyncio.sleep(2.0 * attempt)
continue
return ToolCallResult(
success=False,
error_code="timeout",
error_message=f"Timeout after {settings.TOOL_CALL_TIMEOUT_SEC}s",
retryable=True,
elapsed_ms=elapsed_ms,
)
except Exception as e:
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("gateway_error tool=%s: %s", tool, str(e)[:200])
return ToolCallResult(
success=False,
error_code="client_error",
error_message=str(e)[:200],
retryable=False,
elapsed_ms=elapsed_ms,
)
# Exhausted retries
return ToolCallResult(
success=False,
error_code="max_retries",
error_message=f"Failed after {settings.TOOL_CALL_MAX_RETRIES + 1} attempts",
retryable=False,
)