feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
This commit is contained in:
233
services/sofiia-supervisor/app/gateway_client.py
Normal file
233
services/sofiia-supervisor/app/gateway_client.py
Normal file
@@ -0,0 +1,233 @@
|
||||
"""
|
||||
Sofiia Supervisor — Gateway Client
|
||||
|
||||
Thin HTTP wrapper around the DAARION router's /v1/tools/execute endpoint.
|
||||
|
||||
Security rules:
|
||||
- Only allowed destination: GATEWAY_BASE_URL (single allowlisted origin)
|
||||
- Payload is NOT logged; only hash + sizes in audit
|
||||
- Correlation: graph_run_id + graph_node injected into every request metadata
|
||||
- Retries only on 5xx (retryable=True) — max 2 attempts
|
||||
- Timeouts enforced per call
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
|
||||
import httpx
|
||||
|
||||
from .config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ─── Result ──────────────────────────────────────────────────────────────────
|
||||
|
||||
class ToolCallResult:
|
||||
__slots__ = ("success", "data", "error_code", "error_message", "retryable", "elapsed_ms")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
success: bool,
|
||||
data: Any = None,
|
||||
error_code: str = "",
|
||||
error_message: str = "",
|
||||
retryable: bool = False,
|
||||
elapsed_ms: float = 0.0,
|
||||
):
|
||||
self.success = success
|
||||
self.data = data
|
||||
self.error_code = error_code
|
||||
self.error_message = error_message
|
||||
self.retryable = retryable
|
||||
self.elapsed_ms = elapsed_ms
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"ToolCallResult(success={self.success}, error={self.error_code})"
|
||||
|
||||
|
||||
# ─── Audit helpers (no payload) ──────────────────────────────────────────────
|
||||
|
||||
def _payload_hash(payload: Dict) -> str:
|
||||
"""SHA-256 of canonical JSON — for audit log without exposing content."""
|
||||
try:
|
||||
canon = json.dumps(payload, sort_keys=True, ensure_ascii=False)
|
||||
return hashlib.sha256(canon.encode()).hexdigest()[:16]
|
||||
except Exception:
|
||||
return "hash_error"
|
||||
|
||||
|
||||
def _payload_size(payload: Dict) -> int:
|
||||
try:
|
||||
return len(json.dumps(payload).encode())
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
# ─── Gateway Client ──────────────────────────────────────────────────────────
|
||||
|
||||
class GatewayClient:
|
||||
"""
|
||||
HTTP client for calling DAARION router tool execution endpoint.
|
||||
|
||||
Usage:
|
||||
async with GatewayClient() as gw:
|
||||
result = await gw.call_tool(
|
||||
tool="job_orchestrator_tool",
|
||||
action="start_task",
|
||||
params={"task_id": "release_check", "inputs": {...}},
|
||||
agent_id="sofiia",
|
||||
workspace_id="daarion",
|
||||
user_id="system",
|
||||
graph_run_id="gr_abc123",
|
||||
graph_node="start_job",
|
||||
)
|
||||
"""
|
||||
|
||||
_ENDPOINT = "/v1/tools/execute"
|
||||
|
||||
def __init__(self):
|
||||
self._client: Optional[httpx.AsyncClient] = None
|
||||
|
||||
async def __aenter__(self) -> "GatewayClient":
|
||||
self._client = httpx.AsyncClient(
|
||||
base_url=settings.GATEWAY_BASE_URL,
|
||||
timeout=settings.TOOL_CALL_TIMEOUT_SEC,
|
||||
headers=self._base_headers(),
|
||||
)
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *args):
|
||||
if self._client:
|
||||
await self._client.aclose()
|
||||
|
||||
def _base_headers(self) -> Dict[str, str]:
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if settings.SUPERVISOR_API_KEY:
|
||||
headers["Authorization"] = f"Bearer {settings.SUPERVISOR_API_KEY}"
|
||||
return headers
|
||||
|
||||
async def call_tool(
|
||||
self,
|
||||
tool: str,
|
||||
action: str,
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
agent_id: str = "",
|
||||
workspace_id: str = "",
|
||||
user_id: str = "",
|
||||
graph_run_id: str = "",
|
||||
graph_node: str = "",
|
||||
trace_id: str = "",
|
||||
) -> ToolCallResult:
|
||||
"""
|
||||
Execute a tool via the gateway's /v1/tools/execute endpoint.
|
||||
|
||||
Injects graph_run_id + graph_node into metadata for correlation.
|
||||
Retries up to MAX_RETRIES times on retryable (5xx) errors.
|
||||
Does NOT log payload — only hash + sizes.
|
||||
"""
|
||||
payload: Dict[str, Any] = {
|
||||
"tool": tool,
|
||||
"action": action,
|
||||
"params": params or {},
|
||||
"agent_id": agent_id or settings.DEFAULT_AGENT_ID,
|
||||
"workspace_id": workspace_id or settings.DEFAULT_WORKSPACE_ID,
|
||||
"user_id": user_id,
|
||||
"metadata": {
|
||||
"graph_run_id": graph_run_id,
|
||||
"graph_node": graph_node,
|
||||
**({"trace_id": trace_id} if trace_id else {}),
|
||||
},
|
||||
}
|
||||
|
||||
p_hash = _payload_hash(payload)
|
||||
p_size = _payload_size(payload)
|
||||
|
||||
for attempt in range(1, settings.TOOL_CALL_MAX_RETRIES + 2):
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
logger.info(
|
||||
"gateway_call tool=%s action=%s node=%s run=%s "
|
||||
"hash=%s size=%d attempt=%d",
|
||||
tool, action, graph_node, graph_run_id, p_hash, p_size, attempt,
|
||||
)
|
||||
resp = await self._client.post(self._ENDPOINT, json=payload)
|
||||
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||
|
||||
if resp.status_code == 200:
|
||||
body = resp.json()
|
||||
if body.get("status") == "succeeded":
|
||||
logger.info(
|
||||
"gateway_ok tool=%s node=%s run=%s elapsed_ms=%.0f",
|
||||
tool, graph_node, graph_run_id, elapsed_ms,
|
||||
)
|
||||
return ToolCallResult(
|
||||
success=True, data=body.get("data"), elapsed_ms=elapsed_ms
|
||||
)
|
||||
else:
|
||||
err = body.get("error") or {}
|
||||
retryable = err.get("retryable", False)
|
||||
logger.warning(
|
||||
"gateway_tool_fail tool=%s code=%s msg=%s retryable=%s",
|
||||
tool, err.get("code"), err.get("message", "")[:120], retryable,
|
||||
)
|
||||
if retryable and attempt <= settings.TOOL_CALL_MAX_RETRIES:
|
||||
await asyncio.sleep(1.5 * attempt)
|
||||
continue
|
||||
return ToolCallResult(
|
||||
success=False,
|
||||
error_code=err.get("code", "tool_error"),
|
||||
error_message=err.get("message", "tool failed"),
|
||||
retryable=retryable,
|
||||
elapsed_ms=elapsed_ms,
|
||||
)
|
||||
elif resp.status_code in (502, 503, 504) and attempt <= settings.TOOL_CALL_MAX_RETRIES:
|
||||
logger.warning("gateway_http_%d tool=%s attempt=%d, retrying", resp.status_code, tool, attempt)
|
||||
await asyncio.sleep(2.0 * attempt)
|
||||
continue
|
||||
else:
|
||||
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||
return ToolCallResult(
|
||||
success=False,
|
||||
error_code=f"http_{resp.status_code}",
|
||||
error_message=f"HTTP {resp.status_code}",
|
||||
retryable=resp.status_code >= 500,
|
||||
elapsed_ms=elapsed_ms,
|
||||
)
|
||||
|
||||
except httpx.TimeoutException as e:
|
||||
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||
logger.warning("gateway_timeout tool=%s node=%s elapsed_ms=%.0f", tool, graph_node, elapsed_ms)
|
||||
if attempt <= settings.TOOL_CALL_MAX_RETRIES:
|
||||
await asyncio.sleep(2.0 * attempt)
|
||||
continue
|
||||
return ToolCallResult(
|
||||
success=False,
|
||||
error_code="timeout",
|
||||
error_message=f"Timeout after {settings.TOOL_CALL_TIMEOUT_SEC}s",
|
||||
retryable=True,
|
||||
elapsed_ms=elapsed_ms,
|
||||
)
|
||||
except Exception as e:
|
||||
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||
logger.error("gateway_error tool=%s: %s", tool, str(e)[:200])
|
||||
return ToolCallResult(
|
||||
success=False,
|
||||
error_code="client_error",
|
||||
error_message=str(e)[:200],
|
||||
retryable=False,
|
||||
elapsed_ms=elapsed_ms,
|
||||
)
|
||||
|
||||
# Exhausted retries
|
||||
return ToolCallResult(
|
||||
success=False,
|
||||
error_code="max_retries",
|
||||
error_message=f"Failed after {settings.TOOL_CALL_MAX_RETRIES + 1} attempts",
|
||||
retryable=False,
|
||||
)
|
||||
Reference in New Issue
Block a user