feat(platform): add new services, tools, tests and crews modules

New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
2026-03-03 07:14:14 -08:00
parent e9dedffa48
commit 129e4ea1fc
241 changed files with 69349 additions and 0 deletions
--- a/services/sofiia-supervisor/app/gateway_client.py
+++ b/services/sofiia-supervisor/app/gateway_client.py
@@ -0,0 +1,233 @@
+"""
+Sofiia Supervisor — Gateway Client
+
+Thin HTTP wrapper around the DAARION router's /v1/tools/execute endpoint.
+
+Security rules:
+  - Only allowed destination: GATEWAY_BASE_URL (single allowlisted origin)
+  - Payload is NOT logged; only hash + sizes in audit
+  - Correlation: graph_run_id + graph_node injected into every request metadata
+  - Retries only on 5xx (retryable=True) — max 2 attempts
+  - Timeouts enforced per call
+"""
+
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import json
+import logging
+import time
+from typing import Any, Dict, Optional, Tuple
+
+import httpx
+
+from .config import settings
+
+logger = logging.getLogger(__name__)
+
+# ─── Result ──────────────────────────────────────────────────────────────────
+
+class ToolCallResult:
+    __slots__ = ("success", "data", "error_code", "error_message", "retryable", "elapsed_ms")
+
+    def __init__(
+        self,
+        success: bool,
+        data: Any = None,
+        error_code: str = "",
+        error_message: str = "",
+        retryable: bool = False,
+        elapsed_ms: float = 0.0,
+    ):
+        self.success = success
+        self.data = data
+        self.error_code = error_code
+        self.error_message = error_message
+        self.retryable = retryable
+        self.elapsed_ms = elapsed_ms
+
+    def __repr__(self) -> str:
+        return f"ToolCallResult(success={self.success}, error={self.error_code})"
+
+
+# ─── Audit helpers (no payload) ──────────────────────────────────────────────
+
+def _payload_hash(payload: Dict) -> str:
+    """SHA-256 of canonical JSON — for audit log without exposing content."""
+    try:
+        canon = json.dumps(payload, sort_keys=True, ensure_ascii=False)
+        return hashlib.sha256(canon.encode()).hexdigest()[:16]
+    except Exception:
+        return "hash_error"
+
+
+def _payload_size(payload: Dict) -> int:
+    try:
+        return len(json.dumps(payload).encode())
+    except Exception:
+        return 0
+
+
+# ─── Gateway Client ──────────────────────────────────────────────────────────
+
+class GatewayClient:
+    """
+    HTTP client for calling DAARION router tool execution endpoint.
+
+    Usage:
+        async with GatewayClient() as gw:
+            result = await gw.call_tool(
+                tool="job_orchestrator_tool",
+                action="start_task",
+                params={"task_id": "release_check", "inputs": {...}},
+                agent_id="sofiia",
+                workspace_id="daarion",
+                user_id="system",
+                graph_run_id="gr_abc123",
+                graph_node="start_job",
+            )
+    """
+
+    _ENDPOINT = "/v1/tools/execute"
+
+    def __init__(self):
+        self._client: Optional[httpx.AsyncClient] = None
+
+    async def __aenter__(self) -> "GatewayClient":
+        self._client = httpx.AsyncClient(
+            base_url=settings.GATEWAY_BASE_URL,
+            timeout=settings.TOOL_CALL_TIMEOUT_SEC,
+            headers=self._base_headers(),
+        )
+        return self
+
+    async def __aexit__(self, *args):
+        if self._client:
+            await self._client.aclose()
+
+    def _base_headers(self) -> Dict[str, str]:
+        headers = {"Content-Type": "application/json"}
+        if settings.SUPERVISOR_API_KEY:
+            headers["Authorization"] = f"Bearer {settings.SUPERVISOR_API_KEY}"
+        return headers
+
+    async def call_tool(
+        self,
+        tool: str,
+        action: str,
+        params: Optional[Dict[str, Any]] = None,
+        agent_id: str = "",
+        workspace_id: str = "",
+        user_id: str = "",
+        graph_run_id: str = "",
+        graph_node: str = "",
+        trace_id: str = "",
+    ) -> ToolCallResult:
+        """
+        Execute a tool via the gateway's /v1/tools/execute endpoint.
+
+        Injects graph_run_id + graph_node into metadata for correlation.
+        Retries up to MAX_RETRIES times on retryable (5xx) errors.
+        Does NOT log payload — only hash + sizes.
+        """
+        payload: Dict[str, Any] = {
+            "tool": tool,
+            "action": action,
+            "params": params or {},
+            "agent_id": agent_id or settings.DEFAULT_AGENT_ID,
+            "workspace_id": workspace_id or settings.DEFAULT_WORKSPACE_ID,
+            "user_id": user_id,
+            "metadata": {
+                "graph_run_id": graph_run_id,
+                "graph_node": graph_node,
+                **({"trace_id": trace_id} if trace_id else {}),
+            },
+        }
+
+        p_hash = _payload_hash(payload)
+        p_size = _payload_size(payload)
+
+        for attempt in range(1, settings.TOOL_CALL_MAX_RETRIES + 2):
+            t0 = time.monotonic()
+            try:
+                logger.info(
+                    "gateway_call tool=%s action=%s node=%s run=%s "
+                    "hash=%s size=%d attempt=%d",
+                    tool, action, graph_node, graph_run_id, p_hash, p_size, attempt,
+                )
+                resp = await self._client.post(self._ENDPOINT, json=payload)
+                elapsed_ms = (time.monotonic() - t0) * 1000
+
+                if resp.status_code == 200:
+                    body = resp.json()
+                    if body.get("status") == "succeeded":
+                        logger.info(
+                            "gateway_ok tool=%s node=%s run=%s elapsed_ms=%.0f",
+                            tool, graph_node, graph_run_id, elapsed_ms,
+                        )
+                        return ToolCallResult(
+                            success=True, data=body.get("data"), elapsed_ms=elapsed_ms
+                        )
+                    else:
+                        err = body.get("error") or {}
+                        retryable = err.get("retryable", False)
+                        logger.warning(
+                            "gateway_tool_fail tool=%s code=%s msg=%s retryable=%s",
+                            tool, err.get("code"), err.get("message", "")[:120], retryable,
+                        )
+                        if retryable and attempt <= settings.TOOL_CALL_MAX_RETRIES:
+                            await asyncio.sleep(1.5 * attempt)
+                            continue
+                        return ToolCallResult(
+                            success=False,
+                            error_code=err.get("code", "tool_error"),
+                            error_message=err.get("message", "tool failed"),
+                            retryable=retryable,
+                            elapsed_ms=elapsed_ms,
+                        )
+                elif resp.status_code in (502, 503, 504) and attempt <= settings.TOOL_CALL_MAX_RETRIES:
+                    logger.warning("gateway_http_%d tool=%s attempt=%d, retrying", resp.status_code, tool, attempt)
+                    await asyncio.sleep(2.0 * attempt)
+                    continue
+                else:
+                    elapsed_ms = (time.monotonic() - t0) * 1000
+                    return ToolCallResult(
+                        success=False,
+                        error_code=f"http_{resp.status_code}",
+                        error_message=f"HTTP {resp.status_code}",
+                        retryable=resp.status_code >= 500,
+                        elapsed_ms=elapsed_ms,
+                    )
+
+            except httpx.TimeoutException as e:
+                elapsed_ms = (time.monotonic() - t0) * 1000
+                logger.warning("gateway_timeout tool=%s node=%s elapsed_ms=%.0f", tool, graph_node, elapsed_ms)
+                if attempt <= settings.TOOL_CALL_MAX_RETRIES:
+                    await asyncio.sleep(2.0 * attempt)
+                    continue
+                return ToolCallResult(
+                    success=False,
+                    error_code="timeout",
+                    error_message=f"Timeout after {settings.TOOL_CALL_TIMEOUT_SEC}s",
+                    retryable=True,
+                    elapsed_ms=elapsed_ms,
+                )
+            except Exception as e:
+                elapsed_ms = (time.monotonic() - t0) * 1000
+                logger.error("gateway_error tool=%s: %s", tool, str(e)[:200])
+                return ToolCallResult(
+                    success=False,
+                    error_code="client_error",
+                    error_message=str(e)[:200],
+                    retryable=False,
+                    elapsed_ms=elapsed_ms,
+                )
+
+        # Exhausted retries
+        return ToolCallResult(
+            success=False,
+            error_code="max_retries",
+            error_message=f"Failed after {settings.TOOL_CALL_MAX_RETRIES + 1} attempts",
+            retryable=False,
+        )