feat(platform): add new services, tools, tests and crews modules

New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
2026-03-03 07:14:14 -08:00
parent e9dedffa48
commit 129e4ea1fc
241 changed files with 69349 additions and 0 deletions
--- a/services/router/cost_analyzer.py
+++ b/services/router/cost_analyzer.py
@@ -0,0 +1,595 @@
+"""
+Cost & Resource Analyzer (FinOps MVP)
+
+Reads audit events from AuditStore and computes:
+  - Aggregated cost_units by tool/agent/workspace/status
+  - Top spenders (tools, agents, users)
+  - Anomalies (cost spikes, error rate spikes)
+  - Cost model weights
+
+"cost_units" = cost_per_call(tool) + duration_ms * cost_per_ms(tool)
+These are relative units, not real dollars.
+
+No payload access — all inputs are aggregation parameters only.
+"""
+
+from __future__ import annotations
+
+import datetime
+import logging
+import os
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+# ─── Config loader ────────────────────────────────────────────────────────────
+
+_weights_cache: Optional[Dict] = None
+_WEIGHTS_PATH = os.path.join(
+    os.getenv("REPO_ROOT", str(Path(__file__).parent.parent.parent)),
+    "config", "cost_weights.yml",
+)
+
+
+def _load_weights() -> Dict:
+    global _weights_cache
+    if _weights_cache is not None:
+        return _weights_cache
+    try:
+        import yaml
+        with open(_WEIGHTS_PATH, "r") as f:
+            _weights_cache = yaml.safe_load(f) or {}
+    except Exception as e:
+        logger.warning("cost_weights.yml not loaded: %s", e)
+        _weights_cache = {}
+    return _weights_cache
+
+
+def reload_cost_weights() -> None:
+    """Force reload weights (for tests)."""
+    global _weights_cache
+    _weights_cache = None
+
+
+def get_weights_for_tool(tool: str) -> Tuple[float, float]:
+    """Return (cost_per_call, cost_per_ms) for a tool."""
+    cfg = _load_weights()
+    defaults = cfg.get("defaults", {})
+    tool_cfg = (cfg.get("tools") or {}).get(tool, {})
+    cpc = float(tool_cfg.get("cost_per_call", defaults.get("cost_per_call", 1.0)))
+    cpm = float(tool_cfg.get("cost_per_ms", defaults.get("cost_per_ms", 0.001)))
+    return cpc, cpm
+
+
+def compute_event_cost(event: Dict) -> float:
+    """Compute cost_units for a single audit event."""
+    tool = event.get("tool", "")
+    duration_ms = float(event.get("duration_ms", 0))
+    cpc, cpm = get_weights_for_tool(tool)
+    return round(cpc + duration_ms * cpm, 4)
+
+
+# ─── Time helpers ─────────────────────────────────────────────────────────────
+
+def _now_utc() -> datetime.datetime:
+    return datetime.datetime.now(datetime.timezone.utc)
+
+
+def _iso(dt: datetime.datetime) -> str:
+    return dt.isoformat()
+
+
+def _parse_iso(s: str) -> datetime.datetime:
+    s = s.replace("Z", "+00:00")
+    try:
+        return datetime.datetime.fromisoformat(s)
+    except Exception:
+        return _now_utc()
+
+
+def _bucket_hour(ts: str) -> str:
+    """Truncate ISO ts to hour: '2026-02-23T10:00:00+00:00'."""
+    return ts[:13] + ":00"
+
+
+# ─── Aggregation helpers ──────────────────────────────────────────────────────
+
+def _aggregate(
+    events: List[Dict],
+    group_keys: List[str],
+) -> Dict[str, Dict]:
+    """
+    Aggregate events by composite key (e.g. ["tool"] or ["agent_id", "tool"]).
+    Returns {key_str: {count, cost_units, duration_sum, failed_count, ...}}.
+    """
+    result: Dict[str, Dict] = defaultdict(lambda: {
+        "count": 0,
+        "cost_units": 0.0,
+        "duration_ms_sum": 0.0,
+        "failed_count": 0,
+        "denied_count": 0,
+        "in_size_sum": 0,
+        "out_size_sum": 0,
+    })
+
+    for ev in events:
+        parts = [str(ev.get(k, "unknown")) for k in group_keys]
+        key = ":".join(parts)
+        cost = compute_event_cost(ev)
+        status = ev.get("status", "pass")
+
+        r = result[key]
+        r["count"] += 1
+        r["cost_units"] = round(r["cost_units"] + cost, 4)
+        r["duration_ms_sum"] = round(r["duration_ms_sum"] + float(ev.get("duration_ms", 0)), 2)
+        r["in_size_sum"] += int(ev.get("in_size", 0))
+        r["out_size_sum"] += int(ev.get("out_size", 0))
+        if status in ("failed", "error"):
+            r["failed_count"] += 1
+        elif status == "denied":
+            r["denied_count"] += 1
+
+    # Enrich with averages
+    for key, r in result.items():
+        n = r["count"] or 1
+        r["avg_duration_ms"] = round(r["duration_ms_sum"] / n, 1)
+        r["avg_cost_units"] = round(r["cost_units"] / n, 4)
+        r["error_rate"] = round(r["failed_count"] / (r["count"] or 1), 4)
+
+    return dict(result)
+
+
+def _top_n(aggregated: Dict[str, Dict], key_field: str, n: int, sort_by: str = "cost_units") -> List[Dict]:
+    """Sort aggregated dict by sort_by and return top N."""
+    items = [
+        {"key": k, key_field: k, **v}
+        for k, v in aggregated.items()
+    ]
+    items.sort(key=lambda x: x.get(sort_by, 0), reverse=True)
+    return items[:n]
+
+
+# ─── Actions ──────────────────────────────────────────────────────────────────
+
+def action_report(
+    store,
+    time_range: Optional[Dict[str, str]] = None,
+    group_by: Optional[List[str]] = None,
+    top_n: int = 10,
+    include_failed: bool = True,
+    include_hourly: bool = False,
+) -> Dict[str, Any]:
+    """
+    Generate aggregated cost report for a time range.
+
+    Returns:
+      totals, breakdowns by group_by keys, top spenders, optional hourly trend.
+    """
+    now = _now_utc()
+    tr = time_range or {}
+    from_ts = tr.get("from") or _iso(now - datetime.timedelta(days=7))
+    to_ts = tr.get("to") or _iso(now)
+
+    events = store.read(from_ts=from_ts, to_ts=to_ts, limit=200_000)
+    if not include_failed:
+        events = [e for e in events if e.get("status", "pass") not in ("failed", "error")]
+
+    # Totals
+    total_cost = sum(compute_event_cost(e) for e in events)
+    total_calls = len(events)
+    total_failed = sum(1 for e in events if e.get("status") in ("failed", "error"))
+    total_denied = sum(1 for e in events if e.get("status") == "denied")
+
+    # Breakdowns
+    by_key = group_by or ["tool"]
+    breakdowns: Dict[str, List[Dict]] = {}
+    for gk in by_key:
+        agg = _aggregate(events, [gk])
+        breakdowns[gk] = _top_n(agg, gk, top_n)
+
+    # Hourly trend (optional, for last 7d max)
+    hourly: List[Dict] = []
+    if include_hourly and events:
+        hourly_agg: Dict[str, Dict] = defaultdict(lambda: {"count": 0, "cost_units": 0.0})
+        for ev in events:
+            bucket = _bucket_hour(ev.get("ts", ""))
+            hourly_agg[bucket]["count"] += 1
+            hourly_agg[bucket]["cost_units"] = round(
+                hourly_agg[bucket]["cost_units"] + compute_event_cost(ev), 4
+            )
+        hourly = [{"hour": k, **v} for k, v in sorted(hourly_agg.items())]
+
+    return {
+        "time_range": {"from": from_ts, "to": to_ts},
+        "totals": {
+            "calls": total_calls,
+            "cost_units": round(total_cost, 2),
+            "failed": total_failed,
+            "denied": total_denied,
+            "error_rate": round(total_failed / (total_calls or 1), 4),
+        },
+        "breakdowns": breakdowns,
+        **({"hourly": hourly} if include_hourly else {}),
+    }
+
+
+def action_top(
+    store,
+    window_hours: int = 24,
+    top_n: int = 10,
+) -> Dict[str, Any]:
+    """
+    Quick top-N report for tools, agents, and users over window_hours.
+    """
+    now = _now_utc()
+    from_ts = _iso(now - datetime.timedelta(hours=window_hours))
+    to_ts = _iso(now)
+
+    events = store.read(from_ts=from_ts, to_ts=to_ts, limit=100_000)
+
+    top_tools = _top_n(_aggregate(events, ["tool"]), "tool", top_n)
+    top_agents = _top_n(_aggregate(events, ["agent_id"]), "agent_id", top_n)
+    top_users = _top_n(_aggregate(events, ["user_id"]), "user_id", top_n)
+    top_workspaces = _top_n(_aggregate(events, ["workspace_id"]), "workspace_id", top_n)
+
+    return {
+        "window_hours": window_hours,
+        "time_range": {"from": from_ts, "to": to_ts},
+        "total_calls": len(events),
+        "top_tools": top_tools,
+        "top_agents": top_agents,
+        "top_users": top_users,
+        "top_workspaces": top_workspaces,
+    }
+
+
+def action_anomalies(
+    store,
+    window_minutes: int = 60,
+    baseline_hours: int = 24,
+    ratio_threshold: Optional[float] = None,
+    min_calls: Optional[int] = None,
+    tools_filter: Optional[List[str]] = None,
+) -> Dict[str, Any]:
+    """
+    Detect cost/call spikes and elevated error rates.
+
+    Algorithm:
+      1. Compute per-tool metrics for window [now-window_minutes, now]
+      2. Compute per-tool metrics for baseline [now-baseline_hours, now-window_minutes]
+      3. Spike = window_rate / baseline_rate >= ratio_threshold AND calls >= min_calls
+      4. Error spike = failed_rate > 10% AND calls >= min_calls
+    """
+    cfg = _load_weights()
+    anomaly_cfg = cfg.get("anomaly", {})
+
+    if ratio_threshold is None:
+        ratio_threshold = float(anomaly_cfg.get("spike_ratio_threshold", 3.0))
+    if min_calls is None:
+        min_calls = int(anomaly_cfg.get("min_calls_threshold", 10))
+
+    now = _now_utc()
+    window_from = _iso(now - datetime.timedelta(minutes=window_minutes))
+    baseline_from = _iso(now - datetime.timedelta(hours=baseline_hours))
+    baseline_to = window_from  # non-overlapping
+
+    # Fetch both windows
+    window_events = store.read(from_ts=window_from, to_ts=_iso(now), limit=50_000)
+    baseline_events = store.read(from_ts=baseline_from, to_ts=baseline_to, limit=200_000)
+
+    if tools_filter:
+        window_events = [e for e in window_events if e.get("tool") in tools_filter]
+        baseline_events = [e for e in baseline_events if e.get("tool") in tools_filter]
+
+    # Aggregate by tool
+    window_by_tool = _aggregate(window_events, ["tool"])
+    baseline_by_tool = _aggregate(baseline_events, ["tool"])
+
+    # Normalise baseline to per-minute rate
+    baseline_minutes = (baseline_hours * 60) - window_minutes
+    baseline_minutes = max(baseline_minutes, 1)
+    window_minutes_actual = float(window_minutes)
+
+    anomalies = []
+
+    all_tools = set(window_by_tool.keys()) | set(baseline_by_tool.keys())
+    for tool_key in sorted(all_tools):
+        w = window_by_tool.get(tool_key, {})
+        b = baseline_by_tool.get(tool_key, {})
+
+        w_calls = w.get("count", 0)
+        b_calls = b.get("count", 0)
+
+        if w_calls < min_calls:
+            continue  # Not enough traffic for meaningful anomaly
+
+        # Per-minute rates
+        w_rate = w_calls / window_minutes_actual
+        b_rate = b_calls / baseline_minutes if b_calls > 0 else 0.0
+
+        # Cost spike
+        w_cost_pm = w.get("cost_units", 0) / window_minutes_actual
+        b_cost_pm = b.get("cost_units", 0) / baseline_minutes if b_calls > 0 else 0.0
+
+        call_ratio = (w_rate / b_rate) if b_rate > 0 else float("inf")
+        cost_ratio = (w_cost_pm / b_cost_pm) if b_cost_pm > 0 else float("inf")
+
+        if call_ratio >= ratio_threshold or cost_ratio >= ratio_threshold:
+            ratio_display = round(max(call_ratio, cost_ratio), 2)
+            if ratio_display == float("inf"):
+                ratio_display = "∞ (no baseline)"
+            w_cost = w.get("cost_units", 0)
+            b_cost = b.get("cost_units", 0)
+            anomalies.append({
+                "type": "cost_spike",
+                "key": f"tool:{tool_key}",
+                "tool": tool_key,
+                "window": f"last_{window_minutes}m",
+                "baseline": f"prev_{baseline_hours}h",
+                "window_calls": w_calls,
+                "baseline_calls": b_calls,
+                "window_cost_units": round(w_cost, 2),
+                "baseline_cost_units": round(b_cost, 2),
+                "ratio": ratio_display,
+                "recommendation": _spike_recommendation(tool_key, ratio_display, w_calls),
+            })
+
+        # Error rate spike
+        w_err_rate = w.get("error_rate", 0)
+        if w_err_rate > 0.10 and w_calls >= min_calls:
+            anomalies.append({
+                "type": "error_spike",
+                "key": f"tool:{tool_key}",
+                "tool": tool_key,
+                "window": f"last_{window_minutes}m",
+                "failed_calls": w.get("failed_count", 0),
+                "total_calls": w_calls,
+                "error_rate": round(w_err_rate, 4),
+                "recommendation": f"Investigate failures for '{tool_key}': {w.get('failed_count',0)} failed / {w_calls} calls ({round(w_err_rate*100,1)}% error rate).",
+            })
+
+    # De-duplicate tool+type combos (error_spike already separate)
+    seen = set()
+    unique_anomalies = []
+    for a in anomalies:
+        key = (a["type"], a.get("tool", ""))
+        if key not in seen:
+            unique_anomalies.append(a)
+            seen.add(key)
+
+    return {
+        "anomalies": unique_anomalies,
+        "anomaly_count": len(unique_anomalies),
+        "window_minutes": window_minutes,
+        "baseline_hours": baseline_hours,
+        "ratio_threshold": ratio_threshold,
+        "min_calls": min_calls,
+        "stats": {
+            "window_calls": len(window_events),
+            "baseline_calls": len(baseline_events),
+        },
+    }
+
+
+def action_weights(repo_root: Optional[str] = None) -> Dict[str, Any]:
+    """Return current cost weights configuration."""
+    global _weights_cache
+    _weights_cache = None  # Force reload
+    cfg = _load_weights()
+    return {
+        "defaults": cfg.get("defaults", {}),
+        "tools": cfg.get("tools", {}),
+        "anomaly": cfg.get("anomaly", {}),
+        "config_path": _WEIGHTS_PATH,
+    }
+
+
+# ─── Recommendation templates ─────────────────────────────────────────────────
+
+def _spike_recommendation(tool: str, ratio: Any, calls: int) -> str:
+    cfg = _load_weights()
+    tool_cfg = (cfg.get("tools") or {}).get(tool, {})
+    category = tool_cfg.get("category", "")
+
+    if category == "media":
+        return (
+            f"'{tool}' cost spike (ratio={ratio}, {calls} calls). "
+            "Consider: rate-limit per workspace, queue with priority, review calling agents."
+        )
+    if category == "release":
+        return (
+            f"'{tool}' called more frequently than baseline (ratio={ratio}). "
+            "Review if release_check is looping or being triggered too often."
+        )
+    if category == "web":
+        return (
+            f"'{tool}' spike (ratio={ratio}). Consider: result caching, dedup identical queries."
+        )
+    return (
+        f"'{tool}' cost spike (ratio={ratio}, {calls} calls in window). "
+        "Review caller agents and apply rate limits if needed."
+    )
+
+
+# ─── backend=auto store resolver ─────────────────────────────────────────────
+
+def _resolve_store(backend: str = "auto"):
+    """
+    Return an AuditStore based on backend param.
+    backend='auto' (default): uses the globally configured store (which may be
+                              AutoAuditStore, Postgres, or JSONL).
+    backend='jsonl':          forces JsonlAuditStore (7-day window max recommended).
+    backend='memory':         MemoryAuditStore (testing).
+    """
+    from audit_store import get_audit_store, JsonlAuditStore, MemoryAuditStore
+    if backend in ("auto", None, ""):
+        return get_audit_store()
+    if backend == "jsonl":
+        import os
+        from pathlib import Path
+        audit_dir = os.getenv(
+            "AUDIT_JSONL_DIR",
+            str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "audit"),
+        )
+        return JsonlAuditStore(audit_dir)
+    if backend == "memory":
+        return MemoryAuditStore()
+    return get_audit_store()
+
+
+# ─── Digest action ────────────────────────────────────────────────────────────
+
+def action_digest(
+    store,
+    window_hours: int = 24,
+    baseline_hours: int = 168,   # 7 days
+    top_n: int = 10,
+    max_markdown_chars: int = 3800,
+) -> Dict:
+    """
+    Daily/weekly cost digest: top tools/agents + anomalies + recommendations.
+
+    Returns both structured JSON and a Telegram/markdown-friendly `markdown` field.
+    """
+    now = _now_utc()
+    window_from = _iso(now - datetime.timedelta(hours=window_hours))
+    window_to = _iso(now)
+    baseline_from = _iso(now - datetime.timedelta(hours=baseline_hours))
+
+    # ── Top ──────────────────────────────────────────────────────────────────
+    top_data = action_top(store, window_hours=window_hours, top_n=top_n)
+    top_tools = top_data.get("top_tools") or []
+    top_agents = top_data.get("top_agents") or []
+    total_calls = top_data.get("total_calls", 0)
+
+    # ── Anomalies ─────────────────────────────────────────────────────────────
+    anomaly_data = action_anomalies(
+        store,
+        window_minutes=int(window_hours * 60 / 4),
+        baseline_hours=baseline_hours,
+        min_calls=5,
+    )
+    anomalies = anomaly_data.get("anomalies") or []
+
+    # ── Total cost ────────────────────────────────────────────────────────────
+    events = store.read(from_ts=window_from, to_ts=window_to, limit=200_000)
+    total_cost = sum(compute_event_cost(e) for e in events)
+    failed = sum(1 for e in events if e.get("status") in ("failed", "error"))
+    error_rate = round(failed / max(len(events), 1), 4)
+
+    # ── Recommendations ───────────────────────────────────────────────────────
+    recs = []
+    for a in anomalies[:5]:
+        r = a.get("recommendation", "")
+        if r:
+            recs.append(r)
+    if error_rate > 0.05:
+        recs.append(f"High error rate {round(error_rate*100,1)}% — investigate failing tools.")
+    if top_tools and top_tools[0].get("cost_units", 0) > 500:
+        tool_name = top_tools[0].get("tool", "?")
+        recs.append(f"Top spender '{tool_name}' used {top_tools[0]['cost_units']:.0f} cost units — review frequency.")
+    recs = list(dict.fromkeys(recs))[:8]
+
+    # ── Markdown ─────────────────────────────────────────────────────────────
+    period_label = f"Last {window_hours}h" if window_hours <= 48 else f"Last {window_hours//24}d"
+    lines = [
+        f"📊 **Cost Digest** ({period_label})",
+        f"Total calls: {total_calls} | Cost units: {total_cost:.0f} | Errors: {round(error_rate*100,1)}%",
+        "",
+        "**Top Tools:**",
+    ]
+    for t in top_tools[:5]:
+        lines.append(f"  • `{t.get('tool','?')}` — {t.get('cost_units',0):.1f}u, {t.get('count',0)} calls")
+    lines.append("")
+    lines.append("**Top Agents:**")
+    for a in top_agents[:3]:
+        lines.append(f"  • `{a.get('agent_id','?')}` — {a.get('cost_units',0):.1f}u, {a.get('count',0)} calls")
+
+    if anomalies:
+        lines.append("")
+        lines.append(f"⚠️ **{len(anomalies)} Anomaly(ies):**")
+        for anm in anomalies[:3]:
+            lines.append(f"  • [{anm.get('type','?')}] `{anm.get('tool','?')}` ratio={anm.get('ratio','?')}")
+    if recs:
+        lines.append("")
+        lines.append("💡 **Recommendations:**")
+        for r in recs[:5]:
+            lines.append(f"  {r[:200]}")
+
+    markdown = "\n".join(lines)
+    if len(markdown) > max_markdown_chars:
+        markdown = markdown[:max_markdown_chars] + "\n…[truncated]"
+
+    return {
+        "period": period_label,
+        "window_hours": window_hours,
+        "time_range": {"from": window_from, "to": window_to},
+        "totals": {
+            "calls": total_calls,
+            "cost_units": round(total_cost, 2),
+            "failed": failed,
+            "error_rate": error_rate,
+        },
+        "top_tools": top_tools[:top_n],
+        "top_agents": top_agents[:top_n],
+        "anomalies": anomalies[:10],
+        "anomaly_count": len(anomalies),
+        "recommendations": recs,
+        "markdown": markdown,
+    }
+
+
+# ─── Main entrypoint ─────────────────────────────────────────────────────────
+
+def analyze_cost_dict(action: str, params: Optional[Dict] = None, store=None) -> Dict:
+    """
+    Wrapper called by tool_manager handler.
+    Returns plain dict for ToolResult.
+    """
+    params = params or {}
+    if store is None:
+        backend = params.get("backend", "auto")
+        store = _resolve_store(backend)
+
+    if action == "digest":
+        return action_digest(
+            store,
+            window_hours=int(params.get("window_hours", 24)),
+            baseline_hours=int(params.get("baseline_hours", 168)),
+            top_n=int(params.get("top_n", 10)),
+            max_markdown_chars=int(params.get("max_markdown_chars", 3800)),
+        )
+
+    if action == "report":
+        return action_report(
+            store,
+            time_range=params.get("time_range"),
+            group_by=params.get("group_by", ["tool"]),
+            top_n=int(params.get("top_n", 10)),
+            include_failed=bool(params.get("include_failed", True)),
+            include_hourly=bool(params.get("include_hourly", False)),
+        )
+
+    if action == "top":
+        return action_top(
+            store,
+            window_hours=int(params.get("window_hours", 24)),
+            top_n=int(params.get("top_n", 10)),
+        )
+
+    if action == "anomalies":
+        return action_anomalies(
+            store,
+            window_minutes=int(params.get("window_minutes", 60)),
+            baseline_hours=int(params.get("baseline_hours", 24)),
+            ratio_threshold=params.get("ratio_threshold"),
+            min_calls=params.get("min_calls"),
+            tools_filter=params.get("tools_filter"),
+        )
+
+    if action == "weights":
+        return action_weights()
+
+    return {"error": f"Unknown action '{action}'. Valid: digest, report, top, anomalies, weights"}