feat(platform): add new services, tools, tests and crews modules
New router intelligence modules (26 files): alert_ingest/store, audit_store, architecture_pressure, backlog_generator/store, cost_analyzer, data_governance, dependency_scanner, drift_analyzer, incident_* (5 files), llm_enrichment, platform_priority_digest, provider_budget, release_check_runner, risk_* (6 files), signature_state_store, sofiia_auto_router, tool_governance New services: - sofiia-console: Dockerfile, adapters/, monitor/nodes/ops/voice modules, launchd, react static - memory-service: integration_endpoints, integrations, voice_endpoints, static UI - aurora-service: full app suite (analysis, job_store, orchestrator, reporting, schemas, subagents) - sofiia-supervisor: new supervisor service - aistalk-bridge-lite: Telegram bridge lite - calendar-service: CalDAV calendar service with reminders - mlx-stt-service / mlx-tts-service: Apple Silicon speech services - binance-bot-monitor: market monitor service - node-worker: STT/TTS memory providers New tools (9): agent_email, browser_tool, contract_tool, observability_tool, oncall_tool, pr_reviewer_tool, repo_tool, safe_code_executor, secure_vault New crews: agromatrix_crew (10 modules: depth_classifier, doc_facts, doc_focus, farm_state, light_reply, llm_factory, memory_manager, proactivity, reflection_engine, session_context, style_adapter, telemetry) Tests: 85+ test files for all new modules Made-with: Cursor
This commit is contained in:
595
services/router/cost_analyzer.py
Normal file
595
services/router/cost_analyzer.py
Normal file
@@ -0,0 +1,595 @@
|
||||
"""
|
||||
Cost & Resource Analyzer (FinOps MVP)
|
||||
|
||||
Reads audit events from AuditStore and computes:
|
||||
- Aggregated cost_units by tool/agent/workspace/status
|
||||
- Top spenders (tools, agents, users)
|
||||
- Anomalies (cost spikes, error rate spikes)
|
||||
- Cost model weights
|
||||
|
||||
"cost_units" = cost_per_call(tool) + duration_ms * cost_per_ms(tool)
|
||||
These are relative units, not real dollars.
|
||||
|
||||
No payload access — all inputs are aggregation parameters only.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ─── Config loader ────────────────────────────────────────────────────────────
|
||||
|
||||
_weights_cache: Optional[Dict] = None
|
||||
_WEIGHTS_PATH = os.path.join(
|
||||
os.getenv("REPO_ROOT", str(Path(__file__).parent.parent.parent)),
|
||||
"config", "cost_weights.yml",
|
||||
)
|
||||
|
||||
|
||||
def _load_weights() -> Dict:
|
||||
global _weights_cache
|
||||
if _weights_cache is not None:
|
||||
return _weights_cache
|
||||
try:
|
||||
import yaml
|
||||
with open(_WEIGHTS_PATH, "r") as f:
|
||||
_weights_cache = yaml.safe_load(f) or {}
|
||||
except Exception as e:
|
||||
logger.warning("cost_weights.yml not loaded: %s", e)
|
||||
_weights_cache = {}
|
||||
return _weights_cache
|
||||
|
||||
|
||||
def reload_cost_weights() -> None:
|
||||
"""Force reload weights (for tests)."""
|
||||
global _weights_cache
|
||||
_weights_cache = None
|
||||
|
||||
|
||||
def get_weights_for_tool(tool: str) -> Tuple[float, float]:
|
||||
"""Return (cost_per_call, cost_per_ms) for a tool."""
|
||||
cfg = _load_weights()
|
||||
defaults = cfg.get("defaults", {})
|
||||
tool_cfg = (cfg.get("tools") or {}).get(tool, {})
|
||||
cpc = float(tool_cfg.get("cost_per_call", defaults.get("cost_per_call", 1.0)))
|
||||
cpm = float(tool_cfg.get("cost_per_ms", defaults.get("cost_per_ms", 0.001)))
|
||||
return cpc, cpm
|
||||
|
||||
|
||||
def compute_event_cost(event: Dict) -> float:
|
||||
"""Compute cost_units for a single audit event."""
|
||||
tool = event.get("tool", "")
|
||||
duration_ms = float(event.get("duration_ms", 0))
|
||||
cpc, cpm = get_weights_for_tool(tool)
|
||||
return round(cpc + duration_ms * cpm, 4)
|
||||
|
||||
|
||||
# ─── Time helpers ─────────────────────────────────────────────────────────────
|
||||
|
||||
def _now_utc() -> datetime.datetime:
|
||||
return datetime.datetime.now(datetime.timezone.utc)
|
||||
|
||||
|
||||
def _iso(dt: datetime.datetime) -> str:
|
||||
return dt.isoformat()
|
||||
|
||||
|
||||
def _parse_iso(s: str) -> datetime.datetime:
|
||||
s = s.replace("Z", "+00:00")
|
||||
try:
|
||||
return datetime.datetime.fromisoformat(s)
|
||||
except Exception:
|
||||
return _now_utc()
|
||||
|
||||
|
||||
def _bucket_hour(ts: str) -> str:
|
||||
"""Truncate ISO ts to hour: '2026-02-23T10:00:00+00:00'."""
|
||||
return ts[:13] + ":00"
|
||||
|
||||
|
||||
# ─── Aggregation helpers ──────────────────────────────────────────────────────
|
||||
|
||||
def _aggregate(
|
||||
events: List[Dict],
|
||||
group_keys: List[str],
|
||||
) -> Dict[str, Dict]:
|
||||
"""
|
||||
Aggregate events by composite key (e.g. ["tool"] or ["agent_id", "tool"]).
|
||||
Returns {key_str: {count, cost_units, duration_sum, failed_count, ...}}.
|
||||
"""
|
||||
result: Dict[str, Dict] = defaultdict(lambda: {
|
||||
"count": 0,
|
||||
"cost_units": 0.0,
|
||||
"duration_ms_sum": 0.0,
|
||||
"failed_count": 0,
|
||||
"denied_count": 0,
|
||||
"in_size_sum": 0,
|
||||
"out_size_sum": 0,
|
||||
})
|
||||
|
||||
for ev in events:
|
||||
parts = [str(ev.get(k, "unknown")) for k in group_keys]
|
||||
key = ":".join(parts)
|
||||
cost = compute_event_cost(ev)
|
||||
status = ev.get("status", "pass")
|
||||
|
||||
r = result[key]
|
||||
r["count"] += 1
|
||||
r["cost_units"] = round(r["cost_units"] + cost, 4)
|
||||
r["duration_ms_sum"] = round(r["duration_ms_sum"] + float(ev.get("duration_ms", 0)), 2)
|
||||
r["in_size_sum"] += int(ev.get("in_size", 0))
|
||||
r["out_size_sum"] += int(ev.get("out_size", 0))
|
||||
if status in ("failed", "error"):
|
||||
r["failed_count"] += 1
|
||||
elif status == "denied":
|
||||
r["denied_count"] += 1
|
||||
|
||||
# Enrich with averages
|
||||
for key, r in result.items():
|
||||
n = r["count"] or 1
|
||||
r["avg_duration_ms"] = round(r["duration_ms_sum"] / n, 1)
|
||||
r["avg_cost_units"] = round(r["cost_units"] / n, 4)
|
||||
r["error_rate"] = round(r["failed_count"] / (r["count"] or 1), 4)
|
||||
|
||||
return dict(result)
|
||||
|
||||
|
||||
def _top_n(aggregated: Dict[str, Dict], key_field: str, n: int, sort_by: str = "cost_units") -> List[Dict]:
|
||||
"""Sort aggregated dict by sort_by and return top N."""
|
||||
items = [
|
||||
{"key": k, key_field: k, **v}
|
||||
for k, v in aggregated.items()
|
||||
]
|
||||
items.sort(key=lambda x: x.get(sort_by, 0), reverse=True)
|
||||
return items[:n]
|
||||
|
||||
|
||||
# ─── Actions ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def action_report(
|
||||
store,
|
||||
time_range: Optional[Dict[str, str]] = None,
|
||||
group_by: Optional[List[str]] = None,
|
||||
top_n: int = 10,
|
||||
include_failed: bool = True,
|
||||
include_hourly: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate aggregated cost report for a time range.
|
||||
|
||||
Returns:
|
||||
totals, breakdowns by group_by keys, top spenders, optional hourly trend.
|
||||
"""
|
||||
now = _now_utc()
|
||||
tr = time_range or {}
|
||||
from_ts = tr.get("from") or _iso(now - datetime.timedelta(days=7))
|
||||
to_ts = tr.get("to") or _iso(now)
|
||||
|
||||
events = store.read(from_ts=from_ts, to_ts=to_ts, limit=200_000)
|
||||
if not include_failed:
|
||||
events = [e for e in events if e.get("status", "pass") not in ("failed", "error")]
|
||||
|
||||
# Totals
|
||||
total_cost = sum(compute_event_cost(e) for e in events)
|
||||
total_calls = len(events)
|
||||
total_failed = sum(1 for e in events if e.get("status") in ("failed", "error"))
|
||||
total_denied = sum(1 for e in events if e.get("status") == "denied")
|
||||
|
||||
# Breakdowns
|
||||
by_key = group_by or ["tool"]
|
||||
breakdowns: Dict[str, List[Dict]] = {}
|
||||
for gk in by_key:
|
||||
agg = _aggregate(events, [gk])
|
||||
breakdowns[gk] = _top_n(agg, gk, top_n)
|
||||
|
||||
# Hourly trend (optional, for last 7d max)
|
||||
hourly: List[Dict] = []
|
||||
if include_hourly and events:
|
||||
hourly_agg: Dict[str, Dict] = defaultdict(lambda: {"count": 0, "cost_units": 0.0})
|
||||
for ev in events:
|
||||
bucket = _bucket_hour(ev.get("ts", ""))
|
||||
hourly_agg[bucket]["count"] += 1
|
||||
hourly_agg[bucket]["cost_units"] = round(
|
||||
hourly_agg[bucket]["cost_units"] + compute_event_cost(ev), 4
|
||||
)
|
||||
hourly = [{"hour": k, **v} for k, v in sorted(hourly_agg.items())]
|
||||
|
||||
return {
|
||||
"time_range": {"from": from_ts, "to": to_ts},
|
||||
"totals": {
|
||||
"calls": total_calls,
|
||||
"cost_units": round(total_cost, 2),
|
||||
"failed": total_failed,
|
||||
"denied": total_denied,
|
||||
"error_rate": round(total_failed / (total_calls or 1), 4),
|
||||
},
|
||||
"breakdowns": breakdowns,
|
||||
**({"hourly": hourly} if include_hourly else {}),
|
||||
}
|
||||
|
||||
|
||||
def action_top(
|
||||
store,
|
||||
window_hours: int = 24,
|
||||
top_n: int = 10,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Quick top-N report for tools, agents, and users over window_hours.
|
||||
"""
|
||||
now = _now_utc()
|
||||
from_ts = _iso(now - datetime.timedelta(hours=window_hours))
|
||||
to_ts = _iso(now)
|
||||
|
||||
events = store.read(from_ts=from_ts, to_ts=to_ts, limit=100_000)
|
||||
|
||||
top_tools = _top_n(_aggregate(events, ["tool"]), "tool", top_n)
|
||||
top_agents = _top_n(_aggregate(events, ["agent_id"]), "agent_id", top_n)
|
||||
top_users = _top_n(_aggregate(events, ["user_id"]), "user_id", top_n)
|
||||
top_workspaces = _top_n(_aggregate(events, ["workspace_id"]), "workspace_id", top_n)
|
||||
|
||||
return {
|
||||
"window_hours": window_hours,
|
||||
"time_range": {"from": from_ts, "to": to_ts},
|
||||
"total_calls": len(events),
|
||||
"top_tools": top_tools,
|
||||
"top_agents": top_agents,
|
||||
"top_users": top_users,
|
||||
"top_workspaces": top_workspaces,
|
||||
}
|
||||
|
||||
|
||||
def action_anomalies(
|
||||
store,
|
||||
window_minutes: int = 60,
|
||||
baseline_hours: int = 24,
|
||||
ratio_threshold: Optional[float] = None,
|
||||
min_calls: Optional[int] = None,
|
||||
tools_filter: Optional[List[str]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Detect cost/call spikes and elevated error rates.
|
||||
|
||||
Algorithm:
|
||||
1. Compute per-tool metrics for window [now-window_minutes, now]
|
||||
2. Compute per-tool metrics for baseline [now-baseline_hours, now-window_minutes]
|
||||
3. Spike = window_rate / baseline_rate >= ratio_threshold AND calls >= min_calls
|
||||
4. Error spike = failed_rate > 10% AND calls >= min_calls
|
||||
"""
|
||||
cfg = _load_weights()
|
||||
anomaly_cfg = cfg.get("anomaly", {})
|
||||
|
||||
if ratio_threshold is None:
|
||||
ratio_threshold = float(anomaly_cfg.get("spike_ratio_threshold", 3.0))
|
||||
if min_calls is None:
|
||||
min_calls = int(anomaly_cfg.get("min_calls_threshold", 10))
|
||||
|
||||
now = _now_utc()
|
||||
window_from = _iso(now - datetime.timedelta(minutes=window_minutes))
|
||||
baseline_from = _iso(now - datetime.timedelta(hours=baseline_hours))
|
||||
baseline_to = window_from # non-overlapping
|
||||
|
||||
# Fetch both windows
|
||||
window_events = store.read(from_ts=window_from, to_ts=_iso(now), limit=50_000)
|
||||
baseline_events = store.read(from_ts=baseline_from, to_ts=baseline_to, limit=200_000)
|
||||
|
||||
if tools_filter:
|
||||
window_events = [e for e in window_events if e.get("tool") in tools_filter]
|
||||
baseline_events = [e for e in baseline_events if e.get("tool") in tools_filter]
|
||||
|
||||
# Aggregate by tool
|
||||
window_by_tool = _aggregate(window_events, ["tool"])
|
||||
baseline_by_tool = _aggregate(baseline_events, ["tool"])
|
||||
|
||||
# Normalise baseline to per-minute rate
|
||||
baseline_minutes = (baseline_hours * 60) - window_minutes
|
||||
baseline_minutes = max(baseline_minutes, 1)
|
||||
window_minutes_actual = float(window_minutes)
|
||||
|
||||
anomalies = []
|
||||
|
||||
all_tools = set(window_by_tool.keys()) | set(baseline_by_tool.keys())
|
||||
for tool_key in sorted(all_tools):
|
||||
w = window_by_tool.get(tool_key, {})
|
||||
b = baseline_by_tool.get(tool_key, {})
|
||||
|
||||
w_calls = w.get("count", 0)
|
||||
b_calls = b.get("count", 0)
|
||||
|
||||
if w_calls < min_calls:
|
||||
continue # Not enough traffic for meaningful anomaly
|
||||
|
||||
# Per-minute rates
|
||||
w_rate = w_calls / window_minutes_actual
|
||||
b_rate = b_calls / baseline_minutes if b_calls > 0 else 0.0
|
||||
|
||||
# Cost spike
|
||||
w_cost_pm = w.get("cost_units", 0) / window_minutes_actual
|
||||
b_cost_pm = b.get("cost_units", 0) / baseline_minutes if b_calls > 0 else 0.0
|
||||
|
||||
call_ratio = (w_rate / b_rate) if b_rate > 0 else float("inf")
|
||||
cost_ratio = (w_cost_pm / b_cost_pm) if b_cost_pm > 0 else float("inf")
|
||||
|
||||
if call_ratio >= ratio_threshold or cost_ratio >= ratio_threshold:
|
||||
ratio_display = round(max(call_ratio, cost_ratio), 2)
|
||||
if ratio_display == float("inf"):
|
||||
ratio_display = "∞ (no baseline)"
|
||||
w_cost = w.get("cost_units", 0)
|
||||
b_cost = b.get("cost_units", 0)
|
||||
anomalies.append({
|
||||
"type": "cost_spike",
|
||||
"key": f"tool:{tool_key}",
|
||||
"tool": tool_key,
|
||||
"window": f"last_{window_minutes}m",
|
||||
"baseline": f"prev_{baseline_hours}h",
|
||||
"window_calls": w_calls,
|
||||
"baseline_calls": b_calls,
|
||||
"window_cost_units": round(w_cost, 2),
|
||||
"baseline_cost_units": round(b_cost, 2),
|
||||
"ratio": ratio_display,
|
||||
"recommendation": _spike_recommendation(tool_key, ratio_display, w_calls),
|
||||
})
|
||||
|
||||
# Error rate spike
|
||||
w_err_rate = w.get("error_rate", 0)
|
||||
if w_err_rate > 0.10 and w_calls >= min_calls:
|
||||
anomalies.append({
|
||||
"type": "error_spike",
|
||||
"key": f"tool:{tool_key}",
|
||||
"tool": tool_key,
|
||||
"window": f"last_{window_minutes}m",
|
||||
"failed_calls": w.get("failed_count", 0),
|
||||
"total_calls": w_calls,
|
||||
"error_rate": round(w_err_rate, 4),
|
||||
"recommendation": f"Investigate failures for '{tool_key}': {w.get('failed_count',0)} failed / {w_calls} calls ({round(w_err_rate*100,1)}% error rate).",
|
||||
})
|
||||
|
||||
# De-duplicate tool+type combos (error_spike already separate)
|
||||
seen = set()
|
||||
unique_anomalies = []
|
||||
for a in anomalies:
|
||||
key = (a["type"], a.get("tool", ""))
|
||||
if key not in seen:
|
||||
unique_anomalies.append(a)
|
||||
seen.add(key)
|
||||
|
||||
return {
|
||||
"anomalies": unique_anomalies,
|
||||
"anomaly_count": len(unique_anomalies),
|
||||
"window_minutes": window_minutes,
|
||||
"baseline_hours": baseline_hours,
|
||||
"ratio_threshold": ratio_threshold,
|
||||
"min_calls": min_calls,
|
||||
"stats": {
|
||||
"window_calls": len(window_events),
|
||||
"baseline_calls": len(baseline_events),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def action_weights(repo_root: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Return current cost weights configuration."""
|
||||
global _weights_cache
|
||||
_weights_cache = None # Force reload
|
||||
cfg = _load_weights()
|
||||
return {
|
||||
"defaults": cfg.get("defaults", {}),
|
||||
"tools": cfg.get("tools", {}),
|
||||
"anomaly": cfg.get("anomaly", {}),
|
||||
"config_path": _WEIGHTS_PATH,
|
||||
}
|
||||
|
||||
|
||||
# ─── Recommendation templates ─────────────────────────────────────────────────
|
||||
|
||||
def _spike_recommendation(tool: str, ratio: Any, calls: int) -> str:
|
||||
cfg = _load_weights()
|
||||
tool_cfg = (cfg.get("tools") or {}).get(tool, {})
|
||||
category = tool_cfg.get("category", "")
|
||||
|
||||
if category == "media":
|
||||
return (
|
||||
f"'{tool}' cost spike (ratio={ratio}, {calls} calls). "
|
||||
"Consider: rate-limit per workspace, queue with priority, review calling agents."
|
||||
)
|
||||
if category == "release":
|
||||
return (
|
||||
f"'{tool}' called more frequently than baseline (ratio={ratio}). "
|
||||
"Review if release_check is looping or being triggered too often."
|
||||
)
|
||||
if category == "web":
|
||||
return (
|
||||
f"'{tool}' spike (ratio={ratio}). Consider: result caching, dedup identical queries."
|
||||
)
|
||||
return (
|
||||
f"'{tool}' cost spike (ratio={ratio}, {calls} calls in window). "
|
||||
"Review caller agents and apply rate limits if needed."
|
||||
)
|
||||
|
||||
|
||||
# ─── backend=auto store resolver ─────────────────────────────────────────────
|
||||
|
||||
def _resolve_store(backend: str = "auto"):
|
||||
"""
|
||||
Return an AuditStore based on backend param.
|
||||
backend='auto' (default): uses the globally configured store (which may be
|
||||
AutoAuditStore, Postgres, or JSONL).
|
||||
backend='jsonl': forces JsonlAuditStore (7-day window max recommended).
|
||||
backend='memory': MemoryAuditStore (testing).
|
||||
"""
|
||||
from audit_store import get_audit_store, JsonlAuditStore, MemoryAuditStore
|
||||
if backend in ("auto", None, ""):
|
||||
return get_audit_store()
|
||||
if backend == "jsonl":
|
||||
import os
|
||||
from pathlib import Path
|
||||
audit_dir = os.getenv(
|
||||
"AUDIT_JSONL_DIR",
|
||||
str(Path(os.getenv("REPO_ROOT", ".")) / "ops" / "audit"),
|
||||
)
|
||||
return JsonlAuditStore(audit_dir)
|
||||
if backend == "memory":
|
||||
return MemoryAuditStore()
|
||||
return get_audit_store()
|
||||
|
||||
|
||||
# ─── Digest action ────────────────────────────────────────────────────────────
|
||||
|
||||
def action_digest(
|
||||
store,
|
||||
window_hours: int = 24,
|
||||
baseline_hours: int = 168, # 7 days
|
||||
top_n: int = 10,
|
||||
max_markdown_chars: int = 3800,
|
||||
) -> Dict:
|
||||
"""
|
||||
Daily/weekly cost digest: top tools/agents + anomalies + recommendations.
|
||||
|
||||
Returns both structured JSON and a Telegram/markdown-friendly `markdown` field.
|
||||
"""
|
||||
now = _now_utc()
|
||||
window_from = _iso(now - datetime.timedelta(hours=window_hours))
|
||||
window_to = _iso(now)
|
||||
baseline_from = _iso(now - datetime.timedelta(hours=baseline_hours))
|
||||
|
||||
# ── Top ──────────────────────────────────────────────────────────────────
|
||||
top_data = action_top(store, window_hours=window_hours, top_n=top_n)
|
||||
top_tools = top_data.get("top_tools") or []
|
||||
top_agents = top_data.get("top_agents") or []
|
||||
total_calls = top_data.get("total_calls", 0)
|
||||
|
||||
# ── Anomalies ─────────────────────────────────────────────────────────────
|
||||
anomaly_data = action_anomalies(
|
||||
store,
|
||||
window_minutes=int(window_hours * 60 / 4),
|
||||
baseline_hours=baseline_hours,
|
||||
min_calls=5,
|
||||
)
|
||||
anomalies = anomaly_data.get("anomalies") or []
|
||||
|
||||
# ── Total cost ────────────────────────────────────────────────────────────
|
||||
events = store.read(from_ts=window_from, to_ts=window_to, limit=200_000)
|
||||
total_cost = sum(compute_event_cost(e) for e in events)
|
||||
failed = sum(1 for e in events if e.get("status") in ("failed", "error"))
|
||||
error_rate = round(failed / max(len(events), 1), 4)
|
||||
|
||||
# ── Recommendations ───────────────────────────────────────────────────────
|
||||
recs = []
|
||||
for a in anomalies[:5]:
|
||||
r = a.get("recommendation", "")
|
||||
if r:
|
||||
recs.append(r)
|
||||
if error_rate > 0.05:
|
||||
recs.append(f"High error rate {round(error_rate*100,1)}% — investigate failing tools.")
|
||||
if top_tools and top_tools[0].get("cost_units", 0) > 500:
|
||||
tool_name = top_tools[0].get("tool", "?")
|
||||
recs.append(f"Top spender '{tool_name}' used {top_tools[0]['cost_units']:.0f} cost units — review frequency.")
|
||||
recs = list(dict.fromkeys(recs))[:8]
|
||||
|
||||
# ── Markdown ─────────────────────────────────────────────────────────────
|
||||
period_label = f"Last {window_hours}h" if window_hours <= 48 else f"Last {window_hours//24}d"
|
||||
lines = [
|
||||
f"📊 **Cost Digest** ({period_label})",
|
||||
f"Total calls: {total_calls} | Cost units: {total_cost:.0f} | Errors: {round(error_rate*100,1)}%",
|
||||
"",
|
||||
"**Top Tools:**",
|
||||
]
|
||||
for t in top_tools[:5]:
|
||||
lines.append(f" • `{t.get('tool','?')}` — {t.get('cost_units',0):.1f}u, {t.get('count',0)} calls")
|
||||
lines.append("")
|
||||
lines.append("**Top Agents:**")
|
||||
for a in top_agents[:3]:
|
||||
lines.append(f" • `{a.get('agent_id','?')}` — {a.get('cost_units',0):.1f}u, {a.get('count',0)} calls")
|
||||
|
||||
if anomalies:
|
||||
lines.append("")
|
||||
lines.append(f"⚠️ **{len(anomalies)} Anomaly(ies):**")
|
||||
for anm in anomalies[:3]:
|
||||
lines.append(f" • [{anm.get('type','?')}] `{anm.get('tool','?')}` ratio={anm.get('ratio','?')}")
|
||||
if recs:
|
||||
lines.append("")
|
||||
lines.append("💡 **Recommendations:**")
|
||||
for r in recs[:5]:
|
||||
lines.append(f" {r[:200]}")
|
||||
|
||||
markdown = "\n".join(lines)
|
||||
if len(markdown) > max_markdown_chars:
|
||||
markdown = markdown[:max_markdown_chars] + "\n…[truncated]"
|
||||
|
||||
return {
|
||||
"period": period_label,
|
||||
"window_hours": window_hours,
|
||||
"time_range": {"from": window_from, "to": window_to},
|
||||
"totals": {
|
||||
"calls": total_calls,
|
||||
"cost_units": round(total_cost, 2),
|
||||
"failed": failed,
|
||||
"error_rate": error_rate,
|
||||
},
|
||||
"top_tools": top_tools[:top_n],
|
||||
"top_agents": top_agents[:top_n],
|
||||
"anomalies": anomalies[:10],
|
||||
"anomaly_count": len(anomalies),
|
||||
"recommendations": recs,
|
||||
"markdown": markdown,
|
||||
}
|
||||
|
||||
|
||||
# ─── Main entrypoint ─────────────────────────────────────────────────────────
|
||||
|
||||
def analyze_cost_dict(action: str, params: Optional[Dict] = None, store=None) -> Dict:
|
||||
"""
|
||||
Wrapper called by tool_manager handler.
|
||||
Returns plain dict for ToolResult.
|
||||
"""
|
||||
params = params or {}
|
||||
if store is None:
|
||||
backend = params.get("backend", "auto")
|
||||
store = _resolve_store(backend)
|
||||
|
||||
if action == "digest":
|
||||
return action_digest(
|
||||
store,
|
||||
window_hours=int(params.get("window_hours", 24)),
|
||||
baseline_hours=int(params.get("baseline_hours", 168)),
|
||||
top_n=int(params.get("top_n", 10)),
|
||||
max_markdown_chars=int(params.get("max_markdown_chars", 3800)),
|
||||
)
|
||||
|
||||
if action == "report":
|
||||
return action_report(
|
||||
store,
|
||||
time_range=params.get("time_range"),
|
||||
group_by=params.get("group_by", ["tool"]),
|
||||
top_n=int(params.get("top_n", 10)),
|
||||
include_failed=bool(params.get("include_failed", True)),
|
||||
include_hourly=bool(params.get("include_hourly", False)),
|
||||
)
|
||||
|
||||
if action == "top":
|
||||
return action_top(
|
||||
store,
|
||||
window_hours=int(params.get("window_hours", 24)),
|
||||
top_n=int(params.get("top_n", 10)),
|
||||
)
|
||||
|
||||
if action == "anomalies":
|
||||
return action_anomalies(
|
||||
store,
|
||||
window_minutes=int(params.get("window_minutes", 60)),
|
||||
baseline_hours=int(params.get("baseline_hours", 24)),
|
||||
ratio_threshold=params.get("ratio_threshold"),
|
||||
min_calls=params.get("min_calls"),
|
||||
tools_filter=params.get("tools_filter"),
|
||||
)
|
||||
|
||||
if action == "weights":
|
||||
return action_weights()
|
||||
|
||||
return {"error": f"Unknown action '{action}'. Valid: digest, report, top, anomalies, weights"}
|
||||
Reference in New Issue
Block a user