microdao-daarion/services/router/provider_budget.py

"""Provider Budget Tracker — real-money token usage accounting.

Tracks:
  - Tokens used (input/output) per provider per model
  - Estimated USD cost based on published pricing
  - Approximate balance (if configured via env var)
  - Rolling 24h / 7d / 30d windows

Pricing table: updated Feb 2026 (USD per 1M tokens)
"""
from __future__ import annotations

import json
import logging
import os
import threading
import time
from collections import defaultdict
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional

logger = logging.getLogger(__name__)

# ── Pricing catalog (USD / 1M tokens) ─────────────────────────────────────────

PRICING: Dict[str, Dict[str, float]] = {
    # provider → model_pattern → {input, output}
    "anthropic": {
        "claude-sonnet-4-5":   {"input": 3.0,  "output": 15.0},
        "claude-opus-4-5":     {"input": 15.0, "output": 75.0},
        "claude-haiku-3-5":    {"input": 0.8,  "output": 4.0},
        "claude-3-5-sonnet":   {"input": 3.0,  "output": 15.0},
        "_default":            {"input": 3.0,  "output": 15.0},
    },
    "grok": {
        "grok-4-1-fast-reasoning": {"input": 5.0, "output": 15.0},
        "grok-3":                  {"input": 5.0, "output": 25.0},
        "grok-2-1212":             {"input": 2.0, "output": 10.0},
        "_default":                {"input": 5.0, "output": 15.0},
    },
    "deepseek": {
        "deepseek-chat":           {"input": 0.27, "output": 1.10},
        "deepseek-reasoner":       {"input": 0.55, "output": 2.19},
        "_default":                {"input": 0.27, "output": 1.10},
    },
    "mistral": {
        "mistral-large-latest":    {"input": 2.0,  "output": 6.0},
        "mistral-small-latest":    {"input": 0.2,  "output": 0.6},
        "_default":                {"input": 2.0,  "output": 6.0},
    },
    "openai": {
        "gpt-4o":                  {"input": 2.5,  "output": 10.0},
        "gpt-4o-mini":             {"input": 0.15, "output": 0.60},
        "gpt-4-turbo":             {"input": 10.0, "output": 30.0},
        "_default":                {"input": 2.5,  "output": 10.0},
    },
    "glm": {
        "glm-4-plus":              {"input": 0.05, "output": 0.05},
        "glm-4-flash":             {"input": 0.0,  "output": 0.0},   # free tier
        "glm-4.7-flash":           {"input": 0.0,  "output": 0.0},
        "glm-z1-plus":             {"input": 0.07, "output": 0.07},
        "_default":                {"input": 0.05, "output": 0.05},
    },
    "ollama": {
        "_default": {"input": 0.0, "output": 0.0},
    },
}


def get_price(provider: str, model: str) -> Dict[str, float]:
    p = PRICING.get(provider.lower(), PRICING.get("anthropic"))
    # exact match
    if model in p:
        return p[model]
    # prefix match
    for k, v in p.items():
        if k != "_default" and model.startswith(k):
            return v
    return p.get("_default", {"input": 3.0, "output": 15.0})


def calc_cost_usd(provider: str, model: str, input_tokens: int, output_tokens: int) -> float:
    price = get_price(provider, model)
    return (input_tokens * price["input"] + output_tokens * price["output"]) / 1_000_000


# ── Usage record ──────────────────────────────────────────────────────────────

@dataclass
class UsageRecord:
    ts: float
    provider: str
    model: str
    agent: str
    input_tokens: int
    output_tokens: int
    cost_usd: float
    latency_ms: int = 0
    task_type: str = ""
    fallback_used: bool = False


# ── Storage ────────────────────────────────────────────────────────────────────

_BUDGET_DIR = Path(os.getenv("BUDGET_DATA_DIR", os.path.expanduser("~/.sofiia/budget")))
_USAGE_FILE = _BUDGET_DIR / "usage.jsonl"
_LIMITS_FILE = _BUDGET_DIR / "limits.json"

_lock = threading.Lock()


def _ensure_dir() -> None:
    _BUDGET_DIR.mkdir(parents=True, exist_ok=True)


def _append_usage(rec: UsageRecord) -> None:
    _ensure_dir()
    with _lock:
        with open(_USAGE_FILE, "a", encoding="utf-8") as f:
            f.write(json.dumps(asdict(rec)) + "\n")


def _load_usage(since_ts: float = 0.0) -> List[UsageRecord]:
    if not _USAGE_FILE.exists():
        return []
    records: List[UsageRecord] = []
    with _lock:
        try:
            with open(_USAGE_FILE, "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        d = json.loads(line)
                        if d.get("ts", 0) >= since_ts:
                            records.append(UsageRecord(**d))
                    except Exception:
                        pass
        except Exception as e:
            logger.warning("budget: failed to load usage: %s", e)
    return records


# ── Manual balance config ──────────────────────────────────────────────────────

def _load_limits() -> Dict[str, Any]:
    if not _LIMITS_FILE.exists():
        return {}
    try:
        with open(_LIMITS_FILE, "r") as f:
            return json.load(f)
    except Exception:
        return {}


def _save_limits(data: Dict[str, Any]) -> None:
    _ensure_dir()
    with _lock:
        with open(_LIMITS_FILE, "w") as f:
            json.dump(data, f, indent=2)


# ── Public API ─────────────────────────────────────────────────────────────────

def track_usage(
    provider: str,
    model: str,
    agent: str,
    input_tokens: int,
    output_tokens: int,
    latency_ms: int = 0,
    task_type: str = "",
    fallback_used: bool = False,
) -> float:
    """Record token usage and return cost in USD."""
    cost = calc_cost_usd(provider, model, input_tokens, output_tokens)
    rec = UsageRecord(
        ts=time.time(),
        provider=provider,
        model=model,
        agent=agent,
        input_tokens=input_tokens,
        output_tokens=output_tokens,
        cost_usd=cost,
        latency_ms=latency_ms,
        task_type=task_type,
        fallback_used=fallback_used,
    )
    _append_usage(rec)
    logger.debug(
        "💰 tracked: provider=%s model=%s tokens=%d+%d cost=$%.5f",
        provider, model, input_tokens, output_tokens, cost,
    )
    return cost


@dataclass
class ProviderStats:
    provider: str
    total_input_tokens: int = 0
    total_output_tokens: int = 0
    total_cost_usd: float = 0.0
    call_count: int = 0
    avg_latency_ms: float = 0.0
    top_models: List[Dict[str, Any]] = field(default_factory=list)
    # Configured limits (from limits.json)
    monthly_limit_usd: Optional[float] = None
    topup_balance_usd: Optional[float] = None
    estimated_remaining_usd: Optional[float] = None


def get_stats(window_hours: int = 720) -> Dict[str, ProviderStats]:
    """
    Aggregate usage stats per provider for the given time window.
    Default window = 720h = 30 days.
    """
    since_ts = time.time() - window_hours * 3600
    records = _load_usage(since_ts)
    by_provider = _aggregate_records(records)

    limits = _load_limits()
    for p, s in by_provider.items():
        lim = limits.get(p, {})
        if "monthly_limit_usd" in lim:
            s.monthly_limit_usd = lim["monthly_limit_usd"]
        if "topup_balance_usd" in lim:
            s.topup_balance_usd = lim["topup_balance_usd"]
            s.estimated_remaining_usd = round(lim["topup_balance_usd"] - s.total_cost_usd, 4)

    return by_provider


def get_dashboard_data() -> Dict[str, Any]:
    """
    Returns structured data for the budget dashboard UI.
    Includes 24h, 7d, 30d windows.
    Single file read + in-memory filtering for all three windows.
    """
    now = time.time()
    ts_30d = now - 720 * 3600
    ts_7d  = now - 168 * 3600
    ts_24h = now - 24 * 3600

    all_records = _load_usage(since_ts=ts_30d)
    records_7d  = [r for r in all_records if r.ts >= ts_7d]
    records_24h = [r for r in records_7d if r.ts >= ts_24h]

    stats_30d = _aggregate_records(all_records)
    stats_7d  = _aggregate_records(records_7d)
    stats_24h = _aggregate_records(records_24h)

    limits = _load_limits()

    # Apply limits to 30d stats
    for p, s in stats_30d.items():
        lim = limits.get(p, {})
        if "monthly_limit_usd" in lim:
            s.monthly_limit_usd = lim["monthly_limit_usd"]
        if "topup_balance_usd" in lim:
            s.topup_balance_usd = lim["topup_balance_usd"]
            s.estimated_remaining_usd = round(lim["topup_balance_usd"] - s.total_cost_usd, 4)

    all_providers = sorted({
        *(k for k in PRICING if k != "ollama"),
        *stats_30d.keys(),
    })

    providers_data = []
    for p in all_providers:
        s30 = stats_30d.get(p, ProviderStats(provider=p))
        s7  = stats_7d.get(p, ProviderStats(provider=p))
        s24 = stats_24h.get(p, ProviderStats(provider=p))
        plim = limits.get(p, {})

        providers_data.append({
            "provider": p,
            "display_name": _provider_display_name(p),
            "icon": _provider_icon(p),
            "available": bool(os.getenv(_provider_env_key(p), "").strip()),
            "cost_24h": round(s24.total_cost_usd, 5),
            "cost_7d":  round(s7.total_cost_usd, 5),
            "cost_30d": round(s30.total_cost_usd, 5),
            "calls_24h": s24.call_count,
            "calls_30d": s30.call_count,
            "tokens_24h": s24.total_input_tokens + s24.total_output_tokens,
            "tokens_30d": s30.total_input_tokens + s30.total_output_tokens,
            "avg_latency_ms": round(s30.avg_latency_ms),
            "monthly_limit_usd": s30.monthly_limit_usd,
            "topup_balance_usd": plim.get("topup_balance_usd"),
            "estimated_remaining_usd": s30.estimated_remaining_usd,
            "top_models": s30.top_models,
        })

    total_24h = sum(s.total_cost_usd for s in stats_24h.values())
    total_7d  = sum(s.total_cost_usd for s in stats_7d.values())
    total_30d = sum(s.total_cost_usd for s in stats_30d.values())

    return {
        "providers": providers_data,
        "summary": {
            "total_cost_24h": round(total_24h, 5),
            "total_cost_7d":  round(total_7d, 5),
            "total_cost_30d": round(total_30d, 5),
            "total_calls_30d": sum(s.call_count for s in stats_30d.values()),
        },
        "generated_at": now,
    }


def _aggregate_records(records: List[UsageRecord]) -> Dict[str, ProviderStats]:
    """Aggregate a list of records into per-provider stats."""
    by_provider: Dict[str, ProviderStats] = {}
    model_usage: Dict[str, Dict[str, Dict[str, Any]]] = defaultdict(
        lambda: defaultdict(lambda: {"calls": 0, "cost": 0.0, "tokens": 0})
    )
    for rec in records:
        p = rec.provider
        if p not in by_provider:
            by_provider[p] = ProviderStats(provider=p)
        s = by_provider[p]
        s.total_input_tokens += rec.input_tokens
        s.total_output_tokens += rec.output_tokens
        s.total_cost_usd += rec.cost_usd
        s.call_count += 1
        if rec.latency_ms:
            s.avg_latency_ms = (
                (s.avg_latency_ms * (s.call_count - 1) + rec.latency_ms) / s.call_count
            )
        model_usage[p][rec.model]["calls"] += 1
        model_usage[p][rec.model]["cost"] += rec.cost_usd
        model_usage[p][rec.model]["tokens"] += rec.input_tokens + rec.output_tokens

    for p, s in by_provider.items():
        top = sorted(model_usage[p].items(), key=lambda x: x[1]["cost"], reverse=True)[:3]
        s.top_models = [{"model": k, **v} for k, v in top]

    return by_provider


def rotate_usage_log(max_age_days: int = 90) -> int:
    """Remove records older than max_age_days. Returns count of removed lines."""
    if not _USAGE_FILE.exists():
        return 0
    cutoff = time.time() - max_age_days * 86400
    kept = []
    removed = 0
    with _lock:
        try:
            with open(_USAGE_FILE, "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        d = json.loads(line)
                        if d.get("ts", 0) >= cutoff:
                            kept.append(line)
                        else:
                            removed += 1
                    except Exception:
                        removed += 1
            with open(_USAGE_FILE, "w", encoding="utf-8") as f:
                for line in kept:
                    f.write(line + "\n")
        except Exception as e:
            logger.warning("budget: rotate failed: %s", e)
    if removed:
        logger.info("budget: rotated %d old records (>%dd)", removed, max_age_days)
    return removed


def set_provider_limit(provider: str, monthly_limit_usd: Optional[float] = None, topup_balance_usd: Optional[float] = None) -> None:
    """Configure budget limits for a provider."""
    limits = _load_limits()
    if provider not in limits:
        limits[provider] = {}
    if monthly_limit_usd is not None:
        limits[provider]["monthly_limit_usd"] = monthly_limit_usd
    if topup_balance_usd is not None:
        limits[provider]["topup_balance_usd"] = topup_balance_usd
    _save_limits(limits)
    logger.info("budget: set limits for %s: %s", provider, limits[provider])


def _provider_display_name(p: str) -> str:
    return {
        "anthropic": "Anthropic Claude",
        "grok": "xAI Grok",
        "deepseek": "DeepSeek",
        "mistral": "Mistral AI",
        "openai": "OpenAI",
        "glm": "GLM / Z.AI",
        "ollama": "Local (Ollama)",
    }.get(p, p.title())


def _provider_icon(p: str) -> str:
    return {
        "anthropic": "🟣",
        "grok": "⚡",
        "deepseek": "🔵",
        "mistral": "🌊",
        "openai": "🟢",
        "glm": "🐉",
        "ollama": "🖥️",
    }.get(p, "🤖")


def _provider_env_key(p: str) -> str:
    return {
        "anthropic": "ANTHROPIC_API_KEY",
        "grok": "GROK_API_KEY",
        "deepseek": "DEEPSEEK_API_KEY",
        "mistral": "MISTRAL_API_KEY",
        "openai": "OPENAI_API_KEY",
        "glm": "GLM5_API_KEY",
    }.get(p, f"{p.upper()}_API_KEY")