microdao-daarion/ops/voice_policy_update.py

#!/usr/bin/env python3
"""
voice_policy_update.py — Auto-update voice routing policy from audit results.

What it does:
1. Reads the N most recent audit JSON files from ops/voice_audit_results/
2. Computes p50/p95 per model across all runs
3. Generates ops/voice_latency_report.json — canonical latency snapshot
4. Proposes (and optionally applies) updates to router-config.yml:
   - prefer_models order (sorted by p50 ascending)
   - exclude_models (models with p50 > SLOW_THRESHOLD_MS)
   - auto_promote: set/update p95_ratio based on real data

Usage:
  python3 ops/voice_policy_update.py [--apply] [--n-runs 5]
  python3 ops/voice_policy_update.py          # dry-run: print proposals only
  python3 ops/voice_policy_update.py --apply  # write changes to router-config.yml

Safety:
  - Always prints a diff before applying.
  - Creates a backup: router-config.yml.bak before any write.
  - Never removes a model that has no audit data (unknown ≠ slow).
  - Never touches sections other than selection_policies.
"""
import argparse
import json
import os
import re
import shutil
import statistics
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path

import yaml

REPO_ROOT = Path(__file__).parent.parent
AUDIT_DIR = REPO_ROOT / "ops" / "voice_audit_results"
REPORT_FILE = REPO_ROOT / "ops" / "voice_latency_report.json"
ROUTER_CONFIG = REPO_ROOT / "services" / "router" / "router-config.yml"

# Models with p50 > this are considered "slow" for voice
SLOW_THRESHOLD_MS = 15_000
# Models with p50 > this but ≤ SLOW_THRESHOLD_MS → warn only
WARN_THRESHOLD_MS = 10_000
# auto_promote: apply if ratio < this
AUTO_PROMOTE_RATIO_THRESHOLD = 0.9

# Scenario name → model mapping (from audit script naming convention)
SCENARIO_MODEL_MAP = {
    "CHAT_gemma3":     "gemma3:latest",
    "CHAT_qwen3_8b":   "qwen3:8b",
    "CHAT_qwen3_14b":  "qwen3:14b",
    "CHAT_qwen35_35b": "qwen3.5:35b-a3b",
}


def load_audits(n: int = 10) -> list[dict]:
    if not AUDIT_DIR.exists():
        return []
    files = sorted(AUDIT_DIR.glob("audit_*.json"), reverse=True)[:n]
    audits = []
    for f in files:
        try:
            with open(f) as fp:
                audits.append(json.load(fp))
        except Exception as e:
            print(f"  warn: could not load {f}: {e}")
    return audits


def aggregate_model_latency(audits: list[dict]) -> dict[str, dict]:
    """
    Returns {model_name: {p50, p95, min, max, n, samples: [ms, ...]}}
    Only includes successful runs.
    """
    samples: dict[str, list[int]] = defaultdict(list)
    for audit in audits:
        for r in audit.get("results", []):
            model = SCENARIO_MODEL_MAP.get(r.get("scenario", ""))
            if model and r.get("status") == "ok" and r.get("ms", 0) > 0:
                samples[model].append(r["ms"])

    result = {}
    for model, ms_list in samples.items():
        s = sorted(ms_list)
        n = len(s)
        p50 = s[n // 2]
        p95 = s[int(n * 0.95)] if n > 1 else s[-1]
        result[model] = {
            "p50": p50,
            "p95": p95,
            "min": min(s),
            "max": max(s),
            "n": n,
            "samples": s,
        }
    return result


def aggregate_tts_latency(audits: list[dict]) -> dict:
    tts_ms = []
    for audit in audits:
        for r in audit.get("results", []):
            if "TTS" in r.get("scenario", "") and r.get("status") == "ok":
                tts_ms.append(r["ms"])
    if not tts_ms:
        return {}
    s = sorted(tts_ms)
    return {
        "p50": s[len(s) // 2],
        "p95": s[int(len(s) * 0.95)] if len(s) > 1 else s[-1],
        "min": min(s), "max": max(s), "n": len(s),
    }


def generate_report(model_stats: dict, tts_stats: dict, n_audits: int) -> dict:
    now = datetime.now(timezone.utc).isoformat()
    report = {
        "generated_at": now,
        "n_audit_runs": n_audits,
        "slo": {
            "tts_p95_target_ms": 2500,
            "llm_fast_p95_target_ms": 9000,
            "e2e_p95_target_ms": 12000,
        },
        "tts": tts_stats,
        "models": {},
        "voice_fast_uk_recommended_prefer": [],
        "voice_fast_uk_recommended_exclude": [],
        "auto_promote_verdict": None,
    }

    # Sort models by p50
    ranked = sorted(model_stats.items(), key=lambda x: x[1]["p50"])

    for model, stats in ranked:
        status = "ok"
        if stats["p50"] > SLOW_THRESHOLD_MS:
            status = "slow_exclude"
        elif stats["p50"] > WARN_THRESHOLD_MS:
            status = "warn_borderline"
        report["models"][model] = {**stats, "status": status}

    # Build recommended prefer list (exclude slow models)
    prefer = [m for m, s in ranked if s["p50"] <= SLOW_THRESHOLD_MS]
    exclude = [m for m, s in ranked if s["p50"] > SLOW_THRESHOLD_MS]

    # Always exclude known-slow even if no data
    always_exclude = ["glm-4.7-flash:32k", "glm-4.7-flash", "deepseek-r1:70b"]
    for m in always_exclude:
        if m not in exclude:
            exclude.append(m)

    report["voice_fast_uk_recommended_prefer"] = prefer
    report["voice_fast_uk_recommended_exclude"] = exclude

    # auto_promote verdict: qwen3.5 vs qwen3:14b
    q35 = model_stats.get("qwen3.5:35b-a3b", {})
    q14 = model_stats.get("qwen3:14b", {})
    if q35 and q14:
        ratio = q35["p95"] / max(q14["p95"], 1)
        verdict = "promote" if ratio < AUTO_PROMOTE_RATIO_THRESHOLD else (
            "keep_as_second" if ratio < 1.0 else "move_to_quality_only"
        )
        report["auto_promote_verdict"] = {
            "candidate": "qwen3.5:35b-a3b",
            "baseline": "qwen3:14b",
            "ratio": round(ratio, 3),
            "threshold": AUTO_PROMOTE_RATIO_THRESHOLD,
            "verdict": verdict,
            "description": {
                "promote": "qwen3.5 qualifies for auto_promote in voice_fast_uk (>10% faster)",
                "keep_as_second": "qwen3.5 faster but <10% — keep as 2nd in prefer list",
                "move_to_quality_only": "qwen3.5 slower than 14b — move to voice_quality_uk only",
            }[verdict],
        }

    return report


def propose_config_updates(report: dict, current_config: dict) -> list[dict]:
    """Returns list of {section, key, old, new, reason} proposals."""
    proposals = []
    sp = current_config.get("selection_policies", {})
    vf = sp.get("voice_fast_uk", {})

    rec_prefer = report.get("voice_fast_uk_recommended_prefer", [])
    rec_exclude = report.get("voice_fast_uk_recommended_exclude", [])

    current_prefer = vf.get("prefer_models", [])
    current_exclude = vf.get("exclude_models", [])

    # Prefer order: suggest if different from current
    if rec_prefer and rec_prefer != current_prefer:
        proposals.append({
            "section": "selection_policies.voice_fast_uk.prefer_models",
            "old": current_prefer,
            "new": rec_prefer,
            "reason": f"Sorted by p50 from {report['n_audit_runs']} audit run(s)",
        })

    # Exclude: add newly-discovered slow models
    new_excludes = [m for m in rec_exclude if m not in current_exclude]
    if new_excludes:
        proposals.append({
            "section": "selection_policies.voice_fast_uk.exclude_models",
            "old": current_exclude,
            "new": current_exclude + new_excludes,
            "reason": f"Add slow models: {new_excludes}",
        })

    # auto_promote ratio update
    ap_verdict = report.get("auto_promote_verdict")
    if ap_verdict:
        current_ratio = vf.get("auto_promote", {}).get("condition", {}).get(
            "p95_ratio_vs_next_model", None
        )
        new_ratio = ap_verdict["ratio"]
        if current_ratio is None or abs(current_ratio - new_ratio) > 0.05:
            proposals.append({
                "section": "selection_policies.voice_fast_uk.auto_promote.condition.p95_ratio_vs_next_model",
                "old": current_ratio,
                "new": round(new_ratio, 3),
                "reason": f"Updated from real audit data. Verdict: {ap_verdict['verdict']}",
            })

    return proposals


def apply_updates(proposals: list[dict], config_path: Path) -> None:
    """Write updates to router-config.yml with backup."""
    backup = config_path.with_suffix(".yml.bak")
    shutil.copy2(config_path, backup)
    print(f"\n  Backup created: {backup}")

    with open(config_path) as f:
        config = yaml.safe_load(f)

    for p in proposals:
        section = p["section"]
        new_val = p["new"]
        # Navigate and set
        keys = section.split(".")
        obj = config
        for k in keys[:-1]:
            obj = obj.setdefault(k, {})
        obj[keys[-1]] = new_val
        print(f"  Applied: {section}")
        print(f"    old: {p['old']}")
        print(f"    new: {new_val}")

    with open(config_path, "w") as f:
        yaml.dump(config, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
    print(f"\n  Written: {config_path}")


def print_report_summary(report: dict) -> None:
    print("\n── Voice Latency Report ──")
    print(f"  Generated: {report['generated_at']}")
    print(f"  Audit runs analyzed: {report['n_audit_runs']}")

    tts = report.get("tts", {})
    if tts:
        slo_ok = "✅" if tts.get("p95", 9999) <= 2500 else "⚠️ "
        print(f"\n  TTS: p50={tts.get('p50')}ms p95={tts.get('p95')}ms {slo_ok}")

    print("\n  LLM Model Ranking (by p50):")
    for model, stats in report.get("models", {}).items():
        icon = {"ok": "✅", "warn_borderline": "⚠️ ", "slow_exclude": "❌"}.get(stats["status"], "  ")
        print(f"    {icon} {model:25s} p50={stats['p50']}ms p95={stats['p95']}ms n={stats['n']}")

    ap = report.get("auto_promote_verdict")
    if ap:
        verdict_icon = {"promote": "✅", "keep_as_second": "ℹ️ ", "move_to_quality_only": "⚠️ "}.get(
            ap["verdict"], "  "
        )
        print(f"\n  auto_promote: {verdict_icon} {ap['description']} (ratio={ap['ratio']})")

    print(f"\n  Recommended prefer: {report.get('voice_fast_uk_recommended_prefer')}")
    print(f"  Recommended exclude additions: {[m for m in report.get('voice_fast_uk_recommended_exclude', []) if 'glm' not in m and 'deepseek' not in m]}")


def main():
    parser = argparse.ArgumentParser(description="Voice policy auto-update from audit data")
    parser.add_argument("--apply", action="store_true", help="Apply updates to router-config.yml")
    parser.add_argument("--n-runs", type=int, default=10, help="Number of recent audit runs to analyze")
    args = parser.parse_args()

    print("╔══════════════════════════════════════════╗")
    print("║   Voice Policy Auto-Update               ║")
    print("╚══════════════════════════════════════════╝")

    audits = load_audits(args.n_runs)
    if not audits:
        print("\n  No audit results found. Run: bash ops/voice_latency_audit.sh first.")
        return

    print(f"\n  Loaded {len(audits)} audit run(s)")

    model_stats = aggregate_model_latency(audits)
    tts_stats = aggregate_tts_latency(audits)

    report = generate_report(model_stats, tts_stats, len(audits))
    print_report_summary(report)

    # Save report
    with open(REPORT_FILE, "w") as f:
        json.dump(report, f, indent=2, ensure_ascii=False)
    print(f"\n  Report saved: {REPORT_FILE}")

    # Load current config
    with open(ROUTER_CONFIG) as f:
        current_config = yaml.safe_load(f)

    proposals = propose_config_updates(report, current_config)
    if not proposals:
        print("\n  No config changes needed — policy is up to date.")
    else:
        print(f"\n── {len(proposals)} Proposed Config Update(s) ──")
        for p in proposals:
            print(f"\n  [{p['section']}]")
            print(f"    reason: {p['reason']}")
            print(f"    old: {p['old']}")
            print(f"    new: {p['new']}")

        if args.apply:
            print("\n── Applying updates ──")
            apply_updates(proposals, ROUTER_CONFIG)
            print("\n  Done. Re-run tests: python3 -m pytest tests/test_voice_policy.py -v")
        else:
            print("\n  Dry-run mode. To apply: python3 ops/voice_policy_update.py --apply")


if __name__ == "__main__":
    main()