#!/usr/bin/env python3 """ voice_policy_update.py — Auto-update voice routing policy from audit results. What it does: 1. Reads the N most recent audit JSON files from ops/voice_audit_results/ 2. Computes p50/p95 per model across all runs 3. Generates ops/voice_latency_report.json — canonical latency snapshot 4. Proposes (and optionally applies) updates to router-config.yml: - prefer_models order (sorted by p50 ascending) - exclude_models (models with p50 > SLOW_THRESHOLD_MS) - auto_promote: set/update p95_ratio based on real data Usage: python3 ops/voice_policy_update.py [--apply] [--n-runs 5] python3 ops/voice_policy_update.py # dry-run: print proposals only python3 ops/voice_policy_update.py --apply # write changes to router-config.yml Safety: - Always prints a diff before applying. - Creates a backup: router-config.yml.bak before any write. - Never removes a model that has no audit data (unknown ≠ slow). - Never touches sections other than selection_policies. """ import argparse import json import os import re import shutil import statistics from collections import defaultdict from datetime import datetime, timezone from pathlib import Path import yaml REPO_ROOT = Path(__file__).parent.parent AUDIT_DIR = REPO_ROOT / "ops" / "voice_audit_results" REPORT_FILE = REPO_ROOT / "ops" / "voice_latency_report.json" ROUTER_CONFIG = REPO_ROOT / "services" / "router" / "router-config.yml" # Models with p50 > this are considered "slow" for voice SLOW_THRESHOLD_MS = 15_000 # Models with p50 > this but ≤ SLOW_THRESHOLD_MS → warn only WARN_THRESHOLD_MS = 10_000 # auto_promote: apply if ratio < this AUTO_PROMOTE_RATIO_THRESHOLD = 0.9 # Scenario name → model mapping (from audit script naming convention) SCENARIO_MODEL_MAP = { "CHAT_gemma3": "gemma3:latest", "CHAT_qwen3_8b": "qwen3:8b", "CHAT_qwen3_14b": "qwen3:14b", "CHAT_qwen35_35b": "qwen3.5:35b-a3b", } def load_audits(n: int = 10) -> list[dict]: if not AUDIT_DIR.exists(): return [] files = sorted(AUDIT_DIR.glob("audit_*.json"), reverse=True)[:n] audits = [] for f in files: try: with open(f) as fp: audits.append(json.load(fp)) except Exception as e: print(f" warn: could not load {f}: {e}") return audits def aggregate_model_latency(audits: list[dict]) -> dict[str, dict]: """ Returns {model_name: {p50, p95, min, max, n, samples: [ms, ...]}} Only includes successful runs. """ samples: dict[str, list[int]] = defaultdict(list) for audit in audits: for r in audit.get("results", []): model = SCENARIO_MODEL_MAP.get(r.get("scenario", "")) if model and r.get("status") == "ok" and r.get("ms", 0) > 0: samples[model].append(r["ms"]) result = {} for model, ms_list in samples.items(): s = sorted(ms_list) n = len(s) p50 = s[n // 2] p95 = s[int(n * 0.95)] if n > 1 else s[-1] result[model] = { "p50": p50, "p95": p95, "min": min(s), "max": max(s), "n": n, "samples": s, } return result def aggregate_tts_latency(audits: list[dict]) -> dict: tts_ms = [] for audit in audits: for r in audit.get("results", []): if "TTS" in r.get("scenario", "") and r.get("status") == "ok": tts_ms.append(r["ms"]) if not tts_ms: return {} s = sorted(tts_ms) return { "p50": s[len(s) // 2], "p95": s[int(len(s) * 0.95)] if len(s) > 1 else s[-1], "min": min(s), "max": max(s), "n": len(s), } def generate_report(model_stats: dict, tts_stats: dict, n_audits: int) -> dict: now = datetime.now(timezone.utc).isoformat() report = { "generated_at": now, "n_audit_runs": n_audits, "slo": { "tts_p95_target_ms": 2500, "llm_fast_p95_target_ms": 9000, "e2e_p95_target_ms": 12000, }, "tts": tts_stats, "models": {}, "voice_fast_uk_recommended_prefer": [], "voice_fast_uk_recommended_exclude": [], "auto_promote_verdict": None, } # Sort models by p50 ranked = sorted(model_stats.items(), key=lambda x: x[1]["p50"]) for model, stats in ranked: status = "ok" if stats["p50"] > SLOW_THRESHOLD_MS: status = "slow_exclude" elif stats["p50"] > WARN_THRESHOLD_MS: status = "warn_borderline" report["models"][model] = {**stats, "status": status} # Build recommended prefer list (exclude slow models) prefer = [m for m, s in ranked if s["p50"] <= SLOW_THRESHOLD_MS] exclude = [m for m, s in ranked if s["p50"] > SLOW_THRESHOLD_MS] # Always exclude known-slow even if no data always_exclude = ["glm-4.7-flash:32k", "glm-4.7-flash", "deepseek-r1:70b"] for m in always_exclude: if m not in exclude: exclude.append(m) report["voice_fast_uk_recommended_prefer"] = prefer report["voice_fast_uk_recommended_exclude"] = exclude # auto_promote verdict: qwen3.5 vs qwen3:14b q35 = model_stats.get("qwen3.5:35b-a3b", {}) q14 = model_stats.get("qwen3:14b", {}) if q35 and q14: ratio = q35["p95"] / max(q14["p95"], 1) verdict = "promote" if ratio < AUTO_PROMOTE_RATIO_THRESHOLD else ( "keep_as_second" if ratio < 1.0 else "move_to_quality_only" ) report["auto_promote_verdict"] = { "candidate": "qwen3.5:35b-a3b", "baseline": "qwen3:14b", "ratio": round(ratio, 3), "threshold": AUTO_PROMOTE_RATIO_THRESHOLD, "verdict": verdict, "description": { "promote": "qwen3.5 qualifies for auto_promote in voice_fast_uk (>10% faster)", "keep_as_second": "qwen3.5 faster but <10% — keep as 2nd in prefer list", "move_to_quality_only": "qwen3.5 slower than 14b — move to voice_quality_uk only", }[verdict], } return report def propose_config_updates(report: dict, current_config: dict) -> list[dict]: """Returns list of {section, key, old, new, reason} proposals.""" proposals = [] sp = current_config.get("selection_policies", {}) vf = sp.get("voice_fast_uk", {}) rec_prefer = report.get("voice_fast_uk_recommended_prefer", []) rec_exclude = report.get("voice_fast_uk_recommended_exclude", []) current_prefer = vf.get("prefer_models", []) current_exclude = vf.get("exclude_models", []) # Prefer order: suggest if different from current if rec_prefer and rec_prefer != current_prefer: proposals.append({ "section": "selection_policies.voice_fast_uk.prefer_models", "old": current_prefer, "new": rec_prefer, "reason": f"Sorted by p50 from {report['n_audit_runs']} audit run(s)", }) # Exclude: add newly-discovered slow models new_excludes = [m for m in rec_exclude if m not in current_exclude] if new_excludes: proposals.append({ "section": "selection_policies.voice_fast_uk.exclude_models", "old": current_exclude, "new": current_exclude + new_excludes, "reason": f"Add slow models: {new_excludes}", }) # auto_promote ratio update ap_verdict = report.get("auto_promote_verdict") if ap_verdict: current_ratio = vf.get("auto_promote", {}).get("condition", {}).get( "p95_ratio_vs_next_model", None ) new_ratio = ap_verdict["ratio"] if current_ratio is None or abs(current_ratio - new_ratio) > 0.05: proposals.append({ "section": "selection_policies.voice_fast_uk.auto_promote.condition.p95_ratio_vs_next_model", "old": current_ratio, "new": round(new_ratio, 3), "reason": f"Updated from real audit data. Verdict: {ap_verdict['verdict']}", }) return proposals def apply_updates(proposals: list[dict], config_path: Path) -> None: """Write updates to router-config.yml with backup.""" backup = config_path.with_suffix(".yml.bak") shutil.copy2(config_path, backup) print(f"\n Backup created: {backup}") with open(config_path) as f: config = yaml.safe_load(f) for p in proposals: section = p["section"] new_val = p["new"] # Navigate and set keys = section.split(".") obj = config for k in keys[:-1]: obj = obj.setdefault(k, {}) obj[keys[-1]] = new_val print(f" Applied: {section}") print(f" old: {p['old']}") print(f" new: {new_val}") with open(config_path, "w") as f: yaml.dump(config, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"\n Written: {config_path}") def print_report_summary(report: dict) -> None: print("\n── Voice Latency Report ──") print(f" Generated: {report['generated_at']}") print(f" Audit runs analyzed: {report['n_audit_runs']}") tts = report.get("tts", {}) if tts: slo_ok = "✅" if tts.get("p95", 9999) <= 2500 else "⚠️ " print(f"\n TTS: p50={tts.get('p50')}ms p95={tts.get('p95')}ms {slo_ok}") print("\n LLM Model Ranking (by p50):") for model, stats in report.get("models", {}).items(): icon = {"ok": "✅", "warn_borderline": "⚠️ ", "slow_exclude": "❌"}.get(stats["status"], " ") print(f" {icon} {model:25s} p50={stats['p50']}ms p95={stats['p95']}ms n={stats['n']}") ap = report.get("auto_promote_verdict") if ap: verdict_icon = {"promote": "✅", "keep_as_second": "ℹ️ ", "move_to_quality_only": "⚠️ "}.get( ap["verdict"], " " ) print(f"\n auto_promote: {verdict_icon} {ap['description']} (ratio={ap['ratio']})") print(f"\n Recommended prefer: {report.get('voice_fast_uk_recommended_prefer')}") print(f" Recommended exclude additions: {[m for m in report.get('voice_fast_uk_recommended_exclude', []) if 'glm' not in m and 'deepseek' not in m]}") def main(): parser = argparse.ArgumentParser(description="Voice policy auto-update from audit data") parser.add_argument("--apply", action="store_true", help="Apply updates to router-config.yml") parser.add_argument("--n-runs", type=int, default=10, help="Number of recent audit runs to analyze") args = parser.parse_args() print("╔══════════════════════════════════════════╗") print("║ Voice Policy Auto-Update ║") print("╚══════════════════════════════════════════╝") audits = load_audits(args.n_runs) if not audits: print("\n No audit results found. Run: bash ops/voice_latency_audit.sh first.") return print(f"\n Loaded {len(audits)} audit run(s)") model_stats = aggregate_model_latency(audits) tts_stats = aggregate_tts_latency(audits) report = generate_report(model_stats, tts_stats, len(audits)) print_report_summary(report) # Save report with open(REPORT_FILE, "w") as f: json.dump(report, f, indent=2, ensure_ascii=False) print(f"\n Report saved: {REPORT_FILE}") # Load current config with open(ROUTER_CONFIG) as f: current_config = yaml.safe_load(f) proposals = propose_config_updates(report, current_config) if not proposals: print("\n No config changes needed — policy is up to date.") else: print(f"\n── {len(proposals)} Proposed Config Update(s) ──") for p in proposals: print(f"\n [{p['section']}]") print(f" reason: {p['reason']}") print(f" old: {p['old']}") print(f" new: {p['new']}") if args.apply: print("\n── Applying updates ──") apply_updates(proposals, ROUTER_CONFIG) print("\n Done. Re-run tests: python3 -m pytest tests/test_voice_policy.py -v") else: print("\n Dry-run mode. To apply: python3 ops/voice_policy_update.py --apply") if __name__ == "__main__": main()