Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
337 lines
12 KiB
Python
337 lines
12 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
voice_policy_update.py — Auto-update voice routing policy from audit results.
|
||
|
||
What it does:
|
||
1. Reads the N most recent audit JSON files from ops/voice_audit_results/
|
||
2. Computes p50/p95 per model across all runs
|
||
3. Generates ops/voice_latency_report.json — canonical latency snapshot
|
||
4. Proposes (and optionally applies) updates to router-config.yml:
|
||
- prefer_models order (sorted by p50 ascending)
|
||
- exclude_models (models with p50 > SLOW_THRESHOLD_MS)
|
||
- auto_promote: set/update p95_ratio based on real data
|
||
|
||
Usage:
|
||
python3 ops/voice_policy_update.py [--apply] [--n-runs 5]
|
||
python3 ops/voice_policy_update.py # dry-run: print proposals only
|
||
python3 ops/voice_policy_update.py --apply # write changes to router-config.yml
|
||
|
||
Safety:
|
||
- Always prints a diff before applying.
|
||
- Creates a backup: router-config.yml.bak before any write.
|
||
- Never removes a model that has no audit data (unknown ≠ slow).
|
||
- Never touches sections other than selection_policies.
|
||
"""
|
||
import argparse
|
||
import json
|
||
import os
|
||
import re
|
||
import shutil
|
||
import statistics
|
||
from collections import defaultdict
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
|
||
import yaml
|
||
|
||
REPO_ROOT = Path(__file__).parent.parent
|
||
AUDIT_DIR = REPO_ROOT / "ops" / "voice_audit_results"
|
||
REPORT_FILE = REPO_ROOT / "ops" / "voice_latency_report.json"
|
||
ROUTER_CONFIG = REPO_ROOT / "services" / "router" / "router-config.yml"
|
||
|
||
# Models with p50 > this are considered "slow" for voice
|
||
SLOW_THRESHOLD_MS = 15_000
|
||
# Models with p50 > this but ≤ SLOW_THRESHOLD_MS → warn only
|
||
WARN_THRESHOLD_MS = 10_000
|
||
# auto_promote: apply if ratio < this
|
||
AUTO_PROMOTE_RATIO_THRESHOLD = 0.9
|
||
|
||
# Scenario name → model mapping (from audit script naming convention)
|
||
SCENARIO_MODEL_MAP = {
|
||
"CHAT_gemma3": "gemma3:latest",
|
||
"CHAT_qwen3_8b": "qwen3:8b",
|
||
"CHAT_qwen3_14b": "qwen3:14b",
|
||
"CHAT_qwen35_35b": "qwen3.5:35b-a3b",
|
||
}
|
||
|
||
|
||
def load_audits(n: int = 10) -> list[dict]:
|
||
if not AUDIT_DIR.exists():
|
||
return []
|
||
files = sorted(AUDIT_DIR.glob("audit_*.json"), reverse=True)[:n]
|
||
audits = []
|
||
for f in files:
|
||
try:
|
||
with open(f) as fp:
|
||
audits.append(json.load(fp))
|
||
except Exception as e:
|
||
print(f" warn: could not load {f}: {e}")
|
||
return audits
|
||
|
||
|
||
def aggregate_model_latency(audits: list[dict]) -> dict[str, dict]:
|
||
"""
|
||
Returns {model_name: {p50, p95, min, max, n, samples: [ms, ...]}}
|
||
Only includes successful runs.
|
||
"""
|
||
samples: dict[str, list[int]] = defaultdict(list)
|
||
for audit in audits:
|
||
for r in audit.get("results", []):
|
||
model = SCENARIO_MODEL_MAP.get(r.get("scenario", ""))
|
||
if model and r.get("status") == "ok" and r.get("ms", 0) > 0:
|
||
samples[model].append(r["ms"])
|
||
|
||
result = {}
|
||
for model, ms_list in samples.items():
|
||
s = sorted(ms_list)
|
||
n = len(s)
|
||
p50 = s[n // 2]
|
||
p95 = s[int(n * 0.95)] if n > 1 else s[-1]
|
||
result[model] = {
|
||
"p50": p50,
|
||
"p95": p95,
|
||
"min": min(s),
|
||
"max": max(s),
|
||
"n": n,
|
||
"samples": s,
|
||
}
|
||
return result
|
||
|
||
|
||
def aggregate_tts_latency(audits: list[dict]) -> dict:
|
||
tts_ms = []
|
||
for audit in audits:
|
||
for r in audit.get("results", []):
|
||
if "TTS" in r.get("scenario", "") and r.get("status") == "ok":
|
||
tts_ms.append(r["ms"])
|
||
if not tts_ms:
|
||
return {}
|
||
s = sorted(tts_ms)
|
||
return {
|
||
"p50": s[len(s) // 2],
|
||
"p95": s[int(len(s) * 0.95)] if len(s) > 1 else s[-1],
|
||
"min": min(s), "max": max(s), "n": len(s),
|
||
}
|
||
|
||
|
||
def generate_report(model_stats: dict, tts_stats: dict, n_audits: int) -> dict:
|
||
now = datetime.now(timezone.utc).isoformat()
|
||
report = {
|
||
"generated_at": now,
|
||
"n_audit_runs": n_audits,
|
||
"slo": {
|
||
"tts_p95_target_ms": 2500,
|
||
"llm_fast_p95_target_ms": 9000,
|
||
"e2e_p95_target_ms": 12000,
|
||
},
|
||
"tts": tts_stats,
|
||
"models": {},
|
||
"voice_fast_uk_recommended_prefer": [],
|
||
"voice_fast_uk_recommended_exclude": [],
|
||
"auto_promote_verdict": None,
|
||
}
|
||
|
||
# Sort models by p50
|
||
ranked = sorted(model_stats.items(), key=lambda x: x[1]["p50"])
|
||
|
||
for model, stats in ranked:
|
||
status = "ok"
|
||
if stats["p50"] > SLOW_THRESHOLD_MS:
|
||
status = "slow_exclude"
|
||
elif stats["p50"] > WARN_THRESHOLD_MS:
|
||
status = "warn_borderline"
|
||
report["models"][model] = {**stats, "status": status}
|
||
|
||
# Build recommended prefer list (exclude slow models)
|
||
prefer = [m for m, s in ranked if s["p50"] <= SLOW_THRESHOLD_MS]
|
||
exclude = [m for m, s in ranked if s["p50"] > SLOW_THRESHOLD_MS]
|
||
|
||
# Always exclude known-slow even if no data
|
||
always_exclude = ["glm-4.7-flash:32k", "glm-4.7-flash", "deepseek-r1:70b"]
|
||
for m in always_exclude:
|
||
if m not in exclude:
|
||
exclude.append(m)
|
||
|
||
report["voice_fast_uk_recommended_prefer"] = prefer
|
||
report["voice_fast_uk_recommended_exclude"] = exclude
|
||
|
||
# auto_promote verdict: qwen3.5 vs qwen3:14b
|
||
q35 = model_stats.get("qwen3.5:35b-a3b", {})
|
||
q14 = model_stats.get("qwen3:14b", {})
|
||
if q35 and q14:
|
||
ratio = q35["p95"] / max(q14["p95"], 1)
|
||
verdict = "promote" if ratio < AUTO_PROMOTE_RATIO_THRESHOLD else (
|
||
"keep_as_second" if ratio < 1.0 else "move_to_quality_only"
|
||
)
|
||
report["auto_promote_verdict"] = {
|
||
"candidate": "qwen3.5:35b-a3b",
|
||
"baseline": "qwen3:14b",
|
||
"ratio": round(ratio, 3),
|
||
"threshold": AUTO_PROMOTE_RATIO_THRESHOLD,
|
||
"verdict": verdict,
|
||
"description": {
|
||
"promote": "qwen3.5 qualifies for auto_promote in voice_fast_uk (>10% faster)",
|
||
"keep_as_second": "qwen3.5 faster but <10% — keep as 2nd in prefer list",
|
||
"move_to_quality_only": "qwen3.5 slower than 14b — move to voice_quality_uk only",
|
||
}[verdict],
|
||
}
|
||
|
||
return report
|
||
|
||
|
||
def propose_config_updates(report: dict, current_config: dict) -> list[dict]:
|
||
"""Returns list of {section, key, old, new, reason} proposals."""
|
||
proposals = []
|
||
sp = current_config.get("selection_policies", {})
|
||
vf = sp.get("voice_fast_uk", {})
|
||
|
||
rec_prefer = report.get("voice_fast_uk_recommended_prefer", [])
|
||
rec_exclude = report.get("voice_fast_uk_recommended_exclude", [])
|
||
|
||
current_prefer = vf.get("prefer_models", [])
|
||
current_exclude = vf.get("exclude_models", [])
|
||
|
||
# Prefer order: suggest if different from current
|
||
if rec_prefer and rec_prefer != current_prefer:
|
||
proposals.append({
|
||
"section": "selection_policies.voice_fast_uk.prefer_models",
|
||
"old": current_prefer,
|
||
"new": rec_prefer,
|
||
"reason": f"Sorted by p50 from {report['n_audit_runs']} audit run(s)",
|
||
})
|
||
|
||
# Exclude: add newly-discovered slow models
|
||
new_excludes = [m for m in rec_exclude if m not in current_exclude]
|
||
if new_excludes:
|
||
proposals.append({
|
||
"section": "selection_policies.voice_fast_uk.exclude_models",
|
||
"old": current_exclude,
|
||
"new": current_exclude + new_excludes,
|
||
"reason": f"Add slow models: {new_excludes}",
|
||
})
|
||
|
||
# auto_promote ratio update
|
||
ap_verdict = report.get("auto_promote_verdict")
|
||
if ap_verdict:
|
||
current_ratio = vf.get("auto_promote", {}).get("condition", {}).get(
|
||
"p95_ratio_vs_next_model", None
|
||
)
|
||
new_ratio = ap_verdict["ratio"]
|
||
if current_ratio is None or abs(current_ratio - new_ratio) > 0.05:
|
||
proposals.append({
|
||
"section": "selection_policies.voice_fast_uk.auto_promote.condition.p95_ratio_vs_next_model",
|
||
"old": current_ratio,
|
||
"new": round(new_ratio, 3),
|
||
"reason": f"Updated from real audit data. Verdict: {ap_verdict['verdict']}",
|
||
})
|
||
|
||
return proposals
|
||
|
||
|
||
def apply_updates(proposals: list[dict], config_path: Path) -> None:
|
||
"""Write updates to router-config.yml with backup."""
|
||
backup = config_path.with_suffix(".yml.bak")
|
||
shutil.copy2(config_path, backup)
|
||
print(f"\n Backup created: {backup}")
|
||
|
||
with open(config_path) as f:
|
||
config = yaml.safe_load(f)
|
||
|
||
for p in proposals:
|
||
section = p["section"]
|
||
new_val = p["new"]
|
||
# Navigate and set
|
||
keys = section.split(".")
|
||
obj = config
|
||
for k in keys[:-1]:
|
||
obj = obj.setdefault(k, {})
|
||
obj[keys[-1]] = new_val
|
||
print(f" Applied: {section}")
|
||
print(f" old: {p['old']}")
|
||
print(f" new: {new_val}")
|
||
|
||
with open(config_path, "w") as f:
|
||
yaml.dump(config, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||
print(f"\n Written: {config_path}")
|
||
|
||
|
||
def print_report_summary(report: dict) -> None:
|
||
print("\n── Voice Latency Report ──")
|
||
print(f" Generated: {report['generated_at']}")
|
||
print(f" Audit runs analyzed: {report['n_audit_runs']}")
|
||
|
||
tts = report.get("tts", {})
|
||
if tts:
|
||
slo_ok = "✅" if tts.get("p95", 9999) <= 2500 else "⚠️ "
|
||
print(f"\n TTS: p50={tts.get('p50')}ms p95={tts.get('p95')}ms {slo_ok}")
|
||
|
||
print("\n LLM Model Ranking (by p50):")
|
||
for model, stats in report.get("models", {}).items():
|
||
icon = {"ok": "✅", "warn_borderline": "⚠️ ", "slow_exclude": "❌"}.get(stats["status"], " ")
|
||
print(f" {icon} {model:25s} p50={stats['p50']}ms p95={stats['p95']}ms n={stats['n']}")
|
||
|
||
ap = report.get("auto_promote_verdict")
|
||
if ap:
|
||
verdict_icon = {"promote": "✅", "keep_as_second": "ℹ️ ", "move_to_quality_only": "⚠️ "}.get(
|
||
ap["verdict"], " "
|
||
)
|
||
print(f"\n auto_promote: {verdict_icon} {ap['description']} (ratio={ap['ratio']})")
|
||
|
||
print(f"\n Recommended prefer: {report.get('voice_fast_uk_recommended_prefer')}")
|
||
print(f" Recommended exclude additions: {[m for m in report.get('voice_fast_uk_recommended_exclude', []) if 'glm' not in m and 'deepseek' not in m]}")
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="Voice policy auto-update from audit data")
|
||
parser.add_argument("--apply", action="store_true", help="Apply updates to router-config.yml")
|
||
parser.add_argument("--n-runs", type=int, default=10, help="Number of recent audit runs to analyze")
|
||
args = parser.parse_args()
|
||
|
||
print("╔══════════════════════════════════════════╗")
|
||
print("║ Voice Policy Auto-Update ║")
|
||
print("╚══════════════════════════════════════════╝")
|
||
|
||
audits = load_audits(args.n_runs)
|
||
if not audits:
|
||
print("\n No audit results found. Run: bash ops/voice_latency_audit.sh first.")
|
||
return
|
||
|
||
print(f"\n Loaded {len(audits)} audit run(s)")
|
||
|
||
model_stats = aggregate_model_latency(audits)
|
||
tts_stats = aggregate_tts_latency(audits)
|
||
|
||
report = generate_report(model_stats, tts_stats, len(audits))
|
||
print_report_summary(report)
|
||
|
||
# Save report
|
||
with open(REPORT_FILE, "w") as f:
|
||
json.dump(report, f, indent=2, ensure_ascii=False)
|
||
print(f"\n Report saved: {REPORT_FILE}")
|
||
|
||
# Load current config
|
||
with open(ROUTER_CONFIG) as f:
|
||
current_config = yaml.safe_load(f)
|
||
|
||
proposals = propose_config_updates(report, current_config)
|
||
if not proposals:
|
||
print("\n No config changes needed — policy is up to date.")
|
||
else:
|
||
print(f"\n── {len(proposals)} Proposed Config Update(s) ──")
|
||
for p in proposals:
|
||
print(f"\n [{p['section']}]")
|
||
print(f" reason: {p['reason']}")
|
||
print(f" old: {p['old']}")
|
||
print(f" new: {p['new']}")
|
||
|
||
if args.apply:
|
||
print("\n── Applying updates ──")
|
||
apply_updates(proposals, ROUTER_CONFIG)
|
||
print("\n Done. Re-run tests: python3 -m pytest tests/test_voice_policy.py -v")
|
||
else:
|
||
print("\n Dry-run mode. To apply: python3 ops/voice_policy_update.py --apply")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|