Files
microdao-daarion/ops/voice_policy_update.py
Apple 67225a39fa docs(platform): add policy configs, runbooks, ops scripts and platform documentation
Config policies (16 files): alert_routing, architecture_pressure, backlog,
cost_weights, data_governance, incident_escalation, incident_intelligence,
network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix,
release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout

Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard,
deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice,
cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule),
task_registry, voice alerts/ha/latency/policy

Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks,
NODA1/NODA2 status and setup, audit index and traces, backlog, incident,
supervisor, tools, voice, opencode, release, risk, aistalk, spacebot

Made-with: Cursor
2026-03-03 07:14:53 -08:00

337 lines
12 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
voice_policy_update.py — Auto-update voice routing policy from audit results.
What it does:
1. Reads the N most recent audit JSON files from ops/voice_audit_results/
2. Computes p50/p95 per model across all runs
3. Generates ops/voice_latency_report.json — canonical latency snapshot
4. Proposes (and optionally applies) updates to router-config.yml:
- prefer_models order (sorted by p50 ascending)
- exclude_models (models with p50 > SLOW_THRESHOLD_MS)
- auto_promote: set/update p95_ratio based on real data
Usage:
python3 ops/voice_policy_update.py [--apply] [--n-runs 5]
python3 ops/voice_policy_update.py # dry-run: print proposals only
python3 ops/voice_policy_update.py --apply # write changes to router-config.yml
Safety:
- Always prints a diff before applying.
- Creates a backup: router-config.yml.bak before any write.
- Never removes a model that has no audit data (unknown ≠ slow).
- Never touches sections other than selection_policies.
"""
import argparse
import json
import os
import re
import shutil
import statistics
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
import yaml
REPO_ROOT = Path(__file__).parent.parent
AUDIT_DIR = REPO_ROOT / "ops" / "voice_audit_results"
REPORT_FILE = REPO_ROOT / "ops" / "voice_latency_report.json"
ROUTER_CONFIG = REPO_ROOT / "services" / "router" / "router-config.yml"
# Models with p50 > this are considered "slow" for voice
SLOW_THRESHOLD_MS = 15_000
# Models with p50 > this but ≤ SLOW_THRESHOLD_MS → warn only
WARN_THRESHOLD_MS = 10_000
# auto_promote: apply if ratio < this
AUTO_PROMOTE_RATIO_THRESHOLD = 0.9
# Scenario name → model mapping (from audit script naming convention)
SCENARIO_MODEL_MAP = {
"CHAT_gemma3": "gemma3:latest",
"CHAT_qwen3_8b": "qwen3:8b",
"CHAT_qwen3_14b": "qwen3:14b",
"CHAT_qwen35_35b": "qwen3.5:35b-a3b",
}
def load_audits(n: int = 10) -> list[dict]:
if not AUDIT_DIR.exists():
return []
files = sorted(AUDIT_DIR.glob("audit_*.json"), reverse=True)[:n]
audits = []
for f in files:
try:
with open(f) as fp:
audits.append(json.load(fp))
except Exception as e:
print(f" warn: could not load {f}: {e}")
return audits
def aggregate_model_latency(audits: list[dict]) -> dict[str, dict]:
"""
Returns {model_name: {p50, p95, min, max, n, samples: [ms, ...]}}
Only includes successful runs.
"""
samples: dict[str, list[int]] = defaultdict(list)
for audit in audits:
for r in audit.get("results", []):
model = SCENARIO_MODEL_MAP.get(r.get("scenario", ""))
if model and r.get("status") == "ok" and r.get("ms", 0) > 0:
samples[model].append(r["ms"])
result = {}
for model, ms_list in samples.items():
s = sorted(ms_list)
n = len(s)
p50 = s[n // 2]
p95 = s[int(n * 0.95)] if n > 1 else s[-1]
result[model] = {
"p50": p50,
"p95": p95,
"min": min(s),
"max": max(s),
"n": n,
"samples": s,
}
return result
def aggregate_tts_latency(audits: list[dict]) -> dict:
tts_ms = []
for audit in audits:
for r in audit.get("results", []):
if "TTS" in r.get("scenario", "") and r.get("status") == "ok":
tts_ms.append(r["ms"])
if not tts_ms:
return {}
s = sorted(tts_ms)
return {
"p50": s[len(s) // 2],
"p95": s[int(len(s) * 0.95)] if len(s) > 1 else s[-1],
"min": min(s), "max": max(s), "n": len(s),
}
def generate_report(model_stats: dict, tts_stats: dict, n_audits: int) -> dict:
now = datetime.now(timezone.utc).isoformat()
report = {
"generated_at": now,
"n_audit_runs": n_audits,
"slo": {
"tts_p95_target_ms": 2500,
"llm_fast_p95_target_ms": 9000,
"e2e_p95_target_ms": 12000,
},
"tts": tts_stats,
"models": {},
"voice_fast_uk_recommended_prefer": [],
"voice_fast_uk_recommended_exclude": [],
"auto_promote_verdict": None,
}
# Sort models by p50
ranked = sorted(model_stats.items(), key=lambda x: x[1]["p50"])
for model, stats in ranked:
status = "ok"
if stats["p50"] > SLOW_THRESHOLD_MS:
status = "slow_exclude"
elif stats["p50"] > WARN_THRESHOLD_MS:
status = "warn_borderline"
report["models"][model] = {**stats, "status": status}
# Build recommended prefer list (exclude slow models)
prefer = [m for m, s in ranked if s["p50"] <= SLOW_THRESHOLD_MS]
exclude = [m for m, s in ranked if s["p50"] > SLOW_THRESHOLD_MS]
# Always exclude known-slow even if no data
always_exclude = ["glm-4.7-flash:32k", "glm-4.7-flash", "deepseek-r1:70b"]
for m in always_exclude:
if m not in exclude:
exclude.append(m)
report["voice_fast_uk_recommended_prefer"] = prefer
report["voice_fast_uk_recommended_exclude"] = exclude
# auto_promote verdict: qwen3.5 vs qwen3:14b
q35 = model_stats.get("qwen3.5:35b-a3b", {})
q14 = model_stats.get("qwen3:14b", {})
if q35 and q14:
ratio = q35["p95"] / max(q14["p95"], 1)
verdict = "promote" if ratio < AUTO_PROMOTE_RATIO_THRESHOLD else (
"keep_as_second" if ratio < 1.0 else "move_to_quality_only"
)
report["auto_promote_verdict"] = {
"candidate": "qwen3.5:35b-a3b",
"baseline": "qwen3:14b",
"ratio": round(ratio, 3),
"threshold": AUTO_PROMOTE_RATIO_THRESHOLD,
"verdict": verdict,
"description": {
"promote": "qwen3.5 qualifies for auto_promote in voice_fast_uk (>10% faster)",
"keep_as_second": "qwen3.5 faster but <10% — keep as 2nd in prefer list",
"move_to_quality_only": "qwen3.5 slower than 14b — move to voice_quality_uk only",
}[verdict],
}
return report
def propose_config_updates(report: dict, current_config: dict) -> list[dict]:
"""Returns list of {section, key, old, new, reason} proposals."""
proposals = []
sp = current_config.get("selection_policies", {})
vf = sp.get("voice_fast_uk", {})
rec_prefer = report.get("voice_fast_uk_recommended_prefer", [])
rec_exclude = report.get("voice_fast_uk_recommended_exclude", [])
current_prefer = vf.get("prefer_models", [])
current_exclude = vf.get("exclude_models", [])
# Prefer order: suggest if different from current
if rec_prefer and rec_prefer != current_prefer:
proposals.append({
"section": "selection_policies.voice_fast_uk.prefer_models",
"old": current_prefer,
"new": rec_prefer,
"reason": f"Sorted by p50 from {report['n_audit_runs']} audit run(s)",
})
# Exclude: add newly-discovered slow models
new_excludes = [m for m in rec_exclude if m not in current_exclude]
if new_excludes:
proposals.append({
"section": "selection_policies.voice_fast_uk.exclude_models",
"old": current_exclude,
"new": current_exclude + new_excludes,
"reason": f"Add slow models: {new_excludes}",
})
# auto_promote ratio update
ap_verdict = report.get("auto_promote_verdict")
if ap_verdict:
current_ratio = vf.get("auto_promote", {}).get("condition", {}).get(
"p95_ratio_vs_next_model", None
)
new_ratio = ap_verdict["ratio"]
if current_ratio is None or abs(current_ratio - new_ratio) > 0.05:
proposals.append({
"section": "selection_policies.voice_fast_uk.auto_promote.condition.p95_ratio_vs_next_model",
"old": current_ratio,
"new": round(new_ratio, 3),
"reason": f"Updated from real audit data. Verdict: {ap_verdict['verdict']}",
})
return proposals
def apply_updates(proposals: list[dict], config_path: Path) -> None:
"""Write updates to router-config.yml with backup."""
backup = config_path.with_suffix(".yml.bak")
shutil.copy2(config_path, backup)
print(f"\n Backup created: {backup}")
with open(config_path) as f:
config = yaml.safe_load(f)
for p in proposals:
section = p["section"]
new_val = p["new"]
# Navigate and set
keys = section.split(".")
obj = config
for k in keys[:-1]:
obj = obj.setdefault(k, {})
obj[keys[-1]] = new_val
print(f" Applied: {section}")
print(f" old: {p['old']}")
print(f" new: {new_val}")
with open(config_path, "w") as f:
yaml.dump(config, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"\n Written: {config_path}")
def print_report_summary(report: dict) -> None:
print("\n── Voice Latency Report ──")
print(f" Generated: {report['generated_at']}")
print(f" Audit runs analyzed: {report['n_audit_runs']}")
tts = report.get("tts", {})
if tts:
slo_ok = "" if tts.get("p95", 9999) <= 2500 else "⚠️ "
print(f"\n TTS: p50={tts.get('p50')}ms p95={tts.get('p95')}ms {slo_ok}")
print("\n LLM Model Ranking (by p50):")
for model, stats in report.get("models", {}).items():
icon = {"ok": "", "warn_borderline": "⚠️ ", "slow_exclude": ""}.get(stats["status"], " ")
print(f" {icon} {model:25s} p50={stats['p50']}ms p95={stats['p95']}ms n={stats['n']}")
ap = report.get("auto_promote_verdict")
if ap:
verdict_icon = {"promote": "", "keep_as_second": " ", "move_to_quality_only": "⚠️ "}.get(
ap["verdict"], " "
)
print(f"\n auto_promote: {verdict_icon} {ap['description']} (ratio={ap['ratio']})")
print(f"\n Recommended prefer: {report.get('voice_fast_uk_recommended_prefer')}")
print(f" Recommended exclude additions: {[m for m in report.get('voice_fast_uk_recommended_exclude', []) if 'glm' not in m and 'deepseek' not in m]}")
def main():
parser = argparse.ArgumentParser(description="Voice policy auto-update from audit data")
parser.add_argument("--apply", action="store_true", help="Apply updates to router-config.yml")
parser.add_argument("--n-runs", type=int, default=10, help="Number of recent audit runs to analyze")
args = parser.parse_args()
print("╔══════════════════════════════════════════╗")
print("║ Voice Policy Auto-Update ║")
print("╚══════════════════════════════════════════╝")
audits = load_audits(args.n_runs)
if not audits:
print("\n No audit results found. Run: bash ops/voice_latency_audit.sh first.")
return
print(f"\n Loaded {len(audits)} audit run(s)")
model_stats = aggregate_model_latency(audits)
tts_stats = aggregate_tts_latency(audits)
report = generate_report(model_stats, tts_stats, len(audits))
print_report_summary(report)
# Save report
with open(REPORT_FILE, "w") as f:
json.dump(report, f, indent=2, ensure_ascii=False)
print(f"\n Report saved: {REPORT_FILE}")
# Load current config
with open(ROUTER_CONFIG) as f:
current_config = yaml.safe_load(f)
proposals = propose_config_updates(report, current_config)
if not proposals:
print("\n No config changes needed — policy is up to date.")
else:
print(f"\n── {len(proposals)} Proposed Config Update(s) ──")
for p in proposals:
print(f"\n [{p['section']}]")
print(f" reason: {p['reason']}")
print(f" old: {p['old']}")
print(f" new: {p['new']}")
if args.apply:
print("\n── Applying updates ──")
apply_updates(proposals, ROUTER_CONFIG)
print("\n Done. Re-run tests: python3 -m pytest tests/test_voice_policy.py -v")
else:
print("\n Dry-run mode. To apply: python3 ops/voice_policy_update.py --apply")
if __name__ == "__main__":
main()