docs(platform): add policy configs, runbooks, ops scripts and platform documentation

Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
2026-03-03 07:14:53 -08:00
parent 129e4ea1fc
commit 67225a39fa
102 changed files with 20060 additions and 0 deletions
--- a/ops/voice_policy_update.py
+++ b/ops/voice_policy_update.py
@@ -0,0 +1,336 @@
+#!/usr/bin/env python3
+"""
+voice_policy_update.py — Auto-update voice routing policy from audit results.
+
+What it does:
+1. Reads the N most recent audit JSON files from ops/voice_audit_results/
+2. Computes p50/p95 per model across all runs
+3. Generates ops/voice_latency_report.json — canonical latency snapshot
+4. Proposes (and optionally applies) updates to router-config.yml:
+   - prefer_models order (sorted by p50 ascending)
+   - exclude_models (models with p50 > SLOW_THRESHOLD_MS)
+   - auto_promote: set/update p95_ratio based on real data
+
+Usage:
+  python3 ops/voice_policy_update.py [--apply] [--n-runs 5]
+  python3 ops/voice_policy_update.py          # dry-run: print proposals only
+  python3 ops/voice_policy_update.py --apply  # write changes to router-config.yml
+
+Safety:
+  - Always prints a diff before applying.
+  - Creates a backup: router-config.yml.bak before any write.
+  - Never removes a model that has no audit data (unknown ≠ slow).
+  - Never touches sections other than selection_policies.
+"""
+import argparse
+import json
+import os
+import re
+import shutil
+import statistics
+from collections import defaultdict
+from datetime import datetime, timezone
+from pathlib import Path
+
+import yaml
+
+REPO_ROOT = Path(__file__).parent.parent
+AUDIT_DIR = REPO_ROOT / "ops" / "voice_audit_results"
+REPORT_FILE = REPO_ROOT / "ops" / "voice_latency_report.json"
+ROUTER_CONFIG = REPO_ROOT / "services" / "router" / "router-config.yml"
+
+# Models with p50 > this are considered "slow" for voice
+SLOW_THRESHOLD_MS = 15_000
+# Models with p50 > this but ≤ SLOW_THRESHOLD_MS → warn only
+WARN_THRESHOLD_MS = 10_000
+# auto_promote: apply if ratio < this
+AUTO_PROMOTE_RATIO_THRESHOLD = 0.9
+
+# Scenario name → model mapping (from audit script naming convention)
+SCENARIO_MODEL_MAP = {
+    "CHAT_gemma3":     "gemma3:latest",
+    "CHAT_qwen3_8b":   "qwen3:8b",
+    "CHAT_qwen3_14b":  "qwen3:14b",
+    "CHAT_qwen35_35b": "qwen3.5:35b-a3b",
+}
+
+
+def load_audits(n: int = 10) -> list[dict]:
+    if not AUDIT_DIR.exists():
+        return []
+    files = sorted(AUDIT_DIR.glob("audit_*.json"), reverse=True)[:n]
+    audits = []
+    for f in files:
+        try:
+            with open(f) as fp:
+                audits.append(json.load(fp))
+        except Exception as e:
+            print(f"  warn: could not load {f}: {e}")
+    return audits
+
+
+def aggregate_model_latency(audits: list[dict]) -> dict[str, dict]:
+    """
+    Returns {model_name: {p50, p95, min, max, n, samples: [ms, ...]}}
+    Only includes successful runs.
+    """
+    samples: dict[str, list[int]] = defaultdict(list)
+    for audit in audits:
+        for r in audit.get("results", []):
+            model = SCENARIO_MODEL_MAP.get(r.get("scenario", ""))
+            if model and r.get("status") == "ok" and r.get("ms", 0) > 0:
+                samples[model].append(r["ms"])
+
+    result = {}
+    for model, ms_list in samples.items():
+        s = sorted(ms_list)
+        n = len(s)
+        p50 = s[n // 2]
+        p95 = s[int(n * 0.95)] if n > 1 else s[-1]
+        result[model] = {
+            "p50": p50,
+            "p95": p95,
+            "min": min(s),
+            "max": max(s),
+            "n": n,
+            "samples": s,
+        }
+    return result
+
+
+def aggregate_tts_latency(audits: list[dict]) -> dict:
+    tts_ms = []
+    for audit in audits:
+        for r in audit.get("results", []):
+            if "TTS" in r.get("scenario", "") and r.get("status") == "ok":
+                tts_ms.append(r["ms"])
+    if not tts_ms:
+        return {}
+    s = sorted(tts_ms)
+    return {
+        "p50": s[len(s) // 2],
+        "p95": s[int(len(s) * 0.95)] if len(s) > 1 else s[-1],
+        "min": min(s), "max": max(s), "n": len(s),
+    }
+
+
+def generate_report(model_stats: dict, tts_stats: dict, n_audits: int) -> dict:
+    now = datetime.now(timezone.utc).isoformat()
+    report = {
+        "generated_at": now,
+        "n_audit_runs": n_audits,
+        "slo": {
+            "tts_p95_target_ms": 2500,
+            "llm_fast_p95_target_ms": 9000,
+            "e2e_p95_target_ms": 12000,
+        },
+        "tts": tts_stats,
+        "models": {},
+        "voice_fast_uk_recommended_prefer": [],
+        "voice_fast_uk_recommended_exclude": [],
+        "auto_promote_verdict": None,
+    }
+
+    # Sort models by p50
+    ranked = sorted(model_stats.items(), key=lambda x: x[1]["p50"])
+
+    for model, stats in ranked:
+        status = "ok"
+        if stats["p50"] > SLOW_THRESHOLD_MS:
+            status = "slow_exclude"
+        elif stats["p50"] > WARN_THRESHOLD_MS:
+            status = "warn_borderline"
+        report["models"][model] = {**stats, "status": status}
+
+    # Build recommended prefer list (exclude slow models)
+    prefer = [m for m, s in ranked if s["p50"] <= SLOW_THRESHOLD_MS]
+    exclude = [m for m, s in ranked if s["p50"] > SLOW_THRESHOLD_MS]
+
+    # Always exclude known-slow even if no data
+    always_exclude = ["glm-4.7-flash:32k", "glm-4.7-flash", "deepseek-r1:70b"]
+    for m in always_exclude:
+        if m not in exclude:
+            exclude.append(m)
+
+    report["voice_fast_uk_recommended_prefer"] = prefer
+    report["voice_fast_uk_recommended_exclude"] = exclude
+
+    # auto_promote verdict: qwen3.5 vs qwen3:14b
+    q35 = model_stats.get("qwen3.5:35b-a3b", {})
+    q14 = model_stats.get("qwen3:14b", {})
+    if q35 and q14:
+        ratio = q35["p95"] / max(q14["p95"], 1)
+        verdict = "promote" if ratio < AUTO_PROMOTE_RATIO_THRESHOLD else (
+            "keep_as_second" if ratio < 1.0 else "move_to_quality_only"
+        )
+        report["auto_promote_verdict"] = {
+            "candidate": "qwen3.5:35b-a3b",
+            "baseline": "qwen3:14b",
+            "ratio": round(ratio, 3),
+            "threshold": AUTO_PROMOTE_RATIO_THRESHOLD,
+            "verdict": verdict,
+            "description": {
+                "promote": "qwen3.5 qualifies for auto_promote in voice_fast_uk (>10% faster)",
+                "keep_as_second": "qwen3.5 faster but <10% — keep as 2nd in prefer list",
+                "move_to_quality_only": "qwen3.5 slower than 14b — move to voice_quality_uk only",
+            }[verdict],
+        }
+
+    return report
+
+
+def propose_config_updates(report: dict, current_config: dict) -> list[dict]:
+    """Returns list of {section, key, old, new, reason} proposals."""
+    proposals = []
+    sp = current_config.get("selection_policies", {})
+    vf = sp.get("voice_fast_uk", {})
+
+    rec_prefer = report.get("voice_fast_uk_recommended_prefer", [])
+    rec_exclude = report.get("voice_fast_uk_recommended_exclude", [])
+
+    current_prefer = vf.get("prefer_models", [])
+    current_exclude = vf.get("exclude_models", [])
+
+    # Prefer order: suggest if different from current
+    if rec_prefer and rec_prefer != current_prefer:
+        proposals.append({
+            "section": "selection_policies.voice_fast_uk.prefer_models",
+            "old": current_prefer,
+            "new": rec_prefer,
+            "reason": f"Sorted by p50 from {report['n_audit_runs']} audit run(s)",
+        })
+
+    # Exclude: add newly-discovered slow models
+    new_excludes = [m for m in rec_exclude if m not in current_exclude]
+    if new_excludes:
+        proposals.append({
+            "section": "selection_policies.voice_fast_uk.exclude_models",
+            "old": current_exclude,
+            "new": current_exclude + new_excludes,
+            "reason": f"Add slow models: {new_excludes}",
+        })
+
+    # auto_promote ratio update
+    ap_verdict = report.get("auto_promote_verdict")
+    if ap_verdict:
+        current_ratio = vf.get("auto_promote", {}).get("condition", {}).get(
+            "p95_ratio_vs_next_model", None
+        )
+        new_ratio = ap_verdict["ratio"]
+        if current_ratio is None or abs(current_ratio - new_ratio) > 0.05:
+            proposals.append({
+                "section": "selection_policies.voice_fast_uk.auto_promote.condition.p95_ratio_vs_next_model",
+                "old": current_ratio,
+                "new": round(new_ratio, 3),
+                "reason": f"Updated from real audit data. Verdict: {ap_verdict['verdict']}",
+            })
+
+    return proposals
+
+
+def apply_updates(proposals: list[dict], config_path: Path) -> None:
+    """Write updates to router-config.yml with backup."""
+    backup = config_path.with_suffix(".yml.bak")
+    shutil.copy2(config_path, backup)
+    print(f"\n  Backup created: {backup}")
+
+    with open(config_path) as f:
+        config = yaml.safe_load(f)
+
+    for p in proposals:
+        section = p["section"]
+        new_val = p["new"]
+        # Navigate and set
+        keys = section.split(".")
+        obj = config
+        for k in keys[:-1]:
+            obj = obj.setdefault(k, {})
+        obj[keys[-1]] = new_val
+        print(f"  Applied: {section}")
+        print(f"    old: {p['old']}")
+        print(f"    new: {new_val}")
+
+    with open(config_path, "w") as f:
+        yaml.dump(config, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
+    print(f"\n  Written: {config_path}")
+
+
+def print_report_summary(report: dict) -> None:
+    print("\n── Voice Latency Report ──")
+    print(f"  Generated: {report['generated_at']}")
+    print(f"  Audit runs analyzed: {report['n_audit_runs']}")
+
+    tts = report.get("tts", {})
+    if tts:
+        slo_ok = "✅" if tts.get("p95", 9999) <= 2500 else "⚠️ "
+        print(f"\n  TTS: p50={tts.get('p50')}ms p95={tts.get('p95')}ms {slo_ok}")
+
+    print("\n  LLM Model Ranking (by p50):")
+    for model, stats in report.get("models", {}).items():
+        icon = {"ok": "✅", "warn_borderline": "⚠️ ", "slow_exclude": "❌"}.get(stats["status"], "  ")
+        print(f"    {icon} {model:25s} p50={stats['p50']}ms p95={stats['p95']}ms n={stats['n']}")
+
+    ap = report.get("auto_promote_verdict")
+    if ap:
+        verdict_icon = {"promote": "✅", "keep_as_second": "ℹ️ ", "move_to_quality_only": "⚠️ "}.get(
+            ap["verdict"], "  "
+        )
+        print(f"\n  auto_promote: {verdict_icon} {ap['description']} (ratio={ap['ratio']})")
+
+    print(f"\n  Recommended prefer: {report.get('voice_fast_uk_recommended_prefer')}")
+    print(f"  Recommended exclude additions: {[m for m in report.get('voice_fast_uk_recommended_exclude', []) if 'glm' not in m and 'deepseek' not in m]}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Voice policy auto-update from audit data")
+    parser.add_argument("--apply", action="store_true", help="Apply updates to router-config.yml")
+    parser.add_argument("--n-runs", type=int, default=10, help="Number of recent audit runs to analyze")
+    args = parser.parse_args()
+
+    print("╔══════════════════════════════════════════╗")
+    print("║   Voice Policy Auto-Update               ║")
+    print("╚══════════════════════════════════════════╝")
+
+    audits = load_audits(args.n_runs)
+    if not audits:
+        print("\n  No audit results found. Run: bash ops/voice_latency_audit.sh first.")
+        return
+
+    print(f"\n  Loaded {len(audits)} audit run(s)")
+
+    model_stats = aggregate_model_latency(audits)
+    tts_stats = aggregate_tts_latency(audits)
+
+    report = generate_report(model_stats, tts_stats, len(audits))
+    print_report_summary(report)
+
+    # Save report
+    with open(REPORT_FILE, "w") as f:
+        json.dump(report, f, indent=2, ensure_ascii=False)
+    print(f"\n  Report saved: {REPORT_FILE}")
+
+    # Load current config
+    with open(ROUTER_CONFIG) as f:
+        current_config = yaml.safe_load(f)
+
+    proposals = propose_config_updates(report, current_config)
+    if not proposals:
+        print("\n  No config changes needed — policy is up to date.")
+    else:
+        print(f"\n── {len(proposals)} Proposed Config Update(s) ──")
+        for p in proposals:
+            print(f"\n  [{p['section']}]")
+            print(f"    reason: {p['reason']}")
+            print(f"    old: {p['old']}")
+            print(f"    new: {p['new']}")
+
+        if args.apply:
+            print("\n── Applying updates ──")
+            apply_updates(proposals, ROUTER_CONFIG)
+            print("\n  Done. Re-run tests: python3 -m pytest tests/test_voice_policy.py -v")
+        else:
+            print("\n  Dry-run mode. To apply: python3 ops/voice_policy_update.py --apply")
+
+
+if __name__ == "__main__":
+    main()