Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
119 lines
5.2 KiB
YAML
119 lines
5.2 KiB
YAML
groups:
|
|
- name: voice_slo
|
|
# Evaluation interval should match Prometheus global evaluation_interval (default 1m).
|
|
# All thresholds align with config/slo_policy.yml voice_slo section.
|
|
rules:
|
|
|
|
# ── Alert 1: TTFA p95 breach ──────────────────────────────────────────────
|
|
# Fires when Time-to-first-audio p95 exceeds SLO for 10 consecutive minutes.
|
|
# Root causes: slow LLM, Ollama overload, model cold-start.
|
|
- alert: VoiceTTFA_P95_Breach_Fast
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
rate(voice_ttfa_ms_bucket{voice_profile="voice_fast_uk"}[10m])
|
|
) > 5000
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
profile: voice_fast_uk
|
|
annotations:
|
|
summary: "Voice TTFA p95 breach (fast profile)"
|
|
description: >
|
|
voice_fast_uk TTFA p95 = {{ $value | humanizeDuration }}ms > 5000ms SLO.
|
|
Check: Ollama queue depth, gemma3 model availability, sofiia-console logs.
|
|
runbook: "ops/runbook-alerts.md#voice-ttfa"
|
|
dashboard: "grafana/d/voice-slo/voice-latency"
|
|
|
|
- alert: VoiceTTFA_P95_Breach_Quality
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
rate(voice_ttfa_ms_bucket{voice_profile="voice_quality_uk"}[10m])
|
|
) > 7000
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
profile: voice_quality_uk
|
|
annotations:
|
|
summary: "Voice TTFA p95 breach (quality profile)"
|
|
description: >
|
|
voice_quality_uk TTFA p95 = {{ $value }}ms > 7000ms SLO.
|
|
Check: qwen3.5:35b-a3b availability, NODA2 GPU/CPU load.
|
|
runbook: "ops/runbook-alerts.md#voice-ttfa"
|
|
|
|
# ── Alert 2: Underflow spike ───────────────────────────────────────────────
|
|
# Fires when queue starvation rate exceeds 1 event/min for 5 minutes.
|
|
# Root cause: TTS synthesis slower than playback — LLM too slow, long chunks,
|
|
# or network latency to memory-service.
|
|
- alert: VoiceQueueUnderflow_Spike
|
|
expr: |
|
|
rate(voice_queue_underflows_total[5m]) > 0.017
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
annotations:
|
|
summary: "Voice queue starvation detected"
|
|
description: >
|
|
Queue underflow rate = {{ $value | humanize }}/s (>1/min).
|
|
Audio playback is outrunning TTS synthesis — users hear silence gaps.
|
|
Check: TTS latency (voice_tts_first_ms), chunk size, LLM total time.
|
|
runbook: "ops/runbook-alerts.md#voice-underflow"
|
|
|
|
# ── Alert 3: TTS synthesis degradation ────────────────────────────────────
|
|
# Fires when first-sentence TTS p95 exceeds 2s — indicates edge-tts issues
|
|
# (403 auth errors, Microsoft endpoint throttling, network degradation).
|
|
- alert: VoiceTTS_P95_Degraded
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
rate(voice_tts_first_ms_bucket[10m])
|
|
) > 2000
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
team: platform
|
|
annotations:
|
|
summary: "Voice TTS synthesis degraded (p95 > 2s)"
|
|
description: >
|
|
voice_tts_first_ms p95 = {{ $value }}ms > 2000ms.
|
|
Likely edge-tts 403 or Microsoft endpoint issue.
|
|
Check: memory-service /voice/health, voice_tts_errors_total{error_type="403"}.
|
|
runbook: "ops/runbook-alerts.md#voice-tts-degraded"
|
|
|
|
# ── Alert 4: TTS error rate spike ─────────────────────────────────────────
|
|
# Fires on elevated edge-tts error rate (403, network, synthesis failure).
|
|
- alert: VoiceTTS_ErrorRate_High
|
|
expr: |
|
|
rate(voice_tts_errors_total[5m]) > 0.05
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
team: platform
|
|
annotations:
|
|
summary: "Voice TTS error rate elevated"
|
|
description: >
|
|
TTS errors = {{ $value | humanize }}/s.
|
|
Engine: {{ $labels.engine }}, Error type: {{ $labels.error_type }}.
|
|
Users may hear espeak fallback or silence.
|
|
runbook: "ops/runbook-alerts.md#voice-tts-error"
|
|
|
|
# ── Alert 5: E2E latency breach ───────────────────────────────────────────
|
|
# Full round-trip SLO guard — catches combined LLM+TTS degradation.
|
|
- alert: VoiceE2E_P95_Breach
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
rate(voice_e2e_ms_bucket{voice_profile="voice_fast_uk"}[15m])
|
|
) > 9000
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
profile: voice_fast_uk
|
|
annotations:
|
|
summary: "Voice E2E latency p95 breach"
|
|
description: >
|
|
voice_fast_uk E2E p95 = {{ $value }}ms > 9000ms SLO.
|
|
Full pipeline (STT+LLM+TTS) is degraded.
|
|
runbook: "ops/runbook-alerts.md#voice-e2e"
|