Config policies (16 files): alert_routing, architecture_pressure, backlog, cost_weights, data_governance, incident_escalation, incident_intelligence, network_allowlist, nodes_registry, observability_sources, rbac_tools_matrix, release_gate, risk_attribution, risk_policy, slo_policy, tool_limits, tools_rollout Ops (22 files): Caddyfile, calendar compose, grafana voice dashboard, deployments/incidents logs, runbooks for alerts/audit/backlog/incidents/sofiia/voice, cron jobs, scripts (alert_triage, audit_cleanup, migrate_*, governance, schedule), task_registry, voice alerts/ha/latency/policy Docs (30+ files): HUMANIZED_STEPAN v2.7-v3 changelogs and runbooks, NODA1/NODA2 status and setup, audit index and traces, backlog, incident, supervisor, tools, voice, opencode, release, risk, aistalk, spacebot Made-with: Cursor
213 lines
7.2 KiB
JSON
213 lines
7.2 KiB
JSON
{
|
|
"__inputs": [
|
|
{
|
|
"name": "DS_PROMETHEUS",
|
|
"label": "Prometheus",
|
|
"description": "Prometheus datasource — point to your Prometheus instance",
|
|
"type": "datasource",
|
|
"pluginId": "prometheus",
|
|
"pluginName": "Prometheus"
|
|
}
|
|
],
|
|
"title": "DAARION Voice SLO Dashboard",
|
|
"uid": "voice-slo",
|
|
"description": "Voice pipeline SLO: TTFA, LLM latency, TTS health, queue underflows. Aligns with ops/voice_alerts.yml and config/slo_policy.yml.",
|
|
"tags": ["voice", "slo", "daarion"],
|
|
"timezone": "browser",
|
|
"refresh": "30s",
|
|
"time": { "from": "now-1h", "to": "now" },
|
|
"panels": [
|
|
|
|
{
|
|
"id": 1,
|
|
"title": "⏱ Time-to-First-Audio p50 / p95",
|
|
"description": "SLO: voice_fast_uk p95 ≤ 5000ms | voice_quality_uk p95 ≤ 7000ms",
|
|
"type": "timeseries",
|
|
"gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
|
|
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "ms",
|
|
"custom": { "lineWidth": 2 },
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "value": null, "color": "green" },
|
|
{ "value": 5000, "color": "yellow" },
|
|
{ "value": 7000, "color": "red" }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.50, rate(voice_ttfa_ms_bucket{voice_profile='voice_fast_uk'}[$__rate_interval]))",
|
|
"legendFormat": "fast p50"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(voice_ttfa_ms_bucket{voice_profile='voice_fast_uk'}[$__rate_interval]))",
|
|
"legendFormat": "fast p95"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(voice_ttfa_ms_bucket{voice_profile='voice_quality_uk'}[$__rate_interval]))",
|
|
"legendFormat": "quality p95"
|
|
}
|
|
],
|
|
"options": {
|
|
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
|
}
|
|
},
|
|
|
|
{
|
|
"id": 2,
|
|
"title": "🤖 LLM Latency by Model",
|
|
"description": "LLM inference time per model. Use to identify slow models and trigger auto-promote.",
|
|
"type": "timeseries",
|
|
"gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
|
|
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "ms" }
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.50, rate(voice_llm_ms_bucket[$__rate_interval])) by (model)",
|
|
"legendFormat": "{{ model }} p50"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(voice_llm_ms_bucket[$__rate_interval])) by (model)",
|
|
"legendFormat": "{{ model }} p95"
|
|
}
|
|
],
|
|
"options": {
|
|
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
|
}
|
|
},
|
|
|
|
{
|
|
"id": 3,
|
|
"title": "🔊 TTS Health: Synthesis Time + Error Rate",
|
|
"description": "SLO: tts_first_ms p95 ≤ 2000ms. Error rate > 0.05/s → alert.",
|
|
"type": "timeseries",
|
|
"gridPos": { "x": 0, "y": 8, "w": 12, "h": 8 },
|
|
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "ms" },
|
|
"overrides": [
|
|
{
|
|
"matcher": { "id": "byName", "options": "errors/s" },
|
|
"properties": [
|
|
{ "id": "custom.axisPlacement", "value": "right" },
|
|
{ "id": "unit", "value": "short" },
|
|
{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }
|
|
]
|
|
}
|
|
]
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.50, rate(voice_tts_first_ms_bucket[$__rate_interval]))",
|
|
"legendFormat": "tts_first p50"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(voice_tts_first_ms_bucket[$__rate_interval]))",
|
|
"legendFormat": "tts_first p95"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(voice_tts_compute_ms_bucket[$__rate_interval])) by (engine)",
|
|
"legendFormat": "{{ engine }} compute p95"
|
|
},
|
|
{
|
|
"expr": "rate(voice_tts_errors_total[$__rate_interval])",
|
|
"legendFormat": "errors/s"
|
|
}
|
|
],
|
|
"options": {
|
|
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
|
}
|
|
},
|
|
|
|
{
|
|
"id": 4,
|
|
"title": "📊 Queue Underflows + E2E Latency",
|
|
"description": "Underflow = playback outran TTS synthesis (silence gap). E2E SLO: p95 ≤ 9000ms.",
|
|
"type": "timeseries",
|
|
"gridPos": { "x": 12, "y": 8, "w": 12, "h": 8 },
|
|
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "ms" },
|
|
"overrides": [
|
|
{
|
|
"matcher": { "id": "byName", "options": "underflows/min" },
|
|
"properties": [
|
|
{ "id": "custom.axisPlacement", "value": "right" },
|
|
{ "id": "unit", "value": "short" },
|
|
{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }
|
|
]
|
|
}
|
|
]
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(voice_e2e_ms_bucket{voice_profile='voice_fast_uk'}[$__rate_interval]))",
|
|
"legendFormat": "e2e fast p95"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(voice_e2e_ms_bucket{voice_profile='voice_quality_uk'}[$__rate_interval]))",
|
|
"legendFormat": "e2e quality p95"
|
|
},
|
|
{
|
|
"expr": "rate(voice_queue_underflows_total[$__rate_interval]) * 60",
|
|
"legendFormat": "underflows/min"
|
|
}
|
|
],
|
|
"options": {
|
|
"legend": { "displayMode": "table", "placement": "bottom", "calcs": ["lastNotNull", "max"] }
|
|
}
|
|
},
|
|
|
|
{
|
|
"id": 5,
|
|
"title": "🚦 SLO Status (Stat)",
|
|
"type": "stat",
|
|
"gridPos": { "x": 0, "y": 16, "w": 24, "h": 4 },
|
|
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "value": null, "color": "green" },
|
|
{ "value": 5000, "color": "yellow" },
|
|
{ "value": 7000, "color": "red" }
|
|
]
|
|
},
|
|
"mappings": []
|
|
}
|
|
},
|
|
"options": {
|
|
"reduceOptions": { "calcs": ["lastNotNull"] },
|
|
"orientation": "horizontal",
|
|
"colorMode": "background"
|
|
},
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(voice_ttfa_ms_bucket{voice_profile='voice_fast_uk'}[10m]))",
|
|
"legendFormat": "TTFA fast p95"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(voice_tts_first_ms_bucket[10m]))",
|
|
"legendFormat": "TTS first p95"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(voice_e2e_ms_bucket{voice_profile='voice_fast_uk'}[10m]))",
|
|
"legendFormat": "E2E fast p95"
|
|
},
|
|
{
|
|
"expr": "rate(voice_tts_errors_total[10m])",
|
|
"legendFormat": "TTS errors/s"
|
|
}
|
|
]
|
|
}
|
|
]
|
|
}
|