Files
microdao-daarion/infrastructure/observability/prometheus-rules.yaml
Apple 70fd268a0d 🚀 Production-ready: Auth enforcement + Observability + Policy
- Atomic генерація всіх секретів (generate-all-secrets.sh)
- Auth enforcement перевірка (enforce-auth.sh)
- Оновлений full flow test (must-pass)
- Prometheus alerting rules для Memory Module
- Matrix alerts bridge (алерти в ops room)
- Policy engine документація для пам'яті

Готово до production deployment!
2026-01-10 10:56:05 -08:00

107 lines
3.9 KiB
YAML

---
# Prometheus Alerting Rules для Memory Module
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: memory-module-alerts
namespace: monitoring
labels:
app: memory-module
spec:
groups:
- name: memory_module
interval: 30s
rules:
# NATS JetStream Alerts
- alert: NATSOnlineBacklogHigh
expr: nats_jetstream_stream_messages{stream="MM_ONLINE"} > 1000
for: 5m
labels:
severity: critical
component: nats
annotations:
summary: "MM_ONLINE backlog критично високий"
description: "Backlog в MM_ONLINE stream: {{ $value }} messages. SLO порушено."
- alert: NATSRedeliveriesSpike
expr: rate(nats_jetstream_consumer_redeliveries_total[5m]) > 100
for: 2m
labels:
severity: warning
component: nats
annotations:
summary: "Спік redeliveries в NATS"
description: "Redeliveries rate: {{ $value }}/min. Можливі проблеми з воркерами."
- alert: NATSAckPendingHigh
expr: nats_jetstream_consumer_ack_pending{stream="MM_ONLINE"} > 5000
for: 5m
labels:
severity: warning
component: nats
annotations:
summary: "Високий ack_pending в MM_ONLINE"
description: "Ack pending: {{ $value }}. Воркери можуть бути перевантажені."
- alert: NATSStreamStorageHigh
expr: (nats_jetstream_stream_bytes / nats_jetstream_stream_max_bytes) > 0.8
for: 10m
labels:
severity: warning
component: nats
annotations:
summary: "Диск JetStream майже заповнений"
description: "Використання: {{ $value | humanizePercentage }}"
# Worker Alerts
- alert: WorkerOffline
expr: time() - worker_last_heartbeat_seconds > 120
for: 2m
labels:
severity: critical
component: worker
annotations:
summary: "Worker offline більше 2 хвилин"
description: "Worker {{ $labels.node_id }} (Tier {{ $labels.tier }}) не відповідає."
- alert: WorkerEmbedLatencyHigh
expr: histogram_quantile(0.95, rate(worker_job_duration_seconds_bucket{type="embed"}[5m])) > 0.5
for: 5m
labels:
severity: warning
component: worker
annotations:
summary: "P95 latency для embed jobs > 500ms"
description: "P95: {{ $value }}s (target: 300ms)"
- alert: WorkerErrorRateHigh
expr: rate(worker_errors_total[5m]) > 10
for: 5m
labels:
severity: warning
component: worker
annotations:
summary: "Високий error rate в воркерів"
description: "Error rate: {{ $value }}/s"
# Memory Service Alerts
- alert: MemoryServiceDown
expr: up{job="memory-service"} == 0
for: 1m
labels:
severity: critical
component: memory-service
annotations:
summary: "Memory Service недоступний"
description: "Memory Service не відповідає на health checks."
- alert: MemoryServiceLatencyHigh
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="memory-service"}[5m])) > 1.0
for: 5m
labels:
severity: warning
component: memory-service
annotations:
summary: "P95 latency Memory Service > 1s"
description: "P95: {{ $value }}s"