--- # Prometheus Alerting Rules для Memory Module apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: memory-module-alerts namespace: monitoring labels: app: memory-module spec: groups: - name: memory_module interval: 30s rules: # NATS JetStream Alerts - alert: NATSOnlineBacklogHigh expr: nats_jetstream_stream_messages{stream="MM_ONLINE"} > 1000 for: 5m labels: severity: critical component: nats annotations: summary: "MM_ONLINE backlog критично високий" description: "Backlog в MM_ONLINE stream: {{ $value }} messages. SLO порушено." - alert: NATSRedeliveriesSpike expr: rate(nats_jetstream_consumer_redeliveries_total[5m]) > 100 for: 2m labels: severity: warning component: nats annotations: summary: "Спік redeliveries в NATS" description: "Redeliveries rate: {{ $value }}/min. Можливі проблеми з воркерами." - alert: NATSAckPendingHigh expr: nats_jetstream_consumer_ack_pending{stream="MM_ONLINE"} > 5000 for: 5m labels: severity: warning component: nats annotations: summary: "Високий ack_pending в MM_ONLINE" description: "Ack pending: {{ $value }}. Воркери можуть бути перевантажені." - alert: NATSStreamStorageHigh expr: (nats_jetstream_stream_bytes / nats_jetstream_stream_max_bytes) > 0.8 for: 10m labels: severity: warning component: nats annotations: summary: "Диск JetStream майже заповнений" description: "Використання: {{ $value | humanizePercentage }}" # Worker Alerts - alert: WorkerOffline expr: time() - worker_last_heartbeat_seconds > 120 for: 2m labels: severity: critical component: worker annotations: summary: "Worker offline більше 2 хвилин" description: "Worker {{ $labels.node_id }} (Tier {{ $labels.tier }}) не відповідає." - alert: WorkerEmbedLatencyHigh expr: histogram_quantile(0.95, rate(worker_job_duration_seconds_bucket{type="embed"}[5m])) > 0.5 for: 5m labels: severity: warning component: worker annotations: summary: "P95 latency для embed jobs > 500ms" description: "P95: {{ $value }}s (target: 300ms)" - alert: WorkerErrorRateHigh expr: rate(worker_errors_total[5m]) > 10 for: 5m labels: severity: warning component: worker annotations: summary: "Високий error rate в воркерів" description: "Error rate: {{ $value }}/s" # Memory Service Alerts - alert: MemoryServiceDown expr: up{job="memory-service"} == 0 for: 1m labels: severity: critical component: memory-service annotations: summary: "Memory Service недоступний" description: "Memory Service не відповідає на health checks." - alert: MemoryServiceLatencyHigh expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="memory-service"}[5m])) > 1.0 for: 5m labels: severity: warning component: memory-service annotations: summary: "P95 latency Memory Service > 1s" description: "P95: {{ $value }}s"