- Atomic генерація всіх секретів (generate-all-secrets.sh) - Auth enforcement перевірка (enforce-auth.sh) - Оновлений full flow test (must-pass) - Prometheus alerting rules для Memory Module - Matrix alerts bridge (алерти в ops room) - Policy engine документація для пам'яті Готово до production deployment!
107 lines
3.9 KiB
YAML
107 lines
3.9 KiB
YAML
---
|
|
# Prometheus Alerting Rules для Memory Module
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: memory-module-alerts
|
|
namespace: monitoring
|
|
labels:
|
|
app: memory-module
|
|
spec:
|
|
groups:
|
|
- name: memory_module
|
|
interval: 30s
|
|
rules:
|
|
# NATS JetStream Alerts
|
|
- alert: NATSOnlineBacklogHigh
|
|
expr: nats_jetstream_stream_messages{stream="MM_ONLINE"} > 1000
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
component: nats
|
|
annotations:
|
|
summary: "MM_ONLINE backlog критично високий"
|
|
description: "Backlog в MM_ONLINE stream: {{ $value }} messages. SLO порушено."
|
|
|
|
- alert: NATSRedeliveriesSpike
|
|
expr: rate(nats_jetstream_consumer_redeliveries_total[5m]) > 100
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
component: nats
|
|
annotations:
|
|
summary: "Спік redeliveries в NATS"
|
|
description: "Redeliveries rate: {{ $value }}/min. Можливі проблеми з воркерами."
|
|
|
|
- alert: NATSAckPendingHigh
|
|
expr: nats_jetstream_consumer_ack_pending{stream="MM_ONLINE"} > 5000
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: nats
|
|
annotations:
|
|
summary: "Високий ack_pending в MM_ONLINE"
|
|
description: "Ack pending: {{ $value }}. Воркери можуть бути перевантажені."
|
|
|
|
- alert: NATSStreamStorageHigh
|
|
expr: (nats_jetstream_stream_bytes / nats_jetstream_stream_max_bytes) > 0.8
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
component: nats
|
|
annotations:
|
|
summary: "Диск JetStream майже заповнений"
|
|
description: "Використання: {{ $value | humanizePercentage }}"
|
|
|
|
# Worker Alerts
|
|
- alert: WorkerOffline
|
|
expr: time() - worker_last_heartbeat_seconds > 120
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
component: worker
|
|
annotations:
|
|
summary: "Worker offline більше 2 хвилин"
|
|
description: "Worker {{ $labels.node_id }} (Tier {{ $labels.tier }}) не відповідає."
|
|
|
|
- alert: WorkerEmbedLatencyHigh
|
|
expr: histogram_quantile(0.95, rate(worker_job_duration_seconds_bucket{type="embed"}[5m])) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: worker
|
|
annotations:
|
|
summary: "P95 latency для embed jobs > 500ms"
|
|
description: "P95: {{ $value }}s (target: 300ms)"
|
|
|
|
- alert: WorkerErrorRateHigh
|
|
expr: rate(worker_errors_total[5m]) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: worker
|
|
annotations:
|
|
summary: "Високий error rate в воркерів"
|
|
description: "Error rate: {{ $value }}/s"
|
|
|
|
# Memory Service Alerts
|
|
- alert: MemoryServiceDown
|
|
expr: up{job="memory-service"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
component: memory-service
|
|
annotations:
|
|
summary: "Memory Service недоступний"
|
|
description: "Memory Service не відповідає на health checks."
|
|
|
|
- alert: MemoryServiceLatencyHigh
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="memory-service"}[5m])) > 1.0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: memory-service
|
|
annotations:
|
|
summary: "P95 latency Memory Service > 1s"
|
|
description: "P95: {{ $value }}s"
|