🚀 Production-ready: Auth enforcement + Observability + Policy
- Atomic генерація всіх секретів (generate-all-secrets.sh) - Auth enforcement перевірка (enforce-auth.sh) - Оновлений full flow test (must-pass) - Prometheus alerting rules для Memory Module - Matrix alerts bridge (алерти в ops room) - Policy engine документація для пам'яті Готово до production deployment!
This commit is contained in:
106
infrastructure/observability/prometheus-rules.yaml
Normal file
106
infrastructure/observability/prometheus-rules.yaml
Normal file
@@ -0,0 +1,106 @@
|
||||
---
|
||||
# Prometheus Alerting Rules для Memory Module
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: memory-module-alerts
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: memory-module
|
||||
spec:
|
||||
groups:
|
||||
- name: memory_module
|
||||
interval: 30s
|
||||
rules:
|
||||
# NATS JetStream Alerts
|
||||
- alert: NATSOnlineBacklogHigh
|
||||
expr: nats_jetstream_stream_messages{stream="MM_ONLINE"} > 1000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
component: nats
|
||||
annotations:
|
||||
summary: "MM_ONLINE backlog критично високий"
|
||||
description: "Backlog в MM_ONLINE stream: {{ $value }} messages. SLO порушено."
|
||||
|
||||
- alert: NATSRedeliveriesSpike
|
||||
expr: rate(nats_jetstream_consumer_redeliveries_total[5m]) > 100
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
component: nats
|
||||
annotations:
|
||||
summary: "Спік redeliveries в NATS"
|
||||
description: "Redeliveries rate: {{ $value }}/min. Можливі проблеми з воркерами."
|
||||
|
||||
- alert: NATSAckPendingHigh
|
||||
expr: nats_jetstream_consumer_ack_pending{stream="MM_ONLINE"} > 5000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: nats
|
||||
annotations:
|
||||
summary: "Високий ack_pending в MM_ONLINE"
|
||||
description: "Ack pending: {{ $value }}. Воркери можуть бути перевантажені."
|
||||
|
||||
- alert: NATSStreamStorageHigh
|
||||
expr: (nats_jetstream_stream_bytes / nats_jetstream_stream_max_bytes) > 0.8
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
component: nats
|
||||
annotations:
|
||||
summary: "Диск JetStream майже заповнений"
|
||||
description: "Використання: {{ $value | humanizePercentage }}"
|
||||
|
||||
# Worker Alerts
|
||||
- alert: WorkerOffline
|
||||
expr: time() - worker_last_heartbeat_seconds > 120
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
component: worker
|
||||
annotations:
|
||||
summary: "Worker offline більше 2 хвилин"
|
||||
description: "Worker {{ $labels.node_id }} (Tier {{ $labels.tier }}) не відповідає."
|
||||
|
||||
- alert: WorkerEmbedLatencyHigh
|
||||
expr: histogram_quantile(0.95, rate(worker_job_duration_seconds_bucket{type="embed"}[5m])) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: worker
|
||||
annotations:
|
||||
summary: "P95 latency для embed jobs > 500ms"
|
||||
description: "P95: {{ $value }}s (target: 300ms)"
|
||||
|
||||
- alert: WorkerErrorRateHigh
|
||||
expr: rate(worker_errors_total[5m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: worker
|
||||
annotations:
|
||||
summary: "Високий error rate в воркерів"
|
||||
description: "Error rate: {{ $value }}/s"
|
||||
|
||||
# Memory Service Alerts
|
||||
- alert: MemoryServiceDown
|
||||
expr: up{job="memory-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: memory-service
|
||||
annotations:
|
||||
summary: "Memory Service недоступний"
|
||||
description: "Memory Service не відповідає на health checks."
|
||||
|
||||
- alert: MemoryServiceLatencyHigh
|
||||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="memory-service"}[5m])) > 1.0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: memory-service
|
||||
annotations:
|
||||
summary: "P95 latency Memory Service > 1s"
|
||||
description: "P95: {{ $value }}s"
|
||||
Reference in New Issue
Block a user