feat: додано Node Registry, GreenFood, Monitoring та Utils
This commit is contained in:
129
monitoring/prometheus/alerts/daarion_alerts.yml
Normal file
129
monitoring/prometheus/alerts/daarion_alerts.yml
Normal file
@@ -0,0 +1,129 @@
|
||||
groups:
|
||||
- name: DAARION Platform
|
||||
interval: 30s
|
||||
rules:
|
||||
# Service Health Alerts
|
||||
- alert: ServiceDown
|
||||
expr: up == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Service {{ $labels.job }} is down"
|
||||
description: "{{ $labels.job }} has been down for more than 2 minutes"
|
||||
|
||||
- alert: HighErrorRate
|
||||
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate on {{ $labels.job }}"
|
||||
description: "Error rate is {{ $value }} errors/sec"
|
||||
|
||||
# Router Alerts
|
||||
- alert: RouterHighLatency
|
||||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="dagi-router"}[5m])) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "DAGI Router high latency"
|
||||
description: "95th percentile latency is {{ $value }}s"
|
||||
|
||||
- alert: RouterHighLoad
|
||||
expr: rate(http_requests_total{job="dagi-router"}[1m]) > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "DAGI Router high load"
|
||||
description: "Request rate is {{ $value }} req/sec"
|
||||
|
||||
# Telegram Gateway Alerts
|
||||
- alert: TelegramGatewayDown
|
||||
expr: up{job="telegram-gateway"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Telegram Gateway is down"
|
||||
description: "Telegram bots will not respond"
|
||||
|
||||
- alert: TelegramMessageBacklog
|
||||
expr: telegram_message_queue_size > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Telegram message backlog"
|
||||
description: "{{ $value }} messages in queue"
|
||||
|
||||
# LLM Performance
|
||||
- alert: LLMHighLatency
|
||||
expr: histogram_quantile(0.95, rate(llm_request_duration_seconds_bucket[5m])) > 30
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "LLM high latency"
|
||||
description: "95th percentile LLM latency is {{ $value }}s"
|
||||
|
||||
- alert: LLMErrorRate
|
||||
expr: rate(llm_errors_total[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "High LLM error rate"
|
||||
description: "LLM error rate is {{ $value }} errors/sec"
|
||||
|
||||
# Database Alerts
|
||||
- alert: PostgreSQLDown
|
||||
expr: up{job="postgres"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL is down"
|
||||
description: "Database is unavailable"
|
||||
|
||||
# NATS Alerts
|
||||
- alert: NATSDown
|
||||
expr: up{job="nats"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "NATS is down"
|
||||
description: "Message broker is unavailable"
|
||||
|
||||
# Vector DB Alerts
|
||||
- alert: QdrantHighMemory
|
||||
expr: qdrant_memory_used_bytes / qdrant_memory_total_bytes > 0.9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Qdrant high memory usage"
|
||||
description: "Memory usage is {{ $value | humanizePercentage }}"
|
||||
|
||||
# Disk Space Alerts
|
||||
- alert: DiskSpaceWarning
|
||||
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Low disk space"
|
||||
description: "Only {{ $value | humanizePercentage }} disk space left"
|
||||
|
||||
- alert: DiskSpaceCritical
|
||||
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical disk space"
|
||||
description: "Only {{ $value | humanizePercentage }} disk space left"
|
||||
|
||||
124
monitoring/prometheus/prometheus.yml
Normal file
124
monitoring/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,124 @@
|
||||
# Prometheus Configuration for DAARION Platform
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
cluster: 'daarion-prod'
|
||||
environment: 'production'
|
||||
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: []
|
||||
# - alertmanager:9093
|
||||
|
||||
# Load rules once and periodically evaluate them
|
||||
rule_files:
|
||||
- "/etc/prometheus/alerts/*.yml"
|
||||
|
||||
# Scrape configurations
|
||||
scrape_configs:
|
||||
# DAGI Router
|
||||
- job_name: 'dagi-router'
|
||||
static_configs:
|
||||
- targets: ['dagi-router:9102']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 10s
|
||||
|
||||
# Telegram Gateway
|
||||
- job_name: 'telegram-gateway'
|
||||
static_configs:
|
||||
- targets: ['telegram-gateway:8000']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 10s
|
||||
|
||||
# DAGI Gateway
|
||||
- job_name: 'dagi-gateway'
|
||||
static_configs:
|
||||
- targets: ['dagi-gateway:9300']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 10s
|
||||
|
||||
# RBAC Service
|
||||
- job_name: 'dagi-rbac'
|
||||
static_configs:
|
||||
- targets: ['dagi-rbac:9200']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 15s
|
||||
|
||||
# CrewAI Service
|
||||
- job_name: 'dagi-crewai'
|
||||
static_configs:
|
||||
- targets: ['dagi-crewai:9010']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 15s
|
||||
|
||||
# Parser Service
|
||||
- job_name: 'dagi-parser'
|
||||
static_configs:
|
||||
- targets: ['dagi-parser:9400']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 20s
|
||||
|
||||
# Vision Encoder
|
||||
- job_name: 'dagi-vision-encoder'
|
||||
static_configs:
|
||||
- targets: ['dagi-vision-encoder:8001']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 20s
|
||||
|
||||
# DevTools
|
||||
- job_name: 'dagi-devtools'
|
||||
static_configs:
|
||||
- targets: ['dagi-devtools:8008']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 15s
|
||||
|
||||
# STT Service
|
||||
- job_name: 'dagi-stt'
|
||||
static_configs:
|
||||
- targets: ['dagi-stt:9000']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 20s
|
||||
|
||||
# TTS Service
|
||||
- job_name: 'dagi-tts'
|
||||
static_configs:
|
||||
- targets: ['dagi-tts:9101']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 20s
|
||||
|
||||
# Qdrant Vector DB
|
||||
- job_name: 'dagi-qdrant'
|
||||
static_configs:
|
||||
- targets: ['dagi-qdrant:6333']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
# NATS
|
||||
- job_name: 'nats'
|
||||
static_configs:
|
||||
- targets: ['nats:8222']
|
||||
metrics_path: '/varz'
|
||||
scrape_interval: 15s
|
||||
|
||||
# PostgreSQL (if exporter is installed)
|
||||
- job_name: 'postgres'
|
||||
static_configs:
|
||||
- targets: ['dagi-postgres:5432']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 30s
|
||||
|
||||
# Prometheus self-monitoring
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# Docker containers (if node_exporter is installed)
|
||||
- job_name: 'node-exporter'
|
||||
static_configs:
|
||||
- targets: ['host.docker.internal:9100']
|
||||
scrape_interval: 30s
|
||||
|
||||
Reference in New Issue
Block a user