groups: - name: node1-services rules: # Service Down alerts - alert: ServiceDown expr: up == 0 for: 2m labels: severity: critical annotations: summary: "Service {{ $labels.job }} is down" description: "{{ $labels.job }} on {{ $labels.instance }} has been down for more than 2 minutes." # Qdrant specific - alert: QdrantCollectionsLow expr: collections_total < 10 for: 5m labels: severity: warning annotations: summary: "Qdrant collections count low" description: "Qdrant has only {{ $value }} collections, expected 18+." - alert: QdrantVectorsDropped expr: collections_vector_total < 500 for: 5m labels: severity: warning annotations: summary: "Qdrant vector count dropped" description: "Only {{ $value }} vectors in Qdrant, expected 900+." - name: node1-host rules: # Disk space - alert: HostDiskSpaceLow expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 15 for: 10m labels: severity: warning annotations: summary: "Disk space low" description: "Only {{ $value | printf \"%.1f\" }}% free disk space on {{ $labels.mountpoint }}." # Memory - alert: HostMemoryHigh expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90 for: 10m labels: severity: warning annotations: summary: "High memory usage" description: "Memory usage is {{ $value | printf \"%.1f\" }}% on host." # Load average - alert: HostHighLoad expr: node_load15 > 10 for: 15m labels: severity: warning annotations: summary: "High system load" description: "15-minute load average is {{ $value | printf \"%.1f\" }}." - name: node1-prometheus rules: # Prometheus self - alert: PrometheusConfigReloadFailed expr: prometheus_config_last_reload_successful != 1 for: 5m labels: severity: warning annotations: summary: "Prometheus config reload failed" description: "Prometheus configuration reload has failed." - alert: PrometheusTargetDown expr: count(up == 0) > 3 for: 5m labels: severity: critical annotations: summary: "Multiple Prometheus targets down" description: "{{ $value }} scrape targets are down." - name: node1-nginx rules: # Rate limiting events (429s spike) - alert: NginxHighRateLimitHits expr: increase(nginx_http_requests_total{status="429"}[5m]) > 100 for: 5m labels: severity: warning annotations: summary: "High rate limit hits" description: "More than 100 rate limit (429) responses in 5 minutes." # WAF drops (444s spike) - alert: NginxWAFDropsSpike expr: increase(nginx_http_requests_total{status="444"}[5m]) > 50 for: 5m labels: severity: warning annotations: summary: "WAF drops spike detected" description: "More than 50 WAF blocks (444) in 5 minutes - possible attack." # 5xx errors from gateway upstream - alert: NginxUpstream5xxErrors expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.1 for: 5m labels: severity: warning annotations: summary: "Gateway returning 5xx errors" description: "Gateway upstream is returning 5xx errors at rate {{ $value }}/s." # Auth failures spike (401s) - alert: NginxAuthFailuresSpike expr: increase(nginx_http_requests_total{status="401"}[10m]) > 200 for: 5m labels: severity: warning annotations: summary: "High authentication failures" description: "More than 200 auth failures (401) in 10 minutes - possible brute force." - name: agent-e2e-prober rules: # Agent not responding - alert: AgentE2EFailure expr: agent_e2e_success{target="gateway_health"} == 0 for: 2m labels: severity: critical annotations: summary: "Agent E2E probe failing" description: "Gateway health check failing for 2+ minutes. Agents may not be responding." - alert: AgentE2EHighLatency expr: agent_e2e_latency_seconds{target="gateway_health"} > 10 for: 5m labels: severity: warning annotations: summary: "Agent E2E high latency" description: "Agent response latency is {{ $value }}s, expected <10s." - alert: AgentPingFailure expr: agent_e2e_success{target="agent_ping"} == 0 for: 3m labels: severity: critical annotations: summary: "Agent ping failing" description: "Router/memory connectivity check failing. Pipeline may be broken." - name: agent-errors rules: # Agent errors spike - alert: AgentErrorsSpike expr: rate(gateway_errors_total[5m]) > 0 for: 3m labels: severity: warning annotations: summary: "Agent errors detected" description: "Errors from {{ $labels.source }}: {{ $labels.type }} at rate {{ $value }}/s" # Router call failures - alert: RouterCallFailures expr: rate(gateway_router_calls_total{status!="success"}[5m]) > 0 for: 3m labels: severity: warning annotations: summary: "Router call failures detected" description: "Router calls failing with status={{ $labels.status }} at rate {{ $value }}/s" # High latency p95 - alert: RouterLatencyHigh expr: histogram_quantile(0.95, rate(gateway_router_latency_seconds_bucket[5m])) > 30 for: 5m labels: severity: warning annotations: summary: "Router latency p95 > 30s" description: "Router p95 latency is {{ $value }}s, expected <30s" - name: agent-traffic rules: # No traffic (agents silent) - alert: AgentsSilent expr: increase(gateway_telegram_messages_total[1h]) == 0 for: 2h labels: severity: info annotations: summary: "No agent activity for 2+ hours" description: "No Telegram messages processed in last hour. Check if this is expected."