microdao-daarion/monitoring/prometheus/rules/node1.rules.yml

groups:
- name: node1-services
  rules:
  # Service Down alerts
  - alert: ServiceDown
    expr: up == 0
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: "Service {{ $labels.job }} is down"
      description: "{{ $labels.job }} on {{ $labels.instance }} has been down for more than 2 minutes."

  # Qdrant specific
  - alert: QdrantCollectionsLow
    expr: collections_total < 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Qdrant collections count low"
      description: "Qdrant has only {{ $value }} collections, expected 18+."

  - alert: QdrantVectorsDropped
    expr: collections_vector_total < 500
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Qdrant vector count dropped"
      description: "Only {{ $value }} vectors in Qdrant, expected 900+."

- name: node1-host
  rules:
  # Disk space
  - alert: HostDiskSpaceLow
    expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 15
    for: 10m
    labels:
      severity: warning
    annotations:
      summary: "Disk space low"
      description: "Only {{ $value | printf \"%.1f\" }}% free disk space on {{ $labels.mountpoint }}."

  # Memory
  - alert: HostMemoryHigh
    expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
    for: 10m
    labels:
      severity: warning
    annotations:
      summary: "High memory usage"
      description: "Memory usage is {{ $value | printf \"%.1f\" }}% on host."

  # Load average
  - alert: HostHighLoad
    expr: node_load15 > 10
    for: 15m
    labels:
      severity: warning
    annotations:
      summary: "High system load"
      description: "15-minute load average is {{ $value | printf \"%.1f\" }}."

- name: node1-prometheus
  rules:
  # Prometheus self
  - alert: PrometheusConfigReloadFailed
    expr: prometheus_config_last_reload_successful != 1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Prometheus config reload failed"
      description: "Prometheus configuration reload has failed."

  - alert: PrometheusTargetDown
    expr: count(up == 0) > 3
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Multiple Prometheus targets down"
      description: "{{ $value }} scrape targets are down."

- name: node1-nginx
  rules:
  # Rate limiting events (429s spike)
  - alert: NginxHighRateLimitHits
    expr: increase(nginx_http_requests_total{status="429"}[5m]) > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "High rate limit hits"
      description: "More than 100 rate limit (429) responses in 5 minutes."

  # WAF drops (444s spike)
  - alert: NginxWAFDropsSpike
    expr: increase(nginx_http_requests_total{status="444"}[5m]) > 50
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "WAF drops spike detected"
      description: "More than 50 WAF blocks (444) in 5 minutes - possible attack."

  # 5xx errors from gateway upstream
  - alert: NginxUpstream5xxErrors
    expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.1
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Gateway returning 5xx errors"
      description: "Gateway upstream is returning 5xx errors at rate {{ $value }}/s."

  # Auth failures spike (401s)
  - alert: NginxAuthFailuresSpike
    expr: increase(nginx_http_requests_total{status="401"}[10m]) > 200
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "High authentication failures"
      description: "More than 200 auth failures (401) in 10 minutes - possible brute force."

- name: agent-e2e-prober
  rules:
  # Agent not responding
  - alert: AgentE2EFailure
    expr: agent_e2e_success{target="gateway_health"} == 0
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: "Agent E2E probe failing"
      description: "Gateway health check failing for 2+ minutes. Agents may not be responding."

  - alert: AgentE2EHighLatency
    expr: agent_e2e_latency_seconds{target="gateway_health"} > 10
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Agent E2E high latency"
      description: "Agent response latency is {{ $value }}s, expected <10s."

  - alert: AgentPingFailure
    expr: agent_e2e_success{target="agent_ping"} == 0
    for: 3m
    labels:
      severity: critical
    annotations:
      summary: "Agent ping failing"
      description: "Router/memory connectivity check failing. Pipeline may be broken."

- name: agent-errors
  rules:
  # Agent errors spike
  - alert: AgentErrorsSpike
    expr: rate(gateway_errors_total[5m]) > 0
    for: 3m
    labels:
      severity: warning
    annotations:
      summary: "Agent errors detected"
      description: "Errors from {{ $labels.source }}: {{ $labels.type }} at rate {{ $value }}/s"

  # Router call failures
  - alert: RouterCallFailures
    expr: rate(gateway_router_calls_total{status!="success"}[5m]) > 0
    for: 3m
    labels:
      severity: warning
    annotations:
      summary: "Router call failures detected"
      description: "Router calls failing with status={{ $labels.status }} at rate {{ $value }}/s"

  # High latency p95
  - alert: RouterLatencyHigh
    expr: histogram_quantile(0.95, rate(gateway_router_latency_seconds_bucket[5m])) > 30
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Router latency p95 > 30s"
      description: "Router p95 latency is {{ $value }}s, expected <30s"

- name: agent-traffic
  rules:
  # No traffic (agents silent)
  - alert: AgentsSilent
    expr: increase(gateway_telegram_messages_total[1h]) == 0
    for: 2h
    labels:
      severity: info
    annotations:
      summary: "No agent activity for 2+ hours"
      description: "No Telegram messages processed in last hour. Check if this is expected."