Files
microdao-daarion/monitoring/prometheus/rules/node1.rules.yml
Apple ef3473db21 snapshot: NODE1 production state 2026-02-09
Complete snapshot of /opt/microdao-daarion/ from NODE1 (144.76.224.179).
This represents the actual running production code that has diverged
significantly from the previous main branch.

Key changes from old main:
- Gateway (http_api.py): expanded from ~40KB to 164KB with full agent support
- Router: new /v1/agents/{id}/infer endpoint with vision + DeepSeek routing
- Behavior Policy: SOWA v2.2 (3-level: FULL/ACK/SILENT)
- Agent Registry: config/agent_registry.yml as single source of truth
- 13 agents configured (was 3)
- Memory service integration
- CrewAI teams and roles

Excluded from snapshot: venv/, .env, data/, backups, .tgz archives

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-09 08:46:46 -08:00

201 lines
5.8 KiB
YAML

groups:
- name: node1-services
rules:
# Service Down alerts
- alert: ServiceDown
expr: up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down"
description: "{{ $labels.job }} on {{ $labels.instance }} has been down for more than 2 minutes."
# Qdrant specific
- alert: QdrantCollectionsLow
expr: collections_total < 10
for: 5m
labels:
severity: warning
annotations:
summary: "Qdrant collections count low"
description: "Qdrant has only {{ $value }} collections, expected 18+."
- alert: QdrantVectorsDropped
expr: collections_vector_total < 500
for: 5m
labels:
severity: warning
annotations:
summary: "Qdrant vector count dropped"
description: "Only {{ $value }} vectors in Qdrant, expected 900+."
- name: node1-host
rules:
# Disk space
- alert: HostDiskSpaceLow
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 15
for: 10m
labels:
severity: warning
annotations:
summary: "Disk space low"
description: "Only {{ $value | printf \"%.1f\" }}% free disk space on {{ $labels.mountpoint }}."
# Memory
- alert: HostMemoryHigh
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
for: 10m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Memory usage is {{ $value | printf \"%.1f\" }}% on host."
# Load average
- alert: HostHighLoad
expr: node_load15 > 10
for: 15m
labels:
severity: warning
annotations:
summary: "High system load"
description: "15-minute load average is {{ $value | printf \"%.1f\" }}."
- name: node1-prometheus
rules:
# Prometheus self
- alert: PrometheusConfigReloadFailed
expr: prometheus_config_last_reload_successful != 1
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus config reload failed"
description: "Prometheus configuration reload has failed."
- alert: PrometheusTargetDown
expr: count(up == 0) > 3
for: 5m
labels:
severity: critical
annotations:
summary: "Multiple Prometheus targets down"
description: "{{ $value }} scrape targets are down."
- name: node1-nginx
rules:
# Rate limiting events (429s spike)
- alert: NginxHighRateLimitHits
expr: increase(nginx_http_requests_total{status="429"}[5m]) > 100
for: 5m
labels:
severity: warning
annotations:
summary: "High rate limit hits"
description: "More than 100 rate limit (429) responses in 5 minutes."
# WAF drops (444s spike)
- alert: NginxWAFDropsSpike
expr: increase(nginx_http_requests_total{status="444"}[5m]) > 50
for: 5m
labels:
severity: warning
annotations:
summary: "WAF drops spike detected"
description: "More than 50 WAF blocks (444) in 5 minutes - possible attack."
# 5xx errors from gateway upstream
- alert: NginxUpstream5xxErrors
expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "Gateway returning 5xx errors"
description: "Gateway upstream is returning 5xx errors at rate {{ $value }}/s."
# Auth failures spike (401s)
- alert: NginxAuthFailuresSpike
expr: increase(nginx_http_requests_total{status="401"}[10m]) > 200
for: 5m
labels:
severity: warning
annotations:
summary: "High authentication failures"
description: "More than 200 auth failures (401) in 10 minutes - possible brute force."
- name: agent-e2e-prober
rules:
# Agent not responding
- alert: AgentE2EFailure
expr: agent_e2e_success{target="gateway_health"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Agent E2E probe failing"
description: "Gateway health check failing for 2+ minutes. Agents may not be responding."
- alert: AgentE2EHighLatency
expr: agent_e2e_latency_seconds{target="gateway_health"} > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Agent E2E high latency"
description: "Agent response latency is {{ $value }}s, expected <10s."
- alert: AgentPingFailure
expr: agent_e2e_success{target="agent_ping"} == 0
for: 3m
labels:
severity: critical
annotations:
summary: "Agent ping failing"
description: "Router/memory connectivity check failing. Pipeline may be broken."
- name: agent-errors
rules:
# Agent errors spike
- alert: AgentErrorsSpike
expr: rate(gateway_errors_total[5m]) > 0
for: 3m
labels:
severity: warning
annotations:
summary: "Agent errors detected"
description: "Errors from {{ $labels.source }}: {{ $labels.type }} at rate {{ $value }}/s"
# Router call failures
- alert: RouterCallFailures
expr: rate(gateway_router_calls_total{status!="success"}[5m]) > 0
for: 3m
labels:
severity: warning
annotations:
summary: "Router call failures detected"
description: "Router calls failing with status={{ $labels.status }} at rate {{ $value }}/s"
# High latency p95
- alert: RouterLatencyHigh
expr: histogram_quantile(0.95, rate(gateway_router_latency_seconds_bucket[5m])) > 30
for: 5m
labels:
severity: warning
annotations:
summary: "Router latency p95 > 30s"
description: "Router p95 latency is {{ $value }}s, expected <30s"
- name: agent-traffic
rules:
# No traffic (agents silent)
- alert: AgentsSilent
expr: increase(gateway_telegram_messages_total[1h]) == 0
for: 2h
labels:
severity: info
annotations:
summary: "No agent activity for 2+ hours"
description: "No Telegram messages processed in last hour. Check if this is expected."