Complete snapshot of /opt/microdao-daarion/ from NODE1 (144.76.224.179).
This represents the actual running production code that has diverged
significantly from the previous main branch.
Key changes from old main:
- Gateway (http_api.py): expanded from ~40KB to 164KB with full agent support
- Router: new /v1/agents/{id}/infer endpoint with vision + DeepSeek routing
- Behavior Policy: SOWA v2.2 (3-level: FULL/ACK/SILENT)
- Agent Registry: config/agent_registry.yml as single source of truth
- 13 agents configured (was 3)
- Memory service integration
- CrewAI teams and roles
Excluded from snapshot: venv/, .env, data/, backups, .tgz archives
Co-authored-by: Cursor <cursoragent@cursor.com>
201 lines
5.8 KiB
YAML
201 lines
5.8 KiB
YAML
groups:
|
|
- name: node1-services
|
|
rules:
|
|
# Service Down alerts
|
|
- alert: ServiceDown
|
|
expr: up == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Service {{ $labels.job }} is down"
|
|
description: "{{ $labels.job }} on {{ $labels.instance }} has been down for more than 2 minutes."
|
|
|
|
# Qdrant specific
|
|
- alert: QdrantCollectionsLow
|
|
expr: collections_total < 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Qdrant collections count low"
|
|
description: "Qdrant has only {{ $value }} collections, expected 18+."
|
|
|
|
- alert: QdrantVectorsDropped
|
|
expr: collections_vector_total < 500
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Qdrant vector count dropped"
|
|
description: "Only {{ $value }} vectors in Qdrant, expected 900+."
|
|
|
|
- name: node1-host
|
|
rules:
|
|
# Disk space
|
|
- alert: HostDiskSpaceLow
|
|
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 15
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Disk space low"
|
|
description: "Only {{ $value | printf \"%.1f\" }}% free disk space on {{ $labels.mountpoint }}."
|
|
|
|
# Memory
|
|
- alert: HostMemoryHigh
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage"
|
|
description: "Memory usage is {{ $value | printf \"%.1f\" }}% on host."
|
|
|
|
# Load average
|
|
- alert: HostHighLoad
|
|
expr: node_load15 > 10
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High system load"
|
|
description: "15-minute load average is {{ $value | printf \"%.1f\" }}."
|
|
|
|
- name: node1-prometheus
|
|
rules:
|
|
# Prometheus self
|
|
- alert: PrometheusConfigReloadFailed
|
|
expr: prometheus_config_last_reload_successful != 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus config reload failed"
|
|
description: "Prometheus configuration reload has failed."
|
|
|
|
- alert: PrometheusTargetDown
|
|
expr: count(up == 0) > 3
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Multiple Prometheus targets down"
|
|
description: "{{ $value }} scrape targets are down."
|
|
|
|
- name: node1-nginx
|
|
rules:
|
|
# Rate limiting events (429s spike)
|
|
- alert: NginxHighRateLimitHits
|
|
expr: increase(nginx_http_requests_total{status="429"}[5m]) > 100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High rate limit hits"
|
|
description: "More than 100 rate limit (429) responses in 5 minutes."
|
|
|
|
# WAF drops (444s spike)
|
|
- alert: NginxWAFDropsSpike
|
|
expr: increase(nginx_http_requests_total{status="444"}[5m]) > 50
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "WAF drops spike detected"
|
|
description: "More than 50 WAF blocks (444) in 5 minutes - possible attack."
|
|
|
|
# 5xx errors from gateway upstream
|
|
- alert: NginxUpstream5xxErrors
|
|
expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Gateway returning 5xx errors"
|
|
description: "Gateway upstream is returning 5xx errors at rate {{ $value }}/s."
|
|
|
|
# Auth failures spike (401s)
|
|
- alert: NginxAuthFailuresSpike
|
|
expr: increase(nginx_http_requests_total{status="401"}[10m]) > 200
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High authentication failures"
|
|
description: "More than 200 auth failures (401) in 10 minutes - possible brute force."
|
|
|
|
- name: agent-e2e-prober
|
|
rules:
|
|
# Agent not responding
|
|
- alert: AgentE2EFailure
|
|
expr: agent_e2e_success{target="gateway_health"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Agent E2E probe failing"
|
|
description: "Gateway health check failing for 2+ minutes. Agents may not be responding."
|
|
|
|
- alert: AgentE2EHighLatency
|
|
expr: agent_e2e_latency_seconds{target="gateway_health"} > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Agent E2E high latency"
|
|
description: "Agent response latency is {{ $value }}s, expected <10s."
|
|
|
|
- alert: AgentPingFailure
|
|
expr: agent_e2e_success{target="agent_ping"} == 0
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Agent ping failing"
|
|
description: "Router/memory connectivity check failing. Pipeline may be broken."
|
|
|
|
- name: agent-errors
|
|
rules:
|
|
# Agent errors spike
|
|
- alert: AgentErrorsSpike
|
|
expr: rate(gateway_errors_total[5m]) > 0
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Agent errors detected"
|
|
description: "Errors from {{ $labels.source }}: {{ $labels.type }} at rate {{ $value }}/s"
|
|
|
|
# Router call failures
|
|
- alert: RouterCallFailures
|
|
expr: rate(gateway_router_calls_total{status!="success"}[5m]) > 0
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Router call failures detected"
|
|
description: "Router calls failing with status={{ $labels.status }} at rate {{ $value }}/s"
|
|
|
|
# High latency p95
|
|
- alert: RouterLatencyHigh
|
|
expr: histogram_quantile(0.95, rate(gateway_router_latency_seconds_bucket[5m])) > 30
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Router latency p95 > 30s"
|
|
description: "Router p95 latency is {{ $value }}s, expected <30s"
|
|
|
|
- name: agent-traffic
|
|
rules:
|
|
# No traffic (agents silent)
|
|
- alert: AgentsSilent
|
|
expr: increase(gateway_telegram_messages_total[1h]) == 0
|
|
for: 2h
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "No agent activity for 2+ hours"
|
|
description: "No Telegram messages processed in last hour. Check if this is expected."
|