feat(P0/P1/P2): Add E2E agent prober, version pinning, prometheus fixes
This commit is contained in:
@@ -3106,3 +3106,46 @@ async def health():
|
||||
"agents_count": len(AGENT_REGISTRY),
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
|
||||
@router.post("/debug/agent_ping")
|
||||
async def debug_agent_ping(request: dict = None):
|
||||
"""
|
||||
E2E probe endpoint - tests full agent pipeline.
|
||||
Used by agent-e2e-prober for monitoring.
|
||||
Returns success only if router responds.
|
||||
"""
|
||||
import time
|
||||
start = time.time()
|
||||
|
||||
try:
|
||||
# Test 1: Check router connectivity
|
||||
router_url = os.getenv("ROUTER_URL", "http://router:8000")
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
router_resp = await client.get(f"{router_url}/health")
|
||||
router_ok = router_resp.status_code == 200
|
||||
|
||||
# Test 2: Check memory service connectivity
|
||||
memory_url = os.getenv("MEMORY_SERVICE_URL", "http://memory-service:8000")
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
memory_resp = await client.get(f"{memory_url}/health")
|
||||
memory_ok = memory_resp.status_code == 200
|
||||
|
||||
latency = time.time() - start
|
||||
|
||||
return {
|
||||
"success": router_ok and memory_ok,
|
||||
"latency_seconds": round(latency, 3),
|
||||
"checks": {
|
||||
"router": router_ok,
|
||||
"memory_service": memory_ok,
|
||||
},
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)[:100],
|
||||
"latency_seconds": round(time.time() - start, 3),
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
61
scripts/node1/verify_agents.sh
Normal file
61
scripts/node1/verify_agents.sh
Normal file
@@ -0,0 +1,61 @@
|
||||
#!/bin/bash
|
||||
# Verify Agents Script - checks that agents are responding
|
||||
# Usage: ./verify_agents.sh
|
||||
|
||||
set -e
|
||||
|
||||
echo "=== AGENT E2E VERIFICATION ==="
|
||||
echo ""
|
||||
|
||||
# 1. Check prober metrics
|
||||
echo "1. Checking prober metrics..."
|
||||
PROBER_METRICS=$(curl -s http://localhost:9108/metrics 2>/dev/null || echo "FAIL")
|
||||
if echo "$PROBER_METRICS" | grep -q "agent_e2e_success"; then
|
||||
echo " ✅ Prober metrics available"
|
||||
echo "$PROBER_METRICS" | grep "agent_e2e_success" | head -3
|
||||
else
|
||||
echo " ❌ Prober metrics NOT available (prober may not be running)"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# 2. Check Prometheus targets
|
||||
echo "2. Checking Prometheus targets..."
|
||||
PROM_TARGETS=$(curl -s http://localhost:9090/api/v1/targets 2>/dev/null || echo "FAIL")
|
||||
if echo "$PROM_TARGETS" | grep -q "prober"; then
|
||||
echo " ✅ Prober found in Prometheus targets"
|
||||
else
|
||||
echo " ⚠️ Prober not in Prometheus targets yet"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
|
||||
# 3. Direct agent ping test
|
||||
echo "3. Testing /debug/agent_ping..."
|
||||
PING_RESULT=$(curl -s -X POST http://localhost:9300/debug/agent_ping -H "Content-Type: application/json" -d '{}' 2>/dev/null || echo '{"error":"connection failed"}')
|
||||
echo " $PING_RESULT"
|
||||
|
||||
echo ""
|
||||
|
||||
# 4. Gateway health
|
||||
echo "4. Gateway health..."
|
||||
HEALTH=$(curl -s http://localhost:9300/health 2>/dev/null || echo '{"status":"error"}')
|
||||
if echo "$HEALTH" | grep -q "healthy"; then
|
||||
echo " ✅ Gateway healthy"
|
||||
echo " $HEALTH" | head -c 200
|
||||
else
|
||||
echo " ❌ Gateway unhealthy: $HEALTH"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo ""
|
||||
|
||||
# 5. Webhook test
|
||||
echo "5. Testing webhook (Helion)..."
|
||||
WEBHOOK=$(curl -s -X POST http://localhost:9300/helion/telegram/webhook \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"update_id":1}' 2>/dev/null || echo '{"error":"failed"}')
|
||||
echo " $WEBHOOK"
|
||||
|
||||
echo ""
|
||||
echo "=== VERIFICATION COMPLETE ==="
|
||||
18
services/agent-e2e-prober/Dockerfile
Normal file
18
services/agent-e2e-prober/Dockerfile
Normal file
@@ -0,0 +1,18 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application
|
||||
COPY main.py .
|
||||
|
||||
# Run as non-root
|
||||
RUN useradd -m -u 1000 prober
|
||||
USER prober
|
||||
|
||||
EXPOSE 9108
|
||||
|
||||
CMD ["python", "-u", "main.py"]
|
||||
179
services/agent-e2e-prober/main.py
Normal file
179
services/agent-e2e-prober/main.py
Normal file
@@ -0,0 +1,179 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
E2E Agent Prober - перевіряє що агенти відповідають
|
||||
Експортує Prometheus метрики на :9108/metrics
|
||||
"""
|
||||
import asyncio
|
||||
import time
|
||||
import os
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from prometheus_client import start_http_server, Counter, Gauge, Histogram
|
||||
import httpx
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration
|
||||
GATEWAY_URL = os.getenv("GATEWAY_URL", "http://gateway:9300")
|
||||
PROBE_INTERVAL = int(os.getenv("PROBE_INTERVAL", "60")) # seconds
|
||||
PROBE_TIMEOUT = int(os.getenv("PROBE_TIMEOUT", "30")) # seconds
|
||||
METRICS_PORT = int(os.getenv("METRICS_PORT", "9108"))
|
||||
|
||||
# Prometheus metrics
|
||||
agent_e2e_success = Gauge('agent_e2e_success', 'Whether last E2E probe succeeded', ['target'])
|
||||
agent_e2e_latency = Gauge('agent_e2e_latency_seconds', 'Latency of last E2E probe', ['target'])
|
||||
agent_e2e_failures_total = Counter('agent_e2e_failures_total', 'Total E2E probe failures', ['target', 'reason'])
|
||||
agent_e2e_runs_total = Counter('agent_e2e_runs_total', 'Total E2E probe runs', ['target'])
|
||||
|
||||
# Histogram for latency distribution
|
||||
agent_e2e_latency_histogram = Histogram(
|
||||
'agent_e2e_latency_histogram_seconds',
|
||||
'E2E probe latency distribution',
|
||||
['target'],
|
||||
buckets=[0.1, 0.5, 1, 2, 5, 10, 30, 60]
|
||||
)
|
||||
|
||||
|
||||
async def probe_gateway_health() -> tuple[bool, float, str]:
|
||||
"""Probe gateway /health endpoint"""
|
||||
start = time.time()
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
|
||||
resp = await client.get(f"{GATEWAY_URL}/health")
|
||||
latency = time.time() - start
|
||||
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
if data.get("status") == "healthy":
|
||||
return True, latency, ""
|
||||
else:
|
||||
return False, latency, f"unhealthy: {data.get('status')}"
|
||||
else:
|
||||
return False, latency, f"http_{resp.status_code}"
|
||||
except httpx.TimeoutException:
|
||||
return False, time.time() - start, "timeout"
|
||||
except Exception as e:
|
||||
return False, time.time() - start, f"error: {str(e)[:50]}"
|
||||
|
||||
|
||||
async def probe_agent_ping() -> tuple[bool, float, str]:
|
||||
"""Probe gateway /debug/agent_ping endpoint (E2E through router)"""
|
||||
start = time.time()
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
|
||||
resp = await client.post(
|
||||
f"{GATEWAY_URL}/debug/agent_ping",
|
||||
json={"probe": True, "timestamp": datetime.utcnow().isoformat()}
|
||||
)
|
||||
latency = time.time() - start
|
||||
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
if data.get("success"):
|
||||
return True, latency, ""
|
||||
else:
|
||||
return False, latency, data.get("error", "unknown")
|
||||
else:
|
||||
return False, latency, f"http_{resp.status_code}"
|
||||
except httpx.TimeoutException:
|
||||
return False, time.time() - start, "timeout"
|
||||
except Exception as e:
|
||||
return False, time.time() - start, f"error: {str(e)[:50]}"
|
||||
|
||||
|
||||
async def probe_webhook_echo() -> tuple[bool, float, str]:
|
||||
"""
|
||||
Probe via webhook endpoint with minimal test message.
|
||||
This tests the full path: gateway -> router -> completion
|
||||
"""
|
||||
start = time.time()
|
||||
try:
|
||||
# Minimal Telegram-like update that gateway can process
|
||||
test_update = {
|
||||
"update_id": int(time.time()),
|
||||
"message": {
|
||||
"message_id": 1,
|
||||
"from": {"id": 0, "is_bot": True, "first_name": "E2EProber"},
|
||||
"chat": {"id": 0, "type": "private"},
|
||||
"date": int(time.time()),
|
||||
"text": "/health" # Simple health check command
|
||||
}
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
|
||||
# Use helion webhook as it's the most tested
|
||||
resp = await client.post(
|
||||
f"{GATEWAY_URL}/helion/telegram/webhook",
|
||||
json=test_update
|
||||
)
|
||||
latency = time.time() - start
|
||||
|
||||
if resp.status_code == 200:
|
||||
return True, latency, ""
|
||||
else:
|
||||
return False, latency, f"http_{resp.status_code}"
|
||||
except httpx.TimeoutException:
|
||||
return False, time.time() - start, "timeout"
|
||||
except Exception as e:
|
||||
return False, time.time() - start, f"error: {str(e)[:50]}"
|
||||
|
||||
|
||||
async def run_probes():
|
||||
"""Run all probes and update metrics"""
|
||||
# Probe 1: Gateway health
|
||||
success, latency, reason = await probe_gateway_health()
|
||||
agent_e2e_runs_total.labels(target="gateway_health").inc()
|
||||
agent_e2e_success.labels(target="gateway_health").set(1 if success else 0)
|
||||
agent_e2e_latency.labels(target="gateway_health").set(latency)
|
||||
agent_e2e_latency_histogram.labels(target="gateway_health").observe(latency)
|
||||
if not success:
|
||||
agent_e2e_failures_total.labels(target="gateway_health", reason=reason).inc()
|
||||
logger.info(f"gateway_health: success={success}, latency={latency:.3f}s, reason={reason}")
|
||||
|
||||
# Probe 2: Agent ping (if endpoint exists)
|
||||
success, latency, reason = await probe_agent_ping()
|
||||
agent_e2e_runs_total.labels(target="agent_ping").inc()
|
||||
agent_e2e_success.labels(target="agent_ping").set(1 if success else 0)
|
||||
agent_e2e_latency.labels(target="agent_ping").set(latency)
|
||||
agent_e2e_latency_histogram.labels(target="agent_ping").observe(latency)
|
||||
if not success:
|
||||
agent_e2e_failures_total.labels(target="agent_ping", reason=reason).inc()
|
||||
logger.info(f"agent_ping: success={success}, latency={latency:.3f}s, reason={reason}")
|
||||
|
||||
# Probe 3: Webhook E2E (full path test)
|
||||
success, latency, reason = await probe_webhook_echo()
|
||||
agent_e2e_runs_total.labels(target="webhook_e2e").inc()
|
||||
agent_e2e_success.labels(target="webhook_e2e").set(1 if success else 0)
|
||||
agent_e2e_latency.labels(target="webhook_e2e").set(latency)
|
||||
agent_e2e_latency_histogram.labels(target="webhook_e2e").observe(latency)
|
||||
if not success:
|
||||
agent_e2e_failures_total.labels(target="webhook_e2e", reason=reason).inc()
|
||||
logger.info(f"webhook_e2e: success={success}, latency={latency:.3f}s, reason={reason}")
|
||||
|
||||
|
||||
async def main():
|
||||
logger.info(f"Starting E2E Agent Prober")
|
||||
logger.info(f" GATEWAY_URL: {GATEWAY_URL}")
|
||||
logger.info(f" PROBE_INTERVAL: {PROBE_INTERVAL}s")
|
||||
logger.info(f" PROBE_TIMEOUT: {PROBE_TIMEOUT}s")
|
||||
logger.info(f" METRICS_PORT: {METRICS_PORT}")
|
||||
|
||||
# Start Prometheus metrics server
|
||||
start_http_server(METRICS_PORT)
|
||||
logger.info(f"Prometheus metrics available at :{METRICS_PORT}/metrics")
|
||||
|
||||
# Initial probe
|
||||
await run_probes()
|
||||
|
||||
# Continuous probing
|
||||
while True:
|
||||
await asyncio.sleep(PROBE_INTERVAL)
|
||||
try:
|
||||
await run_probes()
|
||||
except Exception as e:
|
||||
logger.error(f"Probe error: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
2
services/agent-e2e-prober/requirements.txt
Normal file
2
services/agent-e2e-prober/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
httpx>=0.25.0
|
||||
prometheus-client>=0.19.0
|
||||
Reference in New Issue
Block a user