feat(P0/P1/P2): Add E2E agent prober, version pinning, prometheus fixes
This commit is contained in:
@@ -3106,3 +3106,46 @@ async def health():
|
|||||||
"agents_count": len(AGENT_REGISTRY),
|
"agents_count": len(AGENT_REGISTRY),
|
||||||
"timestamp": datetime.utcnow().isoformat(),
|
"timestamp": datetime.utcnow().isoformat(),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/debug/agent_ping")
|
||||||
|
async def debug_agent_ping(request: dict = None):
|
||||||
|
"""
|
||||||
|
E2E probe endpoint - tests full agent pipeline.
|
||||||
|
Used by agent-e2e-prober for monitoring.
|
||||||
|
Returns success only if router responds.
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Test 1: Check router connectivity
|
||||||
|
router_url = os.getenv("ROUTER_URL", "http://router:8000")
|
||||||
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||||
|
router_resp = await client.get(f"{router_url}/health")
|
||||||
|
router_ok = router_resp.status_code == 200
|
||||||
|
|
||||||
|
# Test 2: Check memory service connectivity
|
||||||
|
memory_url = os.getenv("MEMORY_SERVICE_URL", "http://memory-service:8000")
|
||||||
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||||
|
memory_resp = await client.get(f"{memory_url}/health")
|
||||||
|
memory_ok = memory_resp.status_code == 200
|
||||||
|
|
||||||
|
latency = time.time() - start
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": router_ok and memory_ok,
|
||||||
|
"latency_seconds": round(latency, 3),
|
||||||
|
"checks": {
|
||||||
|
"router": router_ok,
|
||||||
|
"memory_service": memory_ok,
|
||||||
|
},
|
||||||
|
"timestamp": datetime.utcnow().isoformat(),
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"error": str(e)[:100],
|
||||||
|
"latency_seconds": round(time.time() - start, 3),
|
||||||
|
"timestamp": datetime.utcnow().isoformat(),
|
||||||
|
}
|
||||||
|
|||||||
61
scripts/node1/verify_agents.sh
Normal file
61
scripts/node1/verify_agents.sh
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Verify Agents Script - checks that agents are responding
|
||||||
|
# Usage: ./verify_agents.sh
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo "=== AGENT E2E VERIFICATION ==="
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 1. Check prober metrics
|
||||||
|
echo "1. Checking prober metrics..."
|
||||||
|
PROBER_METRICS=$(curl -s http://localhost:9108/metrics 2>/dev/null || echo "FAIL")
|
||||||
|
if echo "$PROBER_METRICS" | grep -q "agent_e2e_success"; then
|
||||||
|
echo " ✅ Prober metrics available"
|
||||||
|
echo "$PROBER_METRICS" | grep "agent_e2e_success" | head -3
|
||||||
|
else
|
||||||
|
echo " ❌ Prober metrics NOT available (prober may not be running)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 2. Check Prometheus targets
|
||||||
|
echo "2. Checking Prometheus targets..."
|
||||||
|
PROM_TARGETS=$(curl -s http://localhost:9090/api/v1/targets 2>/dev/null || echo "FAIL")
|
||||||
|
if echo "$PROM_TARGETS" | grep -q "prober"; then
|
||||||
|
echo " ✅ Prober found in Prometheus targets"
|
||||||
|
else
|
||||||
|
echo " ⚠️ Prober not in Prometheus targets yet"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 3. Direct agent ping test
|
||||||
|
echo "3. Testing /debug/agent_ping..."
|
||||||
|
PING_RESULT=$(curl -s -X POST http://localhost:9300/debug/agent_ping -H "Content-Type: application/json" -d '{}' 2>/dev/null || echo '{"error":"connection failed"}')
|
||||||
|
echo " $PING_RESULT"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 4. Gateway health
|
||||||
|
echo "4. Gateway health..."
|
||||||
|
HEALTH=$(curl -s http://localhost:9300/health 2>/dev/null || echo '{"status":"error"}')
|
||||||
|
if echo "$HEALTH" | grep -q "healthy"; then
|
||||||
|
echo " ✅ Gateway healthy"
|
||||||
|
echo " $HEALTH" | head -c 200
|
||||||
|
else
|
||||||
|
echo " ❌ Gateway unhealthy: $HEALTH"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 5. Webhook test
|
||||||
|
echo "5. Testing webhook (Helion)..."
|
||||||
|
WEBHOOK=$(curl -s -X POST http://localhost:9300/helion/telegram/webhook \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"update_id":1}' 2>/dev/null || echo '{"error":"failed"}')
|
||||||
|
echo " $WEBHOOK"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== VERIFICATION COMPLETE ==="
|
||||||
18
services/agent-e2e-prober/Dockerfile
Normal file
18
services/agent-e2e-prober/Dockerfile
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Copy application
|
||||||
|
COPY main.py .
|
||||||
|
|
||||||
|
# Run as non-root
|
||||||
|
RUN useradd -m -u 1000 prober
|
||||||
|
USER prober
|
||||||
|
|
||||||
|
EXPOSE 9108
|
||||||
|
|
||||||
|
CMD ["python", "-u", "main.py"]
|
||||||
179
services/agent-e2e-prober/main.py
Normal file
179
services/agent-e2e-prober/main.py
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
E2E Agent Prober - перевіряє що агенти відповідають
|
||||||
|
Експортує Prometheus метрики на :9108/metrics
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
from prometheus_client import start_http_server, Counter, Gauge, Histogram
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
GATEWAY_URL = os.getenv("GATEWAY_URL", "http://gateway:9300")
|
||||||
|
PROBE_INTERVAL = int(os.getenv("PROBE_INTERVAL", "60")) # seconds
|
||||||
|
PROBE_TIMEOUT = int(os.getenv("PROBE_TIMEOUT", "30")) # seconds
|
||||||
|
METRICS_PORT = int(os.getenv("METRICS_PORT", "9108"))
|
||||||
|
|
||||||
|
# Prometheus metrics
|
||||||
|
agent_e2e_success = Gauge('agent_e2e_success', 'Whether last E2E probe succeeded', ['target'])
|
||||||
|
agent_e2e_latency = Gauge('agent_e2e_latency_seconds', 'Latency of last E2E probe', ['target'])
|
||||||
|
agent_e2e_failures_total = Counter('agent_e2e_failures_total', 'Total E2E probe failures', ['target', 'reason'])
|
||||||
|
agent_e2e_runs_total = Counter('agent_e2e_runs_total', 'Total E2E probe runs', ['target'])
|
||||||
|
|
||||||
|
# Histogram for latency distribution
|
||||||
|
agent_e2e_latency_histogram = Histogram(
|
||||||
|
'agent_e2e_latency_histogram_seconds',
|
||||||
|
'E2E probe latency distribution',
|
||||||
|
['target'],
|
||||||
|
buckets=[0.1, 0.5, 1, 2, 5, 10, 30, 60]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def probe_gateway_health() -> tuple[bool, float, str]:
|
||||||
|
"""Probe gateway /health endpoint"""
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
|
||||||
|
resp = await client.get(f"{GATEWAY_URL}/health")
|
||||||
|
latency = time.time() - start
|
||||||
|
|
||||||
|
if resp.status_code == 200:
|
||||||
|
data = resp.json()
|
||||||
|
if data.get("status") == "healthy":
|
||||||
|
return True, latency, ""
|
||||||
|
else:
|
||||||
|
return False, latency, f"unhealthy: {data.get('status')}"
|
||||||
|
else:
|
||||||
|
return False, latency, f"http_{resp.status_code}"
|
||||||
|
except httpx.TimeoutException:
|
||||||
|
return False, time.time() - start, "timeout"
|
||||||
|
except Exception as e:
|
||||||
|
return False, time.time() - start, f"error: {str(e)[:50]}"
|
||||||
|
|
||||||
|
|
||||||
|
async def probe_agent_ping() -> tuple[bool, float, str]:
|
||||||
|
"""Probe gateway /debug/agent_ping endpoint (E2E through router)"""
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{GATEWAY_URL}/debug/agent_ping",
|
||||||
|
json={"probe": True, "timestamp": datetime.utcnow().isoformat()}
|
||||||
|
)
|
||||||
|
latency = time.time() - start
|
||||||
|
|
||||||
|
if resp.status_code == 200:
|
||||||
|
data = resp.json()
|
||||||
|
if data.get("success"):
|
||||||
|
return True, latency, ""
|
||||||
|
else:
|
||||||
|
return False, latency, data.get("error", "unknown")
|
||||||
|
else:
|
||||||
|
return False, latency, f"http_{resp.status_code}"
|
||||||
|
except httpx.TimeoutException:
|
||||||
|
return False, time.time() - start, "timeout"
|
||||||
|
except Exception as e:
|
||||||
|
return False, time.time() - start, f"error: {str(e)[:50]}"
|
||||||
|
|
||||||
|
|
||||||
|
async def probe_webhook_echo() -> tuple[bool, float, str]:
|
||||||
|
"""
|
||||||
|
Probe via webhook endpoint with minimal test message.
|
||||||
|
This tests the full path: gateway -> router -> completion
|
||||||
|
"""
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
# Minimal Telegram-like update that gateway can process
|
||||||
|
test_update = {
|
||||||
|
"update_id": int(time.time()),
|
||||||
|
"message": {
|
||||||
|
"message_id": 1,
|
||||||
|
"from": {"id": 0, "is_bot": True, "first_name": "E2EProber"},
|
||||||
|
"chat": {"id": 0, "type": "private"},
|
||||||
|
"date": int(time.time()),
|
||||||
|
"text": "/health" # Simple health check command
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
|
||||||
|
# Use helion webhook as it's the most tested
|
||||||
|
resp = await client.post(
|
||||||
|
f"{GATEWAY_URL}/helion/telegram/webhook",
|
||||||
|
json=test_update
|
||||||
|
)
|
||||||
|
latency = time.time() - start
|
||||||
|
|
||||||
|
if resp.status_code == 200:
|
||||||
|
return True, latency, ""
|
||||||
|
else:
|
||||||
|
return False, latency, f"http_{resp.status_code}"
|
||||||
|
except httpx.TimeoutException:
|
||||||
|
return False, time.time() - start, "timeout"
|
||||||
|
except Exception as e:
|
||||||
|
return False, time.time() - start, f"error: {str(e)[:50]}"
|
||||||
|
|
||||||
|
|
||||||
|
async def run_probes():
|
||||||
|
"""Run all probes and update metrics"""
|
||||||
|
# Probe 1: Gateway health
|
||||||
|
success, latency, reason = await probe_gateway_health()
|
||||||
|
agent_e2e_runs_total.labels(target="gateway_health").inc()
|
||||||
|
agent_e2e_success.labels(target="gateway_health").set(1 if success else 0)
|
||||||
|
agent_e2e_latency.labels(target="gateway_health").set(latency)
|
||||||
|
agent_e2e_latency_histogram.labels(target="gateway_health").observe(latency)
|
||||||
|
if not success:
|
||||||
|
agent_e2e_failures_total.labels(target="gateway_health", reason=reason).inc()
|
||||||
|
logger.info(f"gateway_health: success={success}, latency={latency:.3f}s, reason={reason}")
|
||||||
|
|
||||||
|
# Probe 2: Agent ping (if endpoint exists)
|
||||||
|
success, latency, reason = await probe_agent_ping()
|
||||||
|
agent_e2e_runs_total.labels(target="agent_ping").inc()
|
||||||
|
agent_e2e_success.labels(target="agent_ping").set(1 if success else 0)
|
||||||
|
agent_e2e_latency.labels(target="agent_ping").set(latency)
|
||||||
|
agent_e2e_latency_histogram.labels(target="agent_ping").observe(latency)
|
||||||
|
if not success:
|
||||||
|
agent_e2e_failures_total.labels(target="agent_ping", reason=reason).inc()
|
||||||
|
logger.info(f"agent_ping: success={success}, latency={latency:.3f}s, reason={reason}")
|
||||||
|
|
||||||
|
# Probe 3: Webhook E2E (full path test)
|
||||||
|
success, latency, reason = await probe_webhook_echo()
|
||||||
|
agent_e2e_runs_total.labels(target="webhook_e2e").inc()
|
||||||
|
agent_e2e_success.labels(target="webhook_e2e").set(1 if success else 0)
|
||||||
|
agent_e2e_latency.labels(target="webhook_e2e").set(latency)
|
||||||
|
agent_e2e_latency_histogram.labels(target="webhook_e2e").observe(latency)
|
||||||
|
if not success:
|
||||||
|
agent_e2e_failures_total.labels(target="webhook_e2e", reason=reason).inc()
|
||||||
|
logger.info(f"webhook_e2e: success={success}, latency={latency:.3f}s, reason={reason}")
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
logger.info(f"Starting E2E Agent Prober")
|
||||||
|
logger.info(f" GATEWAY_URL: {GATEWAY_URL}")
|
||||||
|
logger.info(f" PROBE_INTERVAL: {PROBE_INTERVAL}s")
|
||||||
|
logger.info(f" PROBE_TIMEOUT: {PROBE_TIMEOUT}s")
|
||||||
|
logger.info(f" METRICS_PORT: {METRICS_PORT}")
|
||||||
|
|
||||||
|
# Start Prometheus metrics server
|
||||||
|
start_http_server(METRICS_PORT)
|
||||||
|
logger.info(f"Prometheus metrics available at :{METRICS_PORT}/metrics")
|
||||||
|
|
||||||
|
# Initial probe
|
||||||
|
await run_probes()
|
||||||
|
|
||||||
|
# Continuous probing
|
||||||
|
while True:
|
||||||
|
await asyncio.sleep(PROBE_INTERVAL)
|
||||||
|
try:
|
||||||
|
await run_probes()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Probe error: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
2
services/agent-e2e-prober/requirements.txt
Normal file
2
services/agent-e2e-prober/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
httpx>=0.25.0
|
||||||
|
prometheus-client>=0.19.0
|
||||||
Reference in New Issue
Block a user