feat(P0/P1/P2): Add E2E agent prober, version pinning, prometheus fixes

This commit is contained in:
Apple
2026-01-28 07:06:07 -08:00
parent 9dcc3563f6
commit a3923cd96f
5 changed files with 303 additions and 0 deletions

View File

@@ -3106,3 +3106,46 @@ async def health():
"agents_count": len(AGENT_REGISTRY),
"timestamp": datetime.utcnow().isoformat(),
}
@router.post("/debug/agent_ping")
async def debug_agent_ping(request: dict = None):
"""
E2E probe endpoint - tests full agent pipeline.
Used by agent-e2e-prober for monitoring.
Returns success only if router responds.
"""
import time
start = time.time()
try:
# Test 1: Check router connectivity
router_url = os.getenv("ROUTER_URL", "http://router:8000")
async with httpx.AsyncClient(timeout=10.0) as client:
router_resp = await client.get(f"{router_url}/health")
router_ok = router_resp.status_code == 200
# Test 2: Check memory service connectivity
memory_url = os.getenv("MEMORY_SERVICE_URL", "http://memory-service:8000")
async with httpx.AsyncClient(timeout=10.0) as client:
memory_resp = await client.get(f"{memory_url}/health")
memory_ok = memory_resp.status_code == 200
latency = time.time() - start
return {
"success": router_ok and memory_ok,
"latency_seconds": round(latency, 3),
"checks": {
"router": router_ok,
"memory_service": memory_ok,
},
"timestamp": datetime.utcnow().isoformat(),
}
except Exception as e:
return {
"success": False,
"error": str(e)[:100],
"latency_seconds": round(time.time() - start, 3),
"timestamp": datetime.utcnow().isoformat(),
}

View File

@@ -0,0 +1,61 @@
#!/bin/bash
# Verify Agents Script - checks that agents are responding
# Usage: ./verify_agents.sh
set -e
echo "=== AGENT E2E VERIFICATION ==="
echo ""
# 1. Check prober metrics
echo "1. Checking prober metrics..."
PROBER_METRICS=$(curl -s http://localhost:9108/metrics 2>/dev/null || echo "FAIL")
if echo "$PROBER_METRICS" | grep -q "agent_e2e_success"; then
echo " ✅ Prober metrics available"
echo "$PROBER_METRICS" | grep "agent_e2e_success" | head -3
else
echo " ❌ Prober metrics NOT available (prober may not be running)"
fi
echo ""
# 2. Check Prometheus targets
echo "2. Checking Prometheus targets..."
PROM_TARGETS=$(curl -s http://localhost:9090/api/v1/targets 2>/dev/null || echo "FAIL")
if echo "$PROM_TARGETS" | grep -q "prober"; then
echo " ✅ Prober found in Prometheus targets"
else
echo " ⚠️ Prober not in Prometheus targets yet"
fi
echo ""
# 3. Direct agent ping test
echo "3. Testing /debug/agent_ping..."
PING_RESULT=$(curl -s -X POST http://localhost:9300/debug/agent_ping -H "Content-Type: application/json" -d '{}' 2>/dev/null || echo '{"error":"connection failed"}')
echo " $PING_RESULT"
echo ""
# 4. Gateway health
echo "4. Gateway health..."
HEALTH=$(curl -s http://localhost:9300/health 2>/dev/null || echo '{"status":"error"}')
if echo "$HEALTH" | grep -q "healthy"; then
echo " ✅ Gateway healthy"
echo " $HEALTH" | head -c 200
else
echo " ❌ Gateway unhealthy: $HEALTH"
fi
echo ""
echo ""
# 5. Webhook test
echo "5. Testing webhook (Helion)..."
WEBHOOK=$(curl -s -X POST http://localhost:9300/helion/telegram/webhook \
-H "Content-Type: application/json" \
-d '{"update_id":1}' 2>/dev/null || echo '{"error":"failed"}')
echo " $WEBHOOK"
echo ""
echo "=== VERIFICATION COMPLETE ==="

View File

@@ -0,0 +1,18 @@
FROM python:3.11-slim
WORKDIR /app
# Install dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application
COPY main.py .
# Run as non-root
RUN useradd -m -u 1000 prober
USER prober
EXPOSE 9108
CMD ["python", "-u", "main.py"]

View File

@@ -0,0 +1,179 @@
#!/usr/bin/env python3
"""
E2E Agent Prober - перевіряє що агенти відповідають
Експортує Prometheus метрики на :9108/metrics
"""
import asyncio
import time
import os
import logging
from datetime import datetime
from prometheus_client import start_http_server, Counter, Gauge, Histogram
import httpx
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)
# Configuration
GATEWAY_URL = os.getenv("GATEWAY_URL", "http://gateway:9300")
PROBE_INTERVAL = int(os.getenv("PROBE_INTERVAL", "60")) # seconds
PROBE_TIMEOUT = int(os.getenv("PROBE_TIMEOUT", "30")) # seconds
METRICS_PORT = int(os.getenv("METRICS_PORT", "9108"))
# Prometheus metrics
agent_e2e_success = Gauge('agent_e2e_success', 'Whether last E2E probe succeeded', ['target'])
agent_e2e_latency = Gauge('agent_e2e_latency_seconds', 'Latency of last E2E probe', ['target'])
agent_e2e_failures_total = Counter('agent_e2e_failures_total', 'Total E2E probe failures', ['target', 'reason'])
agent_e2e_runs_total = Counter('agent_e2e_runs_total', 'Total E2E probe runs', ['target'])
# Histogram for latency distribution
agent_e2e_latency_histogram = Histogram(
'agent_e2e_latency_histogram_seconds',
'E2E probe latency distribution',
['target'],
buckets=[0.1, 0.5, 1, 2, 5, 10, 30, 60]
)
async def probe_gateway_health() -> tuple[bool, float, str]:
"""Probe gateway /health endpoint"""
start = time.time()
try:
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
resp = await client.get(f"{GATEWAY_URL}/health")
latency = time.time() - start
if resp.status_code == 200:
data = resp.json()
if data.get("status") == "healthy":
return True, latency, ""
else:
return False, latency, f"unhealthy: {data.get('status')}"
else:
return False, latency, f"http_{resp.status_code}"
except httpx.TimeoutException:
return False, time.time() - start, "timeout"
except Exception as e:
return False, time.time() - start, f"error: {str(e)[:50]}"
async def probe_agent_ping() -> tuple[bool, float, str]:
"""Probe gateway /debug/agent_ping endpoint (E2E through router)"""
start = time.time()
try:
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
resp = await client.post(
f"{GATEWAY_URL}/debug/agent_ping",
json={"probe": True, "timestamp": datetime.utcnow().isoformat()}
)
latency = time.time() - start
if resp.status_code == 200:
data = resp.json()
if data.get("success"):
return True, latency, ""
else:
return False, latency, data.get("error", "unknown")
else:
return False, latency, f"http_{resp.status_code}"
except httpx.TimeoutException:
return False, time.time() - start, "timeout"
except Exception as e:
return False, time.time() - start, f"error: {str(e)[:50]}"
async def probe_webhook_echo() -> tuple[bool, float, str]:
"""
Probe via webhook endpoint with minimal test message.
This tests the full path: gateway -> router -> completion
"""
start = time.time()
try:
# Minimal Telegram-like update that gateway can process
test_update = {
"update_id": int(time.time()),
"message": {
"message_id": 1,
"from": {"id": 0, "is_bot": True, "first_name": "E2EProber"},
"chat": {"id": 0, "type": "private"},
"date": int(time.time()),
"text": "/health" # Simple health check command
}
}
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
# Use helion webhook as it's the most tested
resp = await client.post(
f"{GATEWAY_URL}/helion/telegram/webhook",
json=test_update
)
latency = time.time() - start
if resp.status_code == 200:
return True, latency, ""
else:
return False, latency, f"http_{resp.status_code}"
except httpx.TimeoutException:
return False, time.time() - start, "timeout"
except Exception as e:
return False, time.time() - start, f"error: {str(e)[:50]}"
async def run_probes():
"""Run all probes and update metrics"""
# Probe 1: Gateway health
success, latency, reason = await probe_gateway_health()
agent_e2e_runs_total.labels(target="gateway_health").inc()
agent_e2e_success.labels(target="gateway_health").set(1 if success else 0)
agent_e2e_latency.labels(target="gateway_health").set(latency)
agent_e2e_latency_histogram.labels(target="gateway_health").observe(latency)
if not success:
agent_e2e_failures_total.labels(target="gateway_health", reason=reason).inc()
logger.info(f"gateway_health: success={success}, latency={latency:.3f}s, reason={reason}")
# Probe 2: Agent ping (if endpoint exists)
success, latency, reason = await probe_agent_ping()
agent_e2e_runs_total.labels(target="agent_ping").inc()
agent_e2e_success.labels(target="agent_ping").set(1 if success else 0)
agent_e2e_latency.labels(target="agent_ping").set(latency)
agent_e2e_latency_histogram.labels(target="agent_ping").observe(latency)
if not success:
agent_e2e_failures_total.labels(target="agent_ping", reason=reason).inc()
logger.info(f"agent_ping: success={success}, latency={latency:.3f}s, reason={reason}")
# Probe 3: Webhook E2E (full path test)
success, latency, reason = await probe_webhook_echo()
agent_e2e_runs_total.labels(target="webhook_e2e").inc()
agent_e2e_success.labels(target="webhook_e2e").set(1 if success else 0)
agent_e2e_latency.labels(target="webhook_e2e").set(latency)
agent_e2e_latency_histogram.labels(target="webhook_e2e").observe(latency)
if not success:
agent_e2e_failures_total.labels(target="webhook_e2e", reason=reason).inc()
logger.info(f"webhook_e2e: success={success}, latency={latency:.3f}s, reason={reason}")
async def main():
logger.info(f"Starting E2E Agent Prober")
logger.info(f" GATEWAY_URL: {GATEWAY_URL}")
logger.info(f" PROBE_INTERVAL: {PROBE_INTERVAL}s")
logger.info(f" PROBE_TIMEOUT: {PROBE_TIMEOUT}s")
logger.info(f" METRICS_PORT: {METRICS_PORT}")
# Start Prometheus metrics server
start_http_server(METRICS_PORT)
logger.info(f"Prometheus metrics available at :{METRICS_PORT}/metrics")
# Initial probe
await run_probes()
# Continuous probing
while True:
await asyncio.sleep(PROBE_INTERVAL)
try:
await run_probes()
except Exception as e:
logger.error(f"Probe error: {e}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,2 @@
httpx>=0.25.0
prometheus-client>=0.19.0