feat(P0/P1/P2): Add E2E agent prober, version pinning, prometheus fixes

2026-01-28 07:06:07 -08:00
parent 9dcc3563f6
commit a3923cd96f
5 changed files with 303 additions and 0 deletions
--- a/gateway-bot/http_api.py
+++ b/gateway-bot/http_api.py
@@ -3106,3 +3106,46 @@ async def health():
        "agents_count": len(AGENT_REGISTRY),
        "timestamp": datetime.utcnow().isoformat(),
    }
@router.post("/debug/agent_ping")
 async def debug_agent_ping(request: dict = None):
    """
    E2E probe endpoint - tests full agent pipeline.
    Used by agent-e2e-prober for monitoring.
    Returns success only if router responds.
    """
    import time
    start = time.time()
    try:
        # Test 1: Check router connectivity
        router_url = os.getenv("ROUTER_URL", "http://router:8000")
        async with httpx.AsyncClient(timeout=10.0) as client:
            router_resp = await client.get(f"{router_url}/health")
            router_ok = router_resp.status_code == 200
        # Test 2: Check memory service connectivity
        memory_url = os.getenv("MEMORY_SERVICE_URL", "http://memory-service:8000")
        async with httpx.AsyncClient(timeout=10.0) as client:
            memory_resp = await client.get(f"{memory_url}/health")
            memory_ok = memory_resp.status_code == 200
        latency = time.time() - start
        return {
            "success": router_ok and memory_ok,
            "latency_seconds": round(latency, 3),
            "checks": {
                "router": router_ok,
                "memory_service": memory_ok,
            },
            "timestamp": datetime.utcnow().isoformat(),
        }
    except Exception as e:
        return {
            "success": False,
            "error": str(e)[:100],
            "latency_seconds": round(time.time() - start, 3),
            "timestamp": datetime.utcnow().isoformat(),
        }
--- a/scripts/node1/verify_agents.sh
+++ b/scripts/node1/verify_agents.sh
@@ -0,0 +1,61 @@
 #!/bin/bash
 # Verify Agents Script - checks that agents are responding
 # Usage: ./verify_agents.sh
 set -e
 echo "=== AGENT E2E VERIFICATION ==="
 echo ""
 # 1. Check prober metrics
 echo "1. Checking prober metrics..."
 PROBER_METRICS=$(curl -s http://localhost:9108/metrics 2>/dev/null || echo "FAIL")
 if echo "$PROBER_METRICS" | grep -q "agent_e2e_success"; then
    echo "   ✅ Prober metrics available"
    echo "$PROBER_METRICS" | grep "agent_e2e_success" | head -3
 else
    echo "   ❌ Prober metrics NOT available (prober may not be running)"
 fi
 echo ""
 # 2. Check Prometheus targets
 echo "2. Checking Prometheus targets..."
 PROM_TARGETS=$(curl -s http://localhost:9090/api/v1/targets 2>/dev/null || echo "FAIL")
 if echo "$PROM_TARGETS" | grep -q "prober"; then
    echo "   ✅ Prober found in Prometheus targets"
 else
    echo "   ⚠️ Prober not in Prometheus targets yet"
 fi
 echo ""
 # 3. Direct agent ping test
 echo "3. Testing /debug/agent_ping..."
 PING_RESULT=$(curl -s -X POST http://localhost:9300/debug/agent_ping -H "Content-Type: application/json" -d '{}' 2>/dev/null || echo '{"error":"connection failed"}')
 echo "   $PING_RESULT"
 echo ""
 # 4. Gateway health
 echo "4. Gateway health..."
 HEALTH=$(curl -s http://localhost:9300/health 2>/dev/null || echo '{"status":"error"}')
 if echo "$HEALTH" | grep -q "healthy"; then
    echo "   ✅ Gateway healthy"
    echo "   $HEALTH" | head -c 200
 else
    echo "   ❌ Gateway unhealthy: $HEALTH"
 fi
 echo ""
 echo ""
 # 5. Webhook test
 echo "5. Testing webhook (Helion)..."
 WEBHOOK=$(curl -s -X POST http://localhost:9300/helion/telegram/webhook \
    -H "Content-Type: application/json" \
    -d '{"update_id":1}' 2>/dev/null || echo '{"error":"failed"}')
 echo "   $WEBHOOK"
 echo ""
 echo "=== VERIFICATION COMPLETE ==="
--- a/services/agent-e2e-prober/Dockerfile
+++ b/services/agent-e2e-prober/Dockerfile
@@ -0,0 +1,18 @@
 FROM python:3.11-slim
 WORKDIR /app
 # Install dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy application
 COPY main.py .
 # Run as non-root
 RUN useradd -m -u 1000 prober
 USER prober
 EXPOSE 9108
 CMD ["python", "-u", "main.py"]
--- a/services/agent-e2e-prober/main.py
+++ b/services/agent-e2e-prober/main.py
@@ -0,0 +1,179 @@
 #!/usr/bin/env python3
 """
 E2E Agent Prober - перевіряє що агенти відповідають
 Експортує Prometheus метрики на :9108/metrics
 """
 import asyncio
 import time
 import os
 import logging
 from datetime import datetime
 from prometheus_client import start_http_server, Counter, Gauge, Histogram
 import httpx
 logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
 logger = logging.getLogger(__name__)
 # Configuration
 GATEWAY_URL = os.getenv("GATEWAY_URL", "http://gateway:9300")
 PROBE_INTERVAL = int(os.getenv("PROBE_INTERVAL", "60"))  # seconds
 PROBE_TIMEOUT = int(os.getenv("PROBE_TIMEOUT", "30"))  # seconds
 METRICS_PORT = int(os.getenv("METRICS_PORT", "9108"))
 # Prometheus metrics
 agent_e2e_success = Gauge('agent_e2e_success', 'Whether last E2E probe succeeded', ['target'])
 agent_e2e_latency = Gauge('agent_e2e_latency_seconds', 'Latency of last E2E probe', ['target'])
 agent_e2e_failures_total = Counter('agent_e2e_failures_total', 'Total E2E probe failures', ['target', 'reason'])
 agent_e2e_runs_total = Counter('agent_e2e_runs_total', 'Total E2E probe runs', ['target'])
 # Histogram for latency distribution
 agent_e2e_latency_histogram = Histogram(
    'agent_e2e_latency_histogram_seconds',
    'E2E probe latency distribution',
    ['target'],
    buckets=[0.1, 0.5, 1, 2, 5, 10, 30, 60]
 )
 async def probe_gateway_health() -> tuple[bool, float, str]:
    """Probe gateway /health endpoint"""
    start = time.time()
    try:
        async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
            resp = await client.get(f"{GATEWAY_URL}/health")
            latency = time.time() - start
            if resp.status_code == 200:
                data = resp.json()
                if data.get("status") == "healthy":
                    return True, latency, ""
                else:
                    return False, latency, f"unhealthy: {data.get('status')}"
            else:
                return False, latency, f"http_{resp.status_code}"
    except httpx.TimeoutException:
        return False, time.time() - start, "timeout"
    except Exception as e:
        return False, time.time() - start, f"error: {str(e)[:50]}"
 async def probe_agent_ping() -> tuple[bool, float, str]:
    """Probe gateway /debug/agent_ping endpoint (E2E through router)"""
    start = time.time()
    try:
        async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
            resp = await client.post(
                f"{GATEWAY_URL}/debug/agent_ping",
                json={"probe": True, "timestamp": datetime.utcnow().isoformat()}
            )
            latency = time.time() - start
            if resp.status_code == 200:
                data = resp.json()
                if data.get("success"):
                    return True, latency, ""
                else:
                    return False, latency, data.get("error", "unknown")
            else:
                return False, latency, f"http_{resp.status_code}"
    except httpx.TimeoutException:
        return False, time.time() - start, "timeout"
    except Exception as e:
        return False, time.time() - start, f"error: {str(e)[:50]}"
 async def probe_webhook_echo() -> tuple[bool, float, str]:
    """
    Probe via webhook endpoint with minimal test message.
    This tests the full path: gateway -> router -> completion
    """
    start = time.time()
    try:
        # Minimal Telegram-like update that gateway can process
        test_update = {
            "update_id": int(time.time()),
            "message": {
                "message_id": 1,
                "from": {"id": 0, "is_bot": True, "first_name": "E2EProber"},
                "chat": {"id": 0, "type": "private"},
                "date": int(time.time()),
                "text": "/health"  # Simple health check command
            }
        }
        async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
            # Use helion webhook as it's the most tested
            resp = await client.post(
                f"{GATEWAY_URL}/helion/telegram/webhook",
                json=test_update
            )
            latency = time.time() - start
            if resp.status_code == 200:
                return True, latency, ""
            else:
                return False, latency, f"http_{resp.status_code}"
    except httpx.TimeoutException:
        return False, time.time() - start, "timeout"
    except Exception as e:
        return False, time.time() - start, f"error: {str(e)[:50]}"
 async def run_probes():
    """Run all probes and update metrics"""
    # Probe 1: Gateway health
    success, latency, reason = await probe_gateway_health()
    agent_e2e_runs_total.labels(target="gateway_health").inc()
    agent_e2e_success.labels(target="gateway_health").set(1 if success else 0)
    agent_e2e_latency.labels(target="gateway_health").set(latency)
    agent_e2e_latency_histogram.labels(target="gateway_health").observe(latency)
    if not success:
        agent_e2e_failures_total.labels(target="gateway_health", reason=reason).inc()
    logger.info(f"gateway_health: success={success}, latency={latency:.3f}s, reason={reason}")
    # Probe 2: Agent ping (if endpoint exists)
    success, latency, reason = await probe_agent_ping()
    agent_e2e_runs_total.labels(target="agent_ping").inc()
    agent_e2e_success.labels(target="agent_ping").set(1 if success else 0)
    agent_e2e_latency.labels(target="agent_ping").set(latency)
    agent_e2e_latency_histogram.labels(target="agent_ping").observe(latency)
    if not success:
        agent_e2e_failures_total.labels(target="agent_ping", reason=reason).inc()
    logger.info(f"agent_ping: success={success}, latency={latency:.3f}s, reason={reason}")
    # Probe 3: Webhook E2E (full path test)
    success, latency, reason = await probe_webhook_echo()
    agent_e2e_runs_total.labels(target="webhook_e2e").inc()
    agent_e2e_success.labels(target="webhook_e2e").set(1 if success else 0)
    agent_e2e_latency.labels(target="webhook_e2e").set(latency)
    agent_e2e_latency_histogram.labels(target="webhook_e2e").observe(latency)
    if not success:
        agent_e2e_failures_total.labels(target="webhook_e2e", reason=reason).inc()
    logger.info(f"webhook_e2e: success={success}, latency={latency:.3f}s, reason={reason}")
 async def main():
    logger.info(f"Starting E2E Agent Prober")
    logger.info(f"  GATEWAY_URL: {GATEWAY_URL}")
    logger.info(f"  PROBE_INTERVAL: {PROBE_INTERVAL}s")
    logger.info(f"  PROBE_TIMEOUT: {PROBE_TIMEOUT}s")
    logger.info(f"  METRICS_PORT: {METRICS_PORT}")
    # Start Prometheus metrics server
    start_http_server(METRICS_PORT)
    logger.info(f"Prometheus metrics available at :{METRICS_PORT}/metrics")
    # Initial probe
    await run_probes()
    # Continuous probing
    while True:
        await asyncio.sleep(PROBE_INTERVAL)
        try:
            await run_probes()
        except Exception as e:
            logger.error(f"Probe error: {e}")
 if __name__ == "__main__":
    asyncio.run(main())
--- a/services/agent-e2e-prober/requirements.txt
+++ b/services/agent-e2e-prober/requirements.txt
@@ -0,0 +1,2 @@
 httpx>=0.25.0
 prometheus-client>=0.19.0