feat(P0/P1/P2): Add E2E agent prober, version pinning, prometheus fixes
This commit is contained in:
18
services/agent-e2e-prober/Dockerfile
Normal file
18
services/agent-e2e-prober/Dockerfile
Normal file
@@ -0,0 +1,18 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application
|
||||
COPY main.py .
|
||||
|
||||
# Run as non-root
|
||||
RUN useradd -m -u 1000 prober
|
||||
USER prober
|
||||
|
||||
EXPOSE 9108
|
||||
|
||||
CMD ["python", "-u", "main.py"]
|
||||
179
services/agent-e2e-prober/main.py
Normal file
179
services/agent-e2e-prober/main.py
Normal file
@@ -0,0 +1,179 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
E2E Agent Prober - перевіряє що агенти відповідають
|
||||
Експортує Prometheus метрики на :9108/metrics
|
||||
"""
|
||||
import asyncio
|
||||
import time
|
||||
import os
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from prometheus_client import start_http_server, Counter, Gauge, Histogram
|
||||
import httpx
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration
|
||||
GATEWAY_URL = os.getenv("GATEWAY_URL", "http://gateway:9300")
|
||||
PROBE_INTERVAL = int(os.getenv("PROBE_INTERVAL", "60")) # seconds
|
||||
PROBE_TIMEOUT = int(os.getenv("PROBE_TIMEOUT", "30")) # seconds
|
||||
METRICS_PORT = int(os.getenv("METRICS_PORT", "9108"))
|
||||
|
||||
# Prometheus metrics
|
||||
agent_e2e_success = Gauge('agent_e2e_success', 'Whether last E2E probe succeeded', ['target'])
|
||||
agent_e2e_latency = Gauge('agent_e2e_latency_seconds', 'Latency of last E2E probe', ['target'])
|
||||
agent_e2e_failures_total = Counter('agent_e2e_failures_total', 'Total E2E probe failures', ['target', 'reason'])
|
||||
agent_e2e_runs_total = Counter('agent_e2e_runs_total', 'Total E2E probe runs', ['target'])
|
||||
|
||||
# Histogram for latency distribution
|
||||
agent_e2e_latency_histogram = Histogram(
|
||||
'agent_e2e_latency_histogram_seconds',
|
||||
'E2E probe latency distribution',
|
||||
['target'],
|
||||
buckets=[0.1, 0.5, 1, 2, 5, 10, 30, 60]
|
||||
)
|
||||
|
||||
|
||||
async def probe_gateway_health() -> tuple[bool, float, str]:
|
||||
"""Probe gateway /health endpoint"""
|
||||
start = time.time()
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
|
||||
resp = await client.get(f"{GATEWAY_URL}/health")
|
||||
latency = time.time() - start
|
||||
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
if data.get("status") == "healthy":
|
||||
return True, latency, ""
|
||||
else:
|
||||
return False, latency, f"unhealthy: {data.get('status')}"
|
||||
else:
|
||||
return False, latency, f"http_{resp.status_code}"
|
||||
except httpx.TimeoutException:
|
||||
return False, time.time() - start, "timeout"
|
||||
except Exception as e:
|
||||
return False, time.time() - start, f"error: {str(e)[:50]}"
|
||||
|
||||
|
||||
async def probe_agent_ping() -> tuple[bool, float, str]:
|
||||
"""Probe gateway /debug/agent_ping endpoint (E2E through router)"""
|
||||
start = time.time()
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
|
||||
resp = await client.post(
|
||||
f"{GATEWAY_URL}/debug/agent_ping",
|
||||
json={"probe": True, "timestamp": datetime.utcnow().isoformat()}
|
||||
)
|
||||
latency = time.time() - start
|
||||
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
if data.get("success"):
|
||||
return True, latency, ""
|
||||
else:
|
||||
return False, latency, data.get("error", "unknown")
|
||||
else:
|
||||
return False, latency, f"http_{resp.status_code}"
|
||||
except httpx.TimeoutException:
|
||||
return False, time.time() - start, "timeout"
|
||||
except Exception as e:
|
||||
return False, time.time() - start, f"error: {str(e)[:50]}"
|
||||
|
||||
|
||||
async def probe_webhook_echo() -> tuple[bool, float, str]:
|
||||
"""
|
||||
Probe via webhook endpoint with minimal test message.
|
||||
This tests the full path: gateway -> router -> completion
|
||||
"""
|
||||
start = time.time()
|
||||
try:
|
||||
# Minimal Telegram-like update that gateway can process
|
||||
test_update = {
|
||||
"update_id": int(time.time()),
|
||||
"message": {
|
||||
"message_id": 1,
|
||||
"from": {"id": 0, "is_bot": True, "first_name": "E2EProber"},
|
||||
"chat": {"id": 0, "type": "private"},
|
||||
"date": int(time.time()),
|
||||
"text": "/health" # Simple health check command
|
||||
}
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
|
||||
# Use helion webhook as it's the most tested
|
||||
resp = await client.post(
|
||||
f"{GATEWAY_URL}/helion/telegram/webhook",
|
||||
json=test_update
|
||||
)
|
||||
latency = time.time() - start
|
||||
|
||||
if resp.status_code == 200:
|
||||
return True, latency, ""
|
||||
else:
|
||||
return False, latency, f"http_{resp.status_code}"
|
||||
except httpx.TimeoutException:
|
||||
return False, time.time() - start, "timeout"
|
||||
except Exception as e:
|
||||
return False, time.time() - start, f"error: {str(e)[:50]}"
|
||||
|
||||
|
||||
async def run_probes():
|
||||
"""Run all probes and update metrics"""
|
||||
# Probe 1: Gateway health
|
||||
success, latency, reason = await probe_gateway_health()
|
||||
agent_e2e_runs_total.labels(target="gateway_health").inc()
|
||||
agent_e2e_success.labels(target="gateway_health").set(1 if success else 0)
|
||||
agent_e2e_latency.labels(target="gateway_health").set(latency)
|
||||
agent_e2e_latency_histogram.labels(target="gateway_health").observe(latency)
|
||||
if not success:
|
||||
agent_e2e_failures_total.labels(target="gateway_health", reason=reason).inc()
|
||||
logger.info(f"gateway_health: success={success}, latency={latency:.3f}s, reason={reason}")
|
||||
|
||||
# Probe 2: Agent ping (if endpoint exists)
|
||||
success, latency, reason = await probe_agent_ping()
|
||||
agent_e2e_runs_total.labels(target="agent_ping").inc()
|
||||
agent_e2e_success.labels(target="agent_ping").set(1 if success else 0)
|
||||
agent_e2e_latency.labels(target="agent_ping").set(latency)
|
||||
agent_e2e_latency_histogram.labels(target="agent_ping").observe(latency)
|
||||
if not success:
|
||||
agent_e2e_failures_total.labels(target="agent_ping", reason=reason).inc()
|
||||
logger.info(f"agent_ping: success={success}, latency={latency:.3f}s, reason={reason}")
|
||||
|
||||
# Probe 3: Webhook E2E (full path test)
|
||||
success, latency, reason = await probe_webhook_echo()
|
||||
agent_e2e_runs_total.labels(target="webhook_e2e").inc()
|
||||
agent_e2e_success.labels(target="webhook_e2e").set(1 if success else 0)
|
||||
agent_e2e_latency.labels(target="webhook_e2e").set(latency)
|
||||
agent_e2e_latency_histogram.labels(target="webhook_e2e").observe(latency)
|
||||
if not success:
|
||||
agent_e2e_failures_total.labels(target="webhook_e2e", reason=reason).inc()
|
||||
logger.info(f"webhook_e2e: success={success}, latency={latency:.3f}s, reason={reason}")
|
||||
|
||||
|
||||
async def main():
|
||||
logger.info(f"Starting E2E Agent Prober")
|
||||
logger.info(f" GATEWAY_URL: {GATEWAY_URL}")
|
||||
logger.info(f" PROBE_INTERVAL: {PROBE_INTERVAL}s")
|
||||
logger.info(f" PROBE_TIMEOUT: {PROBE_TIMEOUT}s")
|
||||
logger.info(f" METRICS_PORT: {METRICS_PORT}")
|
||||
|
||||
# Start Prometheus metrics server
|
||||
start_http_server(METRICS_PORT)
|
||||
logger.info(f"Prometheus metrics available at :{METRICS_PORT}/metrics")
|
||||
|
||||
# Initial probe
|
||||
await run_probes()
|
||||
|
||||
# Continuous probing
|
||||
while True:
|
||||
await asyncio.sleep(PROBE_INTERVAL)
|
||||
try:
|
||||
await run_probes()
|
||||
except Exception as e:
|
||||
logger.error(f"Probe error: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
2
services/agent-e2e-prober/requirements.txt
Normal file
2
services/agent-e2e-prober/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
httpx>=0.25.0
|
||||
prometheus-client>=0.19.0
|
||||
Reference in New Issue
Block a user