Files
microdao-daarion/services/agent-e2e-prober/main.py

180 lines
7.0 KiB
Python

#!/usr/bin/env python3
"""
E2E Agent Prober - перевіряє що агенти відповідають
Експортує Prometheus метрики на :9108/metrics
"""
import asyncio
import time
import os
import logging
from datetime import datetime
from prometheus_client import start_http_server, Counter, Gauge, Histogram
import httpx
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)
# Configuration
GATEWAY_URL = os.getenv("GATEWAY_URL", "http://gateway:9300")
PROBE_INTERVAL = int(os.getenv("PROBE_INTERVAL", "60")) # seconds
PROBE_TIMEOUT = int(os.getenv("PROBE_TIMEOUT", "30")) # seconds
METRICS_PORT = int(os.getenv("METRICS_PORT", "9108"))
# Prometheus metrics
agent_e2e_success = Gauge('agent_e2e_success', 'Whether last E2E probe succeeded', ['target'])
agent_e2e_latency = Gauge('agent_e2e_latency_seconds', 'Latency of last E2E probe', ['target'])
agent_e2e_failures_total = Counter('agent_e2e_failures_total', 'Total E2E probe failures', ['target', 'reason'])
agent_e2e_runs_total = Counter('agent_e2e_runs_total', 'Total E2E probe runs', ['target'])
# Histogram for latency distribution
agent_e2e_latency_histogram = Histogram(
'agent_e2e_latency_histogram_seconds',
'E2E probe latency distribution',
['target'],
buckets=[0.1, 0.5, 1, 2, 5, 10, 30, 60]
)
async def probe_gateway_health() -> tuple[bool, float, str]:
"""Probe gateway /health endpoint"""
start = time.time()
try:
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
resp = await client.get(f"{GATEWAY_URL}/health")
latency = time.time() - start
if resp.status_code == 200:
data = resp.json()
if data.get("status") == "healthy":
return True, latency, ""
else:
return False, latency, f"unhealthy: {data.get('status')}"
else:
return False, latency, f"http_{resp.status_code}"
except httpx.TimeoutException:
return False, time.time() - start, "timeout"
except Exception as e:
return False, time.time() - start, f"error: {str(e)[:50]}"
async def probe_agent_ping() -> tuple[bool, float, str]:
"""Probe gateway /debug/agent_ping endpoint (E2E through router)"""
start = time.time()
try:
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
resp = await client.post(
f"{GATEWAY_URL}/debug/agent_ping",
json={"probe": True, "timestamp": datetime.utcnow().isoformat()}
)
latency = time.time() - start
if resp.status_code == 200:
data = resp.json()
if data.get("success"):
return True, latency, ""
else:
return False, latency, data.get("error", "unknown")
else:
return False, latency, f"http_{resp.status_code}"
except httpx.TimeoutException:
return False, time.time() - start, "timeout"
except Exception as e:
return False, time.time() - start, f"error: {str(e)[:50]}"
async def probe_webhook_echo() -> tuple[bool, float, str]:
"""
Probe via webhook endpoint with minimal test message.
This tests the full path: gateway -> router -> completion
"""
start = time.time()
try:
# Minimal Telegram-like update that gateway can process
test_update = {
"update_id": int(time.time()),
"message": {
"message_id": 1,
"from": {"id": 0, "is_bot": True, "first_name": "E2EProber"},
"chat": {"id": 0, "type": "private"},
"date": int(time.time()),
"text": "/health" # Simple health check command
}
}
async with httpx.AsyncClient(timeout=PROBE_TIMEOUT) as client:
# Use helion webhook as it's the most tested
resp = await client.post(
f"{GATEWAY_URL}/helion/telegram/webhook",
json=test_update
)
latency = time.time() - start
if resp.status_code == 200:
return True, latency, ""
else:
return False, latency, f"http_{resp.status_code}"
except httpx.TimeoutException:
return False, time.time() - start, "timeout"
except Exception as e:
return False, time.time() - start, f"error: {str(e)[:50]}"
async def run_probes():
"""Run all probes and update metrics"""
# Probe 1: Gateway health
success, latency, reason = await probe_gateway_health()
agent_e2e_runs_total.labels(target="gateway_health").inc()
agent_e2e_success.labels(target="gateway_health").set(1 if success else 0)
agent_e2e_latency.labels(target="gateway_health").set(latency)
agent_e2e_latency_histogram.labels(target="gateway_health").observe(latency)
if not success:
agent_e2e_failures_total.labels(target="gateway_health", reason=reason).inc()
logger.info(f"gateway_health: success={success}, latency={latency:.3f}s, reason={reason}")
# Probe 2: Agent ping (if endpoint exists)
success, latency, reason = await probe_agent_ping()
agent_e2e_runs_total.labels(target="agent_ping").inc()
agent_e2e_success.labels(target="agent_ping").set(1 if success else 0)
agent_e2e_latency.labels(target="agent_ping").set(latency)
agent_e2e_latency_histogram.labels(target="agent_ping").observe(latency)
if not success:
agent_e2e_failures_total.labels(target="agent_ping", reason=reason).inc()
logger.info(f"agent_ping: success={success}, latency={latency:.3f}s, reason={reason}")
# Probe 3: Webhook E2E (full path test)
success, latency, reason = await probe_webhook_echo()
agent_e2e_runs_total.labels(target="webhook_e2e").inc()
agent_e2e_success.labels(target="webhook_e2e").set(1 if success else 0)
agent_e2e_latency.labels(target="webhook_e2e").set(latency)
agent_e2e_latency_histogram.labels(target="webhook_e2e").observe(latency)
if not success:
agent_e2e_failures_total.labels(target="webhook_e2e", reason=reason).inc()
logger.info(f"webhook_e2e: success={success}, latency={latency:.3f}s, reason={reason}")
async def main():
logger.info(f"Starting E2E Agent Prober")
logger.info(f" GATEWAY_URL: {GATEWAY_URL}")
logger.info(f" PROBE_INTERVAL: {PROBE_INTERVAL}s")
logger.info(f" PROBE_TIMEOUT: {PROBE_TIMEOUT}s")
logger.info(f" METRICS_PORT: {METRICS_PORT}")
# Start Prometheus metrics server
start_http_server(METRICS_PORT)
logger.info(f"Prometheus metrics available at :{METRICS_PORT}/metrics")
# Initial probe
await run_probes()
# Continuous probing
while True:
await asyncio.sleep(PROBE_INTERVAL)
try:
await run_probes()
except Exception as e:
logger.error(f"Probe error: {e}")
if __name__ == "__main__":
asyncio.run(main())