feat(P1): Add /metrics endpoint to gateway
This commit is contained in:
90
gateway-bot/metrics.py
Normal file
90
gateway-bot/metrics.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
Gateway Prometheus Metrics
|
||||
Стандартизовані метрики для observability
|
||||
"""
|
||||
from prometheus_client import Counter, Histogram, Gauge, Info, generate_latest, CONTENT_TYPE_LATEST
|
||||
|
||||
# === HTTP Metrics ===
|
||||
HTTP_REQUESTS_TOTAL = Counter(
|
||||
"gateway_http_requests_total",
|
||||
"Total HTTP requests",
|
||||
["method", "endpoint", "status"]
|
||||
)
|
||||
|
||||
HTTP_REQUEST_DURATION = Histogram(
|
||||
"gateway_http_request_duration_seconds",
|
||||
"HTTP request duration",
|
||||
["method", "endpoint"],
|
||||
buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0]
|
||||
)
|
||||
|
||||
# === Agent Metrics ===
|
||||
AGENT_RUNS_TOTAL = Counter(
|
||||
"gateway_agent_runs_total",
|
||||
"Total agent runs",
|
||||
["agent", "status"] # status: started, completed, failed
|
||||
)
|
||||
|
||||
AGENT_RUN_DURATION = Histogram(
|
||||
"gateway_agent_run_duration_seconds",
|
||||
"Agent run duration",
|
||||
["agent"],
|
||||
buckets=[1.0, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0]
|
||||
)
|
||||
|
||||
# === LLM/Router Metrics ===
|
||||
ROUTER_CALLS_TOTAL = Counter(
|
||||
"gateway_router_calls_total",
|
||||
"Total calls to router",
|
||||
["status"] # success, error, timeout
|
||||
)
|
||||
|
||||
ROUTER_LATENCY = Histogram(
|
||||
"gateway_router_latency_seconds",
|
||||
"Router call latency",
|
||||
buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0]
|
||||
)
|
||||
|
||||
# === Memory Service Metrics ===
|
||||
MEMORY_CALLS_TOTAL = Counter(
|
||||
"gateway_memory_calls_total",
|
||||
"Total calls to memory service",
|
||||
["operation", "status"] # operation: save, search, delete
|
||||
)
|
||||
|
||||
# === Telegram Metrics ===
|
||||
TELEGRAM_MESSAGES_TOTAL = Counter(
|
||||
"gateway_telegram_messages_total",
|
||||
"Total Telegram messages processed",
|
||||
["agent", "direction"] # direction: incoming, outgoing
|
||||
)
|
||||
|
||||
# === Errors ===
|
||||
ERRORS_TOTAL = Counter(
|
||||
"gateway_errors_total",
|
||||
"Total errors",
|
||||
["type", "source"]
|
||||
)
|
||||
|
||||
# === Active connections ===
|
||||
ACTIVE_REQUESTS = Gauge(
|
||||
"gateway_active_requests",
|
||||
"Currently active requests"
|
||||
)
|
||||
|
||||
# === Service info ===
|
||||
SERVICE_INFO = Info(
|
||||
"gateway_service",
|
||||
"Gateway service information"
|
||||
)
|
||||
SERVICE_INFO.info({"version": "2.0.0", "node": "node1"})
|
||||
|
||||
|
||||
def get_metrics():
|
||||
"""Return metrics in Prometheus format"""
|
||||
return generate_latest()
|
||||
|
||||
|
||||
def get_content_type():
|
||||
"""Return Prometheus content type"""
|
||||
return CONTENT_TYPE_LATEST
|
||||
Reference in New Issue
Block a user