164 lines
4.4 KiB
Python
164 lines
4.4 KiB
Python
"""
|
|
Gateway Prometheus Metrics
|
|
Стандартизовані метрики для observability
|
|
"""
|
|
from prometheus_client import Counter, Histogram, Gauge, Info, generate_latest, CONTENT_TYPE_LATEST
|
|
|
|
# === HTTP Metrics ===
|
|
HTTP_REQUESTS_TOTAL = Counter(
|
|
"gateway_http_requests_total",
|
|
"Total HTTP requests",
|
|
["method", "endpoint", "status"]
|
|
)
|
|
|
|
HTTP_REQUEST_DURATION = Histogram(
|
|
"gateway_http_request_duration_seconds",
|
|
"HTTP request duration",
|
|
["method", "endpoint"],
|
|
buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0]
|
|
)
|
|
|
|
# === Agent Metrics ===
|
|
AGENT_RUNS_TOTAL = Counter(
|
|
"gateway_agent_runs_total",
|
|
"Total agent runs",
|
|
["agent", "status"] # status: started, completed, failed
|
|
)
|
|
|
|
AGENT_RUN_DURATION = Histogram(
|
|
"gateway_agent_run_duration_seconds",
|
|
"Agent run duration",
|
|
["agent"],
|
|
buckets=[1.0, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0]
|
|
)
|
|
|
|
# === LLM/Router Metrics ===
|
|
ROUTER_CALLS_TOTAL = Counter(
|
|
"gateway_router_calls_total",
|
|
"Total calls to router",
|
|
["status"] # success, error, timeout
|
|
)
|
|
|
|
ROUTER_LATENCY = Histogram(
|
|
"gateway_router_latency_seconds",
|
|
"Router call latency",
|
|
buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0]
|
|
)
|
|
|
|
# === Experience Bus Phase-4 Metrics ===
|
|
GATEWAY_EXPERIENCE_PUBLISHED_TOTAL = Counter(
|
|
"gateway_experience_published_total",
|
|
"Gateway experience event publish/store status",
|
|
["status"] # ok, err
|
|
)
|
|
|
|
GATEWAY_POLICY_DECISIONS_TOTAL = Counter(
|
|
"gateway_policy_decisions_total",
|
|
"Gateway policy (SOWA) decisions",
|
|
["sowa_decision", "reason"]
|
|
)
|
|
|
|
GATEWAY_USER_SIGNAL_TOTAL = Counter(
|
|
"gateway_user_signal_total",
|
|
"Detected user signals from webhook stream",
|
|
["user_signal"] # none, positive, negative, retry, timeout
|
|
)
|
|
|
|
GATEWAY_WEBHOOK_LATENCY_MS = Histogram(
|
|
"gateway_webhook_latency_ms",
|
|
"Gateway webhook end-to-end latency in milliseconds",
|
|
buckets=[5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000]
|
|
)
|
|
|
|
GATEWAY_EXPERIENCE_EMITTED_TOTAL = Counter(
|
|
"gateway_experience_emitted_total",
|
|
"Gateway experience events emitted from webhook handler",
|
|
["status", "path"] # status: ok|err, path: normal|early_return|exception
|
|
)
|
|
|
|
GATEWAY_EARLY_RETURN_TOTAL = Counter(
|
|
"gateway_early_return_total",
|
|
"Gateway early return branches observed by reason",
|
|
["reason"]
|
|
)
|
|
|
|
GATEWAY_EVENT_FINALIZE_LATENCY_MS = Histogram(
|
|
"gateway_event_finalize_latency_ms",
|
|
"Gateway event finalize latency in milliseconds",
|
|
buckets=[1, 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000]
|
|
)
|
|
|
|
GATEWAY_ANTI_SILENT_TOTAL = Counter(
|
|
"gateway_anti_silent_total",
|
|
"Gateway anti-silent actions by reason/chat type",
|
|
["action", "reason", "chat_type"] # ACK_EMITTED, ACK_SUPPRESSED_COOLDOWN
|
|
)
|
|
|
|
GATEWAY_ACK_SENT_TOTAL = Counter(
|
|
"gateway_ack_sent_total",
|
|
"Gateway ACK messages sent by template/chat type",
|
|
["template_id", "chat_type"]
|
|
)
|
|
|
|
GATEWAY_ANTI_SILENT_TUNING_APPLIED_TOTAL = Counter(
|
|
"gateway_anti_silent_tuning_applied_total",
|
|
"Gateway anti-silent tuning applications by reason/chat type/template",
|
|
["reason", "chat_type", "template_id"]
|
|
)
|
|
|
|
GATEWAY_ACCESS_DECISIONS_TOTAL = Counter(
|
|
"gateway_access_decisions_total",
|
|
"Gateway access decisions for public layer",
|
|
["decision", "agent_id", "chat_type"] # allow, deny, rate_limited
|
|
)
|
|
|
|
GATEWAY_RATE_LIMITED_TOTAL = Counter(
|
|
"gateway_rate_limited_total",
|
|
"Gateway rate limit hits by scope",
|
|
["scope", "agent_id", "chat_type"] # user_global, user_agent, group_agent
|
|
)
|
|
|
|
# === Memory Service Metrics ===
|
|
MEMORY_CALLS_TOTAL = Counter(
|
|
"gateway_memory_calls_total",
|
|
"Total calls to memory service",
|
|
["operation", "status"] # operation: save, search, delete
|
|
)
|
|
|
|
# === Telegram Metrics ===
|
|
TELEGRAM_MESSAGES_TOTAL = Counter(
|
|
"gateway_telegram_messages_total",
|
|
"Total Telegram messages processed",
|
|
["agent", "direction"] # direction: incoming, outgoing
|
|
)
|
|
|
|
# === Errors ===
|
|
ERRORS_TOTAL = Counter(
|
|
"gateway_errors_total",
|
|
"Total errors",
|
|
["type", "source"]
|
|
)
|
|
|
|
# === Active connections ===
|
|
ACTIVE_REQUESTS = Gauge(
|
|
"gateway_active_requests",
|
|
"Currently active requests"
|
|
)
|
|
|
|
# === Service info ===
|
|
SERVICE_INFO = Info(
|
|
"gateway_service",
|
|
"Gateway service information"
|
|
)
|
|
SERVICE_INFO.info({"version": "2.0.0", "node": "node1"})
|
|
|
|
|
|
def get_metrics():
|
|
"""Return metrics in Prometheus format"""
|
|
return generate_latest()
|
|
|
|
|
|
def get_content_type():
|
|
"""Return Prometheus content type"""
|
|
return CONTENT_TYPE_LATEST
|