Files

164 lines
4.4 KiB
Python

"""
Gateway Prometheus Metrics
Стандартизовані метрики для observability
"""
from prometheus_client import Counter, Histogram, Gauge, Info, generate_latest, CONTENT_TYPE_LATEST
# === HTTP Metrics ===
HTTP_REQUESTS_TOTAL = Counter(
"gateway_http_requests_total",
"Total HTTP requests",
["method", "endpoint", "status"]
)
HTTP_REQUEST_DURATION = Histogram(
"gateway_http_request_duration_seconds",
"HTTP request duration",
["method", "endpoint"],
buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0]
)
# === Agent Metrics ===
AGENT_RUNS_TOTAL = Counter(
"gateway_agent_runs_total",
"Total agent runs",
["agent", "status"] # status: started, completed, failed
)
AGENT_RUN_DURATION = Histogram(
"gateway_agent_run_duration_seconds",
"Agent run duration",
["agent"],
buckets=[1.0, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0]
)
# === LLM/Router Metrics ===
ROUTER_CALLS_TOTAL = Counter(
"gateway_router_calls_total",
"Total calls to router",
["status"] # success, error, timeout
)
ROUTER_LATENCY = Histogram(
"gateway_router_latency_seconds",
"Router call latency",
buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0]
)
# === Experience Bus Phase-4 Metrics ===
GATEWAY_EXPERIENCE_PUBLISHED_TOTAL = Counter(
"gateway_experience_published_total",
"Gateway experience event publish/store status",
["status"] # ok, err
)
GATEWAY_POLICY_DECISIONS_TOTAL = Counter(
"gateway_policy_decisions_total",
"Gateway policy (SOWA) decisions",
["sowa_decision", "reason"]
)
GATEWAY_USER_SIGNAL_TOTAL = Counter(
"gateway_user_signal_total",
"Detected user signals from webhook stream",
["user_signal"] # none, positive, negative, retry, timeout
)
GATEWAY_WEBHOOK_LATENCY_MS = Histogram(
"gateway_webhook_latency_ms",
"Gateway webhook end-to-end latency in milliseconds",
buckets=[5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000]
)
GATEWAY_EXPERIENCE_EMITTED_TOTAL = Counter(
"gateway_experience_emitted_total",
"Gateway experience events emitted from webhook handler",
["status", "path"] # status: ok|err, path: normal|early_return|exception
)
GATEWAY_EARLY_RETURN_TOTAL = Counter(
"gateway_early_return_total",
"Gateway early return branches observed by reason",
["reason"]
)
GATEWAY_EVENT_FINALIZE_LATENCY_MS = Histogram(
"gateway_event_finalize_latency_ms",
"Gateway event finalize latency in milliseconds",
buckets=[1, 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000]
)
GATEWAY_ANTI_SILENT_TOTAL = Counter(
"gateway_anti_silent_total",
"Gateway anti-silent actions by reason/chat type",
["action", "reason", "chat_type"] # ACK_EMITTED, ACK_SUPPRESSED_COOLDOWN
)
GATEWAY_ACK_SENT_TOTAL = Counter(
"gateway_ack_sent_total",
"Gateway ACK messages sent by template/chat type",
["template_id", "chat_type"]
)
GATEWAY_ANTI_SILENT_TUNING_APPLIED_TOTAL = Counter(
"gateway_anti_silent_tuning_applied_total",
"Gateway anti-silent tuning applications by reason/chat type/template",
["reason", "chat_type", "template_id"]
)
GATEWAY_ACCESS_DECISIONS_TOTAL = Counter(
"gateway_access_decisions_total",
"Gateway access decisions for public layer",
["decision", "agent_id", "chat_type"] # allow, deny, rate_limited
)
GATEWAY_RATE_LIMITED_TOTAL = Counter(
"gateway_rate_limited_total",
"Gateway rate limit hits by scope",
["scope", "agent_id", "chat_type"] # user_global, user_agent, group_agent
)
# === Memory Service Metrics ===
MEMORY_CALLS_TOTAL = Counter(
"gateway_memory_calls_total",
"Total calls to memory service",
["operation", "status"] # operation: save, search, delete
)
# === Telegram Metrics ===
TELEGRAM_MESSAGES_TOTAL = Counter(
"gateway_telegram_messages_total",
"Total Telegram messages processed",
["agent", "direction"] # direction: incoming, outgoing
)
# === Errors ===
ERRORS_TOTAL = Counter(
"gateway_errors_total",
"Total errors",
["type", "source"]
)
# === Active connections ===
ACTIVE_REQUESTS = Gauge(
"gateway_active_requests",
"Currently active requests"
)
# === Service info ===
SERVICE_INFO = Info(
"gateway_service",
"Gateway service information"
)
SERVICE_INFO.info({"version": "2.0.0", "node": "node1"})
def get_metrics():
"""Return metrics in Prometheus format"""
return generate_latest()
def get_content_type():
"""Return Prometheus content type"""
return CONTENT_TYPE_LATEST