""" Gateway Prometheus Metrics Стандартизовані метрики для observability """ from prometheus_client import Counter, Histogram, Gauge, Info, generate_latest, CONTENT_TYPE_LATEST # === HTTP Metrics === HTTP_REQUESTS_TOTAL = Counter( "gateway_http_requests_total", "Total HTTP requests", ["method", "endpoint", "status"] ) HTTP_REQUEST_DURATION = Histogram( "gateway_http_request_duration_seconds", "HTTP request duration", ["method", "endpoint"], buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0] ) # === Agent Metrics === AGENT_RUNS_TOTAL = Counter( "gateway_agent_runs_total", "Total agent runs", ["agent", "status"] # status: started, completed, failed ) AGENT_RUN_DURATION = Histogram( "gateway_agent_run_duration_seconds", "Agent run duration", ["agent"], buckets=[1.0, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0] ) # === LLM/Router Metrics === ROUTER_CALLS_TOTAL = Counter( "gateway_router_calls_total", "Total calls to router", ["status"] # success, error, timeout ) ROUTER_LATENCY = Histogram( "gateway_router_latency_seconds", "Router call latency", buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0] ) # === Experience Bus Phase-4 Metrics === GATEWAY_EXPERIENCE_PUBLISHED_TOTAL = Counter( "gateway_experience_published_total", "Gateway experience event publish/store status", ["status"] # ok, err ) GATEWAY_POLICY_DECISIONS_TOTAL = Counter( "gateway_policy_decisions_total", "Gateway policy (SOWA) decisions", ["sowa_decision", "reason"] ) GATEWAY_USER_SIGNAL_TOTAL = Counter( "gateway_user_signal_total", "Detected user signals from webhook stream", ["user_signal"] # none, positive, negative, retry, timeout ) GATEWAY_WEBHOOK_LATENCY_MS = Histogram( "gateway_webhook_latency_ms", "Gateway webhook end-to-end latency in milliseconds", buckets=[5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000] ) GATEWAY_EXPERIENCE_EMITTED_TOTAL = Counter( "gateway_experience_emitted_total", "Gateway experience events emitted from webhook handler", ["status", "path"] # status: ok|err, path: normal|early_return|exception ) GATEWAY_EARLY_RETURN_TOTAL = Counter( "gateway_early_return_total", "Gateway early return branches observed by reason", ["reason"] ) GATEWAY_EVENT_FINALIZE_LATENCY_MS = Histogram( "gateway_event_finalize_latency_ms", "Gateway event finalize latency in milliseconds", buckets=[1, 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000] ) GATEWAY_ANTI_SILENT_TOTAL = Counter( "gateway_anti_silent_total", "Gateway anti-silent actions by reason/chat type", ["action", "reason", "chat_type"] # ACK_EMITTED, ACK_SUPPRESSED_COOLDOWN ) GATEWAY_ACK_SENT_TOTAL = Counter( "gateway_ack_sent_total", "Gateway ACK messages sent by template/chat type", ["template_id", "chat_type"] ) GATEWAY_ANTI_SILENT_TUNING_APPLIED_TOTAL = Counter( "gateway_anti_silent_tuning_applied_total", "Gateway anti-silent tuning applications by reason/chat type/template", ["reason", "chat_type", "template_id"] ) GATEWAY_ACCESS_DECISIONS_TOTAL = Counter( "gateway_access_decisions_total", "Gateway access decisions for public layer", ["decision", "agent_id", "chat_type"] # allow, deny, rate_limited ) GATEWAY_RATE_LIMITED_TOTAL = Counter( "gateway_rate_limited_total", "Gateway rate limit hits by scope", ["scope", "agent_id", "chat_type"] # user_global, user_agent, group_agent ) # === Memory Service Metrics === MEMORY_CALLS_TOTAL = Counter( "gateway_memory_calls_total", "Total calls to memory service", ["operation", "status"] # operation: save, search, delete ) # === Telegram Metrics === TELEGRAM_MESSAGES_TOTAL = Counter( "gateway_telegram_messages_total", "Total Telegram messages processed", ["agent", "direction"] # direction: incoming, outgoing ) # === Errors === ERRORS_TOTAL = Counter( "gateway_errors_total", "Total errors", ["type", "source"] ) # === Active connections === ACTIVE_REQUESTS = Gauge( "gateway_active_requests", "Currently active requests" ) # === Service info === SERVICE_INFO = Info( "gateway_service", "Gateway service information" ) SERVICE_INFO.info({"version": "2.0.0", "node": "node1"}) def get_metrics(): """Return metrics in Prometheus format""" return generate_latest() def get_content_type(): """Return Prometheus content type""" return CONTENT_TYPE_LATEST