microdao-daarion/gateway-bot/metrics.py

"""
Gateway Prometheus Metrics
Стандартизовані метрики для observability
"""
from prometheus_client import Counter, Histogram, Gauge, Info, generate_latest, CONTENT_TYPE_LATEST

# === HTTP Metrics ===
HTTP_REQUESTS_TOTAL = Counter(
    "gateway_http_requests_total",
    "Total HTTP requests",
    ["method", "endpoint", "status"]
)

HTTP_REQUEST_DURATION = Histogram(
    "gateway_http_request_duration_seconds",
    "HTTP request duration",
    ["method", "endpoint"],
    buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0]
)

# === Agent Metrics ===
AGENT_RUNS_TOTAL = Counter(
    "gateway_agent_runs_total",
    "Total agent runs",
    ["agent", "status"]  # status: started, completed, failed
)

AGENT_RUN_DURATION = Histogram(
    "gateway_agent_run_duration_seconds",
    "Agent run duration",
    ["agent"],
    buckets=[1.0, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0]
)

# === LLM/Router Metrics ===
ROUTER_CALLS_TOTAL = Counter(
    "gateway_router_calls_total",
    "Total calls to router",
    ["status"]  # success, error, timeout
)

ROUTER_LATENCY = Histogram(
    "gateway_router_latency_seconds",
    "Router call latency",
    buckets=[0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0]
)

# === Experience Bus Phase-4 Metrics ===
GATEWAY_EXPERIENCE_PUBLISHED_TOTAL = Counter(
    "gateway_experience_published_total",
    "Gateway experience event publish/store status",
    ["status"]  # ok, err
)

GATEWAY_POLICY_DECISIONS_TOTAL = Counter(
    "gateway_policy_decisions_total",
    "Gateway policy (SOWA) decisions",
    ["sowa_decision", "reason"]
)

GATEWAY_USER_SIGNAL_TOTAL = Counter(
    "gateway_user_signal_total",
    "Detected user signals from webhook stream",
    ["user_signal"]  # none, positive, negative, retry, timeout
)

GATEWAY_WEBHOOK_LATENCY_MS = Histogram(
    "gateway_webhook_latency_ms",
    "Gateway webhook end-to-end latency in milliseconds",
    buckets=[5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000]
)

GATEWAY_EXPERIENCE_EMITTED_TOTAL = Counter(
    "gateway_experience_emitted_total",
    "Gateway experience events emitted from webhook handler",
    ["status", "path"]  # status: ok|err, path: normal|early_return|exception
)

GATEWAY_EARLY_RETURN_TOTAL = Counter(
    "gateway_early_return_total",
    "Gateway early return branches observed by reason",
    ["reason"]
)

GATEWAY_EVENT_FINALIZE_LATENCY_MS = Histogram(
    "gateway_event_finalize_latency_ms",
    "Gateway event finalize latency in milliseconds",
    buckets=[1, 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000]
)

GATEWAY_ANTI_SILENT_TOTAL = Counter(
    "gateway_anti_silent_total",
    "Gateway anti-silent actions by reason/chat type",
    ["action", "reason", "chat_type"]  # ACK_EMITTED, ACK_SUPPRESSED_COOLDOWN
)

GATEWAY_ACK_SENT_TOTAL = Counter(
    "gateway_ack_sent_total",
    "Gateway ACK messages sent by template/chat type",
    ["template_id", "chat_type"]
)

GATEWAY_ANTI_SILENT_TUNING_APPLIED_TOTAL = Counter(
    "gateway_anti_silent_tuning_applied_total",
    "Gateway anti-silent tuning applications by reason/chat type/template",
    ["reason", "chat_type", "template_id"]
)

GATEWAY_ACCESS_DECISIONS_TOTAL = Counter(
    "gateway_access_decisions_total",
    "Gateway access decisions for public layer",
    ["decision", "agent_id", "chat_type"]  # allow, deny, rate_limited
)

GATEWAY_RATE_LIMITED_TOTAL = Counter(
    "gateway_rate_limited_total",
    "Gateway rate limit hits by scope",
    ["scope", "agent_id", "chat_type"]  # user_global, user_agent, group_agent
)

# === Memory Service Metrics ===
MEMORY_CALLS_TOTAL = Counter(
    "gateway_memory_calls_total",
    "Total calls to memory service",
    ["operation", "status"]  # operation: save, search, delete
)

# === Telegram Metrics ===
TELEGRAM_MESSAGES_TOTAL = Counter(
    "gateway_telegram_messages_total",
    "Total Telegram messages processed",
    ["agent", "direction"]  # direction: incoming, outgoing
)

# === Errors ===
ERRORS_TOTAL = Counter(
    "gateway_errors_total",
    "Total errors",
    ["type", "source"]
)

# === Active connections ===
ACTIVE_REQUESTS = Gauge(
    "gateway_active_requests",
    "Currently active requests"
)

# === Service info ===
SERVICE_INFO = Info(
    "gateway_service",
    "Gateway service information"
)
SERVICE_INFO.info({"version": "2.0.0", "node": "node1"})


def get_metrics():
    """Return metrics in Prometheus format"""
    return generate_latest()


def get_content_type():
    """Return Prometheus content type"""
    return CONTENT_TYPE_LATEST