microdao-daarion/services/router/agent_metrics.py

"""
AgentOps Metrics
================
Prometheus метрики для моніторингу агентів.

Метрики:
- Per-agent: latency, tokens, errors, tool calls, budget
- Per-channel: RAG hit rate, index lag
- Per-node: GPU util, VRAM, queue lag
"""

import time
import logging
from typing import Optional, Dict, Any
from contextlib import contextmanager
from functools import wraps

logger = logging.getLogger(__name__)

# Try to import prometheus_client
try:
    from prometheus_client import (
        Counter, Histogram, Gauge, Summary,
        CollectorRegistry, generate_latest, CONTENT_TYPE_LATEST
    )
    PROMETHEUS_AVAILABLE = True
except ImportError:
    PROMETHEUS_AVAILABLE = False
    logger.warning("prometheus_client not installed, metrics disabled")


# ==================== REGISTRY ====================

if PROMETHEUS_AVAILABLE:
    # Use default registry or create custom
    REGISTRY = CollectorRegistry(auto_describe=True)

    # ==================== AGENT METRICS ====================

    # Request latency
    AGENT_LATENCY = Histogram(
        'agent_latency_seconds',
        'Agent request latency in seconds',
        ['agent_id', 'operation'],
        buckets=(0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0),
        registry=REGISTRY
    )

    # Token usage
    AGENT_TOKENS_IN = Counter(
        'agent_tokens_in_total',
        'Total input tokens processed by agent',
        ['agent_id', 'model'],
        registry=REGISTRY
    )

    AGENT_TOKENS_OUT = Counter(
        'agent_tokens_out_total',
        'Total output tokens generated by agent',
        ['agent_id', 'model'],
        registry=REGISTRY
    )

    # Request counts
    AGENT_REQUESTS = Counter(
        'agent_requests_total',
        'Total agent requests',
        ['agent_id', 'status'],  # status: success, error, timeout
        registry=REGISTRY
    )

    # Error rate
    AGENT_ERRORS = Counter(
        'agent_errors_total',
        'Total agent errors',
        ['agent_id', 'error_type'],
        registry=REGISTRY
    )

    # Tool calls
    AGENT_TOOL_CALLS = Counter(
        'agent_tool_calls_total',
        'Total tool calls by agent',
        ['agent_id', 'tool'],
        registry=REGISTRY
    )

    # Budget consumption
    AGENT_BUDGET = Gauge(
        'agent_budget_consumed',
        'Budget consumed by agent per user',
        ['agent_id', 'user_id'],
        registry=REGISTRY
    )

    # Active requests
    AGENT_ACTIVE_REQUESTS = Gauge(
        'agent_active_requests',
        'Number of active requests per agent',
        ['agent_id'],
        registry=REGISTRY
    )

    # ==================== CHANNEL METRICS ====================

    # RAG hit rate
    CHANNEL_RAG_HITS = Counter(
        'channel_rag_hits_total',
        'Total RAG cache hits',
        ['channel_id'],
        registry=REGISTRY
    )

    CHANNEL_RAG_MISSES = Counter(
        'channel_rag_misses_total',
        'Total RAG cache misses',
        ['channel_id'],
        registry=REGISTRY
    )

    # Index lag
    CHANNEL_INDEX_LAG = Gauge(
        'channel_index_lag_seconds',
        'Time since last index update',
        ['channel_id'],
        registry=REGISTRY
    )

    # Message queue
    CHANNEL_QUEUE_SIZE = Gauge(
        'channel_queue_size',
        'Number of messages in channel queue',
        ['channel_id'],
        registry=REGISTRY
    )

    # ==================== NODE METRICS ====================

    # GPU utilization
    NODE_GPU_UTIL = Gauge(
        'node_gpu_utilization',
        'GPU utilization percentage',
        ['node_id', 'gpu_id'],
        registry=REGISTRY
    )

    # VRAM usage
    NODE_VRAM_USED = Gauge(
        'node_vram_used_bytes',
        'VRAM used in bytes',
        ['node_id', 'gpu_id'],
        registry=REGISTRY
    )

    NODE_VRAM_TOTAL = Gauge(
        'node_vram_total_bytes',
        'Total VRAM in bytes',
        ['node_id', 'gpu_id'],
        registry=REGISTRY
    )

    # Queue lag
    NODE_QUEUE_LAG = Gauge(
        'node_queue_lag_seconds',
        'Queue processing lag',
        ['node_id', 'queue'],
        registry=REGISTRY
    )

    # NATS stream lag
    NODE_NATS_LAG = Gauge(
        'node_nats_stream_lag',
        'NATS stream consumer lag (pending messages)',
        ['node_id', 'stream'],
        registry=REGISTRY
    )

    # ==================== MEMORY METRICS ====================

    MEMORY_OPERATIONS = Counter(
        'memory_operations_total',
        'Total memory operations',
        ['operation', 'store'],  # store: postgres, qdrant, neo4j, redis
        registry=REGISTRY
    )

    MEMORY_LATENCY = Histogram(
        'memory_latency_seconds',
        'Memory operation latency',
        ['operation', 'store'],
        buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0),
        registry=REGISTRY
    )

    # ==================== HANDOFF METRICS ====================

    HANDOFF_REQUESTS = Counter(
        'handoff_requests_total',
        'Total handoff requests between agents',
        ['from_agent', 'to_agent', 'status'],
        registry=REGISTRY
    )

    HANDOFF_LATENCY = Histogram(
        'handoff_latency_seconds',
        'Handoff latency between agents',
        ['from_agent', 'to_agent'],
        buckets=(0.1, 0.5, 1.0, 2.5, 5.0, 10.0),
        registry=REGISTRY
    )

    # ==================== EXPERIENCE BUS METRICS ====================

    EXPERIENCE_PUBLISHED = Counter(
        'experience_published_total',
        'Total experience events publish attempts',
        ['source', 'transport', 'status'],  # transport: jetstream|core|none
        registry=REGISTRY
    )

    EXPERIENCE_DB_INSERT = Counter(
        'experience_db_insert_total',
        'Total experience event DB insert attempts',
        ['source', 'status'],  # status: ok|error|skipped
        registry=REGISTRY
    )

    EXPERIENCE_DEDUP_DROPPED = Counter(
        'experience_dedup_dropped_total',
        'Total experience events dropped by dedup',
        ['source'],
        registry=REGISTRY
    )

    EXPERIENCE_SAMPLED = Counter(
        'experience_sampled_total',
        'Total experience events sampled in/out',
        ['source', 'decision', 'reason'],  # decision: in|out
        registry=REGISTRY
    )

    LESSONS_RETRIEVED = Counter(
        'lessons_retrieved_total',
        'Total lessons retrieval attempts',
        ['status'],  # status: ok|timeout|err
        registry=REGISTRY
    )

    LESSONS_ATTACHED = Counter(
        'lessons_attached_total',
        'Total lessons attached buckets',
        ['count'],  # count: 0|1-3|4-7
        registry=REGISTRY
    )

    LESSONS_ATTACH_LATENCY = Histogram(
        'lessons_attach_latency_ms',
        'Lessons retrieval latency in milliseconds',
        buckets=(1, 2, 5, 10, 25, 50, 100, 250, 500, 1000, 2500),
        registry=REGISTRY
    )


# ==================== METRIC HELPERS ====================

class MetricsCollector:
    """
    Helper class for collecting metrics.

    Usage:
        metrics = MetricsCollector(agent_id="helion")

        with metrics.track_request("chat"):
            # Do work
            pass

        metrics.record_tokens(input_tokens=100, output_tokens=50, model="qwen3:8b")
        metrics.record_tool_call("web_search")
    """

    def __init__(self, agent_id: str, node_id: str = "node1"):
        self.agent_id = agent_id
        self.node_id = node_id
        self._enabled = PROMETHEUS_AVAILABLE

    @contextmanager
    def track_request(self, operation: str):
        """Context manager to track request latency"""
        if not self._enabled:
            yield
            return

        AGENT_ACTIVE_REQUESTS.labels(agent_id=self.agent_id).inc()
        start = time.time()
        status = "success"

        try:
            yield
        except Exception as e:
            status = "error"
            AGENT_ERRORS.labels(
                agent_id=self.agent_id,
                error_type=type(e).__name__
            ).inc()
            raise
        finally:
            duration = time.time() - start
            AGENT_LATENCY.labels(
                agent_id=self.agent_id,
                operation=operation
            ).observe(duration)
            AGENT_REQUESTS.labels(
                agent_id=self.agent_id,
                status=status
            ).inc()
            AGENT_ACTIVE_REQUESTS.labels(agent_id=self.agent_id).dec()

    def record_tokens(self, input_tokens: int, output_tokens: int, model: str):
        """Record token usage"""
        if not self._enabled:
            return

        AGENT_TOKENS_IN.labels(
            agent_id=self.agent_id,
            model=model
        ).inc(input_tokens)

        AGENT_TOKENS_OUT.labels(
            agent_id=self.agent_id,
            model=model
        ).inc(output_tokens)

    def record_tool_call(self, tool: str):
        """Record tool call"""
        if not self._enabled:
            return

        AGENT_TOOL_CALLS.labels(
            agent_id=self.agent_id,
            tool=tool
        ).inc()

    def record_budget(self, user_id: str, amount: float):
        """Record budget consumption"""
        if not self._enabled:
            return

        AGENT_BUDGET.labels(
            agent_id=self.agent_id,
            user_id=user_id
        ).set(amount)

    def record_error(self, error_type: str):
        """Record error"""
        if not self._enabled:
            return

        AGENT_ERRORS.labels(
            agent_id=self.agent_id,
            error_type=error_type
        ).inc()

    def record_rag_hit(self, channel_id: str, hit: bool):
        """Record RAG cache hit/miss"""
        if not self._enabled:
            return

        if hit:
            CHANNEL_RAG_HITS.labels(channel_id=channel_id).inc()
        else:
            CHANNEL_RAG_MISSES.labels(channel_id=channel_id).inc()

    def record_memory_operation(self, operation: str, store: str, duration: float):
        """Record memory operation"""
        if not self._enabled:
            return

        MEMORY_OPERATIONS.labels(operation=operation, store=store).inc()
        MEMORY_LATENCY.labels(operation=operation, store=store).observe(duration)

    def record_handoff(self, to_agent: str, status: str, duration: float = None):
        """Record handoff between agents"""
        if not self._enabled:
            return

        HANDOFF_REQUESTS.labels(
            from_agent=self.agent_id,
            to_agent=to_agent,
            status=status
        ).inc()

        if duration is not None:
            HANDOFF_LATENCY.labels(
                from_agent=self.agent_id,
                to_agent=to_agent
            ).observe(duration)


def track_agent_request(agent_id: str, operation: str):
    """Decorator for tracking agent requests"""
    def decorator(func):
        @wraps(func)
        async def wrapper(*args, **kwargs):
            metrics = MetricsCollector(agent_id)
            with metrics.track_request(operation):
                return await func(*args, **kwargs)
        return wrapper
    return decorator


def inc_experience_published(source: str, transport: str, status: str) -> None:
    if not PROMETHEUS_AVAILABLE:
        return
    EXPERIENCE_PUBLISHED.labels(source=source, transport=transport, status=status).inc()


def inc_experience_db_insert(source: str, status: str) -> None:
    if not PROMETHEUS_AVAILABLE:
        return
    EXPERIENCE_DB_INSERT.labels(source=source, status=status).inc()


def inc_experience_dedup_dropped(source: str) -> None:
    if not PROMETHEUS_AVAILABLE:
        return
    EXPERIENCE_DEDUP_DROPPED.labels(source=source).inc()


def inc_experience_sampled(source: str, decision: str, reason: str) -> None:
    if not PROMETHEUS_AVAILABLE:
        return
    EXPERIENCE_SAMPLED.labels(source=source, decision=decision, reason=reason).inc()


def inc_lessons_retrieved(status: str) -> None:
    if not PROMETHEUS_AVAILABLE:
        return
    LESSONS_RETRIEVED.labels(status=status).inc()


def inc_lessons_attached(count: int) -> None:
    if not PROMETHEUS_AVAILABLE:
        return
    try:
        n = int(count)
    except Exception:
        n = 0
    if n <= 0:
        bucket = "0"
    elif n <= 3:
        bucket = "1-3"
    else:
        bucket = "4-7"
    LESSONS_ATTACHED.labels(count=bucket).inc()


def observe_lessons_attach_latency(latency_ms: float) -> None:
    if not PROMETHEUS_AVAILABLE:
        return
    try:
        LESSONS_ATTACH_LATENCY.observe(float(latency_ms))
    except Exception:
        return


# ==================== GPU METRICS COLLECTOR ====================

async def collect_gpu_metrics(node_id: str = "node1"):
    """
    Collect GPU metrics using nvidia-smi.
    Should be called periodically.
    """
    if not PROMETHEUS_AVAILABLE:
        return

    import subprocess

    try:
        # Run nvidia-smi
        result = subprocess.run(
            ['nvidia-smi', '--query-gpu=index,utilization.gpu,memory.used,memory.total',
             '--format=csv,noheader,nounits'],
            capture_output=True, text=True, timeout=5
        )

        if result.returncode != 0:
            return

        for line in result.stdout.strip().split('\n'):
            parts = line.split(',')
            if len(parts) >= 4:
                gpu_id = parts[0].strip()
                util = float(parts[1].strip())
                mem_used = int(parts[2].strip()) * 1024 * 1024  # MB to bytes
                mem_total = int(parts[3].strip()) * 1024 * 1024

                NODE_GPU_UTIL.labels(node_id=node_id, gpu_id=gpu_id).set(util / 100.0)
                NODE_VRAM_USED.labels(node_id=node_id, gpu_id=gpu_id).set(mem_used)
                NODE_VRAM_TOTAL.labels(node_id=node_id, gpu_id=gpu_id).set(mem_total)

    except Exception as e:
        logger.debug(f"GPU metrics collection failed: {e}")


# ==================== FASTAPI INTEGRATION ====================

def setup_metrics_endpoint(app):
    """
    Add /metrics endpoint to FastAPI app.

    Usage:
        from agent_metrics import setup_metrics_endpoint
        setup_metrics_endpoint(app)
    """
    if not PROMETHEUS_AVAILABLE:
        logger.warning("Prometheus not available, /metrics endpoint disabled")
        return

    from fastapi import Response

    @app.get("/metrics")
    async def metrics():
        return Response(
            content=generate_latest(REGISTRY),
            media_type=CONTENT_TYPE_LATEST
        )

    logger.info("Metrics endpoint enabled at /metrics")


# ==================== NATS STREAM LAG COLLECTOR ====================

async def collect_nats_metrics(node_id: str = "node1", nats_url: str = "nats://nats:4222"):
    """
    Collect NATS JetStream metrics.
    """
    if not PROMETHEUS_AVAILABLE:
        return

    try:
        import nats
        nc = await nats.connect(nats_url)
        js = nc.jetstream()

        # Get stream info
        streams = ["MESSAGES", "AGENT_OPS", "AUDIT", "MEMORY"]
        for stream_name in streams:
            try:
                info = await js.stream_info(stream_name)
                # Record pending messages as lag indicator
                pending = info.state.messages
                NODE_NATS_LAG.labels(node_id=node_id, stream=stream_name).set(pending)
            except:
                pass

        await nc.close()
    except Exception as e:
        logger.debug(f"NATS metrics collection failed: {e}")


# ==================== METRICS EXPORT ====================

def get_metrics():
    """Return metrics in Prometheus format"""
    if not PROMETHEUS_AVAILABLE:
        return b"# prometheus_client not available"
    return generate_latest(REGISTRY)


def get_content_type():
    """Return Prometheus content type"""
    if not PROMETHEUS_AVAILABLE:
        return "text/plain"
    return CONTENT_TYPE_LATEST