""" AgentOps Metrics ================ Prometheus метрики для моніторингу агентів. Метрики: - Per-agent: latency, tokens, errors, tool calls, budget - Per-channel: RAG hit rate, index lag - Per-node: GPU util, VRAM, queue lag """ import time import logging from typing import Optional, Dict, Any from contextlib import contextmanager from functools import wraps logger = logging.getLogger(__name__) # Try to import prometheus_client try: from prometheus_client import ( Counter, Histogram, Gauge, Summary, CollectorRegistry, generate_latest, CONTENT_TYPE_LATEST ) PROMETHEUS_AVAILABLE = True except ImportError: PROMETHEUS_AVAILABLE = False logger.warning("prometheus_client not installed, metrics disabled") # ==================== REGISTRY ==================== if PROMETHEUS_AVAILABLE: # Use default registry or create custom REGISTRY = CollectorRegistry(auto_describe=True) # ==================== AGENT METRICS ==================== # Request latency AGENT_LATENCY = Histogram( 'agent_latency_seconds', 'Agent request latency in seconds', ['agent_id', 'operation'], buckets=(0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0), registry=REGISTRY ) # Token usage AGENT_TOKENS_IN = Counter( 'agent_tokens_in_total', 'Total input tokens processed by agent', ['agent_id', 'model'], registry=REGISTRY ) AGENT_TOKENS_OUT = Counter( 'agent_tokens_out_total', 'Total output tokens generated by agent', ['agent_id', 'model'], registry=REGISTRY ) # Request counts AGENT_REQUESTS = Counter( 'agent_requests_total', 'Total agent requests', ['agent_id', 'status'], # status: success, error, timeout registry=REGISTRY ) # Error rate AGENT_ERRORS = Counter( 'agent_errors_total', 'Total agent errors', ['agent_id', 'error_type'], registry=REGISTRY ) # Tool calls AGENT_TOOL_CALLS = Counter( 'agent_tool_calls_total', 'Total tool calls by agent', ['agent_id', 'tool'], registry=REGISTRY ) # Budget consumption AGENT_BUDGET = Gauge( 'agent_budget_consumed', 'Budget consumed by agent per user', ['agent_id', 'user_id'], registry=REGISTRY ) # Active requests AGENT_ACTIVE_REQUESTS = Gauge( 'agent_active_requests', 'Number of active requests per agent', ['agent_id'], registry=REGISTRY ) # ==================== CHANNEL METRICS ==================== # RAG hit rate CHANNEL_RAG_HITS = Counter( 'channel_rag_hits_total', 'Total RAG cache hits', ['channel_id'], registry=REGISTRY ) CHANNEL_RAG_MISSES = Counter( 'channel_rag_misses_total', 'Total RAG cache misses', ['channel_id'], registry=REGISTRY ) # Index lag CHANNEL_INDEX_LAG = Gauge( 'channel_index_lag_seconds', 'Time since last index update', ['channel_id'], registry=REGISTRY ) # Message queue CHANNEL_QUEUE_SIZE = Gauge( 'channel_queue_size', 'Number of messages in channel queue', ['channel_id'], registry=REGISTRY ) # ==================== NODE METRICS ==================== # GPU utilization NODE_GPU_UTIL = Gauge( 'node_gpu_utilization', 'GPU utilization percentage', ['node_id', 'gpu_id'], registry=REGISTRY ) # VRAM usage NODE_VRAM_USED = Gauge( 'node_vram_used_bytes', 'VRAM used in bytes', ['node_id', 'gpu_id'], registry=REGISTRY ) NODE_VRAM_TOTAL = Gauge( 'node_vram_total_bytes', 'Total VRAM in bytes', ['node_id', 'gpu_id'], registry=REGISTRY ) # Queue lag NODE_QUEUE_LAG = Gauge( 'node_queue_lag_seconds', 'Queue processing lag', ['node_id', 'queue'], registry=REGISTRY ) # NATS stream lag NODE_NATS_LAG = Gauge( 'node_nats_stream_lag', 'NATS stream consumer lag (pending messages)', ['node_id', 'stream'], registry=REGISTRY ) # ==================== MEMORY METRICS ==================== MEMORY_OPERATIONS = Counter( 'memory_operations_total', 'Total memory operations', ['operation', 'store'], # store: postgres, qdrant, neo4j, redis registry=REGISTRY ) MEMORY_LATENCY = Histogram( 'memory_latency_seconds', 'Memory operation latency', ['operation', 'store'], buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0), registry=REGISTRY ) # ==================== HANDOFF METRICS ==================== HANDOFF_REQUESTS = Counter( 'handoff_requests_total', 'Total handoff requests between agents', ['from_agent', 'to_agent', 'status'], registry=REGISTRY ) HANDOFF_LATENCY = Histogram( 'handoff_latency_seconds', 'Handoff latency between agents', ['from_agent', 'to_agent'], buckets=(0.1, 0.5, 1.0, 2.5, 5.0, 10.0), registry=REGISTRY ) # ==================== METRIC HELPERS ==================== class MetricsCollector: """ Helper class for collecting metrics. Usage: metrics = MetricsCollector(agent_id="helion") with metrics.track_request("chat"): # Do work pass metrics.record_tokens(input_tokens=100, output_tokens=50, model="qwen3:8b") metrics.record_tool_call("web_search") """ def __init__(self, agent_id: str, node_id: str = "node1"): self.agent_id = agent_id self.node_id = node_id self._enabled = PROMETHEUS_AVAILABLE @contextmanager def track_request(self, operation: str): """Context manager to track request latency""" if not self._enabled: yield return AGENT_ACTIVE_REQUESTS.labels(agent_id=self.agent_id).inc() start = time.time() status = "success" try: yield except Exception as e: status = "error" AGENT_ERRORS.labels( agent_id=self.agent_id, error_type=type(e).__name__ ).inc() raise finally: duration = time.time() - start AGENT_LATENCY.labels( agent_id=self.agent_id, operation=operation ).observe(duration) AGENT_REQUESTS.labels( agent_id=self.agent_id, status=status ).inc() AGENT_ACTIVE_REQUESTS.labels(agent_id=self.agent_id).dec() def record_tokens(self, input_tokens: int, output_tokens: int, model: str): """Record token usage""" if not self._enabled: return AGENT_TOKENS_IN.labels( agent_id=self.agent_id, model=model ).inc(input_tokens) AGENT_TOKENS_OUT.labels( agent_id=self.agent_id, model=model ).inc(output_tokens) def record_tool_call(self, tool: str): """Record tool call""" if not self._enabled: return AGENT_TOOL_CALLS.labels( agent_id=self.agent_id, tool=tool ).inc() def record_budget(self, user_id: str, amount: float): """Record budget consumption""" if not self._enabled: return AGENT_BUDGET.labels( agent_id=self.agent_id, user_id=user_id ).set(amount) def record_error(self, error_type: str): """Record error""" if not self._enabled: return AGENT_ERRORS.labels( agent_id=self.agent_id, error_type=error_type ).inc() def record_rag_hit(self, channel_id: str, hit: bool): """Record RAG cache hit/miss""" if not self._enabled: return if hit: CHANNEL_RAG_HITS.labels(channel_id=channel_id).inc() else: CHANNEL_RAG_MISSES.labels(channel_id=channel_id).inc() def record_memory_operation(self, operation: str, store: str, duration: float): """Record memory operation""" if not self._enabled: return MEMORY_OPERATIONS.labels(operation=operation, store=store).inc() MEMORY_LATENCY.labels(operation=operation, store=store).observe(duration) def record_handoff(self, to_agent: str, status: str, duration: float = None): """Record handoff between agents""" if not self._enabled: return HANDOFF_REQUESTS.labels( from_agent=self.agent_id, to_agent=to_agent, status=status ).inc() if duration is not None: HANDOFF_LATENCY.labels( from_agent=self.agent_id, to_agent=to_agent ).observe(duration) def track_agent_request(agent_id: str, operation: str): """Decorator for tracking agent requests""" def decorator(func): @wraps(func) async def wrapper(*args, **kwargs): metrics = MetricsCollector(agent_id) with metrics.track_request(operation): return await func(*args, **kwargs) return wrapper return decorator # ==================== GPU METRICS COLLECTOR ==================== async def collect_gpu_metrics(node_id: str = "node1"): """ Collect GPU metrics using nvidia-smi. Should be called periodically. """ if not PROMETHEUS_AVAILABLE: return import subprocess try: # Run nvidia-smi result = subprocess.run( ['nvidia-smi', '--query-gpu=index,utilization.gpu,memory.used,memory.total', '--format=csv,noheader,nounits'], capture_output=True, text=True, timeout=5 ) if result.returncode != 0: return for line in result.stdout.strip().split('\n'): parts = line.split(',') if len(parts) >= 4: gpu_id = parts[0].strip() util = float(parts[1].strip()) mem_used = int(parts[2].strip()) * 1024 * 1024 # MB to bytes mem_total = int(parts[3].strip()) * 1024 * 1024 NODE_GPU_UTIL.labels(node_id=node_id, gpu_id=gpu_id).set(util / 100.0) NODE_VRAM_USED.labels(node_id=node_id, gpu_id=gpu_id).set(mem_used) NODE_VRAM_TOTAL.labels(node_id=node_id, gpu_id=gpu_id).set(mem_total) except Exception as e: logger.debug(f"GPU metrics collection failed: {e}") # ==================== FASTAPI INTEGRATION ==================== def setup_metrics_endpoint(app): """ Add /metrics endpoint to FastAPI app. Usage: from agent_metrics import setup_metrics_endpoint setup_metrics_endpoint(app) """ if not PROMETHEUS_AVAILABLE: logger.warning("Prometheus not available, /metrics endpoint disabled") return from fastapi import Response @app.get("/metrics") async def metrics(): return Response( content=generate_latest(REGISTRY), media_type=CONTENT_TYPE_LATEST ) logger.info("Metrics endpoint enabled at /metrics") # ==================== NATS STREAM LAG COLLECTOR ==================== async def collect_nats_metrics(node_id: str = "node1", nats_url: str = "nats://nats:4222"): """ Collect NATS JetStream metrics. """ if not PROMETHEUS_AVAILABLE: return try: import nats nc = await nats.connect(nats_url) js = nc.jetstream() # Get stream info streams = ["MESSAGES", "AGENT_OPS", "AUDIT", "MEMORY"] for stream_name in streams: try: info = await js.stream_info(stream_name) # Record pending messages as lag indicator pending = info.state.messages NODE_NATS_LAG.labels(node_id=node_id, stream=stream_name).set(pending) except: pass await nc.close() except Exception as e: logger.debug(f"NATS metrics collection failed: {e}") # ==================== METRICS EXPORT ==================== def get_metrics(): """Return metrics in Prometheus format""" if not PROMETHEUS_AVAILABLE: return b"# prometheus_client not available" return generate_latest(REGISTRY) def get_content_type(): """Return Prometheus content type""" if not PROMETHEUS_AVAILABLE: return "text/plain" return CONTENT_TYPE_LATEST