Files
microdao-daarion/services/router/agent_metrics.py

575 lines
16 KiB
Python

"""
AgentOps Metrics
================
Prometheus метрики для моніторингу агентів.
Метрики:
- Per-agent: latency, tokens, errors, tool calls, budget
- Per-channel: RAG hit rate, index lag
- Per-node: GPU util, VRAM, queue lag
"""
import time
import logging
from typing import Optional, Dict, Any
from contextlib import contextmanager
from functools import wraps
logger = logging.getLogger(__name__)
# Try to import prometheus_client
try:
from prometheus_client import (
Counter, Histogram, Gauge, Summary,
CollectorRegistry, generate_latest, CONTENT_TYPE_LATEST
)
PROMETHEUS_AVAILABLE = True
except ImportError:
PROMETHEUS_AVAILABLE = False
logger.warning("prometheus_client not installed, metrics disabled")
# ==================== REGISTRY ====================
if PROMETHEUS_AVAILABLE:
# Use default registry or create custom
REGISTRY = CollectorRegistry(auto_describe=True)
# ==================== AGENT METRICS ====================
# Request latency
AGENT_LATENCY = Histogram(
'agent_latency_seconds',
'Agent request latency in seconds',
['agent_id', 'operation'],
buckets=(0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0),
registry=REGISTRY
)
# Token usage
AGENT_TOKENS_IN = Counter(
'agent_tokens_in_total',
'Total input tokens processed by agent',
['agent_id', 'model'],
registry=REGISTRY
)
AGENT_TOKENS_OUT = Counter(
'agent_tokens_out_total',
'Total output tokens generated by agent',
['agent_id', 'model'],
registry=REGISTRY
)
# Request counts
AGENT_REQUESTS = Counter(
'agent_requests_total',
'Total agent requests',
['agent_id', 'status'], # status: success, error, timeout
registry=REGISTRY
)
# Error rate
AGENT_ERRORS = Counter(
'agent_errors_total',
'Total agent errors',
['agent_id', 'error_type'],
registry=REGISTRY
)
# Tool calls
AGENT_TOOL_CALLS = Counter(
'agent_tool_calls_total',
'Total tool calls by agent',
['agent_id', 'tool'],
registry=REGISTRY
)
# Budget consumption
AGENT_BUDGET = Gauge(
'agent_budget_consumed',
'Budget consumed by agent per user',
['agent_id', 'user_id'],
registry=REGISTRY
)
# Active requests
AGENT_ACTIVE_REQUESTS = Gauge(
'agent_active_requests',
'Number of active requests per agent',
['agent_id'],
registry=REGISTRY
)
# ==================== CHANNEL METRICS ====================
# RAG hit rate
CHANNEL_RAG_HITS = Counter(
'channel_rag_hits_total',
'Total RAG cache hits',
['channel_id'],
registry=REGISTRY
)
CHANNEL_RAG_MISSES = Counter(
'channel_rag_misses_total',
'Total RAG cache misses',
['channel_id'],
registry=REGISTRY
)
# Index lag
CHANNEL_INDEX_LAG = Gauge(
'channel_index_lag_seconds',
'Time since last index update',
['channel_id'],
registry=REGISTRY
)
# Message queue
CHANNEL_QUEUE_SIZE = Gauge(
'channel_queue_size',
'Number of messages in channel queue',
['channel_id'],
registry=REGISTRY
)
# ==================== NODE METRICS ====================
# GPU utilization
NODE_GPU_UTIL = Gauge(
'node_gpu_utilization',
'GPU utilization percentage',
['node_id', 'gpu_id'],
registry=REGISTRY
)
# VRAM usage
NODE_VRAM_USED = Gauge(
'node_vram_used_bytes',
'VRAM used in bytes',
['node_id', 'gpu_id'],
registry=REGISTRY
)
NODE_VRAM_TOTAL = Gauge(
'node_vram_total_bytes',
'Total VRAM in bytes',
['node_id', 'gpu_id'],
registry=REGISTRY
)
# Queue lag
NODE_QUEUE_LAG = Gauge(
'node_queue_lag_seconds',
'Queue processing lag',
['node_id', 'queue'],
registry=REGISTRY
)
# NATS stream lag
NODE_NATS_LAG = Gauge(
'node_nats_stream_lag',
'NATS stream consumer lag (pending messages)',
['node_id', 'stream'],
registry=REGISTRY
)
# ==================== MEMORY METRICS ====================
MEMORY_OPERATIONS = Counter(
'memory_operations_total',
'Total memory operations',
['operation', 'store'], # store: postgres, qdrant, neo4j, redis
registry=REGISTRY
)
MEMORY_LATENCY = Histogram(
'memory_latency_seconds',
'Memory operation latency',
['operation', 'store'],
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0),
registry=REGISTRY
)
# ==================== HANDOFF METRICS ====================
HANDOFF_REQUESTS = Counter(
'handoff_requests_total',
'Total handoff requests between agents',
['from_agent', 'to_agent', 'status'],
registry=REGISTRY
)
HANDOFF_LATENCY = Histogram(
'handoff_latency_seconds',
'Handoff latency between agents',
['from_agent', 'to_agent'],
buckets=(0.1, 0.5, 1.0, 2.5, 5.0, 10.0),
registry=REGISTRY
)
# ==================== EXPERIENCE BUS METRICS ====================
EXPERIENCE_PUBLISHED = Counter(
'experience_published_total',
'Total experience events publish attempts',
['source', 'transport', 'status'], # transport: jetstream|core|none
registry=REGISTRY
)
EXPERIENCE_DB_INSERT = Counter(
'experience_db_insert_total',
'Total experience event DB insert attempts',
['source', 'status'], # status: ok|error|skipped
registry=REGISTRY
)
EXPERIENCE_DEDUP_DROPPED = Counter(
'experience_dedup_dropped_total',
'Total experience events dropped by dedup',
['source'],
registry=REGISTRY
)
EXPERIENCE_SAMPLED = Counter(
'experience_sampled_total',
'Total experience events sampled in/out',
['source', 'decision', 'reason'], # decision: in|out
registry=REGISTRY
)
LESSONS_RETRIEVED = Counter(
'lessons_retrieved_total',
'Total lessons retrieval attempts',
['status'], # status: ok|timeout|err
registry=REGISTRY
)
LESSONS_ATTACHED = Counter(
'lessons_attached_total',
'Total lessons attached buckets',
['count'], # count: 0|1-3|4-7
registry=REGISTRY
)
LESSONS_ATTACH_LATENCY = Histogram(
'lessons_attach_latency_ms',
'Lessons retrieval latency in milliseconds',
buckets=(1, 2, 5, 10, 25, 50, 100, 250, 500, 1000, 2500),
registry=REGISTRY
)
# ==================== METRIC HELPERS ====================
class MetricsCollector:
"""
Helper class for collecting metrics.
Usage:
metrics = MetricsCollector(agent_id="helion")
with metrics.track_request("chat"):
# Do work
pass
metrics.record_tokens(input_tokens=100, output_tokens=50, model="qwen3:8b")
metrics.record_tool_call("web_search")
"""
def __init__(self, agent_id: str, node_id: str = "node1"):
self.agent_id = agent_id
self.node_id = node_id
self._enabled = PROMETHEUS_AVAILABLE
@contextmanager
def track_request(self, operation: str):
"""Context manager to track request latency"""
if not self._enabled:
yield
return
AGENT_ACTIVE_REQUESTS.labels(agent_id=self.agent_id).inc()
start = time.time()
status = "success"
try:
yield
except Exception as e:
status = "error"
AGENT_ERRORS.labels(
agent_id=self.agent_id,
error_type=type(e).__name__
).inc()
raise
finally:
duration = time.time() - start
AGENT_LATENCY.labels(
agent_id=self.agent_id,
operation=operation
).observe(duration)
AGENT_REQUESTS.labels(
agent_id=self.agent_id,
status=status
).inc()
AGENT_ACTIVE_REQUESTS.labels(agent_id=self.agent_id).dec()
def record_tokens(self, input_tokens: int, output_tokens: int, model: str):
"""Record token usage"""
if not self._enabled:
return
AGENT_TOKENS_IN.labels(
agent_id=self.agent_id,
model=model
).inc(input_tokens)
AGENT_TOKENS_OUT.labels(
agent_id=self.agent_id,
model=model
).inc(output_tokens)
def record_tool_call(self, tool: str):
"""Record tool call"""
if not self._enabled:
return
AGENT_TOOL_CALLS.labels(
agent_id=self.agent_id,
tool=tool
).inc()
def record_budget(self, user_id: str, amount: float):
"""Record budget consumption"""
if not self._enabled:
return
AGENT_BUDGET.labels(
agent_id=self.agent_id,
user_id=user_id
).set(amount)
def record_error(self, error_type: str):
"""Record error"""
if not self._enabled:
return
AGENT_ERRORS.labels(
agent_id=self.agent_id,
error_type=error_type
).inc()
def record_rag_hit(self, channel_id: str, hit: bool):
"""Record RAG cache hit/miss"""
if not self._enabled:
return
if hit:
CHANNEL_RAG_HITS.labels(channel_id=channel_id).inc()
else:
CHANNEL_RAG_MISSES.labels(channel_id=channel_id).inc()
def record_memory_operation(self, operation: str, store: str, duration: float):
"""Record memory operation"""
if not self._enabled:
return
MEMORY_OPERATIONS.labels(operation=operation, store=store).inc()
MEMORY_LATENCY.labels(operation=operation, store=store).observe(duration)
def record_handoff(self, to_agent: str, status: str, duration: float = None):
"""Record handoff between agents"""
if not self._enabled:
return
HANDOFF_REQUESTS.labels(
from_agent=self.agent_id,
to_agent=to_agent,
status=status
).inc()
if duration is not None:
HANDOFF_LATENCY.labels(
from_agent=self.agent_id,
to_agent=to_agent
).observe(duration)
def track_agent_request(agent_id: str, operation: str):
"""Decorator for tracking agent requests"""
def decorator(func):
@wraps(func)
async def wrapper(*args, **kwargs):
metrics = MetricsCollector(agent_id)
with metrics.track_request(operation):
return await func(*args, **kwargs)
return wrapper
return decorator
def inc_experience_published(source: str, transport: str, status: str) -> None:
if not PROMETHEUS_AVAILABLE:
return
EXPERIENCE_PUBLISHED.labels(source=source, transport=transport, status=status).inc()
def inc_experience_db_insert(source: str, status: str) -> None:
if not PROMETHEUS_AVAILABLE:
return
EXPERIENCE_DB_INSERT.labels(source=source, status=status).inc()
def inc_experience_dedup_dropped(source: str) -> None:
if not PROMETHEUS_AVAILABLE:
return
EXPERIENCE_DEDUP_DROPPED.labels(source=source).inc()
def inc_experience_sampled(source: str, decision: str, reason: str) -> None:
if not PROMETHEUS_AVAILABLE:
return
EXPERIENCE_SAMPLED.labels(source=source, decision=decision, reason=reason).inc()
def inc_lessons_retrieved(status: str) -> None:
if not PROMETHEUS_AVAILABLE:
return
LESSONS_RETRIEVED.labels(status=status).inc()
def inc_lessons_attached(count: int) -> None:
if not PROMETHEUS_AVAILABLE:
return
try:
n = int(count)
except Exception:
n = 0
if n <= 0:
bucket = "0"
elif n <= 3:
bucket = "1-3"
else:
bucket = "4-7"
LESSONS_ATTACHED.labels(count=bucket).inc()
def observe_lessons_attach_latency(latency_ms: float) -> None:
if not PROMETHEUS_AVAILABLE:
return
try:
LESSONS_ATTACH_LATENCY.observe(float(latency_ms))
except Exception:
return
# ==================== GPU METRICS COLLECTOR ====================
async def collect_gpu_metrics(node_id: str = "node1"):
"""
Collect GPU metrics using nvidia-smi.
Should be called periodically.
"""
if not PROMETHEUS_AVAILABLE:
return
import subprocess
try:
# Run nvidia-smi
result = subprocess.run(
['nvidia-smi', '--query-gpu=index,utilization.gpu,memory.used,memory.total',
'--format=csv,noheader,nounits'],
capture_output=True, text=True, timeout=5
)
if result.returncode != 0:
return
for line in result.stdout.strip().split('\n'):
parts = line.split(',')
if len(parts) >= 4:
gpu_id = parts[0].strip()
util = float(parts[1].strip())
mem_used = int(parts[2].strip()) * 1024 * 1024 # MB to bytes
mem_total = int(parts[3].strip()) * 1024 * 1024
NODE_GPU_UTIL.labels(node_id=node_id, gpu_id=gpu_id).set(util / 100.0)
NODE_VRAM_USED.labels(node_id=node_id, gpu_id=gpu_id).set(mem_used)
NODE_VRAM_TOTAL.labels(node_id=node_id, gpu_id=gpu_id).set(mem_total)
except Exception as e:
logger.debug(f"GPU metrics collection failed: {e}")
# ==================== FASTAPI INTEGRATION ====================
def setup_metrics_endpoint(app):
"""
Add /metrics endpoint to FastAPI app.
Usage:
from agent_metrics import setup_metrics_endpoint
setup_metrics_endpoint(app)
"""
if not PROMETHEUS_AVAILABLE:
logger.warning("Prometheus not available, /metrics endpoint disabled")
return
from fastapi import Response
@app.get("/metrics")
async def metrics():
return Response(
content=generate_latest(REGISTRY),
media_type=CONTENT_TYPE_LATEST
)
logger.info("Metrics endpoint enabled at /metrics")
# ==================== NATS STREAM LAG COLLECTOR ====================
async def collect_nats_metrics(node_id: str = "node1", nats_url: str = "nats://nats:4222"):
"""
Collect NATS JetStream metrics.
"""
if not PROMETHEUS_AVAILABLE:
return
try:
import nats
nc = await nats.connect(nats_url)
js = nc.jetstream()
# Get stream info
streams = ["MESSAGES", "AGENT_OPS", "AUDIT", "MEMORY"]
for stream_name in streams:
try:
info = await js.stream_info(stream_name)
# Record pending messages as lag indicator
pending = info.state.messages
NODE_NATS_LAG.labels(node_id=node_id, stream=stream_name).set(pending)
except:
pass
await nc.close()
except Exception as e:
logger.debug(f"NATS metrics collection failed: {e}")
# ==================== METRICS EXPORT ====================
def get_metrics():
"""Return metrics in Prometheus format"""
if not PROMETHEUS_AVAILABLE:
return b"# prometheus_client not available"
return generate_latest(REGISTRY)
def get_content_type():
"""Return Prometheus content type"""
if not PROMETHEUS_AVAILABLE:
return "text/plain"
return CONTENT_TYPE_LATEST