Complete snapshot of /opt/microdao-daarion/ from NODE1 (144.76.224.179).
This represents the actual running production code that has diverged
significantly from the previous main branch.
Key changes from old main:
- Gateway (http_api.py): expanded from ~40KB to 164KB with full agent support
- Router: new /v1/agents/{id}/infer endpoint with vision + DeepSeek routing
- Behavior Policy: SOWA v2.2 (3-level: FULL/ACK/SILENT)
- Agent Registry: config/agent_registry.yml as single source of truth
- 13 agents configured (was 3)
- Memory service integration
- CrewAI teams and roles
Excluded from snapshot: venv/, .env, data/, backups, .tgz archives
Co-authored-by: Cursor <cursoragent@cursor.com>
469 lines
13 KiB
Python
469 lines
13 KiB
Python
"""
|
|
AgentOps Metrics
|
|
================
|
|
Prometheus метрики для моніторингу агентів.
|
|
|
|
Метрики:
|
|
- Per-agent: latency, tokens, errors, tool calls, budget
|
|
- Per-channel: RAG hit rate, index lag
|
|
- Per-node: GPU util, VRAM, queue lag
|
|
"""
|
|
|
|
import time
|
|
import logging
|
|
from typing import Optional, Dict, Any
|
|
from contextlib import contextmanager
|
|
from functools import wraps
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Try to import prometheus_client
|
|
try:
|
|
from prometheus_client import (
|
|
Counter, Histogram, Gauge, Summary,
|
|
CollectorRegistry, generate_latest, CONTENT_TYPE_LATEST
|
|
)
|
|
PROMETHEUS_AVAILABLE = True
|
|
except ImportError:
|
|
PROMETHEUS_AVAILABLE = False
|
|
logger.warning("prometheus_client not installed, metrics disabled")
|
|
|
|
|
|
# ==================== REGISTRY ====================
|
|
|
|
if PROMETHEUS_AVAILABLE:
|
|
# Use default registry or create custom
|
|
REGISTRY = CollectorRegistry(auto_describe=True)
|
|
|
|
# ==================== AGENT METRICS ====================
|
|
|
|
# Request latency
|
|
AGENT_LATENCY = Histogram(
|
|
'agent_latency_seconds',
|
|
'Agent request latency in seconds',
|
|
['agent_id', 'operation'],
|
|
buckets=(0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0),
|
|
registry=REGISTRY
|
|
)
|
|
|
|
# Token usage
|
|
AGENT_TOKENS_IN = Counter(
|
|
'agent_tokens_in_total',
|
|
'Total input tokens processed by agent',
|
|
['agent_id', 'model'],
|
|
registry=REGISTRY
|
|
)
|
|
|
|
AGENT_TOKENS_OUT = Counter(
|
|
'agent_tokens_out_total',
|
|
'Total output tokens generated by agent',
|
|
['agent_id', 'model'],
|
|
registry=REGISTRY
|
|
)
|
|
|
|
# Request counts
|
|
AGENT_REQUESTS = Counter(
|
|
'agent_requests_total',
|
|
'Total agent requests',
|
|
['agent_id', 'status'], # status: success, error, timeout
|
|
registry=REGISTRY
|
|
)
|
|
|
|
# Error rate
|
|
AGENT_ERRORS = Counter(
|
|
'agent_errors_total',
|
|
'Total agent errors',
|
|
['agent_id', 'error_type'],
|
|
registry=REGISTRY
|
|
)
|
|
|
|
# Tool calls
|
|
AGENT_TOOL_CALLS = Counter(
|
|
'agent_tool_calls_total',
|
|
'Total tool calls by agent',
|
|
['agent_id', 'tool'],
|
|
registry=REGISTRY
|
|
)
|
|
|
|
# Budget consumption
|
|
AGENT_BUDGET = Gauge(
|
|
'agent_budget_consumed',
|
|
'Budget consumed by agent per user',
|
|
['agent_id', 'user_id'],
|
|
registry=REGISTRY
|
|
)
|
|
|
|
# Active requests
|
|
AGENT_ACTIVE_REQUESTS = Gauge(
|
|
'agent_active_requests',
|
|
'Number of active requests per agent',
|
|
['agent_id'],
|
|
registry=REGISTRY
|
|
)
|
|
|
|
# ==================== CHANNEL METRICS ====================
|
|
|
|
# RAG hit rate
|
|
CHANNEL_RAG_HITS = Counter(
|
|
'channel_rag_hits_total',
|
|
'Total RAG cache hits',
|
|
['channel_id'],
|
|
registry=REGISTRY
|
|
)
|
|
|
|
CHANNEL_RAG_MISSES = Counter(
|
|
'channel_rag_misses_total',
|
|
'Total RAG cache misses',
|
|
['channel_id'],
|
|
registry=REGISTRY
|
|
)
|
|
|
|
# Index lag
|
|
CHANNEL_INDEX_LAG = Gauge(
|
|
'channel_index_lag_seconds',
|
|
'Time since last index update',
|
|
['channel_id'],
|
|
registry=REGISTRY
|
|
)
|
|
|
|
# Message queue
|
|
CHANNEL_QUEUE_SIZE = Gauge(
|
|
'channel_queue_size',
|
|
'Number of messages in channel queue',
|
|
['channel_id'],
|
|
registry=REGISTRY
|
|
)
|
|
|
|
# ==================== NODE METRICS ====================
|
|
|
|
# GPU utilization
|
|
NODE_GPU_UTIL = Gauge(
|
|
'node_gpu_utilization',
|
|
'GPU utilization percentage',
|
|
['node_id', 'gpu_id'],
|
|
registry=REGISTRY
|
|
)
|
|
|
|
# VRAM usage
|
|
NODE_VRAM_USED = Gauge(
|
|
'node_vram_used_bytes',
|
|
'VRAM used in bytes',
|
|
['node_id', 'gpu_id'],
|
|
registry=REGISTRY
|
|
)
|
|
|
|
NODE_VRAM_TOTAL = Gauge(
|
|
'node_vram_total_bytes',
|
|
'Total VRAM in bytes',
|
|
['node_id', 'gpu_id'],
|
|
registry=REGISTRY
|
|
)
|
|
|
|
# Queue lag
|
|
NODE_QUEUE_LAG = Gauge(
|
|
'node_queue_lag_seconds',
|
|
'Queue processing lag',
|
|
['node_id', 'queue'],
|
|
registry=REGISTRY
|
|
)
|
|
|
|
# NATS stream lag
|
|
NODE_NATS_LAG = Gauge(
|
|
'node_nats_stream_lag',
|
|
'NATS stream consumer lag (pending messages)',
|
|
['node_id', 'stream'],
|
|
registry=REGISTRY
|
|
)
|
|
|
|
# ==================== MEMORY METRICS ====================
|
|
|
|
MEMORY_OPERATIONS = Counter(
|
|
'memory_operations_total',
|
|
'Total memory operations',
|
|
['operation', 'store'], # store: postgres, qdrant, neo4j, redis
|
|
registry=REGISTRY
|
|
)
|
|
|
|
MEMORY_LATENCY = Histogram(
|
|
'memory_latency_seconds',
|
|
'Memory operation latency',
|
|
['operation', 'store'],
|
|
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0),
|
|
registry=REGISTRY
|
|
)
|
|
|
|
# ==================== HANDOFF METRICS ====================
|
|
|
|
HANDOFF_REQUESTS = Counter(
|
|
'handoff_requests_total',
|
|
'Total handoff requests between agents',
|
|
['from_agent', 'to_agent', 'status'],
|
|
registry=REGISTRY
|
|
)
|
|
|
|
HANDOFF_LATENCY = Histogram(
|
|
'handoff_latency_seconds',
|
|
'Handoff latency between agents',
|
|
['from_agent', 'to_agent'],
|
|
buckets=(0.1, 0.5, 1.0, 2.5, 5.0, 10.0),
|
|
registry=REGISTRY
|
|
)
|
|
|
|
|
|
# ==================== METRIC HELPERS ====================
|
|
|
|
class MetricsCollector:
|
|
"""
|
|
Helper class for collecting metrics.
|
|
|
|
Usage:
|
|
metrics = MetricsCollector(agent_id="helion")
|
|
|
|
with metrics.track_request("chat"):
|
|
# Do work
|
|
pass
|
|
|
|
metrics.record_tokens(input_tokens=100, output_tokens=50, model="qwen3:8b")
|
|
metrics.record_tool_call("web_search")
|
|
"""
|
|
|
|
def __init__(self, agent_id: str, node_id: str = "node1"):
|
|
self.agent_id = agent_id
|
|
self.node_id = node_id
|
|
self._enabled = PROMETHEUS_AVAILABLE
|
|
|
|
@contextmanager
|
|
def track_request(self, operation: str):
|
|
"""Context manager to track request latency"""
|
|
if not self._enabled:
|
|
yield
|
|
return
|
|
|
|
AGENT_ACTIVE_REQUESTS.labels(agent_id=self.agent_id).inc()
|
|
start = time.time()
|
|
status = "success"
|
|
|
|
try:
|
|
yield
|
|
except Exception as e:
|
|
status = "error"
|
|
AGENT_ERRORS.labels(
|
|
agent_id=self.agent_id,
|
|
error_type=type(e).__name__
|
|
).inc()
|
|
raise
|
|
finally:
|
|
duration = time.time() - start
|
|
AGENT_LATENCY.labels(
|
|
agent_id=self.agent_id,
|
|
operation=operation
|
|
).observe(duration)
|
|
AGENT_REQUESTS.labels(
|
|
agent_id=self.agent_id,
|
|
status=status
|
|
).inc()
|
|
AGENT_ACTIVE_REQUESTS.labels(agent_id=self.agent_id).dec()
|
|
|
|
def record_tokens(self, input_tokens: int, output_tokens: int, model: str):
|
|
"""Record token usage"""
|
|
if not self._enabled:
|
|
return
|
|
|
|
AGENT_TOKENS_IN.labels(
|
|
agent_id=self.agent_id,
|
|
model=model
|
|
).inc(input_tokens)
|
|
|
|
AGENT_TOKENS_OUT.labels(
|
|
agent_id=self.agent_id,
|
|
model=model
|
|
).inc(output_tokens)
|
|
|
|
def record_tool_call(self, tool: str):
|
|
"""Record tool call"""
|
|
if not self._enabled:
|
|
return
|
|
|
|
AGENT_TOOL_CALLS.labels(
|
|
agent_id=self.agent_id,
|
|
tool=tool
|
|
).inc()
|
|
|
|
def record_budget(self, user_id: str, amount: float):
|
|
"""Record budget consumption"""
|
|
if not self._enabled:
|
|
return
|
|
|
|
AGENT_BUDGET.labels(
|
|
agent_id=self.agent_id,
|
|
user_id=user_id
|
|
).set(amount)
|
|
|
|
def record_error(self, error_type: str):
|
|
"""Record error"""
|
|
if not self._enabled:
|
|
return
|
|
|
|
AGENT_ERRORS.labels(
|
|
agent_id=self.agent_id,
|
|
error_type=error_type
|
|
).inc()
|
|
|
|
def record_rag_hit(self, channel_id: str, hit: bool):
|
|
"""Record RAG cache hit/miss"""
|
|
if not self._enabled:
|
|
return
|
|
|
|
if hit:
|
|
CHANNEL_RAG_HITS.labels(channel_id=channel_id).inc()
|
|
else:
|
|
CHANNEL_RAG_MISSES.labels(channel_id=channel_id).inc()
|
|
|
|
def record_memory_operation(self, operation: str, store: str, duration: float):
|
|
"""Record memory operation"""
|
|
if not self._enabled:
|
|
return
|
|
|
|
MEMORY_OPERATIONS.labels(operation=operation, store=store).inc()
|
|
MEMORY_LATENCY.labels(operation=operation, store=store).observe(duration)
|
|
|
|
def record_handoff(self, to_agent: str, status: str, duration: float = None):
|
|
"""Record handoff between agents"""
|
|
if not self._enabled:
|
|
return
|
|
|
|
HANDOFF_REQUESTS.labels(
|
|
from_agent=self.agent_id,
|
|
to_agent=to_agent,
|
|
status=status
|
|
).inc()
|
|
|
|
if duration is not None:
|
|
HANDOFF_LATENCY.labels(
|
|
from_agent=self.agent_id,
|
|
to_agent=to_agent
|
|
).observe(duration)
|
|
|
|
|
|
def track_agent_request(agent_id: str, operation: str):
|
|
"""Decorator for tracking agent requests"""
|
|
def decorator(func):
|
|
@wraps(func)
|
|
async def wrapper(*args, **kwargs):
|
|
metrics = MetricsCollector(agent_id)
|
|
with metrics.track_request(operation):
|
|
return await func(*args, **kwargs)
|
|
return wrapper
|
|
return decorator
|
|
|
|
|
|
# ==================== GPU METRICS COLLECTOR ====================
|
|
|
|
async def collect_gpu_metrics(node_id: str = "node1"):
|
|
"""
|
|
Collect GPU metrics using nvidia-smi.
|
|
Should be called periodically.
|
|
"""
|
|
if not PROMETHEUS_AVAILABLE:
|
|
return
|
|
|
|
import subprocess
|
|
|
|
try:
|
|
# Run nvidia-smi
|
|
result = subprocess.run(
|
|
['nvidia-smi', '--query-gpu=index,utilization.gpu,memory.used,memory.total',
|
|
'--format=csv,noheader,nounits'],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
return
|
|
|
|
for line in result.stdout.strip().split('\n'):
|
|
parts = line.split(',')
|
|
if len(parts) >= 4:
|
|
gpu_id = parts[0].strip()
|
|
util = float(parts[1].strip())
|
|
mem_used = int(parts[2].strip()) * 1024 * 1024 # MB to bytes
|
|
mem_total = int(parts[3].strip()) * 1024 * 1024
|
|
|
|
NODE_GPU_UTIL.labels(node_id=node_id, gpu_id=gpu_id).set(util / 100.0)
|
|
NODE_VRAM_USED.labels(node_id=node_id, gpu_id=gpu_id).set(mem_used)
|
|
NODE_VRAM_TOTAL.labels(node_id=node_id, gpu_id=gpu_id).set(mem_total)
|
|
|
|
except Exception as e:
|
|
logger.debug(f"GPU metrics collection failed: {e}")
|
|
|
|
|
|
# ==================== FASTAPI INTEGRATION ====================
|
|
|
|
def setup_metrics_endpoint(app):
|
|
"""
|
|
Add /metrics endpoint to FastAPI app.
|
|
|
|
Usage:
|
|
from agent_metrics import setup_metrics_endpoint
|
|
setup_metrics_endpoint(app)
|
|
"""
|
|
if not PROMETHEUS_AVAILABLE:
|
|
logger.warning("Prometheus not available, /metrics endpoint disabled")
|
|
return
|
|
|
|
from fastapi import Response
|
|
|
|
@app.get("/metrics")
|
|
async def metrics():
|
|
return Response(
|
|
content=generate_latest(REGISTRY),
|
|
media_type=CONTENT_TYPE_LATEST
|
|
)
|
|
|
|
logger.info("Metrics endpoint enabled at /metrics")
|
|
|
|
|
|
# ==================== NATS STREAM LAG COLLECTOR ====================
|
|
|
|
async def collect_nats_metrics(node_id: str = "node1", nats_url: str = "nats://nats:4222"):
|
|
"""
|
|
Collect NATS JetStream metrics.
|
|
"""
|
|
if not PROMETHEUS_AVAILABLE:
|
|
return
|
|
|
|
try:
|
|
import nats
|
|
nc = await nats.connect(nats_url)
|
|
js = nc.jetstream()
|
|
|
|
# Get stream info
|
|
streams = ["MESSAGES", "AGENT_OPS", "AUDIT", "MEMORY"]
|
|
for stream_name in streams:
|
|
try:
|
|
info = await js.stream_info(stream_name)
|
|
# Record pending messages as lag indicator
|
|
pending = info.state.messages
|
|
NODE_NATS_LAG.labels(node_id=node_id, stream=stream_name).set(pending)
|
|
except:
|
|
pass
|
|
|
|
await nc.close()
|
|
except Exception as e:
|
|
logger.debug(f"NATS metrics collection failed: {e}")
|
|
|
|
|
|
# ==================== METRICS EXPORT ====================
|
|
|
|
def get_metrics():
|
|
"""Return metrics in Prometheus format"""
|
|
if not PROMETHEUS_AVAILABLE:
|
|
return b"# prometheus_client not available"
|
|
return generate_latest(REGISTRY)
|
|
|
|
|
|
def get_content_type():
|
|
"""Return Prometheus content type"""
|
|
if not PROMETHEUS_AVAILABLE:
|
|
return "text/plain"
|
|
return CONTENT_TYPE_LATEST
|