Some checks failed
Update Documentation / update-repos-info (push) Has been cancelled
- Capability Registry (Postgres heartbeat) - NATS Client (підписка на streams) - Job Executor (виконання jobs) - Metrics Exporter (Prometheus) - Dockerfile для deployment - Виправлено server_name в NATS (emptyDir) TODO: Реальна реалізація embed/retrieve/summarize, Matrix Gateway, Auth
70 lines
2.1 KiB
Python
70 lines
2.1 KiB
Python
"""
|
|
Metrics Exporter — Prometheus metrics для worker
|
|
"""
|
|
|
|
import asyncio
|
|
from typing import Optional
|
|
from aiohttp import web
|
|
from prometheus_client import Counter, Histogram, Gauge, generate_latest
|
|
|
|
|
|
class MetricsExporter:
|
|
def __init__(self, port: int = 9090):
|
|
self.port = port
|
|
self.app: Optional[web.Application] = None
|
|
self.runner: Optional[web.AppRunner] = None
|
|
|
|
# Metrics
|
|
self.jobs_processed = Counter(
|
|
"worker_jobs_processed_total",
|
|
"Total jobs processed",
|
|
["type", "status"]
|
|
)
|
|
self.job_duration = Histogram(
|
|
"worker_job_duration_seconds",
|
|
"Job execution duration",
|
|
["type"],
|
|
buckets=[0.1, 0.5, 1.0, 5.0, 10.0, 30.0, 60.0]
|
|
)
|
|
self.gpu_utilization = Gauge(
|
|
"worker_gpu_utilization",
|
|
"GPU utilization percentage",
|
|
["node_id"]
|
|
)
|
|
self.vram_usage = Gauge(
|
|
"worker_vram_usage_bytes",
|
|
"VRAM usage in bytes",
|
|
["node_id"]
|
|
)
|
|
self.errors_total = Counter(
|
|
"worker_errors_total",
|
|
"Total errors",
|
|
["type", "error_type"]
|
|
)
|
|
|
|
async def start(self):
|
|
"""Запуск metrics server"""
|
|
self.app = web.Application()
|
|
self.app.router.add_get("/metrics", self.metrics_handler)
|
|
|
|
self.runner = web.AppRunner(self.app)
|
|
await self.runner.setup()
|
|
|
|
site = web.TCPSite(self.runner, "0.0.0.0", self.port)
|
|
await site.start()
|
|
|
|
print(f"✅ Metrics server запущено на порту {self.port}")
|
|
|
|
async def stop(self):
|
|
"""Зупинка metrics server"""
|
|
if self.runner:
|
|
await self.runner.cleanup()
|
|
print("✅ Metrics server зупинено")
|
|
|
|
async def metrics_handler(self, request):
|
|
"""HTTP handler для /metrics"""
|
|
return web.Response(
|
|
text=generate_latest(),
|
|
content_type="text/plain"
|
|
)
|