feat: add node-registry dashboard, matrix-presence-aggregator, ocr-service updates
This commit is contained in:
369
services/node-registry/app/dashboard.py
Normal file
369
services/node-registry/app/dashboard.py
Normal file
@@ -0,0 +1,369 @@
|
||||
"""
|
||||
Node Dashboard API - Aggregator for node status and metrics
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
import httpx
|
||||
import psutil
|
||||
from typing import Dict, Any, Optional, List
|
||||
from datetime import datetime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Probe timeout in seconds
|
||||
PROBE_TIMEOUT = 0.5
|
||||
PROBE_TIMEOUT_LONG = 1.0
|
||||
|
||||
|
||||
class DashboardAggregator:
|
||||
"""Aggregates data from multiple services for node dashboard"""
|
||||
|
||||
def __init__(self, node_ip: str = "localhost"):
|
||||
self.node_ip = node_ip
|
||||
self.client = httpx.AsyncClient(timeout=PROBE_TIMEOUT)
|
||||
|
||||
async def close(self):
|
||||
await self.client.aclose()
|
||||
|
||||
async def _probe(self, url: str, timeout: float = PROBE_TIMEOUT) -> Dict[str, Any]:
|
||||
"""Execute HTTP probe with timeout"""
|
||||
try:
|
||||
resp = await self.client.get(url, timeout=timeout)
|
||||
if resp.status_code == 200:
|
||||
return {"status": "up", "data": resp.json(), "latency_ms": int(resp.elapsed.total_seconds() * 1000)}
|
||||
else:
|
||||
return {"status": "degraded", "error": f"HTTP {resp.status_code}"}
|
||||
except httpx.TimeoutException:
|
||||
return {"status": "down", "error": "timeout"}
|
||||
except httpx.ConnectError:
|
||||
return {"status": "down", "error": "connection refused"}
|
||||
except Exception as e:
|
||||
return {"status": "down", "error": str(e)}
|
||||
|
||||
async def get_infra_metrics(self) -> Dict[str, Any]:
|
||||
"""Get infrastructure metrics using psutil"""
|
||||
try:
|
||||
cpu_pct = psutil.cpu_percent(interval=0.1)
|
||||
mem = psutil.virtual_memory()
|
||||
disk = psutil.disk_usage('/')
|
||||
|
||||
result = {
|
||||
"cpu_usage_pct": round(cpu_pct, 1),
|
||||
"ram": {
|
||||
"total_gb": round(mem.total / (1024**3), 1),
|
||||
"used_gb": round(mem.used / (1024**3), 1)
|
||||
},
|
||||
"disk": {
|
||||
"total_gb": round(disk.total / (1024**3), 1),
|
||||
"used_gb": round(disk.used / (1024**3), 1)
|
||||
},
|
||||
"gpus": []
|
||||
}
|
||||
|
||||
# Try to get GPU info (nvidia-smi or similar)
|
||||
try:
|
||||
import subprocess
|
||||
nvidia_output = subprocess.run(
|
||||
['nvidia-smi', '--query-gpu=name,memory.total,memory.used,utilization.gpu', '--format=csv,noheader,nounits'],
|
||||
capture_output=True, text=True, timeout=2
|
||||
)
|
||||
if nvidia_output.returncode == 0:
|
||||
for line in nvidia_output.stdout.strip().split('\n'):
|
||||
parts = [p.strip() for p in line.split(',')]
|
||||
if len(parts) >= 4:
|
||||
result["gpus"].append({
|
||||
"name": parts[0],
|
||||
"vram_gb": round(float(parts[1]) / 1024, 1),
|
||||
"used_gb": round(float(parts[2]) / 1024, 1),
|
||||
"sm_util_pct": int(parts[3])
|
||||
})
|
||||
except:
|
||||
pass
|
||||
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get infra metrics: {e}")
|
||||
return {
|
||||
"cpu_usage_pct": 0,
|
||||
"ram": {"total_gb": 0, "used_gb": 0},
|
||||
"disk": {"total_gb": 0, "used_gb": 0},
|
||||
"gpus": []
|
||||
}
|
||||
|
||||
async def probe_swapper(self, port: int = 8890) -> Dict[str, Any]:
|
||||
"""Probe Swapper service"""
|
||||
base_url = f"http://{self.node_ip}:{port}"
|
||||
|
||||
health_result = await self._probe(f"{base_url}/health", PROBE_TIMEOUT_LONG)
|
||||
models_result = await self._probe(f"{base_url}/models", PROBE_TIMEOUT_LONG)
|
||||
|
||||
result = {
|
||||
"status": health_result.get("status", "unknown"),
|
||||
"endpoint": base_url,
|
||||
"latency_ms": health_result.get("latency_ms", 0),
|
||||
"storage": {"total_gb": 0, "used_gb": 0, "free_gb": 0},
|
||||
"models": []
|
||||
}
|
||||
|
||||
if health_result.get("status") == "up":
|
||||
data = health_result.get("data", {})
|
||||
result["active_model"] = data.get("active_model")
|
||||
result["mode"] = data.get("mode")
|
||||
|
||||
if models_result.get("status") == "up":
|
||||
data = models_result.get("data", {})
|
||||
for m in data.get("models", []):
|
||||
result["models"].append({
|
||||
"name": m.get("name"),
|
||||
"size_gb": m.get("size_gb", 0),
|
||||
"device": m.get("device", "disk"),
|
||||
"state": m.get("status", "unloaded")
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
async def probe_router(self, port: int = 9102) -> Dict[str, Any]:
|
||||
"""Probe DAGI Router service"""
|
||||
base_url = f"http://{self.node_ip}:{port}"
|
||||
|
||||
health_result = await self._probe(f"{base_url}/health", PROBE_TIMEOUT_LONG)
|
||||
backends_result = await self._probe(f"{base_url}/backends/status", PROBE_TIMEOUT_LONG)
|
||||
|
||||
result = {
|
||||
"status": health_result.get("status", "unknown"),
|
||||
"endpoint": base_url,
|
||||
"version": "unknown",
|
||||
"backends": [],
|
||||
"metrics": {
|
||||
"requests_1m": 0,
|
||||
"requests_1h": 0,
|
||||
"error_rate_1h": 0,
|
||||
"avg_latency_ms_1h": 0
|
||||
}
|
||||
}
|
||||
|
||||
if health_result.get("status") == "up":
|
||||
data = health_result.get("data", {})
|
||||
result["version"] = data.get("version", "unknown")
|
||||
result["nats_connected"] = data.get("nats_connected", False)
|
||||
|
||||
if backends_result.get("status") == "up":
|
||||
for backend in backends_result.get("data", []):
|
||||
result["backends"].append({
|
||||
"name": backend.get("name"),
|
||||
"status": backend.get("status"),
|
||||
"latency_ms": backend.get("latency_ms", 0),
|
||||
"error": backend.get("error")
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
async def probe_service(self, name: str, port: int, health_path: str = "/health") -> Dict[str, Any]:
|
||||
"""Probe generic AI service"""
|
||||
base_url = f"http://{self.node_ip}:{port}"
|
||||
|
||||
result = await self._probe(f"{base_url}{health_path}")
|
||||
|
||||
return {
|
||||
"status": result.get("status", "unknown"),
|
||||
"endpoint": base_url,
|
||||
"latency_ms": result.get("latency_ms", 0),
|
||||
"error": result.get("error")
|
||||
}
|
||||
|
||||
async def probe_ollama(self, port: int = 11434) -> Dict[str, Any]:
|
||||
"""Probe Ollama service"""
|
||||
base_url = f"http://{self.node_ip}:{port}"
|
||||
|
||||
result = await self._probe(f"{base_url}/api/tags", PROBE_TIMEOUT_LONG)
|
||||
|
||||
models = []
|
||||
if result.get("status") == "up":
|
||||
data = result.get("data", {})
|
||||
for m in data.get("models", []):
|
||||
models.append(m.get("name"))
|
||||
|
||||
return {
|
||||
"status": result.get("status", "unknown"),
|
||||
"endpoint": base_url,
|
||||
"latency_ms": result.get("latency_ms", 0),
|
||||
"models": models[:10], # Limit to 10 models
|
||||
"error": result.get("error")
|
||||
}
|
||||
|
||||
async def probe_matrix(self, synapse_port: int = 8018, presence_port: int = 8085) -> Dict[str, Any]:
|
||||
"""Probe Matrix services"""
|
||||
synapse_result = await self._probe(f"http://{self.node_ip}:{synapse_port}/_matrix/client/versions")
|
||||
presence_result = await self._probe(f"http://{self.node_ip}:{presence_port}/health")
|
||||
|
||||
return {
|
||||
"enabled": synapse_result.get("status") == "up",
|
||||
"homeserver": f"http://{self.node_ip}:{synapse_port}",
|
||||
"synapse": {
|
||||
"status": synapse_result.get("status", "unknown"),
|
||||
"latency_ms": synapse_result.get("latency_ms", 0)
|
||||
},
|
||||
"presence_bridge": {
|
||||
"status": presence_result.get("status", "unknown"),
|
||||
"latency_ms": presence_result.get("latency_ms", 0)
|
||||
}
|
||||
}
|
||||
|
||||
async def probe_monitoring(self, prometheus_port: int = 9090, grafana_port: int = 3001) -> Dict[str, Any]:
|
||||
"""Probe monitoring services"""
|
||||
prometheus_result = await self._probe(f"http://{self.node_ip}:{prometheus_port}/-/ready")
|
||||
grafana_result = await self._probe(f"http://{self.node_ip}:{grafana_port}/api/health")
|
||||
|
||||
return {
|
||||
"prometheus": {
|
||||
"url": f"http://{self.node_ip}:{prometheus_port}",
|
||||
"status": prometheus_result.get("status", "unknown")
|
||||
},
|
||||
"grafana": {
|
||||
"url": f"http://{self.node_ip}:{grafana_port}",
|
||||
"status": grafana_result.get("status", "unknown")
|
||||
},
|
||||
"logging": {
|
||||
"loki": {"status": "unknown"}
|
||||
}
|
||||
}
|
||||
|
||||
async def get_agents_summary(self, city_service_port: int = 7001) -> Dict[str, Any]:
|
||||
"""Get agents summary from city service"""
|
||||
# City service uses /city/agents endpoint
|
||||
result = await self._probe(f"http://{self.node_ip}:{city_service_port}/city/agents", PROBE_TIMEOUT_LONG)
|
||||
|
||||
summary = {
|
||||
"total": 0,
|
||||
"running": 0,
|
||||
"by_kind": {},
|
||||
"top": []
|
||||
}
|
||||
|
||||
if result.get("status") == "up":
|
||||
agents = result.get("data", [])
|
||||
summary["total"] = len(agents)
|
||||
|
||||
for agent in agents:
|
||||
kind = agent.get("kind", "unknown")
|
||||
summary["by_kind"][kind] = summary["by_kind"].get(kind, 0) + 1
|
||||
|
||||
if agent.get("status") in ["online", "busy"]:
|
||||
summary["running"] += 1
|
||||
|
||||
# Top 5 agents
|
||||
online_agents = [a for a in agents if a.get("status") in ["online", "busy"]][:5]
|
||||
for agent in online_agents:
|
||||
summary["top"].append({
|
||||
"agent_id": agent.get("id"),
|
||||
"display_name": agent.get("display_name"),
|
||||
"kind": agent.get("kind"),
|
||||
"status": agent.get("status"),
|
||||
"node_id": agent.get("node_id")
|
||||
})
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
async def build_dashboard(node_profile: Dict[str, Any], node_ip: str = "localhost") -> Dict[str, Any]:
|
||||
"""
|
||||
Build complete dashboard from node profile.
|
||||
|
||||
Args:
|
||||
node_profile: Node profile from registry (with modules, gpu, roles)
|
||||
node_ip: IP address to probe services
|
||||
|
||||
Returns:
|
||||
Complete dashboard JSON
|
||||
"""
|
||||
aggregator = DashboardAggregator(node_ip)
|
||||
|
||||
try:
|
||||
# Build module port map
|
||||
module_ports = {}
|
||||
for module in node_profile.get("modules", []):
|
||||
if module.get("port"):
|
||||
module_ports[module["id"]] = module["port"]
|
||||
|
||||
# Parallel probes
|
||||
tasks = {
|
||||
"infra": aggregator.get_infra_metrics(),
|
||||
}
|
||||
|
||||
# Add probes based on modules
|
||||
if "ai.swapper" in module_ports:
|
||||
tasks["swapper"] = aggregator.probe_swapper(module_ports["ai.swapper"])
|
||||
|
||||
if "ai.router" in module_ports:
|
||||
tasks["router"] = aggregator.probe_router(module_ports["ai.router"])
|
||||
|
||||
if "ai.ollama" in module_ports:
|
||||
tasks["ollama"] = aggregator.probe_ollama(module_ports["ai.ollama"])
|
||||
|
||||
# Generic AI services
|
||||
ai_services = ["ai.stt", "ai.tts", "ai.ocr", "ai.memory", "ai.crewai"]
|
||||
for svc in ai_services:
|
||||
if svc in module_ports:
|
||||
svc_name = svc.replace("ai.", "")
|
||||
tasks[f"svc_{svc_name}"] = aggregator.probe_service(svc_name, module_ports[svc])
|
||||
|
||||
# Matrix
|
||||
synapse_port = module_ports.get("matrix.synapse", 8018)
|
||||
presence_port = module_ports.get("matrix.presence", 8085)
|
||||
if "matrix.synapse" in module_ports or "matrix.presence" in module_ports:
|
||||
tasks["matrix"] = aggregator.probe_matrix(synapse_port, presence_port)
|
||||
|
||||
# Monitoring
|
||||
prometheus_port = module_ports.get("monitoring.prometheus", 9090)
|
||||
tasks["monitoring"] = aggregator.probe_monitoring(prometheus_port)
|
||||
|
||||
# Agents
|
||||
city_port = module_ports.get("daarion.city", 7001)
|
||||
if "daarion.city" in module_ports or "daarion.agents" in module_ports:
|
||||
tasks["agents"] = aggregator.get_agents_summary(city_port)
|
||||
|
||||
# Execute all probes in parallel
|
||||
results = {}
|
||||
for name, task in tasks.items():
|
||||
try:
|
||||
results[name] = await task
|
||||
except Exception as e:
|
||||
logger.error(f"Probe {name} failed: {e}")
|
||||
results[name] = {"status": "error", "error": str(e)}
|
||||
|
||||
# Build dashboard response
|
||||
dashboard = {
|
||||
"node": {
|
||||
"node_id": node_profile.get("node_id"),
|
||||
"name": node_profile.get("name"),
|
||||
"roles": node_profile.get("roles", []),
|
||||
"status": node_profile.get("status", "unknown"),
|
||||
"public_hostname": node_profile.get("ip_address"),
|
||||
"environment": node_profile.get("role", "production"),
|
||||
"gpu": node_profile.get("gpu"),
|
||||
"modules": node_profile.get("modules", []),
|
||||
"version": node_profile.get("version", "1.0.0")
|
||||
},
|
||||
"infra": results.get("infra", {}),
|
||||
"ai": {
|
||||
"swapper": results.get("swapper", {"status": "not_installed"}),
|
||||
"router": results.get("router", {"status": "not_installed"}),
|
||||
"ollama": results.get("ollama", {"status": "not_installed"}),
|
||||
"services": {}
|
||||
},
|
||||
"agents": results.get("agents", {"total": 0, "running": 0, "by_kind": {}, "top": []}),
|
||||
"matrix": results.get("matrix", {"enabled": False}),
|
||||
"monitoring": results.get("monitoring", {})
|
||||
}
|
||||
|
||||
# Add AI services
|
||||
for key, value in results.items():
|
||||
if key.startswith("svc_"):
|
||||
svc_name = key.replace("svc_", "")
|
||||
dashboard["ai"]["services"][svc_name] = value
|
||||
|
||||
return dashboard
|
||||
|
||||
finally:
|
||||
await aggregator.close()
|
||||
|
||||
@@ -1,24 +1,24 @@
|
||||
"""
|
||||
SQLite Database connection for local development
|
||||
Use this for testing without PostgreSQL
|
||||
PostgreSQL Database connection for Node Registry
|
||||
"""
|
||||
import os
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy import create_engine, text
|
||||
from sqlalchemy.orm import sessionmaker, Session
|
||||
from contextlib import contextmanager
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# SQLite database file
|
||||
DB_FILE = os.getenv("NODE_REGISTRY_DB_FILE", "node_registry.db")
|
||||
DATABASE_URL = f"sqlite:///{DB_FILE}"
|
||||
# Database URL from environment
|
||||
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://postgres:postgres@localhost:5432/node_registry")
|
||||
|
||||
# Create engine
|
||||
engine = create_engine(
|
||||
DATABASE_URL,
|
||||
connect_args={"check_same_thread": False}, # Required for SQLite
|
||||
echo=os.getenv("NODE_REGISTRY_ENV") == "development", # Log SQL in dev
|
||||
pool_pre_ping=True,
|
||||
pool_size=5,
|
||||
max_overflow=10,
|
||||
echo=os.getenv("NODE_REGISTRY_ENV") == "development",
|
||||
)
|
||||
|
||||
# Create session factory
|
||||
@@ -65,7 +65,7 @@ def check_db_connection() -> bool:
|
||||
"""Check if database connection is working"""
|
||||
try:
|
||||
with engine.connect() as conn:
|
||||
conn.execute("SELECT 1")
|
||||
conn.execute(text("SELECT 1"))
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Database connection failed: {e}")
|
||||
@@ -75,8 +75,7 @@ def check_db_connection() -> bool:
|
||||
def get_db_info() -> dict:
|
||||
"""Get database connection information"""
|
||||
return {
|
||||
"type": "sqlite",
|
||||
"database": DB_FILE,
|
||||
"type": "postgresql",
|
||||
"url": DATABASE_URL.split("@")[-1] if "@" in DATABASE_URL else DATABASE_URL,
|
||||
"connected": check_db_connection(),
|
||||
}
|
||||
|
||||
|
||||
@@ -488,6 +488,187 @@ async def discover_nodes(query: NodeDiscoveryQuery, db: Session = Depends(get_db
|
||||
raise HTTPException(status_code=500, detail=f"Discovery failed: {str(e)}")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Node Profile Endpoints (Standard v1)
|
||||
# ============================================================================
|
||||
|
||||
from app.dashboard import build_dashboard
|
||||
|
||||
|
||||
@app.get("/api/v1/nodes/self/dashboard")
|
||||
async def get_self_dashboard(db: Session = Depends(get_db)):
|
||||
"""
|
||||
Get dashboard for current node (self).
|
||||
|
||||
Uses the first node in registry as "self" for now.
|
||||
In production, this would use JWT claims to identify the node.
|
||||
"""
|
||||
try:
|
||||
from sqlalchemy import text
|
||||
|
||||
# Get first node as "self" (simplified for v1)
|
||||
result = db.execute(text("""
|
||||
SELECT node_id FROM nodes ORDER BY registered_at LIMIT 1
|
||||
"""))
|
||||
|
||||
row = result.fetchone()
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="No nodes registered")
|
||||
|
||||
# Delegate to node dashboard
|
||||
return await get_node_dashboard(row[0], db)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to get self dashboard: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Dashboard failed: {str(e)}")
|
||||
|
||||
|
||||
@app.get("/api/v1/nodes/{node_id}/dashboard")
|
||||
async def get_node_dashboard(node_id: str, db: Session = Depends(get_db)):
|
||||
"""
|
||||
Get complete node dashboard with live status.
|
||||
|
||||
Aggregates:
|
||||
- Node profile (roles, modules, GPU)
|
||||
- Infrastructure metrics (CPU, RAM, Disk, GPU)
|
||||
- AI services status (Swapper, Router, STT, Vision, OCR)
|
||||
- Agents summary
|
||||
- Matrix integration status
|
||||
- Monitoring status
|
||||
"""
|
||||
try:
|
||||
from sqlalchemy import text
|
||||
|
||||
# Get node profile
|
||||
result = db.execute(text("""
|
||||
SELECT node_id, node_name, node_role, node_type, ip_address, hostname,
|
||||
status, roles, gpu, modules, version, vpn_ip
|
||||
FROM nodes
|
||||
WHERE node_id = :node_id
|
||||
"""), {"node_id": node_id})
|
||||
|
||||
row = result.fetchone()
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail=f"Node not found: {node_id}")
|
||||
|
||||
profile = {
|
||||
"node_id": row[0],
|
||||
"name": row[1],
|
||||
"role": row[2],
|
||||
"type": row[3],
|
||||
"ip_address": row[4],
|
||||
"hostname": row[5],
|
||||
"status": row[6],
|
||||
"roles": list(row[7]) if row[7] else [],
|
||||
"gpu": row[8],
|
||||
"modules": row[9] if row[9] else [],
|
||||
"version": row[10] or "1.0.0",
|
||||
}
|
||||
|
||||
# Build dashboard with probes
|
||||
# For Docker network, use gateway IP to access host services
|
||||
import os
|
||||
|
||||
# Default to Docker gateway for dagi-network
|
||||
node_ip = os.getenv("PROBE_HOST", "172.21.0.1")
|
||||
|
||||
# For NODE2, use its actual IP (for remote probing)
|
||||
if node_id == "node-2-macbook-m4max":
|
||||
node_ip = row[4] or "192.168.1.33"
|
||||
|
||||
dashboard = await build_dashboard(profile, node_ip)
|
||||
|
||||
return dashboard
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to get node dashboard: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
raise HTTPException(status_code=500, detail=f"Dashboard failed: {str(e)}")
|
||||
|
||||
|
||||
@app.get("/api/v1/nodes/{node_id}/profile")
|
||||
async def get_node_profile(node_id: str, db: Session = Depends(get_db)):
|
||||
"""
|
||||
Get full node profile including modules, GPU, roles.
|
||||
Node Profile Standard v1.
|
||||
"""
|
||||
try:
|
||||
from sqlalchemy import text
|
||||
|
||||
result = db.execute(text("""
|
||||
SELECT node_id, node_name, node_role, node_type, ip_address, hostname,
|
||||
status, roles, gpu, modules, version, vpn_ip
|
||||
FROM nodes
|
||||
WHERE node_id = :node_id
|
||||
"""), {"node_id": node_id})
|
||||
|
||||
row = result.fetchone()
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail=f"Node not found: {node_id}")
|
||||
|
||||
return {
|
||||
"node_id": row[0],
|
||||
"name": row[1],
|
||||
"role": row[2],
|
||||
"type": row[3],
|
||||
"ip_address": row[4],
|
||||
"hostname": row[5],
|
||||
"status": row[6],
|
||||
"roles": list(row[7]) if row[7] else [],
|
||||
"gpu": row[8],
|
||||
"modules": row[9] if row[9] else [],
|
||||
"version": row[10] or "1.0.0",
|
||||
"vpn_ip": str(row[11]) if row[11] else None,
|
||||
}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to get node profile: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
|
||||
|
||||
|
||||
@app.get("/api/v1/nodes/profiles")
|
||||
async def get_all_node_profiles(db: Session = Depends(get_db)):
|
||||
"""
|
||||
Get all node profiles with modules.
|
||||
"""
|
||||
try:
|
||||
from sqlalchemy import text
|
||||
|
||||
result = db.execute(text("""
|
||||
SELECT node_id, node_name, node_role, node_type, ip_address, hostname,
|
||||
status, roles, gpu, modules, version, vpn_ip
|
||||
FROM nodes
|
||||
ORDER BY node_id
|
||||
"""))
|
||||
|
||||
nodes = []
|
||||
for row in result.fetchall():
|
||||
nodes.append({
|
||||
"node_id": row[0],
|
||||
"name": row[1],
|
||||
"role": row[2],
|
||||
"type": row[3],
|
||||
"ip_address": row[4],
|
||||
"hostname": row[5],
|
||||
"status": row[6],
|
||||
"roles": list(row[7]) if row[7] else [],
|
||||
"gpu": row[8],
|
||||
"modules": row[9] if row[9] else [],
|
||||
"version": row[10] or "1.0.0",
|
||||
})
|
||||
|
||||
return {"nodes": nodes, "total": len(nodes)}
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to get node profiles: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Maintenance Endpoints
|
||||
# ============================================================================
|
||||
@@ -518,7 +699,7 @@ if __name__ == "__main__":
|
||||
print(f"🚀 Starting {SERVICE_NAME} v{VERSION}")
|
||||
print(f"📊 Environment: {ENV}")
|
||||
print(f"🔌 Port: {HTTP_PORT}")
|
||||
print(f"🗄️ Database: {DB_USER}@{DB_HOST}:{DB_PORT}/{DB_NAME}")
|
||||
print(f"🗄️ Database: {os.getenv('DATABASE_URL', 'not configured')}")
|
||||
print(f"📝 Log level: {LOG_LEVEL}")
|
||||
print()
|
||||
|
||||
|
||||
@@ -3,8 +3,8 @@ SQLAlchemy ORM Models for Node Registry
|
||||
"""
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from sqlalchemy import Column, String, DateTime, Boolean, ForeignKey, Text, Index
|
||||
from sqlalchemy.dialects.postgresql import UUID as PG_UUID, INET, JSONB as PG_JSONB
|
||||
from sqlalchemy import Column, String, DateTime, Boolean, ForeignKey, Text, Index, ARRAY
|
||||
from sqlalchemy.dialects.postgresql import UUID as PG_UUID, INET, JSONB as PG_JSONB, ARRAY as PG_ARRAY
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import relationship
|
||||
from sqlalchemy.types import TypeDecorator, String as SQLString, Text as SQLText
|
||||
@@ -90,6 +90,12 @@ class Node(Base):
|
||||
updated_at = Column(DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
node_metadata = Column(JSONB, default={})
|
||||
|
||||
# Node Profile Standard v1 fields
|
||||
roles = Column(ARRAY(String), default=[]) # ['core', 'gateway', 'matrix', 'agents', 'gpu']
|
||||
gpu = Column(JSONB, default=None) # {"name": "NVIDIA RTX 4000", "vram_gb": 20}
|
||||
modules = Column(JSONB, default=[]) # [{"id": "ai.router", "status": "up", "port": 9102}, ...]
|
||||
version = Column(String(50), default='1.0.0')
|
||||
|
||||
# Relationships
|
||||
profiles = relationship("NodeProfile", back_populates="node", cascade="all, delete-orphan")
|
||||
heartbeats = relationship("HeartbeatLog", back_populates="node", cascade="all, delete-orphan")
|
||||
@@ -113,6 +119,11 @@ class Node(Base):
|
||||
"registered_at": self.registered_at.isoformat() if self.registered_at else None,
|
||||
"updated_at": self.updated_at.isoformat() if self.updated_at else None,
|
||||
"metadata": self.node_metadata or {},
|
||||
# Node Profile Standard v1
|
||||
"roles": self.roles or [],
|
||||
"gpu": self.gpu,
|
||||
"modules": self.modules or [],
|
||||
"version": self.version or "1.0.0",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -9,3 +9,5 @@ alembic==1.14.0
|
||||
python-json-logger==3.2.1
|
||||
prometheus-client==0.21.0
|
||||
psycopg2-binary>=2.9.0
|
||||
psutil>=5.9.0
|
||||
requests>=2.28.0
|
||||
|
||||
Reference in New Issue
Block a user