feat: Add presence heartbeat for Matrix online status

- matrix-gateway: POST /internal/matrix/presence/online endpoint
- usePresenceHeartbeat hook with activity tracking
- Auto away after 5 min inactivity
- Offline on page close/visibility change
- Integrated in MatrixChatRoom component
This commit is contained in:
Apple
2025-11-27 00:19:40 -08:00
parent 5bed515852
commit 3de3c8cb36
6371 changed files with 1317450 additions and 932 deletions

View File

@@ -3,115 +3,314 @@
Node Registry Service
Central registry for DAGI network nodes (Node #1, Node #2, Node #N)
This is a stub implementation - full API will be implemented by Cursor.
Full implementation with database integration
"""
import os
import time
from datetime import datetime
from typing import Dict, Any
from typing import Dict, Any, Optional
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from fastapi import FastAPI, HTTPException, Depends, Query
from fastapi.middleware.cors import CORSMiddleware
from sqlalchemy.orm import Session
import uvicorn
import logging
# Import our modules
from .database import get_db, get_db_info, check_db_connection, engine
from .models import Base, Node
from .schemas import (
NodeRegister, NodeResponse, NodeListResponse,
HeartbeatRequest, HeartbeatResponse,
NodeDiscoveryQuery, NodeDiscoveryResponse
)
from . import crud
from .system_metrics import get_all_metrics
from .agents_data import get_agents_by_node, get_agents_by_team
from .services_data import get_services_by_node
from .monitoring_api import (
get_agent_profile,
get_agents_registry,
get_alerts_payload,
get_ai_usage_metrics,
get_events_payload,
get_global_kpis,
get_infrastructure_metrics,
get_stack_models,
get_stack_services,
)
from .node1_prometheus import get_node1_metrics
from .node_connector import get_node_connector_report
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Environment configuration
HTTP_PORT = int(os.getenv("NODE_REGISTRY_HTTP_PORT", "9205"))
ENV = os.getenv("NODE_REGISTRY_ENV", "development")
LOG_LEVEL = os.getenv("NODE_REGISTRY_LOG_LEVEL", "info")
DB_HOST = os.getenv("NODE_REGISTRY_DB_HOST", "postgres")
DB_PORT = int(os.getenv("NODE_REGISTRY_DB_PORT", "5432"))
DB_NAME = os.getenv("NODE_REGISTRY_DB_NAME", "node_registry")
DB_USER = os.getenv("NODE_REGISTRY_DB_USER", "node_registry_user")
DB_PASSWORD = os.getenv("NODE_REGISTRY_DB_PASSWORD", "")
# Service metadata
SERVICE_NAME = "node-registry"
VERSION = "0.1.0-stub"
VERSION = "1.0.0"
START_TIME = time.time()
# Create FastAPI app
app = FastAPI(
title="Node Registry Service",
description="Central registry for DAGI network nodes",
description="Central registry for DAGI network nodes - Full Implementation",
version=VERSION,
docs_url="/docs" if ENV == "development" else None,
redoc_url="/redoc" if ENV == "development" else None,
)
# Models (stub - will be expanded by Cursor)
class HealthResponse(BaseModel):
status: str
service: str
version: str
environment: str
uptime_seconds: float
timestamp: str
database: Dict[str, Any]
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class MetricsResponse(BaseModel):
service: str
uptime_seconds: float
total_nodes: int
active_nodes: int
timestamp: str
# ============================================================================
# Startup/Shutdown Events
# ============================================================================
# Health check endpoint
@app.get("/health", response_model=HealthResponse)
async def health_check():
"""
Health check endpoint for monitoring systems.
@app.on_event("startup")
async def startup():
"""Initialize database on startup"""
logger.info(f"🚀 Starting {SERVICE_NAME} v{VERSION}")
logger.info(f"📊 Environment: {ENV}")
logger.info(f"🔌 Port: {HTTP_PORT}")
Returns service status, version, and database connectivity.
"""
# Create tables if they don't exist
try:
Base.metadata.create_all(bind=engine)
logger.info("✅ Database tables initialized")
except Exception as e:
logger.error(f"❌ Database initialization failed: {e}")
# Check database connection
if check_db_connection():
logger.info("✅ Database connection successful")
else:
logger.warning("⚠️ Database connection failed - service may not work correctly")
@app.on_event("shutdown")
async def shutdown():
"""Cleanup on shutdown"""
logger.info("👋 Shutting down Node Registry Service")
# ============================================================================
# Health & Metrics Endpoints
# ============================================================================
@app.get("/health")
async def health_check(db: Session = Depends(get_db)):
"""Health check endpoint for monitoring systems"""
uptime = time.time() - START_TIME
db_info = get_db_info()
# TODO: Implement actual DB health check
db_status = {
"connected": False,
"host": DB_HOST,
"port": DB_PORT,
"database": DB_NAME,
"message": "Not implemented (stub)"
# Check database and get stats
try:
stats = crud.get_network_stats(db)
except Exception as e:
logger.error(f"Failed to get network stats: {e}")
stats = {"total_nodes": 0, "online_nodes": 0}
return {
"status": "healthy" if db_info["connected"] else "degraded",
"service": SERVICE_NAME,
"version": VERSION,
"environment": ENV,
"uptime_seconds": uptime,
"timestamp": datetime.utcnow().isoformat() + "Z",
"database": db_info,
"network_stats": stats,
}
return HealthResponse(
status="healthy",
service=SERVICE_NAME,
version=VERSION,
environment=ENV,
uptime_seconds=uptime,
timestamp=datetime.utcnow().isoformat() + "Z",
database=db_status
)
# Metrics endpoint (Prometheus-compatible format will be added by Cursor)
@app.get("/metrics", response_model=MetricsResponse)
async def metrics():
"""
Metrics endpoint for Prometheus scraping.
TODO: Add proper Prometheus format (prometheus_client library)
"""
@app.get("/metrics")
async def metrics(db: Session = Depends(get_db)):
"""Metrics endpoint for monitoring"""
uptime = time.time() - START_TIME
# TODO: Implement actual metrics from database
return MetricsResponse(
service=SERVICE_NAME,
uptime_seconds=uptime,
total_nodes=0,
active_nodes=0,
timestamp=datetime.utcnow().isoformat() + "Z"
)
try:
stats = crud.get_network_stats(db)
except Exception:
stats = {"total_nodes": 0, "online_nodes": 0, "offline_nodes": 0}
return {
"service": SERVICE_NAME,
"uptime_seconds": uptime,
**stats,
"timestamp": datetime.utcnow().isoformat() + "Z"
}
@app.get("/api/node-metrics")
async def get_node_metrics() -> Dict[str, Any]:
"""
Get real-time system metrics for NODE2 (this machine)
Returns: CPU, RAM, Disk, GPU, Network metrics
"""
try:
metrics = get_all_metrics()
return {
"success": True,
"node_id": "node-2-local",
**metrics
}
except Exception as e:
logger.error(f"Failed to get system metrics: {e}")
return {
"success": False,
"error": str(e),
"timestamp": datetime.utcnow().isoformat() + "Z"
}
@app.get("/api/node1-metrics")
async def get_node1_metrics_endpoint():
"""Returns NODE1 metrics via Prometheus tunnel."""
try:
data = get_node1_metrics()
if not data.get("success"):
raise HTTPException(status_code=502, detail="Prometheus tunnel unavailable")
return data
except HTTPException:
raise
except Exception as e:
logger.error(f"Failed to get NODE1 metrics: {e}")
raise HTTPException(status_code=502, detail="Failed to query NODE1 Prometheus")
@app.get("/api/node-agents/{node_id}")
async def get_node_agents(node_id: str) -> Dict[str, Any]:
"""
Get list of agents for specific node
Returns: List of agents grouped by teams
"""
try:
agents_by_team = get_agents_by_team(node_id)
all_agents = get_agents_by_node(node_id)
return {
"success": True,
"node_id": node_id,
"total": len(all_agents),
"teams": agents_by_team,
"agents": all_agents
}
except Exception as e:
logger.error(f"Failed to get agents for {node_id}: {e}")
return {
"success": False,
"error": str(e),
"node_id": node_id
}
@app.get("/api/node-services/{node_id}")
async def get_node_services(node_id: str) -> Dict[str, Any]:
"""
Get list of services for specific node
Returns: List of running services
"""
try:
services = get_services_by_node(node_id)
running = [s for s in services if s.get("status") == "running"]
return {
"success": True,
"node_id": node_id,
"total": len(services),
"running": len(running),
"services": services
}
except Exception as e:
logger.error(f"Failed to get services for {node_id}: {e}")
return {
"success": False,
"error": str(e),
"node_id": node_id
}
# ============================================================================
# Monitoring API (Global dashboards)
# ============================================================================
@app.get("/api/monitoring/global-kpis")
async def monitoring_global_kpis():
"""Cluster-wide KPIs for System Overview dashboard."""
return get_global_kpis()
@app.get("/api/monitoring/infrastructure")
async def monitoring_infrastructure():
"""Infrastructure metrics (API, WS, NATS, DB)."""
return get_infrastructure_metrics()
@app.get("/api/monitoring/ai-usage")
async def monitoring_ai_usage():
"""AI usage summary (tokens, latency, quota)."""
return get_ai_usage_metrics()
@app.get("/api/monitoring/events/{node_id}")
async def monitoring_events(node_id: str, limit: int = Query(10, ge=1, le=50)):
"""Recent events for a node."""
return get_events_payload(node_id, limit)
@app.get("/api/monitoring/alerts")
async def monitoring_alerts(node_id: Optional[str] = Query(None)):
"""Active alerts (optionally filtered by node)."""
return get_alerts_payload(node_id)
@app.get("/api/agents")
async def list_agents():
"""Return registry of all agents across nodes."""
return get_agents_registry()
@app.get("/api/agents/{agent_id}")
async def agent_detail(agent_id: str):
"""Detailed profile for a single agent."""
profile = get_agent_profile(agent_id)
if not profile:
raise HTTPException(status_code=404, detail=f"Agent not found: {agent_id}")
return profile
@app.get("/api/stack/services")
async def stack_services():
"""Catalog of services per node."""
return get_stack_services()
@app.get("/api/stack/models")
async def stack_models():
"""Catalog of models per node."""
return get_stack_models()
@app.get("/api/node-connector/report")
async def node_connector_report():
"""Return readiness report for connecting new nodes."""
return get_node_connector_report()
# Root endpoint
@app.get("/")
async def root():
"""Root endpoint - service information"""
@@ -120,54 +319,199 @@ async def root():
"version": VERSION,
"status": "running",
"environment": ENV,
"message": "Node Registry Service (stub implementation)",
"message": "Node Registry Service - Full Implementation",
"endpoints": {
"health": "/health",
"metrics": "/metrics",
"docs": "/docs" if ENV == "development" else "disabled",
"bootstrap": "/bootstrap/node_bootstrap.py",
"api": {
"register": "POST /api/v1/nodes/register",
"heartbeat": "POST /api/v1/nodes/heartbeat",
"list": "GET /api/v1/nodes",
"get": "GET /api/v1/nodes/{node_id}",
"discover": "POST /api/v1/nodes/discover",
}
}
}
# Stub API endpoints (to be implemented by Cursor)
@app.post("/api/v1/nodes/register")
async def register_node():
# ============================================================================
# Bootstrap Download Endpoint
# ============================================================================
@app.get("/bootstrap/node_bootstrap.py")
async def download_bootstrap():
"""
Register a new node in the registry.
Download Bootstrap Agent script
TODO: Implement by Cursor
Users can download and run this script to connect their node
"""
raise HTTPException(status_code=501, detail="Not implemented - stub endpoint")
@app.post("/api/v1/nodes/{node_id}/heartbeat")
async def update_heartbeat(node_id: str):
"""
Update node heartbeat (keep-alive).
import os
TODO: Implement by Cursor
"""
raise HTTPException(status_code=501, detail="Not implemented - stub endpoint")
@app.get("/api/v1/nodes")
async def list_nodes():
"""
List all registered nodes.
bootstrap_path = os.path.join(
os.path.dirname(os.path.dirname(__file__)),
"bootstrap",
"node_bootstrap.py"
)
TODO: Implement by Cursor
"""
raise HTTPException(status_code=501, detail="Not implemented - stub endpoint")
try:
with open(bootstrap_path, 'r') as f:
content = f.read()
from fastapi.responses import Response
return Response(
content=content,
media_type="text/x-python",
headers={
"Content-Disposition": "attachment; filename=node_bootstrap.py"
}
)
except FileNotFoundError:
raise HTTPException(status_code=404, detail="Bootstrap script not found")
@app.get("/api/v1/nodes/{node_id}")
async def get_node(node_id: str):
# ============================================================================
# Node Registration API
# ============================================================================
@app.post("/api/v1/nodes/register", response_model=NodeResponse)
async def register_node(node_data: NodeRegister, db: Session = Depends(get_db)):
"""
Get specific node information.
Register a new node or update existing one
TODO: Implement by Cursor
This endpoint automatically generates a unique node_id based on hostname
and registers the node in the network.
"""
raise HTTPException(status_code=501, detail="Not implemented - stub endpoint")
try:
node = crud.register_node(db, node_data)
logger.info(f"✅ Node registered: {node.node_id}")
return node.to_dict()
except Exception as e:
logger.error(f"❌ Failed to register node: {e}")
raise HTTPException(status_code=500, detail=f"Registration failed: {str(e)}")
# ============================================================================
# Heartbeat API
# ============================================================================
@app.post("/api/v1/nodes/heartbeat", response_model=HeartbeatResponse)
async def update_heartbeat(heartbeat: HeartbeatRequest, db: Session = Depends(get_db)):
"""
Update node heartbeat (keep-alive)
Nodes should send heartbeat every 30 seconds to maintain "online" status.
"""
try:
success = crud.update_heartbeat(db, heartbeat)
if not success:
raise HTTPException(status_code=404, detail=f"Node not found: {heartbeat.node_id}")
return HeartbeatResponse(
success=True,
node_id=heartbeat.node_id,
timestamp=datetime.utcnow(),
message="Heartbeat updated successfully"
)
except HTTPException:
raise
except Exception as e:
logger.error(f"❌ Heartbeat update failed: {e}")
raise HTTPException(status_code=500, detail=f"Heartbeat failed: {str(e)}")
# ============================================================================
# Node Query API
# ============================================================================
@app.get("/api/v1/nodes", response_model=NodeListResponse)
async def list_nodes(
role: Optional[str] = Query(None, description="Filter by role"),
status: Optional[str] = Query(None, description="Filter by status"),
limit: int = Query(100, ge=1, le=1000, description="Maximum results"),
offset: int = Query(0, ge=0, description="Results offset"),
db: Session = Depends(get_db)
):
"""List all registered nodes with optional filters"""
try:
nodes = crud.list_nodes(db, role=role, status=status, limit=limit, offset=offset)
return NodeListResponse(
nodes=[node.to_dict() for node in nodes],
total=len(nodes)
)
except Exception as e:
logger.error(f"❌ Failed to list nodes: {e}")
raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
@app.get("/api/v1/nodes/{node_id}", response_model=NodeResponse)
async def get_node(node_id: str, db: Session = Depends(get_db)):
"""Get specific node information"""
try:
node = crud.get_node(db, node_id)
if not node:
raise HTTPException(status_code=404, detail=f"Node not found: {node_id}")
return node.to_dict()
except HTTPException:
raise
except Exception as e:
logger.error(f"❌ Failed to get node: {e}")
raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
# ============================================================================
# Node Discovery API
# ============================================================================
@app.post("/api/v1/nodes/discover", response_model=NodeDiscoveryResponse)
async def discover_nodes(query: NodeDiscoveryQuery, db: Session = Depends(get_db)):
"""
Discover nodes based on criteria
Search for nodes with specific capabilities, roles, or status.
Useful for finding the right node for a specific task.
"""
try:
nodes = crud.discover_nodes(db, query)
return NodeDiscoveryResponse(
nodes=[node.to_dict() for node in nodes],
query=query,
total=len(nodes)
)
except Exception as e:
logger.error(f"❌ Node discovery failed: {e}")
raise HTTPException(status_code=500, detail=f"Discovery failed: {str(e)}")
# ============================================================================
# Maintenance Endpoints
# ============================================================================
@app.post("/api/v1/maintenance/cleanup")
async def cleanup_stale_nodes(
timeout_minutes: int = Query(5, ge=1, le=60),
db: Session = Depends(get_db)
):
"""
Mark nodes as offline if no heartbeat received
Admin endpoint for maintenance
"""
try:
count = crud.cleanup_stale_nodes(db, timeout_minutes)
return {
"success": True,
"nodes_marked_offline": count,
"timeout_minutes": timeout_minutes
}
except Exception as e:
logger.error(f"❌ Cleanup failed: {e}")
raise HTTPException(status_code=500, detail=f"Cleanup failed: {str(e)}")
if __name__ == "__main__":