feat: add node-registry dashboard, matrix-presence-aggregator, ocr-service updates

2025-11-28 05:23:14 -08:00
parent 04b1174641
commit 776ab3a7ba
15 changed files with 1162 additions and 56 deletions
--- a/services/node-registry/app/dashboard.py
+++ b/services/node-registry/app/dashboard.py
@@ -0,0 +1,369 @@
+"""
+Node Dashboard API - Aggregator for node status and metrics
+"""
+import asyncio
+import logging
+import httpx
+import psutil
+from typing import Dict, Any, Optional, List
+from datetime import datetime
+
+logger = logging.getLogger(__name__)
+
+# Probe timeout in seconds
+PROBE_TIMEOUT = 0.5
+PROBE_TIMEOUT_LONG = 1.0
+
+
+class DashboardAggregator:
+    """Aggregates data from multiple services for node dashboard"""
+    
+    def __init__(self, node_ip: str = "localhost"):
+        self.node_ip = node_ip
+        self.client = httpx.AsyncClient(timeout=PROBE_TIMEOUT)
+    
+    async def close(self):
+        await self.client.aclose()
+    
+    async def _probe(self, url: str, timeout: float = PROBE_TIMEOUT) -> Dict[str, Any]:
+        """Execute HTTP probe with timeout"""
+        try:
+            resp = await self.client.get(url, timeout=timeout)
+            if resp.status_code == 200:
+                return {"status": "up", "data": resp.json(), "latency_ms": int(resp.elapsed.total_seconds() * 1000)}
+            else:
+                return {"status": "degraded", "error": f"HTTP {resp.status_code}"}
+        except httpx.TimeoutException:
+            return {"status": "down", "error": "timeout"}
+        except httpx.ConnectError:
+            return {"status": "down", "error": "connection refused"}
+        except Exception as e:
+            return {"status": "down", "error": str(e)}
+    
+    async def get_infra_metrics(self) -> Dict[str, Any]:
+        """Get infrastructure metrics using psutil"""
+        try:
+            cpu_pct = psutil.cpu_percent(interval=0.1)
+            mem = psutil.virtual_memory()
+            disk = psutil.disk_usage('/')
+            
+            result = {
+                "cpu_usage_pct": round(cpu_pct, 1),
+                "ram": {
+                    "total_gb": round(mem.total / (1024**3), 1),
+                    "used_gb": round(mem.used / (1024**3), 1)
+                },
+                "disk": {
+                    "total_gb": round(disk.total / (1024**3), 1),
+                    "used_gb": round(disk.used / (1024**3), 1)
+                },
+                "gpus": []
+            }
+            
+            # Try to get GPU info (nvidia-smi or similar)
+            try:
+                import subprocess
+                nvidia_output = subprocess.run(
+                    ['nvidia-smi', '--query-gpu=name,memory.total,memory.used,utilization.gpu', '--format=csv,noheader,nounits'],
+                    capture_output=True, text=True, timeout=2
+                )
+                if nvidia_output.returncode == 0:
+                    for line in nvidia_output.stdout.strip().split('\n'):
+                        parts = [p.strip() for p in line.split(',')]
+                        if len(parts) >= 4:
+                            result["gpus"].append({
+                                "name": parts[0],
+                                "vram_gb": round(float(parts[1]) / 1024, 1),
+                                "used_gb": round(float(parts[2]) / 1024, 1),
+                                "sm_util_pct": int(parts[3])
+                            })
+            except:
+                pass
+            
+            return result
+        except Exception as e:
+            logger.error(f"Failed to get infra metrics: {e}")
+            return {
+                "cpu_usage_pct": 0,
+                "ram": {"total_gb": 0, "used_gb": 0},
+                "disk": {"total_gb": 0, "used_gb": 0},
+                "gpus": []
+            }
+    
+    async def probe_swapper(self, port: int = 8890) -> Dict[str, Any]:
+        """Probe Swapper service"""
+        base_url = f"http://{self.node_ip}:{port}"
+        
+        health_result = await self._probe(f"{base_url}/health", PROBE_TIMEOUT_LONG)
+        models_result = await self._probe(f"{base_url}/models", PROBE_TIMEOUT_LONG)
+        
+        result = {
+            "status": health_result.get("status", "unknown"),
+            "endpoint": base_url,
+            "latency_ms": health_result.get("latency_ms", 0),
+            "storage": {"total_gb": 0, "used_gb": 0, "free_gb": 0},
+            "models": []
+        }
+        
+        if health_result.get("status") == "up":
+            data = health_result.get("data", {})
+            result["active_model"] = data.get("active_model")
+            result["mode"] = data.get("mode")
+        
+        if models_result.get("status") == "up":
+            data = models_result.get("data", {})
+            for m in data.get("models", []):
+                result["models"].append({
+                    "name": m.get("name"),
+                    "size_gb": m.get("size_gb", 0),
+                    "device": m.get("device", "disk"),
+                    "state": m.get("status", "unloaded")
+                })
+        
+        return result
+    
+    async def probe_router(self, port: int = 9102) -> Dict[str, Any]:
+        """Probe DAGI Router service"""
+        base_url = f"http://{self.node_ip}:{port}"
+        
+        health_result = await self._probe(f"{base_url}/health", PROBE_TIMEOUT_LONG)
+        backends_result = await self._probe(f"{base_url}/backends/status", PROBE_TIMEOUT_LONG)
+        
+        result = {
+            "status": health_result.get("status", "unknown"),
+            "endpoint": base_url,
+            "version": "unknown",
+            "backends": [],
+            "metrics": {
+                "requests_1m": 0,
+                "requests_1h": 0,
+                "error_rate_1h": 0,
+                "avg_latency_ms_1h": 0
+            }
+        }
+        
+        if health_result.get("status") == "up":
+            data = health_result.get("data", {})
+            result["version"] = data.get("version", "unknown")
+            result["nats_connected"] = data.get("nats_connected", False)
+        
+        if backends_result.get("status") == "up":
+            for backend in backends_result.get("data", []):
+                result["backends"].append({
+                    "name": backend.get("name"),
+                    "status": backend.get("status"),
+                    "latency_ms": backend.get("latency_ms", 0),
+                    "error": backend.get("error")
+                })
+        
+        return result
+    
+    async def probe_service(self, name: str, port: int, health_path: str = "/health") -> Dict[str, Any]:
+        """Probe generic AI service"""
+        base_url = f"http://{self.node_ip}:{port}"
+        
+        result = await self._probe(f"{base_url}{health_path}")
+        
+        return {
+            "status": result.get("status", "unknown"),
+            "endpoint": base_url,
+            "latency_ms": result.get("latency_ms", 0),
+            "error": result.get("error")
+        }
+    
+    async def probe_ollama(self, port: int = 11434) -> Dict[str, Any]:
+        """Probe Ollama service"""
+        base_url = f"http://{self.node_ip}:{port}"
+        
+        result = await self._probe(f"{base_url}/api/tags", PROBE_TIMEOUT_LONG)
+        
+        models = []
+        if result.get("status") == "up":
+            data = result.get("data", {})
+            for m in data.get("models", []):
+                models.append(m.get("name"))
+        
+        return {
+            "status": result.get("status", "unknown"),
+            "endpoint": base_url,
+            "latency_ms": result.get("latency_ms", 0),
+            "models": models[:10],  # Limit to 10 models
+            "error": result.get("error")
+        }
+    
+    async def probe_matrix(self, synapse_port: int = 8018, presence_port: int = 8085) -> Dict[str, Any]:
+        """Probe Matrix services"""
+        synapse_result = await self._probe(f"http://{self.node_ip}:{synapse_port}/_matrix/client/versions")
+        presence_result = await self._probe(f"http://{self.node_ip}:{presence_port}/health")
+        
+        return {
+            "enabled": synapse_result.get("status") == "up",
+            "homeserver": f"http://{self.node_ip}:{synapse_port}",
+            "synapse": {
+                "status": synapse_result.get("status", "unknown"),
+                "latency_ms": synapse_result.get("latency_ms", 0)
+            },
+            "presence_bridge": {
+                "status": presence_result.get("status", "unknown"),
+                "latency_ms": presence_result.get("latency_ms", 0)
+            }
+        }
+    
+    async def probe_monitoring(self, prometheus_port: int = 9090, grafana_port: int = 3001) -> Dict[str, Any]:
+        """Probe monitoring services"""
+        prometheus_result = await self._probe(f"http://{self.node_ip}:{prometheus_port}/-/ready")
+        grafana_result = await self._probe(f"http://{self.node_ip}:{grafana_port}/api/health")
+        
+        return {
+            "prometheus": {
+                "url": f"http://{self.node_ip}:{prometheus_port}",
+                "status": prometheus_result.get("status", "unknown")
+            },
+            "grafana": {
+                "url": f"http://{self.node_ip}:{grafana_port}",
+                "status": grafana_result.get("status", "unknown")
+            },
+            "logging": {
+                "loki": {"status": "unknown"}
+            }
+        }
+    
+    async def get_agents_summary(self, city_service_port: int = 7001) -> Dict[str, Any]:
+        """Get agents summary from city service"""
+        # City service uses /city/agents endpoint
+        result = await self._probe(f"http://{self.node_ip}:{city_service_port}/city/agents", PROBE_TIMEOUT_LONG)
+        
+        summary = {
+            "total": 0,
+            "running": 0,
+            "by_kind": {},
+            "top": []
+        }
+        
+        if result.get("status") == "up":
+            agents = result.get("data", [])
+            summary["total"] = len(agents)
+            
+            for agent in agents:
+                kind = agent.get("kind", "unknown")
+                summary["by_kind"][kind] = summary["by_kind"].get(kind, 0) + 1
+                
+                if agent.get("status") in ["online", "busy"]:
+                    summary["running"] += 1
+            
+            # Top 5 agents
+            online_agents = [a for a in agents if a.get("status") in ["online", "busy"]][:5]
+            for agent in online_agents:
+                summary["top"].append({
+                    "agent_id": agent.get("id"),
+                    "display_name": agent.get("display_name"),
+                    "kind": agent.get("kind"),
+                    "status": agent.get("status"),
+                    "node_id": agent.get("node_id")
+                })
+        
+        return summary
+
+
+async def build_dashboard(node_profile: Dict[str, Any], node_ip: str = "localhost") -> Dict[str, Any]:
+    """
+    Build complete dashboard from node profile.
+    
+    Args:
+        node_profile: Node profile from registry (with modules, gpu, roles)
+        node_ip: IP address to probe services
+    
+    Returns:
+        Complete dashboard JSON
+    """
+    aggregator = DashboardAggregator(node_ip)
+    
+    try:
+        # Build module port map
+        module_ports = {}
+        for module in node_profile.get("modules", []):
+            if module.get("port"):
+                module_ports[module["id"]] = module["port"]
+        
+        # Parallel probes
+        tasks = {
+            "infra": aggregator.get_infra_metrics(),
+        }
+        
+        # Add probes based on modules
+        if "ai.swapper" in module_ports:
+            tasks["swapper"] = aggregator.probe_swapper(module_ports["ai.swapper"])
+        
+        if "ai.router" in module_ports:
+            tasks["router"] = aggregator.probe_router(module_ports["ai.router"])
+        
+        if "ai.ollama" in module_ports:
+            tasks["ollama"] = aggregator.probe_ollama(module_ports["ai.ollama"])
+        
+        # Generic AI services
+        ai_services = ["ai.stt", "ai.tts", "ai.ocr", "ai.memory", "ai.crewai"]
+        for svc in ai_services:
+            if svc in module_ports:
+                svc_name = svc.replace("ai.", "")
+                tasks[f"svc_{svc_name}"] = aggregator.probe_service(svc_name, module_ports[svc])
+        
+        # Matrix
+        synapse_port = module_ports.get("matrix.synapse", 8018)
+        presence_port = module_ports.get("matrix.presence", 8085)
+        if "matrix.synapse" in module_ports or "matrix.presence" in module_ports:
+            tasks["matrix"] = aggregator.probe_matrix(synapse_port, presence_port)
+        
+        # Monitoring
+        prometheus_port = module_ports.get("monitoring.prometheus", 9090)
+        tasks["monitoring"] = aggregator.probe_monitoring(prometheus_port)
+        
+        # Agents
+        city_port = module_ports.get("daarion.city", 7001)
+        if "daarion.city" in module_ports or "daarion.agents" in module_ports:
+            tasks["agents"] = aggregator.get_agents_summary(city_port)
+        
+        # Execute all probes in parallel
+        results = {}
+        for name, task in tasks.items():
+            try:
+                results[name] = await task
+            except Exception as e:
+                logger.error(f"Probe {name} failed: {e}")
+                results[name] = {"status": "error", "error": str(e)}
+        
+        # Build dashboard response
+        dashboard = {
+            "node": {
+                "node_id": node_profile.get("node_id"),
+                "name": node_profile.get("name"),
+                "roles": node_profile.get("roles", []),
+                "status": node_profile.get("status", "unknown"),
+                "public_hostname": node_profile.get("ip_address"),
+                "environment": node_profile.get("role", "production"),
+                "gpu": node_profile.get("gpu"),
+                "modules": node_profile.get("modules", []),
+                "version": node_profile.get("version", "1.0.0")
+            },
+            "infra": results.get("infra", {}),
+            "ai": {
+                "swapper": results.get("swapper", {"status": "not_installed"}),
+                "router": results.get("router", {"status": "not_installed"}),
+                "ollama": results.get("ollama", {"status": "not_installed"}),
+                "services": {}
+            },
+            "agents": results.get("agents", {"total": 0, "running": 0, "by_kind": {}, "top": []}),
+            "matrix": results.get("matrix", {"enabled": False}),
+            "monitoring": results.get("monitoring", {})
+        }
+        
+        # Add AI services
+        for key, value in results.items():
+            if key.startswith("svc_"):
+                svc_name = key.replace("svc_", "")
+                dashboard["ai"]["services"][svc_name] = value
+        
+        return dashboard
+    
+    finally:
+        await aggregator.close()
+
--- a/services/node-registry/app/database.py
+++ b/services/node-registry/app/database.py
@@ -1,24 +1,24 @@
 """
-SQLite Database connection for local development
-Use this for testing without PostgreSQL
+PostgreSQL Database connection for Node Registry
 """
 import os
-from sqlalchemy import create_engine
+from sqlalchemy import create_engine, text
 from sqlalchemy.orm import sessionmaker, Session
 from contextlib import contextmanager
 import logging

 logger = logging.getLogger(__name__)

-# SQLite database file
-DB_FILE = os.getenv("NODE_REGISTRY_DB_FILE", "node_registry.db")
-DATABASE_URL = f"sqlite:///{DB_FILE}"
+# Database URL from environment
+DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://postgres:postgres@localhost:5432/node_registry")

 # Create engine
 engine = create_engine(
    DATABASE_URL,
-    connect_args={"check_same_thread": False},  # Required for SQLite
-    echo=os.getenv("NODE_REGISTRY_ENV") == "development",  # Log SQL in dev
+    pool_pre_ping=True,
+    pool_size=5,
+    max_overflow=10,
+    echo=os.getenv("NODE_REGISTRY_ENV") == "development",
 )

 # Create session factory
@@ -65,7 +65,7 @@ def check_db_connection() -> bool:
    """Check if database connection is working"""
    try:
        with engine.connect() as conn:
-            conn.execute("SELECT 1")
+            conn.execute(text("SELECT 1"))
        return True
    except Exception as e:
        logger.error(f"Database connection failed: {e}")
@@ -75,8 +75,7 @@ def check_db_connection() -> bool:
 def get_db_info() -> dict:
    """Get database connection information"""
    return {
-        "type": "sqlite",
-        "database": DB_FILE,
+        "type": "postgresql",
+        "url": DATABASE_URL.split("@")[-1] if "@" in DATABASE_URL else DATABASE_URL,
        "connected": check_db_connection(),
    }
-
--- a/services/node-registry/app/main.py
+++ b/services/node-registry/app/main.py
@@ -488,6 +488,187 @@ async def discover_nodes(query: NodeDiscoveryQuery, db: Session = Depends(get_db
        raise HTTPException(status_code=500, detail=f"Discovery failed: {str(e)}")


+# ============================================================================
+# Node Profile Endpoints (Standard v1)
+# ============================================================================
+
+from app.dashboard import build_dashboard
+
+
+@app.get("/api/v1/nodes/self/dashboard")
+async def get_self_dashboard(db: Session = Depends(get_db)):
+    """
+    Get dashboard for current node (self).
+    
+    Uses the first node in registry as "self" for now.
+    In production, this would use JWT claims to identify the node.
+    """
+    try:
+        from sqlalchemy import text
+        
+        # Get first node as "self" (simplified for v1)
+        result = db.execute(text("""
+            SELECT node_id FROM nodes ORDER BY registered_at LIMIT 1
+        """))
+        
+        row = result.fetchone()
+        if not row:
+            raise HTTPException(status_code=404, detail="No nodes registered")
+        
+        # Delegate to node dashboard
+        return await get_node_dashboard(row[0], db)
+    
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"❌ Failed to get self dashboard: {e}")
+        raise HTTPException(status_code=500, detail=f"Dashboard failed: {str(e)}")
+
+
+@app.get("/api/v1/nodes/{node_id}/dashboard")
+async def get_node_dashboard(node_id: str, db: Session = Depends(get_db)):
+    """
+    Get complete node dashboard with live status.
+    
+    Aggregates:
+    - Node profile (roles, modules, GPU)
+    - Infrastructure metrics (CPU, RAM, Disk, GPU)
+    - AI services status (Swapper, Router, STT, Vision, OCR)
+    - Agents summary
+    - Matrix integration status
+    - Monitoring status
+    """
+    try:
+        from sqlalchemy import text
+        
+        # Get node profile
+        result = db.execute(text("""
+            SELECT node_id, node_name, node_role, node_type, ip_address, hostname,
+                   status, roles, gpu, modules, version, vpn_ip
+            FROM nodes
+            WHERE node_id = :node_id
+        """), {"node_id": node_id})
+        
+        row = result.fetchone()
+        if not row:
+            raise HTTPException(status_code=404, detail=f"Node not found: {node_id}")
+        
+        profile = {
+            "node_id": row[0],
+            "name": row[1],
+            "role": row[2],
+            "type": row[3],
+            "ip_address": row[4],
+            "hostname": row[5],
+            "status": row[6],
+            "roles": list(row[7]) if row[7] else [],
+            "gpu": row[8],
+            "modules": row[9] if row[9] else [],
+            "version": row[10] or "1.0.0",
+        }
+        
+        # Build dashboard with probes
+        # For Docker network, use gateway IP to access host services
+        import os
+        
+        # Default to Docker gateway for dagi-network
+        node_ip = os.getenv("PROBE_HOST", "172.21.0.1")
+        
+        # For NODE2, use its actual IP (for remote probing)
+        if node_id == "node-2-macbook-m4max":
+            node_ip = row[4] or "192.168.1.33"
+        
+        dashboard = await build_dashboard(profile, node_ip)
+        
+        return dashboard
+    
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"❌ Failed to get node dashboard: {e}")
+        import traceback
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=f"Dashboard failed: {str(e)}")
+
+
+@app.get("/api/v1/nodes/{node_id}/profile")
+async def get_node_profile(node_id: str, db: Session = Depends(get_db)):
+    """
+    Get full node profile including modules, GPU, roles.
+    Node Profile Standard v1.
+    """
+    try:
+        from sqlalchemy import text
+        
+        result = db.execute(text("""
+            SELECT node_id, node_name, node_role, node_type, ip_address, hostname,
+                   status, roles, gpu, modules, version, vpn_ip
+            FROM nodes
+            WHERE node_id = :node_id
+        """), {"node_id": node_id})
+        
+        row = result.fetchone()
+        if not row:
+            raise HTTPException(status_code=404, detail=f"Node not found: {node_id}")
+        
+        return {
+            "node_id": row[0],
+            "name": row[1],
+            "role": row[2],
+            "type": row[3],
+            "ip_address": row[4],
+            "hostname": row[5],
+            "status": row[6],
+            "roles": list(row[7]) if row[7] else [],
+            "gpu": row[8],
+            "modules": row[9] if row[9] else [],
+            "version": row[10] or "1.0.0",
+            "vpn_ip": str(row[11]) if row[11] else None,
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"❌ Failed to get node profile: {e}")
+        raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
+
+
+@app.get("/api/v1/nodes/profiles")
+async def get_all_node_profiles(db: Session = Depends(get_db)):
+    """
+    Get all node profiles with modules.
+    """
+    try:
+        from sqlalchemy import text
+        
+        result = db.execute(text("""
+            SELECT node_id, node_name, node_role, node_type, ip_address, hostname,
+                   status, roles, gpu, modules, version, vpn_ip
+            FROM nodes
+            ORDER BY node_id
+        """))
+        
+        nodes = []
+        for row in result.fetchall():
+            nodes.append({
+                "node_id": row[0],
+                "name": row[1],
+                "role": row[2],
+                "type": row[3],
+                "ip_address": row[4],
+                "hostname": row[5],
+                "status": row[6],
+                "roles": list(row[7]) if row[7] else [],
+                "gpu": row[8],
+                "modules": row[9] if row[9] else [],
+                "version": row[10] or "1.0.0",
+            })
+        
+        return {"nodes": nodes, "total": len(nodes)}
+    except Exception as e:
+        logger.error(f"❌ Failed to get node profiles: {e}")
+        raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
+
+
 # ============================================================================
 # Maintenance Endpoints
 # ============================================================================
@@ -518,7 +699,7 @@ if __name__ == "__main__":
    print(f"🚀 Starting {SERVICE_NAME} v{VERSION}")
    print(f"📊 Environment: {ENV}")
    print(f"🔌 Port: {HTTP_PORT}")
-    print(f"🗄️  Database: {DB_USER}@{DB_HOST}:{DB_PORT}/{DB_NAME}")
+    print(f"🗄️  Database: {os.getenv('DATABASE_URL', 'not configured')}")
    print(f"📝 Log level: {LOG_LEVEL}")
    print()
    
--- a/services/node-registry/app/models.py
+++ b/services/node-registry/app/models.py
@@ -3,8 +3,8 @@ SQLAlchemy ORM Models for Node Registry
 """
 from datetime import datetime
 from typing import Optional
-from sqlalchemy import Column, String, DateTime, Boolean, ForeignKey, Text, Index
-from sqlalchemy.dialects.postgresql import UUID as PG_UUID, INET, JSONB as PG_JSONB
+from sqlalchemy import Column, String, DateTime, Boolean, ForeignKey, Text, Index, ARRAY
+from sqlalchemy.dialects.postgresql import UUID as PG_UUID, INET, JSONB as PG_JSONB, ARRAY as PG_ARRAY
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import relationship
 from sqlalchemy.types import TypeDecorator, String as SQLString, Text as SQLText
@@ -90,6 +90,12 @@ class Node(Base):
    updated_at = Column(DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow)
    node_metadata = Column(JSONB, default={})
    
+    # Node Profile Standard v1 fields
+    roles = Column(ARRAY(String), default=[])  # ['core', 'gateway', 'matrix', 'agents', 'gpu']
+    gpu = Column(JSONB, default=None)  # {"name": "NVIDIA RTX 4000", "vram_gb": 20}
+    modules = Column(JSONB, default=[])  # [{"id": "ai.router", "status": "up", "port": 9102}, ...]
+    version = Column(String(50), default='1.0.0')
+    
    # Relationships
    profiles = relationship("NodeProfile", back_populates="node", cascade="all, delete-orphan")
    heartbeats = relationship("HeartbeatLog", back_populates="node", cascade="all, delete-orphan")
@@ -113,6 +119,11 @@ class Node(Base):
            "registered_at": self.registered_at.isoformat() if self.registered_at else None,
            "updated_at": self.updated_at.isoformat() if self.updated_at else None,
            "metadata": self.node_metadata or {},
+            # Node Profile Standard v1
+            "roles": self.roles or [],
+            "gpu": self.gpu,
+            "modules": self.modules or [],
+            "version": self.version or "1.0.0",
        }