feat: add node-registry dashboard, matrix-presence-aggregator, ocr-service updates

This commit is contained in:
Apple
2025-11-28 05:23:14 -08:00
parent 04b1174641
commit 776ab3a7ba
15 changed files with 1162 additions and 56 deletions

View File

@@ -0,0 +1,248 @@
#!/usr/bin/env python3
"""
Sync Agents from YAML Config to DAARION City Database
This script reads config/agents_city_mapping.yaml and syncs:
1. Agents to `agents` table
2. Agent-Room bindings to `agent_room_bindings` table
3. Validates node_id against Node Registry
Usage:
python scripts/sync_agents_from_config.py
Environment:
DATABASE_URL - PostgreSQL connection string for DAARION city DB
NODE_REGISTRY_URL - URL for Node Registry API (default: http://localhost:9205)
"""
import os
import sys
import yaml
import logging
import httpx
from datetime import datetime
from pathlib import Path
import psycopg2
from psycopg2.extras import RealDictCursor
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s'
)
logger = logging.getLogger(__name__)
# Configuration
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://postgres:postgres@localhost:5432/daarion')
NODE_REGISTRY_URL = os.getenv('NODE_REGISTRY_URL', 'http://localhost:9205')
CONFIG_PATH = Path(__file__).parent.parent / 'config' / 'agents_city_mapping.yaml'
def load_config() -> dict:
"""Load agents configuration from YAML file."""
if not CONFIG_PATH.exists():
raise FileNotFoundError(f"Config file not found: {CONFIG_PATH}")
with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
config = yaml.safe_load(f)
logger.info(f"✅ Loaded config: {len(config.get('agents', []))} agents, {len(config.get('districts', []))} districts")
return config
def validate_node(node_id: str) -> bool:
"""Check if node exists in Node Registry."""
try:
response = httpx.get(f"{NODE_REGISTRY_URL}/api/v1/nodes/{node_id}", timeout=5.0)
if response.status_code == 200:
logger.debug(f"✅ Node validated: {node_id}")
return True
else:
logger.warning(f"⚠️ Node not found in registry: {node_id}")
return False
except Exception as e:
logger.warning(f"⚠️ Cannot validate node {node_id}: {e}")
return False
def get_room_id_by_slug(cursor, slug: str) -> str | None:
"""Get room_id by slug from city_rooms."""
cursor.execute("SELECT id FROM city_rooms WHERE slug = %s", (slug,))
row = cursor.fetchone()
return row['id'] if row else None
def sync_agents(config: dict, conn) -> tuple[int, int, int]:
"""
Sync agents from config to database.
Returns:
Tuple of (created, updated, errors)
"""
cursor = conn.cursor(cursor_factory=RealDictCursor)
created = 0
updated = 0
errors = 0
default_node_id = config.get('default_node_id', 'node-2-macbook-m4max')
for agent in config.get('agents', []):
agent_id = agent['agent_id']
node_id = agent.get('node_id', default_node_id)
try:
# Validate node (optional - just warning)
validate_node(node_id)
# Upsert agent (using 'id' as primary key, which is agent_id)
cursor.execute("""
INSERT INTO agents (
id, display_name, kind, role, avatar_url, color_hint,
is_active, node_id, district, primary_room_slug, model, priority,
status, created_at, updated_at
) VALUES (
%(agent_id)s, %(display_name)s, %(kind)s, %(role)s, %(avatar_url)s, %(color_hint)s,
true, %(node_id)s, %(district)s, %(primary_room_slug)s, %(model)s, %(priority)s,
'online', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP
)
ON CONFLICT (id) DO UPDATE SET
display_name = EXCLUDED.display_name,
kind = EXCLUDED.kind,
role = EXCLUDED.role,
avatar_url = EXCLUDED.avatar_url,
color_hint = EXCLUDED.color_hint,
is_active = true,
node_id = EXCLUDED.node_id,
district = EXCLUDED.district,
primary_room_slug = EXCLUDED.primary_room_slug,
model = EXCLUDED.model,
priority = EXCLUDED.priority,
status = 'online',
updated_at = CURRENT_TIMESTAMP
RETURNING (xmax = 0) as is_insert
""", {
'agent_id': agent_id,
'display_name': agent.get('display_name', agent_id),
'kind': agent.get('kind', 'agent'),
'role': agent.get('role', ''),
'avatar_url': agent.get('avatar_url'),
'color_hint': agent.get('color_hint', '#6366F1'),
'node_id': node_id,
'district': agent.get('district'),
'primary_room_slug': agent.get('primary_room_slug'),
'model': agent.get('model'),
'priority': agent.get('priority', 'medium'),
})
result = cursor.fetchone()
if result and result['is_insert']:
created += 1
logger.info(f"✅ Created agent: {agent_id}")
else:
updated += 1
logger.debug(f"🔄 Updated agent: {agent_id}")
# Create room binding
room_slug = agent.get('primary_room_slug')
if room_slug:
room_id = get_room_id_by_slug(cursor, room_slug)
if room_id:
cursor.execute("""
INSERT INTO agent_room_bindings (agent_id, room_id, role, is_primary)
VALUES (%(agent_id)s, %(room_id)s, 'resident', true)
ON CONFLICT (agent_id, room_id) DO UPDATE SET
is_primary = true,
updated_at = CURRENT_TIMESTAMP
""", {'agent_id': agent_id, 'room_id': room_id})
else:
logger.warning(f"⚠️ Room not found for agent {agent_id}: {room_slug}")
except Exception as e:
errors += 1
logger.error(f"❌ Error syncing agent {agent_id}: {e}")
conn.commit()
return created, updated, errors
def sync_districts(config: dict, conn) -> int:
"""Sync districts from config to database."""
cursor = conn.cursor()
count = 0
for district in config.get('districts', []):
try:
cursor.execute("""
INSERT INTO city_districts (id, name, description, color, icon, room_slug)
VALUES (%(id)s, %(name)s, %(description)s, %(color)s, %(icon)s, %(room_slug)s)
ON CONFLICT (id) DO UPDATE SET
name = EXCLUDED.name,
description = EXCLUDED.description,
color = EXCLUDED.color,
icon = EXCLUDED.icon,
room_slug = EXCLUDED.room_slug,
updated_at = CURRENT_TIMESTAMP
""", {
'id': district['id'],
'name': district['name'],
'description': district.get('description', ''),
'color': district.get('color', '#6366F1'),
'icon': district.get('icon', 'building'),
'room_slug': district.get('room_slug'),
})
count += 1
except Exception as e:
logger.error(f"❌ Error syncing district {district['id']}: {e}")
conn.commit()
logger.info(f"✅ Synced {count} districts")
return count
def main():
"""Main entry point."""
logger.info("🚀 Starting Agent-City Sync")
logger.info(f"📁 Config: {CONFIG_PATH}")
logger.info(f"🗄️ Database: {DATABASE_URL.split('@')[-1] if '@' in DATABASE_URL else DATABASE_URL}")
logger.info(f"📡 Node Registry: {NODE_REGISTRY_URL}")
print()
try:
# Load config
config = load_config()
# Connect to database
conn = psycopg2.connect(DATABASE_URL)
logger.info("✅ Connected to database")
# Sync districts
sync_districts(config, conn)
# Sync agents
created, updated, errors = sync_agents(config, conn)
# Summary
print()
logger.info("=" * 50)
logger.info("📊 SYNC SUMMARY")
logger.info("=" * 50)
logger.info(f"✅ Agents created: {created}")
logger.info(f"🔄 Agents updated: {updated}")
logger.info(f"❌ Errors: {errors}")
logger.info(f"📍 Total agents: {created + updated}")
logger.info("=" * 50)
conn.close()
if errors > 0:
sys.exit(1)
except Exception as e:
logger.error(f"❌ Fatal error: {e}")
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -21,20 +21,33 @@ class AgentsSource:
- display_name - display_name
- kind - kind
- status - status
- room_id (current_room_id) - room_id
- color - color
- node_id
- district
- model
- role
- avatar_url
""" """
query = text(""" query = text("""
SELECT SELECT
id as agent_id, a.id as agent_id,
display_name, a.display_name,
kind, a.kind,
status, a.status,
current_room_id as room_id, COALESCE(cr.id, a.current_room_id) as room_id,
color COALESCE(a.color_hint, a.color, 'cyan') as color,
FROM agents a.node_id,
WHERE status IN ('online', 'busy') a.district,
ORDER BY display_name a.model,
a.role,
a.avatar_url,
a.primary_room_slug
FROM agents a
LEFT JOIN city_rooms cr ON cr.slug = a.primary_room_slug
WHERE a.status IN ('online', 'busy')
AND (a.is_active = true OR a.is_active IS NULL)
ORDER BY a.display_name
""") """)
try: try:
@@ -69,17 +82,25 @@ class AgentsSource:
return [] return []
def get_all_agents(self) -> List[Dict]: def get_all_agents(self) -> List[Dict]:
"""Get all agents (including offline)""" """Get all active agents (including offline)"""
query = text(""" query = text("""
SELECT SELECT
id as agent_id, a.id as agent_id,
display_name, a.display_name,
kind, a.kind,
status, a.status,
current_room_id as room_id, COALESCE(cr.id, a.current_room_id) as room_id,
color COALESCE(a.color_hint, a.color, 'cyan') as color,
FROM agents a.node_id,
ORDER BY display_name a.district,
a.model,
a.role,
a.avatar_url,
a.primary_room_slug
FROM agents a
LEFT JOIN city_rooms cr ON cr.slug = a.primary_room_slug
WHERE a.is_active = true OR a.is_active IS NULL
ORDER BY a.display_name
""") """)
try: try:

View File

@@ -86,7 +86,12 @@ class PresenceAggregator:
kind=agent.get("kind", "assistant"), kind=agent.get("kind", "assistant"),
status=agent.get("status", "online"), status=agent.get("status", "online"),
room_id=agent.get("room_id"), room_id=agent.get("room_id"),
color=agent.get("color", "cyan") color=agent.get("color", "cyan"),
node_id=agent.get("node_id"),
district=agent.get("district"),
model=agent.get("model"),
role=agent.get("role"),
avatar_url=agent.get("avatar_url"),
) )
all_agents.append(ap) all_agents.append(ap)

View File

@@ -103,6 +103,7 @@ async def health():
@app.get("/presence/summary") @app.get("/presence/summary")
@app.get("/presence/snapshot")
async def get_presence_summary(): async def get_presence_summary():
""" """
Get current presence snapshot. Get current presence snapshot.

View File

@@ -8,10 +8,15 @@ class AgentPresence(BaseModel):
"""Agent presence in a room""" """Agent presence in a room"""
agent_id: str agent_id: str
display_name: str display_name: str
kind: str = "assistant" # assistant, civic, oracle, builder kind: str = "assistant" # assistant, civic, oracle, builder, vision, etc.
status: str = "offline" # online, offline, busy status: str = "offline" # online, offline, busy
room_id: Optional[str] = None room_id: Optional[str] = None
color: Optional[str] = None color: Optional[str] = None
node_id: Optional[str] = None # Node where agent runs (node-1-hetzner, node-2-macbook)
district: Optional[str] = None # City district (leadership, engineering, etc.)
model: Optional[str] = None # LLM model used by agent
role: Optional[str] = None # Agent's role description
avatar_url: Optional[str] = None # Agent avatar URL
class RoomPresence(BaseModel): class RoomPresence(BaseModel):

View File

@@ -0,0 +1,369 @@
"""
Node Dashboard API - Aggregator for node status and metrics
"""
import asyncio
import logging
import httpx
import psutil
from typing import Dict, Any, Optional, List
from datetime import datetime
logger = logging.getLogger(__name__)
# Probe timeout in seconds
PROBE_TIMEOUT = 0.5
PROBE_TIMEOUT_LONG = 1.0
class DashboardAggregator:
"""Aggregates data from multiple services for node dashboard"""
def __init__(self, node_ip: str = "localhost"):
self.node_ip = node_ip
self.client = httpx.AsyncClient(timeout=PROBE_TIMEOUT)
async def close(self):
await self.client.aclose()
async def _probe(self, url: str, timeout: float = PROBE_TIMEOUT) -> Dict[str, Any]:
"""Execute HTTP probe with timeout"""
try:
resp = await self.client.get(url, timeout=timeout)
if resp.status_code == 200:
return {"status": "up", "data": resp.json(), "latency_ms": int(resp.elapsed.total_seconds() * 1000)}
else:
return {"status": "degraded", "error": f"HTTP {resp.status_code}"}
except httpx.TimeoutException:
return {"status": "down", "error": "timeout"}
except httpx.ConnectError:
return {"status": "down", "error": "connection refused"}
except Exception as e:
return {"status": "down", "error": str(e)}
async def get_infra_metrics(self) -> Dict[str, Any]:
"""Get infrastructure metrics using psutil"""
try:
cpu_pct = psutil.cpu_percent(interval=0.1)
mem = psutil.virtual_memory()
disk = psutil.disk_usage('/')
result = {
"cpu_usage_pct": round(cpu_pct, 1),
"ram": {
"total_gb": round(mem.total / (1024**3), 1),
"used_gb": round(mem.used / (1024**3), 1)
},
"disk": {
"total_gb": round(disk.total / (1024**3), 1),
"used_gb": round(disk.used / (1024**3), 1)
},
"gpus": []
}
# Try to get GPU info (nvidia-smi or similar)
try:
import subprocess
nvidia_output = subprocess.run(
['nvidia-smi', '--query-gpu=name,memory.total,memory.used,utilization.gpu', '--format=csv,noheader,nounits'],
capture_output=True, text=True, timeout=2
)
if nvidia_output.returncode == 0:
for line in nvidia_output.stdout.strip().split('\n'):
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 4:
result["gpus"].append({
"name": parts[0],
"vram_gb": round(float(parts[1]) / 1024, 1),
"used_gb": round(float(parts[2]) / 1024, 1),
"sm_util_pct": int(parts[3])
})
except:
pass
return result
except Exception as e:
logger.error(f"Failed to get infra metrics: {e}")
return {
"cpu_usage_pct": 0,
"ram": {"total_gb": 0, "used_gb": 0},
"disk": {"total_gb": 0, "used_gb": 0},
"gpus": []
}
async def probe_swapper(self, port: int = 8890) -> Dict[str, Any]:
"""Probe Swapper service"""
base_url = f"http://{self.node_ip}:{port}"
health_result = await self._probe(f"{base_url}/health", PROBE_TIMEOUT_LONG)
models_result = await self._probe(f"{base_url}/models", PROBE_TIMEOUT_LONG)
result = {
"status": health_result.get("status", "unknown"),
"endpoint": base_url,
"latency_ms": health_result.get("latency_ms", 0),
"storage": {"total_gb": 0, "used_gb": 0, "free_gb": 0},
"models": []
}
if health_result.get("status") == "up":
data = health_result.get("data", {})
result["active_model"] = data.get("active_model")
result["mode"] = data.get("mode")
if models_result.get("status") == "up":
data = models_result.get("data", {})
for m in data.get("models", []):
result["models"].append({
"name": m.get("name"),
"size_gb": m.get("size_gb", 0),
"device": m.get("device", "disk"),
"state": m.get("status", "unloaded")
})
return result
async def probe_router(self, port: int = 9102) -> Dict[str, Any]:
"""Probe DAGI Router service"""
base_url = f"http://{self.node_ip}:{port}"
health_result = await self._probe(f"{base_url}/health", PROBE_TIMEOUT_LONG)
backends_result = await self._probe(f"{base_url}/backends/status", PROBE_TIMEOUT_LONG)
result = {
"status": health_result.get("status", "unknown"),
"endpoint": base_url,
"version": "unknown",
"backends": [],
"metrics": {
"requests_1m": 0,
"requests_1h": 0,
"error_rate_1h": 0,
"avg_latency_ms_1h": 0
}
}
if health_result.get("status") == "up":
data = health_result.get("data", {})
result["version"] = data.get("version", "unknown")
result["nats_connected"] = data.get("nats_connected", False)
if backends_result.get("status") == "up":
for backend in backends_result.get("data", []):
result["backends"].append({
"name": backend.get("name"),
"status": backend.get("status"),
"latency_ms": backend.get("latency_ms", 0),
"error": backend.get("error")
})
return result
async def probe_service(self, name: str, port: int, health_path: str = "/health") -> Dict[str, Any]:
"""Probe generic AI service"""
base_url = f"http://{self.node_ip}:{port}"
result = await self._probe(f"{base_url}{health_path}")
return {
"status": result.get("status", "unknown"),
"endpoint": base_url,
"latency_ms": result.get("latency_ms", 0),
"error": result.get("error")
}
async def probe_ollama(self, port: int = 11434) -> Dict[str, Any]:
"""Probe Ollama service"""
base_url = f"http://{self.node_ip}:{port}"
result = await self._probe(f"{base_url}/api/tags", PROBE_TIMEOUT_LONG)
models = []
if result.get("status") == "up":
data = result.get("data", {})
for m in data.get("models", []):
models.append(m.get("name"))
return {
"status": result.get("status", "unknown"),
"endpoint": base_url,
"latency_ms": result.get("latency_ms", 0),
"models": models[:10], # Limit to 10 models
"error": result.get("error")
}
async def probe_matrix(self, synapse_port: int = 8018, presence_port: int = 8085) -> Dict[str, Any]:
"""Probe Matrix services"""
synapse_result = await self._probe(f"http://{self.node_ip}:{synapse_port}/_matrix/client/versions")
presence_result = await self._probe(f"http://{self.node_ip}:{presence_port}/health")
return {
"enabled": synapse_result.get("status") == "up",
"homeserver": f"http://{self.node_ip}:{synapse_port}",
"synapse": {
"status": synapse_result.get("status", "unknown"),
"latency_ms": synapse_result.get("latency_ms", 0)
},
"presence_bridge": {
"status": presence_result.get("status", "unknown"),
"latency_ms": presence_result.get("latency_ms", 0)
}
}
async def probe_monitoring(self, prometheus_port: int = 9090, grafana_port: int = 3001) -> Dict[str, Any]:
"""Probe monitoring services"""
prometheus_result = await self._probe(f"http://{self.node_ip}:{prometheus_port}/-/ready")
grafana_result = await self._probe(f"http://{self.node_ip}:{grafana_port}/api/health")
return {
"prometheus": {
"url": f"http://{self.node_ip}:{prometheus_port}",
"status": prometheus_result.get("status", "unknown")
},
"grafana": {
"url": f"http://{self.node_ip}:{grafana_port}",
"status": grafana_result.get("status", "unknown")
},
"logging": {
"loki": {"status": "unknown"}
}
}
async def get_agents_summary(self, city_service_port: int = 7001) -> Dict[str, Any]:
"""Get agents summary from city service"""
# City service uses /city/agents endpoint
result = await self._probe(f"http://{self.node_ip}:{city_service_port}/city/agents", PROBE_TIMEOUT_LONG)
summary = {
"total": 0,
"running": 0,
"by_kind": {},
"top": []
}
if result.get("status") == "up":
agents = result.get("data", [])
summary["total"] = len(agents)
for agent in agents:
kind = agent.get("kind", "unknown")
summary["by_kind"][kind] = summary["by_kind"].get(kind, 0) + 1
if agent.get("status") in ["online", "busy"]:
summary["running"] += 1
# Top 5 agents
online_agents = [a for a in agents if a.get("status") in ["online", "busy"]][:5]
for agent in online_agents:
summary["top"].append({
"agent_id": agent.get("id"),
"display_name": agent.get("display_name"),
"kind": agent.get("kind"),
"status": agent.get("status"),
"node_id": agent.get("node_id")
})
return summary
async def build_dashboard(node_profile: Dict[str, Any], node_ip: str = "localhost") -> Dict[str, Any]:
"""
Build complete dashboard from node profile.
Args:
node_profile: Node profile from registry (with modules, gpu, roles)
node_ip: IP address to probe services
Returns:
Complete dashboard JSON
"""
aggregator = DashboardAggregator(node_ip)
try:
# Build module port map
module_ports = {}
for module in node_profile.get("modules", []):
if module.get("port"):
module_ports[module["id"]] = module["port"]
# Parallel probes
tasks = {
"infra": aggregator.get_infra_metrics(),
}
# Add probes based on modules
if "ai.swapper" in module_ports:
tasks["swapper"] = aggregator.probe_swapper(module_ports["ai.swapper"])
if "ai.router" in module_ports:
tasks["router"] = aggregator.probe_router(module_ports["ai.router"])
if "ai.ollama" in module_ports:
tasks["ollama"] = aggregator.probe_ollama(module_ports["ai.ollama"])
# Generic AI services
ai_services = ["ai.stt", "ai.tts", "ai.ocr", "ai.memory", "ai.crewai"]
for svc in ai_services:
if svc in module_ports:
svc_name = svc.replace("ai.", "")
tasks[f"svc_{svc_name}"] = aggregator.probe_service(svc_name, module_ports[svc])
# Matrix
synapse_port = module_ports.get("matrix.synapse", 8018)
presence_port = module_ports.get("matrix.presence", 8085)
if "matrix.synapse" in module_ports or "matrix.presence" in module_ports:
tasks["matrix"] = aggregator.probe_matrix(synapse_port, presence_port)
# Monitoring
prometheus_port = module_ports.get("monitoring.prometheus", 9090)
tasks["monitoring"] = aggregator.probe_monitoring(prometheus_port)
# Agents
city_port = module_ports.get("daarion.city", 7001)
if "daarion.city" in module_ports or "daarion.agents" in module_ports:
tasks["agents"] = aggregator.get_agents_summary(city_port)
# Execute all probes in parallel
results = {}
for name, task in tasks.items():
try:
results[name] = await task
except Exception as e:
logger.error(f"Probe {name} failed: {e}")
results[name] = {"status": "error", "error": str(e)}
# Build dashboard response
dashboard = {
"node": {
"node_id": node_profile.get("node_id"),
"name": node_profile.get("name"),
"roles": node_profile.get("roles", []),
"status": node_profile.get("status", "unknown"),
"public_hostname": node_profile.get("ip_address"),
"environment": node_profile.get("role", "production"),
"gpu": node_profile.get("gpu"),
"modules": node_profile.get("modules", []),
"version": node_profile.get("version", "1.0.0")
},
"infra": results.get("infra", {}),
"ai": {
"swapper": results.get("swapper", {"status": "not_installed"}),
"router": results.get("router", {"status": "not_installed"}),
"ollama": results.get("ollama", {"status": "not_installed"}),
"services": {}
},
"agents": results.get("agents", {"total": 0, "running": 0, "by_kind": {}, "top": []}),
"matrix": results.get("matrix", {"enabled": False}),
"monitoring": results.get("monitoring", {})
}
# Add AI services
for key, value in results.items():
if key.startswith("svc_"):
svc_name = key.replace("svc_", "")
dashboard["ai"]["services"][svc_name] = value
return dashboard
finally:
await aggregator.close()

View File

@@ -1,24 +1,24 @@
""" """
SQLite Database connection for local development PostgreSQL Database connection for Node Registry
Use this for testing without PostgreSQL
""" """
import os import os
from sqlalchemy import create_engine from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker, Session from sqlalchemy.orm import sessionmaker, Session
from contextlib import contextmanager from contextlib import contextmanager
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# SQLite database file # Database URL from environment
DB_FILE = os.getenv("NODE_REGISTRY_DB_FILE", "node_registry.db") DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://postgres:postgres@localhost:5432/node_registry")
DATABASE_URL = f"sqlite:///{DB_FILE}"
# Create engine # Create engine
engine = create_engine( engine = create_engine(
DATABASE_URL, DATABASE_URL,
connect_args={"check_same_thread": False}, # Required for SQLite pool_pre_ping=True,
echo=os.getenv("NODE_REGISTRY_ENV") == "development", # Log SQL in dev pool_size=5,
max_overflow=10,
echo=os.getenv("NODE_REGISTRY_ENV") == "development",
) )
# Create session factory # Create session factory
@@ -65,7 +65,7 @@ def check_db_connection() -> bool:
"""Check if database connection is working""" """Check if database connection is working"""
try: try:
with engine.connect() as conn: with engine.connect() as conn:
conn.execute("SELECT 1") conn.execute(text("SELECT 1"))
return True return True
except Exception as e: except Exception as e:
logger.error(f"Database connection failed: {e}") logger.error(f"Database connection failed: {e}")
@@ -75,8 +75,7 @@ def check_db_connection() -> bool:
def get_db_info() -> dict: def get_db_info() -> dict:
"""Get database connection information""" """Get database connection information"""
return { return {
"type": "sqlite", "type": "postgresql",
"database": DB_FILE, "url": DATABASE_URL.split("@")[-1] if "@" in DATABASE_URL else DATABASE_URL,
"connected": check_db_connection(), "connected": check_db_connection(),
} }

View File

@@ -488,6 +488,187 @@ async def discover_nodes(query: NodeDiscoveryQuery, db: Session = Depends(get_db
raise HTTPException(status_code=500, detail=f"Discovery failed: {str(e)}") raise HTTPException(status_code=500, detail=f"Discovery failed: {str(e)}")
# ============================================================================
# Node Profile Endpoints (Standard v1)
# ============================================================================
from app.dashboard import build_dashboard
@app.get("/api/v1/nodes/self/dashboard")
async def get_self_dashboard(db: Session = Depends(get_db)):
"""
Get dashboard for current node (self).
Uses the first node in registry as "self" for now.
In production, this would use JWT claims to identify the node.
"""
try:
from sqlalchemy import text
# Get first node as "self" (simplified for v1)
result = db.execute(text("""
SELECT node_id FROM nodes ORDER BY registered_at LIMIT 1
"""))
row = result.fetchone()
if not row:
raise HTTPException(status_code=404, detail="No nodes registered")
# Delegate to node dashboard
return await get_node_dashboard(row[0], db)
except HTTPException:
raise
except Exception as e:
logger.error(f"❌ Failed to get self dashboard: {e}")
raise HTTPException(status_code=500, detail=f"Dashboard failed: {str(e)}")
@app.get("/api/v1/nodes/{node_id}/dashboard")
async def get_node_dashboard(node_id: str, db: Session = Depends(get_db)):
"""
Get complete node dashboard with live status.
Aggregates:
- Node profile (roles, modules, GPU)
- Infrastructure metrics (CPU, RAM, Disk, GPU)
- AI services status (Swapper, Router, STT, Vision, OCR)
- Agents summary
- Matrix integration status
- Monitoring status
"""
try:
from sqlalchemy import text
# Get node profile
result = db.execute(text("""
SELECT node_id, node_name, node_role, node_type, ip_address, hostname,
status, roles, gpu, modules, version, vpn_ip
FROM nodes
WHERE node_id = :node_id
"""), {"node_id": node_id})
row = result.fetchone()
if not row:
raise HTTPException(status_code=404, detail=f"Node not found: {node_id}")
profile = {
"node_id": row[0],
"name": row[1],
"role": row[2],
"type": row[3],
"ip_address": row[4],
"hostname": row[5],
"status": row[6],
"roles": list(row[7]) if row[7] else [],
"gpu": row[8],
"modules": row[9] if row[9] else [],
"version": row[10] or "1.0.0",
}
# Build dashboard with probes
# For Docker network, use gateway IP to access host services
import os
# Default to Docker gateway for dagi-network
node_ip = os.getenv("PROBE_HOST", "172.21.0.1")
# For NODE2, use its actual IP (for remote probing)
if node_id == "node-2-macbook-m4max":
node_ip = row[4] or "192.168.1.33"
dashboard = await build_dashboard(profile, node_ip)
return dashboard
except HTTPException:
raise
except Exception as e:
logger.error(f"❌ Failed to get node dashboard: {e}")
import traceback
traceback.print_exc()
raise HTTPException(status_code=500, detail=f"Dashboard failed: {str(e)}")
@app.get("/api/v1/nodes/{node_id}/profile")
async def get_node_profile(node_id: str, db: Session = Depends(get_db)):
"""
Get full node profile including modules, GPU, roles.
Node Profile Standard v1.
"""
try:
from sqlalchemy import text
result = db.execute(text("""
SELECT node_id, node_name, node_role, node_type, ip_address, hostname,
status, roles, gpu, modules, version, vpn_ip
FROM nodes
WHERE node_id = :node_id
"""), {"node_id": node_id})
row = result.fetchone()
if not row:
raise HTTPException(status_code=404, detail=f"Node not found: {node_id}")
return {
"node_id": row[0],
"name": row[1],
"role": row[2],
"type": row[3],
"ip_address": row[4],
"hostname": row[5],
"status": row[6],
"roles": list(row[7]) if row[7] else [],
"gpu": row[8],
"modules": row[9] if row[9] else [],
"version": row[10] or "1.0.0",
"vpn_ip": str(row[11]) if row[11] else None,
}
except HTTPException:
raise
except Exception as e:
logger.error(f"❌ Failed to get node profile: {e}")
raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
@app.get("/api/v1/nodes/profiles")
async def get_all_node_profiles(db: Session = Depends(get_db)):
"""
Get all node profiles with modules.
"""
try:
from sqlalchemy import text
result = db.execute(text("""
SELECT node_id, node_name, node_role, node_type, ip_address, hostname,
status, roles, gpu, modules, version, vpn_ip
FROM nodes
ORDER BY node_id
"""))
nodes = []
for row in result.fetchall():
nodes.append({
"node_id": row[0],
"name": row[1],
"role": row[2],
"type": row[3],
"ip_address": row[4],
"hostname": row[5],
"status": row[6],
"roles": list(row[7]) if row[7] else [],
"gpu": row[8],
"modules": row[9] if row[9] else [],
"version": row[10] or "1.0.0",
})
return {"nodes": nodes, "total": len(nodes)}
except Exception as e:
logger.error(f"❌ Failed to get node profiles: {e}")
raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
# ============================================================================ # ============================================================================
# Maintenance Endpoints # Maintenance Endpoints
# ============================================================================ # ============================================================================
@@ -518,7 +699,7 @@ if __name__ == "__main__":
print(f"🚀 Starting {SERVICE_NAME} v{VERSION}") print(f"🚀 Starting {SERVICE_NAME} v{VERSION}")
print(f"📊 Environment: {ENV}") print(f"📊 Environment: {ENV}")
print(f"🔌 Port: {HTTP_PORT}") print(f"🔌 Port: {HTTP_PORT}")
print(f"🗄️ Database: {DB_USER}@{DB_HOST}:{DB_PORT}/{DB_NAME}") print(f"🗄️ Database: {os.getenv('DATABASE_URL', 'not configured')}")
print(f"📝 Log level: {LOG_LEVEL}") print(f"📝 Log level: {LOG_LEVEL}")
print() print()

View File

@@ -3,8 +3,8 @@ SQLAlchemy ORM Models for Node Registry
""" """
from datetime import datetime from datetime import datetime
from typing import Optional from typing import Optional
from sqlalchemy import Column, String, DateTime, Boolean, ForeignKey, Text, Index from sqlalchemy import Column, String, DateTime, Boolean, ForeignKey, Text, Index, ARRAY
from sqlalchemy.dialects.postgresql import UUID as PG_UUID, INET, JSONB as PG_JSONB from sqlalchemy.dialects.postgresql import UUID as PG_UUID, INET, JSONB as PG_JSONB, ARRAY as PG_ARRAY
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship from sqlalchemy.orm import relationship
from sqlalchemy.types import TypeDecorator, String as SQLString, Text as SQLText from sqlalchemy.types import TypeDecorator, String as SQLString, Text as SQLText
@@ -90,6 +90,12 @@ class Node(Base):
updated_at = Column(DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow) updated_at = Column(DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow)
node_metadata = Column(JSONB, default={}) node_metadata = Column(JSONB, default={})
# Node Profile Standard v1 fields
roles = Column(ARRAY(String), default=[]) # ['core', 'gateway', 'matrix', 'agents', 'gpu']
gpu = Column(JSONB, default=None) # {"name": "NVIDIA RTX 4000", "vram_gb": 20}
modules = Column(JSONB, default=[]) # [{"id": "ai.router", "status": "up", "port": 9102}, ...]
version = Column(String(50), default='1.0.0')
# Relationships # Relationships
profiles = relationship("NodeProfile", back_populates="node", cascade="all, delete-orphan") profiles = relationship("NodeProfile", back_populates="node", cascade="all, delete-orphan")
heartbeats = relationship("HeartbeatLog", back_populates="node", cascade="all, delete-orphan") heartbeats = relationship("HeartbeatLog", back_populates="node", cascade="all, delete-orphan")
@@ -113,6 +119,11 @@ class Node(Base):
"registered_at": self.registered_at.isoformat() if self.registered_at else None, "registered_at": self.registered_at.isoformat() if self.registered_at else None,
"updated_at": self.updated_at.isoformat() if self.updated_at else None, "updated_at": self.updated_at.isoformat() if self.updated_at else None,
"metadata": self.node_metadata or {}, "metadata": self.node_metadata or {},
# Node Profile Standard v1
"roles": self.roles or [],
"gpu": self.gpu,
"modules": self.modules or [],
"version": self.version or "1.0.0",
} }

View File

@@ -9,3 +9,5 @@ alembic==1.14.0
python-json-logger==3.2.1 python-json-logger==3.2.1
prometheus-client==0.21.0 prometheus-client==0.21.0
psycopg2-binary>=2.9.0 psycopg2-binary>=2.9.0
psutil>=5.9.0
requests>=2.28.0

View File

@@ -1,6 +1,6 @@
FROM python:3.11-slim FROM python:3.11-slim
# Встановити системні залежності # Встановити системні залежності для Tesseract
RUN apt-get update && apt-get install -y \ RUN apt-get update && apt-get install -y \
tesseract-ocr \ tesseract-ocr \
tesseract-ocr-ukr \ tesseract-ocr-ukr \
@@ -16,16 +16,16 @@ WORKDIR /app
# Копіювати requirements # Копіювати requirements
COPY requirements.txt . COPY requirements.txt .
# Встановити Python залежності # Встановити Python залежності (без EasyOCR для швидкого білду)
RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -r requirements.txt
# Завантажити EasyOCR моделі
RUN python -c "import easyocr; easyocr.Reader(['uk', 'en'], gpu=False)"
# Копіювати код # Копіювати код
COPY app/ ./app/ COPY app/ ./app/
EXPOSE 8896 EXPOSE 8896
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8896"] # Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8896/health || exit 1
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8896"]

View File

@@ -92,11 +92,19 @@ async def root():
@app.get("/health") @app.get("/health")
async def health(): async def health():
"""Health check endpoint""" """Health check endpoint"""
gpu_available = False
if EASYOCR_AVAILABLE:
try:
import torch
gpu_available = torch.cuda.is_available()
except:
pass
return { return {
"status": "healthy" if (TESSERACT_AVAILABLE or EASYOCR_AVAILABLE) else "degraded", "status": "healthy" if (TESSERACT_AVAILABLE or EASYOCR_AVAILABLE) else "degraded",
"tesseract": "available" if TESSERACT_AVAILABLE else "unavailable", "tesseract": "available" if TESSERACT_AVAILABLE else "unavailable",
"easyocr": "available" if EASYOCR_AVAILABLE else "unavailable", "easyocr": "available" if EASYOCR_AVAILABLE else "unavailable",
"gpu": torch.cuda.is_available() if EASYOCR_AVAILABLE else False "gpu": gpu_available
} }
def preprocess_image(img: Image.Image) -> Image.Image: def preprocess_image(img: Image.Image) -> Image.Image:

View File

@@ -1,10 +1,14 @@
# Core
fastapi==0.104.1 fastapi==0.104.1
uvicorn[standard]==0.24.0 uvicorn[standard]==0.24.0
python-multipart==0.0.6 python-multipart==0.0.6
# OCR - Tesseract only (EasyOCR optional)
pytesseract==0.3.10 pytesseract==0.3.10
easyocr==1.7.1
Pillow==10.1.0 Pillow==10.1.0
numpy==1.24.3 numpy==1.24.3
torch==2.1.0
torchvision==0.16.0
# Optional: EasyOCR (uncomment for full support, requires GPU)
# easyocr==1.7.1
# torch==2.1.0
# torchvision==0.16.0

View File

@@ -1,15 +1,27 @@
from fastapi import FastAPI, HTTPException from fastapi import FastAPI, HTTPException
from pydantic import BaseModel from pydantic import BaseModel
from typing import Literal, Optional, Dict, Any from typing import Literal, Optional, Dict, Any, List
import asyncio import asyncio
import json import json
import os import os
import yaml import yaml
import httpx
import logging
app = FastAPI(title="DAARION Router", version="1.0.0") logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(title="DAARION Router", version="2.0.0")
# Configuration # Configuration
NATS_URL = os.getenv("NATS_URL", "nats://nats:4222") NATS_URL = os.getenv("NATS_URL", "nats://nats:4222")
SWAPPER_URL = os.getenv("SWAPPER_URL", "http://192.168.1.33:8890")
STT_URL = os.getenv("STT_URL", "http://192.168.1.33:8895")
VISION_URL = os.getenv("VISION_URL", "http://192.168.1.33:11434")
OCR_URL = os.getenv("OCR_URL", "http://192.168.1.33:8896")
# HTTP client for backend services
http_client: Optional[httpx.AsyncClient] = None
# NATS client # NATS client
nc = None nc = None
@@ -49,25 +61,35 @@ config = load_config()
@app.on_event("startup") @app.on_event("startup")
async def startup_event(): async def startup_event():
"""Initialize NATS connection and subscriptions""" """Initialize NATS connection and subscriptions"""
global nc, nats_available global nc, nats_available, http_client
print("🚀 DAGI Router starting up...") logger.info("🚀 DAGI Router v2.0.0 starting up...")
# Initialize HTTP client
http_client = httpx.AsyncClient(timeout=60.0)
logger.info("✅ HTTP client initialized")
# Try to connect to NATS # Try to connect to NATS
try: try:
import nats import nats
nc = await nats.connect(NATS_URL) nc = await nats.connect(NATS_URL)
nats_available = True nats_available = True
print(f"✅ Connected to NATS at {NATS_URL}") logger.info(f"✅ Connected to NATS at {NATS_URL}")
# Subscribe to filter decisions if enabled # Subscribe to filter decisions if enabled
if config.get("messaging_inbound", {}).get("enabled", True): if config.get("messaging_inbound", {}).get("enabled", True):
asyncio.create_task(subscribe_to_filter_decisions()) asyncio.create_task(subscribe_to_filter_decisions())
else: else:
print("⚠️ Messaging inbound routing disabled in config") logger.warning("⚠️ Messaging inbound routing disabled in config")
except Exception as e: except Exception as e:
print(f"⚠️ NATS not available: {e}") logger.warning(f"⚠️ NATS not available: {e}")
print("⚠️ Running in test mode (HTTP only)") logger.warning("⚠️ Running in test mode (HTTP only)")
nats_available = False nats_available = False
# Log backend URLs
logger.info(f"📡 Swapper URL: {SWAPPER_URL}")
logger.info(f"📡 STT URL: {STT_URL}")
logger.info(f"📡 Vision URL: {VISION_URL}")
logger.info(f"📡 OCR URL: {OCR_URL}")
async def subscribe_to_filter_decisions(): async def subscribe_to_filter_decisions():
"""Subscribe to agent.filter.decision events""" """Subscribe to agent.filter.decision events"""
@@ -201,10 +223,239 @@ async def test_messaging_route(decision: FilterDecision):
@app.on_event("shutdown") @app.on_event("shutdown")
async def shutdown_event(): async def shutdown_event():
"""Clean shutdown""" """Clean shutdown"""
global nc global nc, http_client
if nc: if nc:
await nc.close() await nc.close()
print("✅ NATS connection closed") logger.info("✅ NATS connection closed")
if http_client:
await http_client.aclose()
logger.info("✅ HTTP client closed")
# ============================================================================
# Backend Integration Endpoints
# ============================================================================
class InferRequest(BaseModel):
"""Request for agent inference"""
prompt: str
model: Optional[str] = None
max_tokens: Optional[int] = 2048
temperature: Optional[float] = 0.7
system_prompt: Optional[str] = None
class InferResponse(BaseModel):
"""Response from agent inference"""
response: str
model: str
tokens_used: Optional[int] = None
backend: str
class BackendStatus(BaseModel):
"""Status of a backend service"""
name: str
url: str
status: str # online, offline, error
active_model: Optional[str] = None
error: Optional[str] = None
@app.get("/backends/status", response_model=List[BackendStatus])
async def get_backends_status():
"""Get status of all backend services"""
backends = []
# Check Swapper
try:
resp = await http_client.get(f"{SWAPPER_URL}/health", timeout=5.0)
if resp.status_code == 200:
data = resp.json()
backends.append(BackendStatus(
name="swapper",
url=SWAPPER_URL,
status="online",
active_model=data.get("active_model")
))
else:
backends.append(BackendStatus(
name="swapper",
url=SWAPPER_URL,
status="error",
error=f"HTTP {resp.status_code}"
))
except Exception as e:
backends.append(BackendStatus(
name="swapper",
url=SWAPPER_URL,
status="offline",
error=str(e)
))
# Check STT
try:
resp = await http_client.get(f"{STT_URL}/health", timeout=5.0)
backends.append(BackendStatus(
name="stt",
url=STT_URL,
status="online" if resp.status_code == 200 else "error"
))
except Exception as e:
backends.append(BackendStatus(
name="stt",
url=STT_URL,
status="offline",
error=str(e)
))
# Check Vision (Ollama)
try:
resp = await http_client.get(f"{VISION_URL}/api/tags", timeout=5.0)
if resp.status_code == 200:
data = resp.json()
models = [m.get("name") for m in data.get("models", [])]
backends.append(BackendStatus(
name="vision",
url=VISION_URL,
status="online",
active_model=", ".join(models[:3]) if models else None
))
else:
backends.append(BackendStatus(
name="vision",
url=VISION_URL,
status="error"
))
except Exception as e:
backends.append(BackendStatus(
name="vision",
url=VISION_URL,
status="offline",
error=str(e)
))
# Check OCR
try:
resp = await http_client.get(f"{OCR_URL}/health", timeout=5.0)
backends.append(BackendStatus(
name="ocr",
url=OCR_URL,
status="online" if resp.status_code == 200 else "error"
))
except Exception as e:
backends.append(BackendStatus(
name="ocr",
url=OCR_URL,
status="offline",
error=str(e)
))
return backends
@app.post("/v1/agents/{agent_id}/infer", response_model=InferResponse)
async def agent_infer(agent_id: str, request: InferRequest):
"""
Route inference request to appropriate backend.
Router decides which backend to use based on:
- Agent configuration (model, capabilities)
- Request type (text, vision, audio)
- Backend availability
"""
logger.info(f"🔀 Inference request for agent: {agent_id}")
logger.info(f"📝 Prompt: {request.prompt[:100]}...")
# Determine which backend to use
model = request.model or "gpt-oss:latest"
# Try Swapper first (for LLM models)
try:
# Check if Swapper is available
health_resp = await http_client.get(f"{SWAPPER_URL}/health", timeout=5.0)
if health_resp.status_code == 200:
# Load model if needed
load_resp = await http_client.post(
f"{SWAPPER_URL}/load",
json={"model": model},
timeout=30.0
)
if load_resp.status_code == 200:
# Generate response via Ollama
generate_resp = await http_client.post(
f"{VISION_URL}/api/generate",
json={
"model": model,
"prompt": request.prompt,
"system": request.system_prompt,
"stream": False,
"options": {
"num_predict": request.max_tokens,
"temperature": request.temperature
}
},
timeout=120.0
)
if generate_resp.status_code == 200:
data = generate_resp.json()
return InferResponse(
response=data.get("response", ""),
model=model,
tokens_used=data.get("eval_count"),
backend="swapper+ollama"
)
except Exception as e:
logger.error(f"❌ Swapper/Ollama error: {e}")
# Fallback: return error
raise HTTPException(
status_code=503,
detail=f"No backend available for model: {model}"
)
@app.get("/v1/models")
async def list_available_models():
"""List all available models across backends"""
models = []
# Get Swapper models
try:
resp = await http_client.get(f"{SWAPPER_URL}/models", timeout=5.0)
if resp.status_code == 200:
data = resp.json()
for m in data.get("models", []):
models.append({
"id": m.get("name"),
"backend": "swapper",
"size_gb": m.get("size_gb"),
"status": m.get("status", "available")
})
except Exception as e:
logger.warning(f"Cannot get Swapper models: {e}")
# Get Ollama models
try:
resp = await http_client.get(f"{VISION_URL}/api/tags", timeout=5.0)
if resp.status_code == 200:
data = resp.json()
for m in data.get("models", []):
# Avoid duplicates
model_name = m.get("name")
if not any(x.get("id") == model_name for x in models):
models.append({
"id": model_name,
"backend": "ollama",
"size_gb": round(m.get("size", 0) / 1e9, 1),
"status": "loaded"
})
except Exception as e:
logger.warning(f"Cannot get Ollama models: {e}")
return {"models": models, "total": len(models)}

View File

@@ -3,6 +3,7 @@ uvicorn[standard]==0.24.0
pydantic==2.5.0 pydantic==2.5.0
nats-py==2.6.0 nats-py==2.6.0
PyYAML==6.0.1 PyYAML==6.0.1
httpx>=0.25.0