feat: add node-registry dashboard, matrix-presence-aggregator, ocr-service updates
This commit is contained in:
248
scripts/sync_agents_from_config.py
Normal file
248
scripts/sync_agents_from_config.py
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Sync Agents from YAML Config to DAARION City Database
|
||||||
|
|
||||||
|
This script reads config/agents_city_mapping.yaml and syncs:
|
||||||
|
1. Agents to `agents` table
|
||||||
|
2. Agent-Room bindings to `agent_room_bindings` table
|
||||||
|
3. Validates node_id against Node Registry
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/sync_agents_from_config.py
|
||||||
|
|
||||||
|
Environment:
|
||||||
|
DATABASE_URL - PostgreSQL connection string for DAARION city DB
|
||||||
|
NODE_REGISTRY_URL - URL for Node Registry API (default: http://localhost:9205)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import yaml
|
||||||
|
import logging
|
||||||
|
import httpx
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import psycopg2
|
||||||
|
from psycopg2.extras import RealDictCursor
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s [%(levelname)s] %(message)s'
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://postgres:postgres@localhost:5432/daarion')
|
||||||
|
NODE_REGISTRY_URL = os.getenv('NODE_REGISTRY_URL', 'http://localhost:9205')
|
||||||
|
CONFIG_PATH = Path(__file__).parent.parent / 'config' / 'agents_city_mapping.yaml'
|
||||||
|
|
||||||
|
|
||||||
|
def load_config() -> dict:
|
||||||
|
"""Load agents configuration from YAML file."""
|
||||||
|
if not CONFIG_PATH.exists():
|
||||||
|
raise FileNotFoundError(f"Config file not found: {CONFIG_PATH}")
|
||||||
|
|
||||||
|
with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
logger.info(f"✅ Loaded config: {len(config.get('agents', []))} agents, {len(config.get('districts', []))} districts")
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def validate_node(node_id: str) -> bool:
|
||||||
|
"""Check if node exists in Node Registry."""
|
||||||
|
try:
|
||||||
|
response = httpx.get(f"{NODE_REGISTRY_URL}/api/v1/nodes/{node_id}", timeout=5.0)
|
||||||
|
if response.status_code == 200:
|
||||||
|
logger.debug(f"✅ Node validated: {node_id}")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
logger.warning(f"⚠️ Node not found in registry: {node_id}")
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"⚠️ Cannot validate node {node_id}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def get_room_id_by_slug(cursor, slug: str) -> str | None:
|
||||||
|
"""Get room_id by slug from city_rooms."""
|
||||||
|
cursor.execute("SELECT id FROM city_rooms WHERE slug = %s", (slug,))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
return row['id'] if row else None
|
||||||
|
|
||||||
|
|
||||||
|
def sync_agents(config: dict, conn) -> tuple[int, int, int]:
|
||||||
|
"""
|
||||||
|
Sync agents from config to database.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (created, updated, errors)
|
||||||
|
"""
|
||||||
|
cursor = conn.cursor(cursor_factory=RealDictCursor)
|
||||||
|
created = 0
|
||||||
|
updated = 0
|
||||||
|
errors = 0
|
||||||
|
|
||||||
|
default_node_id = config.get('default_node_id', 'node-2-macbook-m4max')
|
||||||
|
|
||||||
|
for agent in config.get('agents', []):
|
||||||
|
agent_id = agent['agent_id']
|
||||||
|
node_id = agent.get('node_id', default_node_id)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Validate node (optional - just warning)
|
||||||
|
validate_node(node_id)
|
||||||
|
|
||||||
|
# Upsert agent (using 'id' as primary key, which is agent_id)
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO agents (
|
||||||
|
id, display_name, kind, role, avatar_url, color_hint,
|
||||||
|
is_active, node_id, district, primary_room_slug, model, priority,
|
||||||
|
status, created_at, updated_at
|
||||||
|
) VALUES (
|
||||||
|
%(agent_id)s, %(display_name)s, %(kind)s, %(role)s, %(avatar_url)s, %(color_hint)s,
|
||||||
|
true, %(node_id)s, %(district)s, %(primary_room_slug)s, %(model)s, %(priority)s,
|
||||||
|
'online', CURRENT_TIMESTAMP, CURRENT_TIMESTAMP
|
||||||
|
)
|
||||||
|
ON CONFLICT (id) DO UPDATE SET
|
||||||
|
display_name = EXCLUDED.display_name,
|
||||||
|
kind = EXCLUDED.kind,
|
||||||
|
role = EXCLUDED.role,
|
||||||
|
avatar_url = EXCLUDED.avatar_url,
|
||||||
|
color_hint = EXCLUDED.color_hint,
|
||||||
|
is_active = true,
|
||||||
|
node_id = EXCLUDED.node_id,
|
||||||
|
district = EXCLUDED.district,
|
||||||
|
primary_room_slug = EXCLUDED.primary_room_slug,
|
||||||
|
model = EXCLUDED.model,
|
||||||
|
priority = EXCLUDED.priority,
|
||||||
|
status = 'online',
|
||||||
|
updated_at = CURRENT_TIMESTAMP
|
||||||
|
RETURNING (xmax = 0) as is_insert
|
||||||
|
""", {
|
||||||
|
'agent_id': agent_id,
|
||||||
|
'display_name': agent.get('display_name', agent_id),
|
||||||
|
'kind': agent.get('kind', 'agent'),
|
||||||
|
'role': agent.get('role', ''),
|
||||||
|
'avatar_url': agent.get('avatar_url'),
|
||||||
|
'color_hint': agent.get('color_hint', '#6366F1'),
|
||||||
|
'node_id': node_id,
|
||||||
|
'district': agent.get('district'),
|
||||||
|
'primary_room_slug': agent.get('primary_room_slug'),
|
||||||
|
'model': agent.get('model'),
|
||||||
|
'priority': agent.get('priority', 'medium'),
|
||||||
|
})
|
||||||
|
|
||||||
|
result = cursor.fetchone()
|
||||||
|
if result and result['is_insert']:
|
||||||
|
created += 1
|
||||||
|
logger.info(f"✅ Created agent: {agent_id}")
|
||||||
|
else:
|
||||||
|
updated += 1
|
||||||
|
logger.debug(f"🔄 Updated agent: {agent_id}")
|
||||||
|
|
||||||
|
# Create room binding
|
||||||
|
room_slug = agent.get('primary_room_slug')
|
||||||
|
if room_slug:
|
||||||
|
room_id = get_room_id_by_slug(cursor, room_slug)
|
||||||
|
if room_id:
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO agent_room_bindings (agent_id, room_id, role, is_primary)
|
||||||
|
VALUES (%(agent_id)s, %(room_id)s, 'resident', true)
|
||||||
|
ON CONFLICT (agent_id, room_id) DO UPDATE SET
|
||||||
|
is_primary = true,
|
||||||
|
updated_at = CURRENT_TIMESTAMP
|
||||||
|
""", {'agent_id': agent_id, 'room_id': room_id})
|
||||||
|
else:
|
||||||
|
logger.warning(f"⚠️ Room not found for agent {agent_id}: {room_slug}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
errors += 1
|
||||||
|
logger.error(f"❌ Error syncing agent {agent_id}: {e}")
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
return created, updated, errors
|
||||||
|
|
||||||
|
|
||||||
|
def sync_districts(config: dict, conn) -> int:
|
||||||
|
"""Sync districts from config to database."""
|
||||||
|
cursor = conn.cursor()
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
for district in config.get('districts', []):
|
||||||
|
try:
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO city_districts (id, name, description, color, icon, room_slug)
|
||||||
|
VALUES (%(id)s, %(name)s, %(description)s, %(color)s, %(icon)s, %(room_slug)s)
|
||||||
|
ON CONFLICT (id) DO UPDATE SET
|
||||||
|
name = EXCLUDED.name,
|
||||||
|
description = EXCLUDED.description,
|
||||||
|
color = EXCLUDED.color,
|
||||||
|
icon = EXCLUDED.icon,
|
||||||
|
room_slug = EXCLUDED.room_slug,
|
||||||
|
updated_at = CURRENT_TIMESTAMP
|
||||||
|
""", {
|
||||||
|
'id': district['id'],
|
||||||
|
'name': district['name'],
|
||||||
|
'description': district.get('description', ''),
|
||||||
|
'color': district.get('color', '#6366F1'),
|
||||||
|
'icon': district.get('icon', 'building'),
|
||||||
|
'room_slug': district.get('room_slug'),
|
||||||
|
})
|
||||||
|
count += 1
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error syncing district {district['id']}: {e}")
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
logger.info(f"✅ Synced {count} districts")
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main entry point."""
|
||||||
|
logger.info("🚀 Starting Agent-City Sync")
|
||||||
|
logger.info(f"📁 Config: {CONFIG_PATH}")
|
||||||
|
logger.info(f"🗄️ Database: {DATABASE_URL.split('@')[-1] if '@' in DATABASE_URL else DATABASE_URL}")
|
||||||
|
logger.info(f"📡 Node Registry: {NODE_REGISTRY_URL}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load config
|
||||||
|
config = load_config()
|
||||||
|
|
||||||
|
# Connect to database
|
||||||
|
conn = psycopg2.connect(DATABASE_URL)
|
||||||
|
logger.info("✅ Connected to database")
|
||||||
|
|
||||||
|
# Sync districts
|
||||||
|
sync_districts(config, conn)
|
||||||
|
|
||||||
|
# Sync agents
|
||||||
|
created, updated, errors = sync_agents(config, conn)
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print()
|
||||||
|
logger.info("=" * 50)
|
||||||
|
logger.info("📊 SYNC SUMMARY")
|
||||||
|
logger.info("=" * 50)
|
||||||
|
logger.info(f"✅ Agents created: {created}")
|
||||||
|
logger.info(f"🔄 Agents updated: {updated}")
|
||||||
|
logger.info(f"❌ Errors: {errors}")
|
||||||
|
logger.info(f"📍 Total agents: {created + updated}")
|
||||||
|
logger.info("=" * 50)
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if errors > 0:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Fatal error: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
||||||
@@ -21,20 +21,33 @@ class AgentsSource:
|
|||||||
- display_name
|
- display_name
|
||||||
- kind
|
- kind
|
||||||
- status
|
- status
|
||||||
- room_id (current_room_id)
|
- room_id
|
||||||
- color
|
- color
|
||||||
|
- node_id
|
||||||
|
- district
|
||||||
|
- model
|
||||||
|
- role
|
||||||
|
- avatar_url
|
||||||
"""
|
"""
|
||||||
query = text("""
|
query = text("""
|
||||||
SELECT
|
SELECT
|
||||||
id as agent_id,
|
a.id as agent_id,
|
||||||
display_name,
|
a.display_name,
|
||||||
kind,
|
a.kind,
|
||||||
status,
|
a.status,
|
||||||
current_room_id as room_id,
|
COALESCE(cr.id, a.current_room_id) as room_id,
|
||||||
color
|
COALESCE(a.color_hint, a.color, 'cyan') as color,
|
||||||
FROM agents
|
a.node_id,
|
||||||
WHERE status IN ('online', 'busy')
|
a.district,
|
||||||
ORDER BY display_name
|
a.model,
|
||||||
|
a.role,
|
||||||
|
a.avatar_url,
|
||||||
|
a.primary_room_slug
|
||||||
|
FROM agents a
|
||||||
|
LEFT JOIN city_rooms cr ON cr.slug = a.primary_room_slug
|
||||||
|
WHERE a.status IN ('online', 'busy')
|
||||||
|
AND (a.is_active = true OR a.is_active IS NULL)
|
||||||
|
ORDER BY a.display_name
|
||||||
""")
|
""")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -69,17 +82,25 @@ class AgentsSource:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
def get_all_agents(self) -> List[Dict]:
|
def get_all_agents(self) -> List[Dict]:
|
||||||
"""Get all agents (including offline)"""
|
"""Get all active agents (including offline)"""
|
||||||
query = text("""
|
query = text("""
|
||||||
SELECT
|
SELECT
|
||||||
id as agent_id,
|
a.id as agent_id,
|
||||||
display_name,
|
a.display_name,
|
||||||
kind,
|
a.kind,
|
||||||
status,
|
a.status,
|
||||||
current_room_id as room_id,
|
COALESCE(cr.id, a.current_room_id) as room_id,
|
||||||
color
|
COALESCE(a.color_hint, a.color, 'cyan') as color,
|
||||||
FROM agents
|
a.node_id,
|
||||||
ORDER BY display_name
|
a.district,
|
||||||
|
a.model,
|
||||||
|
a.role,
|
||||||
|
a.avatar_url,
|
||||||
|
a.primary_room_slug
|
||||||
|
FROM agents a
|
||||||
|
LEFT JOIN city_rooms cr ON cr.slug = a.primary_room_slug
|
||||||
|
WHERE a.is_active = true OR a.is_active IS NULL
|
||||||
|
ORDER BY a.display_name
|
||||||
""")
|
""")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -86,7 +86,12 @@ class PresenceAggregator:
|
|||||||
kind=agent.get("kind", "assistant"),
|
kind=agent.get("kind", "assistant"),
|
||||||
status=agent.get("status", "online"),
|
status=agent.get("status", "online"),
|
||||||
room_id=agent.get("room_id"),
|
room_id=agent.get("room_id"),
|
||||||
color=agent.get("color", "cyan")
|
color=agent.get("color", "cyan"),
|
||||||
|
node_id=agent.get("node_id"),
|
||||||
|
district=agent.get("district"),
|
||||||
|
model=agent.get("model"),
|
||||||
|
role=agent.get("role"),
|
||||||
|
avatar_url=agent.get("avatar_url"),
|
||||||
)
|
)
|
||||||
all_agents.append(ap)
|
all_agents.append(ap)
|
||||||
|
|
||||||
|
|||||||
@@ -103,6 +103,7 @@ async def health():
|
|||||||
|
|
||||||
|
|
||||||
@app.get("/presence/summary")
|
@app.get("/presence/summary")
|
||||||
|
@app.get("/presence/snapshot")
|
||||||
async def get_presence_summary():
|
async def get_presence_summary():
|
||||||
"""
|
"""
|
||||||
Get current presence snapshot.
|
Get current presence snapshot.
|
||||||
|
|||||||
@@ -8,10 +8,15 @@ class AgentPresence(BaseModel):
|
|||||||
"""Agent presence in a room"""
|
"""Agent presence in a room"""
|
||||||
agent_id: str
|
agent_id: str
|
||||||
display_name: str
|
display_name: str
|
||||||
kind: str = "assistant" # assistant, civic, oracle, builder
|
kind: str = "assistant" # assistant, civic, oracle, builder, vision, etc.
|
||||||
status: str = "offline" # online, offline, busy
|
status: str = "offline" # online, offline, busy
|
||||||
room_id: Optional[str] = None
|
room_id: Optional[str] = None
|
||||||
color: Optional[str] = None
|
color: Optional[str] = None
|
||||||
|
node_id: Optional[str] = None # Node where agent runs (node-1-hetzner, node-2-macbook)
|
||||||
|
district: Optional[str] = None # City district (leadership, engineering, etc.)
|
||||||
|
model: Optional[str] = None # LLM model used by agent
|
||||||
|
role: Optional[str] = None # Agent's role description
|
||||||
|
avatar_url: Optional[str] = None # Agent avatar URL
|
||||||
|
|
||||||
|
|
||||||
class RoomPresence(BaseModel):
|
class RoomPresence(BaseModel):
|
||||||
|
|||||||
369
services/node-registry/app/dashboard.py
Normal file
369
services/node-registry/app/dashboard.py
Normal file
@@ -0,0 +1,369 @@
|
|||||||
|
"""
|
||||||
|
Node Dashboard API - Aggregator for node status and metrics
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import httpx
|
||||||
|
import psutil
|
||||||
|
from typing import Dict, Any, Optional, List
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Probe timeout in seconds
|
||||||
|
PROBE_TIMEOUT = 0.5
|
||||||
|
PROBE_TIMEOUT_LONG = 1.0
|
||||||
|
|
||||||
|
|
||||||
|
class DashboardAggregator:
|
||||||
|
"""Aggregates data from multiple services for node dashboard"""
|
||||||
|
|
||||||
|
def __init__(self, node_ip: str = "localhost"):
|
||||||
|
self.node_ip = node_ip
|
||||||
|
self.client = httpx.AsyncClient(timeout=PROBE_TIMEOUT)
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
await self.client.aclose()
|
||||||
|
|
||||||
|
async def _probe(self, url: str, timeout: float = PROBE_TIMEOUT) -> Dict[str, Any]:
|
||||||
|
"""Execute HTTP probe with timeout"""
|
||||||
|
try:
|
||||||
|
resp = await self.client.get(url, timeout=timeout)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
return {"status": "up", "data": resp.json(), "latency_ms": int(resp.elapsed.total_seconds() * 1000)}
|
||||||
|
else:
|
||||||
|
return {"status": "degraded", "error": f"HTTP {resp.status_code}"}
|
||||||
|
except httpx.TimeoutException:
|
||||||
|
return {"status": "down", "error": "timeout"}
|
||||||
|
except httpx.ConnectError:
|
||||||
|
return {"status": "down", "error": "connection refused"}
|
||||||
|
except Exception as e:
|
||||||
|
return {"status": "down", "error": str(e)}
|
||||||
|
|
||||||
|
async def get_infra_metrics(self) -> Dict[str, Any]:
|
||||||
|
"""Get infrastructure metrics using psutil"""
|
||||||
|
try:
|
||||||
|
cpu_pct = psutil.cpu_percent(interval=0.1)
|
||||||
|
mem = psutil.virtual_memory()
|
||||||
|
disk = psutil.disk_usage('/')
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"cpu_usage_pct": round(cpu_pct, 1),
|
||||||
|
"ram": {
|
||||||
|
"total_gb": round(mem.total / (1024**3), 1),
|
||||||
|
"used_gb": round(mem.used / (1024**3), 1)
|
||||||
|
},
|
||||||
|
"disk": {
|
||||||
|
"total_gb": round(disk.total / (1024**3), 1),
|
||||||
|
"used_gb": round(disk.used / (1024**3), 1)
|
||||||
|
},
|
||||||
|
"gpus": []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Try to get GPU info (nvidia-smi or similar)
|
||||||
|
try:
|
||||||
|
import subprocess
|
||||||
|
nvidia_output = subprocess.run(
|
||||||
|
['nvidia-smi', '--query-gpu=name,memory.total,memory.used,utilization.gpu', '--format=csv,noheader,nounits'],
|
||||||
|
capture_output=True, text=True, timeout=2
|
||||||
|
)
|
||||||
|
if nvidia_output.returncode == 0:
|
||||||
|
for line in nvidia_output.stdout.strip().split('\n'):
|
||||||
|
parts = [p.strip() for p in line.split(',')]
|
||||||
|
if len(parts) >= 4:
|
||||||
|
result["gpus"].append({
|
||||||
|
"name": parts[0],
|
||||||
|
"vram_gb": round(float(parts[1]) / 1024, 1),
|
||||||
|
"used_gb": round(float(parts[2]) / 1024, 1),
|
||||||
|
"sm_util_pct": int(parts[3])
|
||||||
|
})
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to get infra metrics: {e}")
|
||||||
|
return {
|
||||||
|
"cpu_usage_pct": 0,
|
||||||
|
"ram": {"total_gb": 0, "used_gb": 0},
|
||||||
|
"disk": {"total_gb": 0, "used_gb": 0},
|
||||||
|
"gpus": []
|
||||||
|
}
|
||||||
|
|
||||||
|
async def probe_swapper(self, port: int = 8890) -> Dict[str, Any]:
|
||||||
|
"""Probe Swapper service"""
|
||||||
|
base_url = f"http://{self.node_ip}:{port}"
|
||||||
|
|
||||||
|
health_result = await self._probe(f"{base_url}/health", PROBE_TIMEOUT_LONG)
|
||||||
|
models_result = await self._probe(f"{base_url}/models", PROBE_TIMEOUT_LONG)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"status": health_result.get("status", "unknown"),
|
||||||
|
"endpoint": base_url,
|
||||||
|
"latency_ms": health_result.get("latency_ms", 0),
|
||||||
|
"storage": {"total_gb": 0, "used_gb": 0, "free_gb": 0},
|
||||||
|
"models": []
|
||||||
|
}
|
||||||
|
|
||||||
|
if health_result.get("status") == "up":
|
||||||
|
data = health_result.get("data", {})
|
||||||
|
result["active_model"] = data.get("active_model")
|
||||||
|
result["mode"] = data.get("mode")
|
||||||
|
|
||||||
|
if models_result.get("status") == "up":
|
||||||
|
data = models_result.get("data", {})
|
||||||
|
for m in data.get("models", []):
|
||||||
|
result["models"].append({
|
||||||
|
"name": m.get("name"),
|
||||||
|
"size_gb": m.get("size_gb", 0),
|
||||||
|
"device": m.get("device", "disk"),
|
||||||
|
"state": m.get("status", "unloaded")
|
||||||
|
})
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def probe_router(self, port: int = 9102) -> Dict[str, Any]:
|
||||||
|
"""Probe DAGI Router service"""
|
||||||
|
base_url = f"http://{self.node_ip}:{port}"
|
||||||
|
|
||||||
|
health_result = await self._probe(f"{base_url}/health", PROBE_TIMEOUT_LONG)
|
||||||
|
backends_result = await self._probe(f"{base_url}/backends/status", PROBE_TIMEOUT_LONG)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"status": health_result.get("status", "unknown"),
|
||||||
|
"endpoint": base_url,
|
||||||
|
"version": "unknown",
|
||||||
|
"backends": [],
|
||||||
|
"metrics": {
|
||||||
|
"requests_1m": 0,
|
||||||
|
"requests_1h": 0,
|
||||||
|
"error_rate_1h": 0,
|
||||||
|
"avg_latency_ms_1h": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if health_result.get("status") == "up":
|
||||||
|
data = health_result.get("data", {})
|
||||||
|
result["version"] = data.get("version", "unknown")
|
||||||
|
result["nats_connected"] = data.get("nats_connected", False)
|
||||||
|
|
||||||
|
if backends_result.get("status") == "up":
|
||||||
|
for backend in backends_result.get("data", []):
|
||||||
|
result["backends"].append({
|
||||||
|
"name": backend.get("name"),
|
||||||
|
"status": backend.get("status"),
|
||||||
|
"latency_ms": backend.get("latency_ms", 0),
|
||||||
|
"error": backend.get("error")
|
||||||
|
})
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def probe_service(self, name: str, port: int, health_path: str = "/health") -> Dict[str, Any]:
|
||||||
|
"""Probe generic AI service"""
|
||||||
|
base_url = f"http://{self.node_ip}:{port}"
|
||||||
|
|
||||||
|
result = await self._probe(f"{base_url}{health_path}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": result.get("status", "unknown"),
|
||||||
|
"endpoint": base_url,
|
||||||
|
"latency_ms": result.get("latency_ms", 0),
|
||||||
|
"error": result.get("error")
|
||||||
|
}
|
||||||
|
|
||||||
|
async def probe_ollama(self, port: int = 11434) -> Dict[str, Any]:
|
||||||
|
"""Probe Ollama service"""
|
||||||
|
base_url = f"http://{self.node_ip}:{port}"
|
||||||
|
|
||||||
|
result = await self._probe(f"{base_url}/api/tags", PROBE_TIMEOUT_LONG)
|
||||||
|
|
||||||
|
models = []
|
||||||
|
if result.get("status") == "up":
|
||||||
|
data = result.get("data", {})
|
||||||
|
for m in data.get("models", []):
|
||||||
|
models.append(m.get("name"))
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": result.get("status", "unknown"),
|
||||||
|
"endpoint": base_url,
|
||||||
|
"latency_ms": result.get("latency_ms", 0),
|
||||||
|
"models": models[:10], # Limit to 10 models
|
||||||
|
"error": result.get("error")
|
||||||
|
}
|
||||||
|
|
||||||
|
async def probe_matrix(self, synapse_port: int = 8018, presence_port: int = 8085) -> Dict[str, Any]:
|
||||||
|
"""Probe Matrix services"""
|
||||||
|
synapse_result = await self._probe(f"http://{self.node_ip}:{synapse_port}/_matrix/client/versions")
|
||||||
|
presence_result = await self._probe(f"http://{self.node_ip}:{presence_port}/health")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"enabled": synapse_result.get("status") == "up",
|
||||||
|
"homeserver": f"http://{self.node_ip}:{synapse_port}",
|
||||||
|
"synapse": {
|
||||||
|
"status": synapse_result.get("status", "unknown"),
|
||||||
|
"latency_ms": synapse_result.get("latency_ms", 0)
|
||||||
|
},
|
||||||
|
"presence_bridge": {
|
||||||
|
"status": presence_result.get("status", "unknown"),
|
||||||
|
"latency_ms": presence_result.get("latency_ms", 0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async def probe_monitoring(self, prometheus_port: int = 9090, grafana_port: int = 3001) -> Dict[str, Any]:
|
||||||
|
"""Probe monitoring services"""
|
||||||
|
prometheus_result = await self._probe(f"http://{self.node_ip}:{prometheus_port}/-/ready")
|
||||||
|
grafana_result = await self._probe(f"http://{self.node_ip}:{grafana_port}/api/health")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"prometheus": {
|
||||||
|
"url": f"http://{self.node_ip}:{prometheus_port}",
|
||||||
|
"status": prometheus_result.get("status", "unknown")
|
||||||
|
},
|
||||||
|
"grafana": {
|
||||||
|
"url": f"http://{self.node_ip}:{grafana_port}",
|
||||||
|
"status": grafana_result.get("status", "unknown")
|
||||||
|
},
|
||||||
|
"logging": {
|
||||||
|
"loki": {"status": "unknown"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async def get_agents_summary(self, city_service_port: int = 7001) -> Dict[str, Any]:
|
||||||
|
"""Get agents summary from city service"""
|
||||||
|
# City service uses /city/agents endpoint
|
||||||
|
result = await self._probe(f"http://{self.node_ip}:{city_service_port}/city/agents", PROBE_TIMEOUT_LONG)
|
||||||
|
|
||||||
|
summary = {
|
||||||
|
"total": 0,
|
||||||
|
"running": 0,
|
||||||
|
"by_kind": {},
|
||||||
|
"top": []
|
||||||
|
}
|
||||||
|
|
||||||
|
if result.get("status") == "up":
|
||||||
|
agents = result.get("data", [])
|
||||||
|
summary["total"] = len(agents)
|
||||||
|
|
||||||
|
for agent in agents:
|
||||||
|
kind = agent.get("kind", "unknown")
|
||||||
|
summary["by_kind"][kind] = summary["by_kind"].get(kind, 0) + 1
|
||||||
|
|
||||||
|
if agent.get("status") in ["online", "busy"]:
|
||||||
|
summary["running"] += 1
|
||||||
|
|
||||||
|
# Top 5 agents
|
||||||
|
online_agents = [a for a in agents if a.get("status") in ["online", "busy"]][:5]
|
||||||
|
for agent in online_agents:
|
||||||
|
summary["top"].append({
|
||||||
|
"agent_id": agent.get("id"),
|
||||||
|
"display_name": agent.get("display_name"),
|
||||||
|
"kind": agent.get("kind"),
|
||||||
|
"status": agent.get("status"),
|
||||||
|
"node_id": agent.get("node_id")
|
||||||
|
})
|
||||||
|
|
||||||
|
return summary
|
||||||
|
|
||||||
|
|
||||||
|
async def build_dashboard(node_profile: Dict[str, Any], node_ip: str = "localhost") -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Build complete dashboard from node profile.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
node_profile: Node profile from registry (with modules, gpu, roles)
|
||||||
|
node_ip: IP address to probe services
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Complete dashboard JSON
|
||||||
|
"""
|
||||||
|
aggregator = DashboardAggregator(node_ip)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Build module port map
|
||||||
|
module_ports = {}
|
||||||
|
for module in node_profile.get("modules", []):
|
||||||
|
if module.get("port"):
|
||||||
|
module_ports[module["id"]] = module["port"]
|
||||||
|
|
||||||
|
# Parallel probes
|
||||||
|
tasks = {
|
||||||
|
"infra": aggregator.get_infra_metrics(),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add probes based on modules
|
||||||
|
if "ai.swapper" in module_ports:
|
||||||
|
tasks["swapper"] = aggregator.probe_swapper(module_ports["ai.swapper"])
|
||||||
|
|
||||||
|
if "ai.router" in module_ports:
|
||||||
|
tasks["router"] = aggregator.probe_router(module_ports["ai.router"])
|
||||||
|
|
||||||
|
if "ai.ollama" in module_ports:
|
||||||
|
tasks["ollama"] = aggregator.probe_ollama(module_ports["ai.ollama"])
|
||||||
|
|
||||||
|
# Generic AI services
|
||||||
|
ai_services = ["ai.stt", "ai.tts", "ai.ocr", "ai.memory", "ai.crewai"]
|
||||||
|
for svc in ai_services:
|
||||||
|
if svc in module_ports:
|
||||||
|
svc_name = svc.replace("ai.", "")
|
||||||
|
tasks[f"svc_{svc_name}"] = aggregator.probe_service(svc_name, module_ports[svc])
|
||||||
|
|
||||||
|
# Matrix
|
||||||
|
synapse_port = module_ports.get("matrix.synapse", 8018)
|
||||||
|
presence_port = module_ports.get("matrix.presence", 8085)
|
||||||
|
if "matrix.synapse" in module_ports or "matrix.presence" in module_ports:
|
||||||
|
tasks["matrix"] = aggregator.probe_matrix(synapse_port, presence_port)
|
||||||
|
|
||||||
|
# Monitoring
|
||||||
|
prometheus_port = module_ports.get("monitoring.prometheus", 9090)
|
||||||
|
tasks["monitoring"] = aggregator.probe_monitoring(prometheus_port)
|
||||||
|
|
||||||
|
# Agents
|
||||||
|
city_port = module_ports.get("daarion.city", 7001)
|
||||||
|
if "daarion.city" in module_ports or "daarion.agents" in module_ports:
|
||||||
|
tasks["agents"] = aggregator.get_agents_summary(city_port)
|
||||||
|
|
||||||
|
# Execute all probes in parallel
|
||||||
|
results = {}
|
||||||
|
for name, task in tasks.items():
|
||||||
|
try:
|
||||||
|
results[name] = await task
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Probe {name} failed: {e}")
|
||||||
|
results[name] = {"status": "error", "error": str(e)}
|
||||||
|
|
||||||
|
# Build dashboard response
|
||||||
|
dashboard = {
|
||||||
|
"node": {
|
||||||
|
"node_id": node_profile.get("node_id"),
|
||||||
|
"name": node_profile.get("name"),
|
||||||
|
"roles": node_profile.get("roles", []),
|
||||||
|
"status": node_profile.get("status", "unknown"),
|
||||||
|
"public_hostname": node_profile.get("ip_address"),
|
||||||
|
"environment": node_profile.get("role", "production"),
|
||||||
|
"gpu": node_profile.get("gpu"),
|
||||||
|
"modules": node_profile.get("modules", []),
|
||||||
|
"version": node_profile.get("version", "1.0.0")
|
||||||
|
},
|
||||||
|
"infra": results.get("infra", {}),
|
||||||
|
"ai": {
|
||||||
|
"swapper": results.get("swapper", {"status": "not_installed"}),
|
||||||
|
"router": results.get("router", {"status": "not_installed"}),
|
||||||
|
"ollama": results.get("ollama", {"status": "not_installed"}),
|
||||||
|
"services": {}
|
||||||
|
},
|
||||||
|
"agents": results.get("agents", {"total": 0, "running": 0, "by_kind": {}, "top": []}),
|
||||||
|
"matrix": results.get("matrix", {"enabled": False}),
|
||||||
|
"monitoring": results.get("monitoring", {})
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add AI services
|
||||||
|
for key, value in results.items():
|
||||||
|
if key.startswith("svc_"):
|
||||||
|
svc_name = key.replace("svc_", "")
|
||||||
|
dashboard["ai"]["services"][svc_name] = value
|
||||||
|
|
||||||
|
return dashboard
|
||||||
|
|
||||||
|
finally:
|
||||||
|
await aggregator.close()
|
||||||
|
|
||||||
@@ -1,24 +1,24 @@
|
|||||||
"""
|
"""
|
||||||
SQLite Database connection for local development
|
PostgreSQL Database connection for Node Registry
|
||||||
Use this for testing without PostgreSQL
|
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine, text
|
||||||
from sqlalchemy.orm import sessionmaker, Session
|
from sqlalchemy.orm import sessionmaker, Session
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# SQLite database file
|
# Database URL from environment
|
||||||
DB_FILE = os.getenv("NODE_REGISTRY_DB_FILE", "node_registry.db")
|
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://postgres:postgres@localhost:5432/node_registry")
|
||||||
DATABASE_URL = f"sqlite:///{DB_FILE}"
|
|
||||||
|
|
||||||
# Create engine
|
# Create engine
|
||||||
engine = create_engine(
|
engine = create_engine(
|
||||||
DATABASE_URL,
|
DATABASE_URL,
|
||||||
connect_args={"check_same_thread": False}, # Required for SQLite
|
pool_pre_ping=True,
|
||||||
echo=os.getenv("NODE_REGISTRY_ENV") == "development", # Log SQL in dev
|
pool_size=5,
|
||||||
|
max_overflow=10,
|
||||||
|
echo=os.getenv("NODE_REGISTRY_ENV") == "development",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create session factory
|
# Create session factory
|
||||||
@@ -65,7 +65,7 @@ def check_db_connection() -> bool:
|
|||||||
"""Check if database connection is working"""
|
"""Check if database connection is working"""
|
||||||
try:
|
try:
|
||||||
with engine.connect() as conn:
|
with engine.connect() as conn:
|
||||||
conn.execute("SELECT 1")
|
conn.execute(text("SELECT 1"))
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Database connection failed: {e}")
|
logger.error(f"Database connection failed: {e}")
|
||||||
@@ -75,8 +75,7 @@ def check_db_connection() -> bool:
|
|||||||
def get_db_info() -> dict:
|
def get_db_info() -> dict:
|
||||||
"""Get database connection information"""
|
"""Get database connection information"""
|
||||||
return {
|
return {
|
||||||
"type": "sqlite",
|
"type": "postgresql",
|
||||||
"database": DB_FILE,
|
"url": DATABASE_URL.split("@")[-1] if "@" in DATABASE_URL else DATABASE_URL,
|
||||||
"connected": check_db_connection(),
|
"connected": check_db_connection(),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -488,6 +488,187 @@ async def discover_nodes(query: NodeDiscoveryQuery, db: Session = Depends(get_db
|
|||||||
raise HTTPException(status_code=500, detail=f"Discovery failed: {str(e)}")
|
raise HTTPException(status_code=500, detail=f"Discovery failed: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Node Profile Endpoints (Standard v1)
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
from app.dashboard import build_dashboard
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/v1/nodes/self/dashboard")
|
||||||
|
async def get_self_dashboard(db: Session = Depends(get_db)):
|
||||||
|
"""
|
||||||
|
Get dashboard for current node (self).
|
||||||
|
|
||||||
|
Uses the first node in registry as "self" for now.
|
||||||
|
In production, this would use JWT claims to identify the node.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
# Get first node as "self" (simplified for v1)
|
||||||
|
result = db.execute(text("""
|
||||||
|
SELECT node_id FROM nodes ORDER BY registered_at LIMIT 1
|
||||||
|
"""))
|
||||||
|
|
||||||
|
row = result.fetchone()
|
||||||
|
if not row:
|
||||||
|
raise HTTPException(status_code=404, detail="No nodes registered")
|
||||||
|
|
||||||
|
# Delegate to node dashboard
|
||||||
|
return await get_node_dashboard(row[0], db)
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Failed to get self dashboard: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Dashboard failed: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/v1/nodes/{node_id}/dashboard")
|
||||||
|
async def get_node_dashboard(node_id: str, db: Session = Depends(get_db)):
|
||||||
|
"""
|
||||||
|
Get complete node dashboard with live status.
|
||||||
|
|
||||||
|
Aggregates:
|
||||||
|
- Node profile (roles, modules, GPU)
|
||||||
|
- Infrastructure metrics (CPU, RAM, Disk, GPU)
|
||||||
|
- AI services status (Swapper, Router, STT, Vision, OCR)
|
||||||
|
- Agents summary
|
||||||
|
- Matrix integration status
|
||||||
|
- Monitoring status
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
# Get node profile
|
||||||
|
result = db.execute(text("""
|
||||||
|
SELECT node_id, node_name, node_role, node_type, ip_address, hostname,
|
||||||
|
status, roles, gpu, modules, version, vpn_ip
|
||||||
|
FROM nodes
|
||||||
|
WHERE node_id = :node_id
|
||||||
|
"""), {"node_id": node_id})
|
||||||
|
|
||||||
|
row = result.fetchone()
|
||||||
|
if not row:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Node not found: {node_id}")
|
||||||
|
|
||||||
|
profile = {
|
||||||
|
"node_id": row[0],
|
||||||
|
"name": row[1],
|
||||||
|
"role": row[2],
|
||||||
|
"type": row[3],
|
||||||
|
"ip_address": row[4],
|
||||||
|
"hostname": row[5],
|
||||||
|
"status": row[6],
|
||||||
|
"roles": list(row[7]) if row[7] else [],
|
||||||
|
"gpu": row[8],
|
||||||
|
"modules": row[9] if row[9] else [],
|
||||||
|
"version": row[10] or "1.0.0",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Build dashboard with probes
|
||||||
|
# For Docker network, use gateway IP to access host services
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Default to Docker gateway for dagi-network
|
||||||
|
node_ip = os.getenv("PROBE_HOST", "172.21.0.1")
|
||||||
|
|
||||||
|
# For NODE2, use its actual IP (for remote probing)
|
||||||
|
if node_id == "node-2-macbook-m4max":
|
||||||
|
node_ip = row[4] or "192.168.1.33"
|
||||||
|
|
||||||
|
dashboard = await build_dashboard(profile, node_ip)
|
||||||
|
|
||||||
|
return dashboard
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Failed to get node dashboard: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
raise HTTPException(status_code=500, detail=f"Dashboard failed: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/v1/nodes/{node_id}/profile")
|
||||||
|
async def get_node_profile(node_id: str, db: Session = Depends(get_db)):
|
||||||
|
"""
|
||||||
|
Get full node profile including modules, GPU, roles.
|
||||||
|
Node Profile Standard v1.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
result = db.execute(text("""
|
||||||
|
SELECT node_id, node_name, node_role, node_type, ip_address, hostname,
|
||||||
|
status, roles, gpu, modules, version, vpn_ip
|
||||||
|
FROM nodes
|
||||||
|
WHERE node_id = :node_id
|
||||||
|
"""), {"node_id": node_id})
|
||||||
|
|
||||||
|
row = result.fetchone()
|
||||||
|
if not row:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Node not found: {node_id}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"node_id": row[0],
|
||||||
|
"name": row[1],
|
||||||
|
"role": row[2],
|
||||||
|
"type": row[3],
|
||||||
|
"ip_address": row[4],
|
||||||
|
"hostname": row[5],
|
||||||
|
"status": row[6],
|
||||||
|
"roles": list(row[7]) if row[7] else [],
|
||||||
|
"gpu": row[8],
|
||||||
|
"modules": row[9] if row[9] else [],
|
||||||
|
"version": row[10] or "1.0.0",
|
||||||
|
"vpn_ip": str(row[11]) if row[11] else None,
|
||||||
|
}
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Failed to get node profile: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/v1/nodes/profiles")
|
||||||
|
async def get_all_node_profiles(db: Session = Depends(get_db)):
|
||||||
|
"""
|
||||||
|
Get all node profiles with modules.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
result = db.execute(text("""
|
||||||
|
SELECT node_id, node_name, node_role, node_type, ip_address, hostname,
|
||||||
|
status, roles, gpu, modules, version, vpn_ip
|
||||||
|
FROM nodes
|
||||||
|
ORDER BY node_id
|
||||||
|
"""))
|
||||||
|
|
||||||
|
nodes = []
|
||||||
|
for row in result.fetchall():
|
||||||
|
nodes.append({
|
||||||
|
"node_id": row[0],
|
||||||
|
"name": row[1],
|
||||||
|
"role": row[2],
|
||||||
|
"type": row[3],
|
||||||
|
"ip_address": row[4],
|
||||||
|
"hostname": row[5],
|
||||||
|
"status": row[6],
|
||||||
|
"roles": list(row[7]) if row[7] else [],
|
||||||
|
"gpu": row[8],
|
||||||
|
"modules": row[9] if row[9] else [],
|
||||||
|
"version": row[10] or "1.0.0",
|
||||||
|
})
|
||||||
|
|
||||||
|
return {"nodes": nodes, "total": len(nodes)}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Failed to get node profiles: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# Maintenance Endpoints
|
# Maintenance Endpoints
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@@ -518,7 +699,7 @@ if __name__ == "__main__":
|
|||||||
print(f"🚀 Starting {SERVICE_NAME} v{VERSION}")
|
print(f"🚀 Starting {SERVICE_NAME} v{VERSION}")
|
||||||
print(f"📊 Environment: {ENV}")
|
print(f"📊 Environment: {ENV}")
|
||||||
print(f"🔌 Port: {HTTP_PORT}")
|
print(f"🔌 Port: {HTTP_PORT}")
|
||||||
print(f"🗄️ Database: {DB_USER}@{DB_HOST}:{DB_PORT}/{DB_NAME}")
|
print(f"🗄️ Database: {os.getenv('DATABASE_URL', 'not configured')}")
|
||||||
print(f"📝 Log level: {LOG_LEVEL}")
|
print(f"📝 Log level: {LOG_LEVEL}")
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|||||||
@@ -3,8 +3,8 @@ SQLAlchemy ORM Models for Node Registry
|
|||||||
"""
|
"""
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from sqlalchemy import Column, String, DateTime, Boolean, ForeignKey, Text, Index
|
from sqlalchemy import Column, String, DateTime, Boolean, ForeignKey, Text, Index, ARRAY
|
||||||
from sqlalchemy.dialects.postgresql import UUID as PG_UUID, INET, JSONB as PG_JSONB
|
from sqlalchemy.dialects.postgresql import UUID as PG_UUID, INET, JSONB as PG_JSONB, ARRAY as PG_ARRAY
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
from sqlalchemy.orm import relationship
|
from sqlalchemy.orm import relationship
|
||||||
from sqlalchemy.types import TypeDecorator, String as SQLString, Text as SQLText
|
from sqlalchemy.types import TypeDecorator, String as SQLString, Text as SQLText
|
||||||
@@ -90,6 +90,12 @@ class Node(Base):
|
|||||||
updated_at = Column(DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow)
|
updated_at = Column(DateTime(timezone=True), default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||||
node_metadata = Column(JSONB, default={})
|
node_metadata = Column(JSONB, default={})
|
||||||
|
|
||||||
|
# Node Profile Standard v1 fields
|
||||||
|
roles = Column(ARRAY(String), default=[]) # ['core', 'gateway', 'matrix', 'agents', 'gpu']
|
||||||
|
gpu = Column(JSONB, default=None) # {"name": "NVIDIA RTX 4000", "vram_gb": 20}
|
||||||
|
modules = Column(JSONB, default=[]) # [{"id": "ai.router", "status": "up", "port": 9102}, ...]
|
||||||
|
version = Column(String(50), default='1.0.0')
|
||||||
|
|
||||||
# Relationships
|
# Relationships
|
||||||
profiles = relationship("NodeProfile", back_populates="node", cascade="all, delete-orphan")
|
profiles = relationship("NodeProfile", back_populates="node", cascade="all, delete-orphan")
|
||||||
heartbeats = relationship("HeartbeatLog", back_populates="node", cascade="all, delete-orphan")
|
heartbeats = relationship("HeartbeatLog", back_populates="node", cascade="all, delete-orphan")
|
||||||
@@ -113,6 +119,11 @@ class Node(Base):
|
|||||||
"registered_at": self.registered_at.isoformat() if self.registered_at else None,
|
"registered_at": self.registered_at.isoformat() if self.registered_at else None,
|
||||||
"updated_at": self.updated_at.isoformat() if self.updated_at else None,
|
"updated_at": self.updated_at.isoformat() if self.updated_at else None,
|
||||||
"metadata": self.node_metadata or {},
|
"metadata": self.node_metadata or {},
|
||||||
|
# Node Profile Standard v1
|
||||||
|
"roles": self.roles or [],
|
||||||
|
"gpu": self.gpu,
|
||||||
|
"modules": self.modules or [],
|
||||||
|
"version": self.version or "1.0.0",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -9,3 +9,5 @@ alembic==1.14.0
|
|||||||
python-json-logger==3.2.1
|
python-json-logger==3.2.1
|
||||||
prometheus-client==0.21.0
|
prometheus-client==0.21.0
|
||||||
psycopg2-binary>=2.9.0
|
psycopg2-binary>=2.9.0
|
||||||
|
psutil>=5.9.0
|
||||||
|
requests>=2.28.0
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
FROM python:3.11-slim
|
FROM python:3.11-slim
|
||||||
|
|
||||||
# Встановити системні залежності
|
# Встановити системні залежності для Tesseract
|
||||||
RUN apt-get update && apt-get install -y \
|
RUN apt-get update && apt-get install -y \
|
||||||
tesseract-ocr \
|
tesseract-ocr \
|
||||||
tesseract-ocr-ukr \
|
tesseract-ocr-ukr \
|
||||||
@@ -16,16 +16,16 @@ WORKDIR /app
|
|||||||
# Копіювати requirements
|
# Копіювати requirements
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
|
|
||||||
# Встановити Python залежності
|
# Встановити Python залежності (без EasyOCR для швидкого білду)
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
# Завантажити EasyOCR моделі
|
|
||||||
RUN python -c "import easyocr; easyocr.Reader(['uk', 'en'], gpu=False)"
|
|
||||||
|
|
||||||
# Копіювати код
|
# Копіювати код
|
||||||
COPY app/ ./app/
|
COPY app/ ./app/
|
||||||
|
|
||||||
EXPOSE 8896
|
EXPOSE 8896
|
||||||
|
|
||||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8896"]
|
# Health check
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||||
|
CMD curl -f http://localhost:8896/health || exit 1
|
||||||
|
|
||||||
|
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8896"]
|
||||||
|
|||||||
@@ -92,11 +92,19 @@ async def root():
|
|||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
async def health():
|
async def health():
|
||||||
"""Health check endpoint"""
|
"""Health check endpoint"""
|
||||||
|
gpu_available = False
|
||||||
|
if EASYOCR_AVAILABLE:
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
gpu_available = torch.cuda.is_available()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"status": "healthy" if (TESSERACT_AVAILABLE or EASYOCR_AVAILABLE) else "degraded",
|
"status": "healthy" if (TESSERACT_AVAILABLE or EASYOCR_AVAILABLE) else "degraded",
|
||||||
"tesseract": "available" if TESSERACT_AVAILABLE else "unavailable",
|
"tesseract": "available" if TESSERACT_AVAILABLE else "unavailable",
|
||||||
"easyocr": "available" if EASYOCR_AVAILABLE else "unavailable",
|
"easyocr": "available" if EASYOCR_AVAILABLE else "unavailable",
|
||||||
"gpu": torch.cuda.is_available() if EASYOCR_AVAILABLE else False
|
"gpu": gpu_available
|
||||||
}
|
}
|
||||||
|
|
||||||
def preprocess_image(img: Image.Image) -> Image.Image:
|
def preprocess_image(img: Image.Image) -> Image.Image:
|
||||||
|
|||||||
@@ -1,10 +1,14 @@
|
|||||||
|
# Core
|
||||||
fastapi==0.104.1
|
fastapi==0.104.1
|
||||||
uvicorn[standard]==0.24.0
|
uvicorn[standard]==0.24.0
|
||||||
python-multipart==0.0.6
|
python-multipart==0.0.6
|
||||||
|
|
||||||
|
# OCR - Tesseract only (EasyOCR optional)
|
||||||
pytesseract==0.3.10
|
pytesseract==0.3.10
|
||||||
easyocr==1.7.1
|
|
||||||
Pillow==10.1.0
|
Pillow==10.1.0
|
||||||
numpy==1.24.3
|
numpy==1.24.3
|
||||||
torch==2.1.0
|
|
||||||
torchvision==0.16.0
|
|
||||||
|
|
||||||
|
# Optional: EasyOCR (uncomment for full support, requires GPU)
|
||||||
|
# easyocr==1.7.1
|
||||||
|
# torch==2.1.0
|
||||||
|
# torchvision==0.16.0
|
||||||
|
|||||||
@@ -1,15 +1,27 @@
|
|||||||
from fastapi import FastAPI, HTTPException
|
from fastapi import FastAPI, HTTPException
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing import Literal, Optional, Dict, Any
|
from typing import Literal, Optional, Dict, Any, List
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import yaml
|
import yaml
|
||||||
|
import httpx
|
||||||
|
import logging
|
||||||
|
|
||||||
app = FastAPI(title="DAARION Router", version="1.0.0")
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
app = FastAPI(title="DAARION Router", version="2.0.0")
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
NATS_URL = os.getenv("NATS_URL", "nats://nats:4222")
|
NATS_URL = os.getenv("NATS_URL", "nats://nats:4222")
|
||||||
|
SWAPPER_URL = os.getenv("SWAPPER_URL", "http://192.168.1.33:8890")
|
||||||
|
STT_URL = os.getenv("STT_URL", "http://192.168.1.33:8895")
|
||||||
|
VISION_URL = os.getenv("VISION_URL", "http://192.168.1.33:11434")
|
||||||
|
OCR_URL = os.getenv("OCR_URL", "http://192.168.1.33:8896")
|
||||||
|
|
||||||
|
# HTTP client for backend services
|
||||||
|
http_client: Optional[httpx.AsyncClient] = None
|
||||||
|
|
||||||
# NATS client
|
# NATS client
|
||||||
nc = None
|
nc = None
|
||||||
@@ -49,25 +61,35 @@ config = load_config()
|
|||||||
@app.on_event("startup")
|
@app.on_event("startup")
|
||||||
async def startup_event():
|
async def startup_event():
|
||||||
"""Initialize NATS connection and subscriptions"""
|
"""Initialize NATS connection and subscriptions"""
|
||||||
global nc, nats_available
|
global nc, nats_available, http_client
|
||||||
print("🚀 DAGI Router starting up...")
|
logger.info("🚀 DAGI Router v2.0.0 starting up...")
|
||||||
|
|
||||||
|
# Initialize HTTP client
|
||||||
|
http_client = httpx.AsyncClient(timeout=60.0)
|
||||||
|
logger.info("✅ HTTP client initialized")
|
||||||
|
|
||||||
# Try to connect to NATS
|
# Try to connect to NATS
|
||||||
try:
|
try:
|
||||||
import nats
|
import nats
|
||||||
nc = await nats.connect(NATS_URL)
|
nc = await nats.connect(NATS_URL)
|
||||||
nats_available = True
|
nats_available = True
|
||||||
print(f"✅ Connected to NATS at {NATS_URL}")
|
logger.info(f"✅ Connected to NATS at {NATS_URL}")
|
||||||
|
|
||||||
# Subscribe to filter decisions if enabled
|
# Subscribe to filter decisions if enabled
|
||||||
if config.get("messaging_inbound", {}).get("enabled", True):
|
if config.get("messaging_inbound", {}).get("enabled", True):
|
||||||
asyncio.create_task(subscribe_to_filter_decisions())
|
asyncio.create_task(subscribe_to_filter_decisions())
|
||||||
else:
|
else:
|
||||||
print("⚠️ Messaging inbound routing disabled in config")
|
logger.warning("⚠️ Messaging inbound routing disabled in config")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"⚠️ NATS not available: {e}")
|
logger.warning(f"⚠️ NATS not available: {e}")
|
||||||
print("⚠️ Running in test mode (HTTP only)")
|
logger.warning("⚠️ Running in test mode (HTTP only)")
|
||||||
nats_available = False
|
nats_available = False
|
||||||
|
|
||||||
|
# Log backend URLs
|
||||||
|
logger.info(f"📡 Swapper URL: {SWAPPER_URL}")
|
||||||
|
logger.info(f"📡 STT URL: {STT_URL}")
|
||||||
|
logger.info(f"📡 Vision URL: {VISION_URL}")
|
||||||
|
logger.info(f"📡 OCR URL: {OCR_URL}")
|
||||||
|
|
||||||
async def subscribe_to_filter_decisions():
|
async def subscribe_to_filter_decisions():
|
||||||
"""Subscribe to agent.filter.decision events"""
|
"""Subscribe to agent.filter.decision events"""
|
||||||
@@ -201,10 +223,239 @@ async def test_messaging_route(decision: FilterDecision):
|
|||||||
@app.on_event("shutdown")
|
@app.on_event("shutdown")
|
||||||
async def shutdown_event():
|
async def shutdown_event():
|
||||||
"""Clean shutdown"""
|
"""Clean shutdown"""
|
||||||
global nc
|
global nc, http_client
|
||||||
if nc:
|
if nc:
|
||||||
await nc.close()
|
await nc.close()
|
||||||
print("✅ NATS connection closed")
|
logger.info("✅ NATS connection closed")
|
||||||
|
if http_client:
|
||||||
|
await http_client.aclose()
|
||||||
|
logger.info("✅ HTTP client closed")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Backend Integration Endpoints
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
class InferRequest(BaseModel):
|
||||||
|
"""Request for agent inference"""
|
||||||
|
prompt: str
|
||||||
|
model: Optional[str] = None
|
||||||
|
max_tokens: Optional[int] = 2048
|
||||||
|
temperature: Optional[float] = 0.7
|
||||||
|
system_prompt: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class InferResponse(BaseModel):
|
||||||
|
"""Response from agent inference"""
|
||||||
|
response: str
|
||||||
|
model: str
|
||||||
|
tokens_used: Optional[int] = None
|
||||||
|
backend: str
|
||||||
|
|
||||||
|
|
||||||
|
class BackendStatus(BaseModel):
|
||||||
|
"""Status of a backend service"""
|
||||||
|
name: str
|
||||||
|
url: str
|
||||||
|
status: str # online, offline, error
|
||||||
|
active_model: Optional[str] = None
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/backends/status", response_model=List[BackendStatus])
|
||||||
|
async def get_backends_status():
|
||||||
|
"""Get status of all backend services"""
|
||||||
|
backends = []
|
||||||
|
|
||||||
|
# Check Swapper
|
||||||
|
try:
|
||||||
|
resp = await http_client.get(f"{SWAPPER_URL}/health", timeout=5.0)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
data = resp.json()
|
||||||
|
backends.append(BackendStatus(
|
||||||
|
name="swapper",
|
||||||
|
url=SWAPPER_URL,
|
||||||
|
status="online",
|
||||||
|
active_model=data.get("active_model")
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
backends.append(BackendStatus(
|
||||||
|
name="swapper",
|
||||||
|
url=SWAPPER_URL,
|
||||||
|
status="error",
|
||||||
|
error=f"HTTP {resp.status_code}"
|
||||||
|
))
|
||||||
|
except Exception as e:
|
||||||
|
backends.append(BackendStatus(
|
||||||
|
name="swapper",
|
||||||
|
url=SWAPPER_URL,
|
||||||
|
status="offline",
|
||||||
|
error=str(e)
|
||||||
|
))
|
||||||
|
|
||||||
|
# Check STT
|
||||||
|
try:
|
||||||
|
resp = await http_client.get(f"{STT_URL}/health", timeout=5.0)
|
||||||
|
backends.append(BackendStatus(
|
||||||
|
name="stt",
|
||||||
|
url=STT_URL,
|
||||||
|
status="online" if resp.status_code == 200 else "error"
|
||||||
|
))
|
||||||
|
except Exception as e:
|
||||||
|
backends.append(BackendStatus(
|
||||||
|
name="stt",
|
||||||
|
url=STT_URL,
|
||||||
|
status="offline",
|
||||||
|
error=str(e)
|
||||||
|
))
|
||||||
|
|
||||||
|
# Check Vision (Ollama)
|
||||||
|
try:
|
||||||
|
resp = await http_client.get(f"{VISION_URL}/api/tags", timeout=5.0)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
data = resp.json()
|
||||||
|
models = [m.get("name") for m in data.get("models", [])]
|
||||||
|
backends.append(BackendStatus(
|
||||||
|
name="vision",
|
||||||
|
url=VISION_URL,
|
||||||
|
status="online",
|
||||||
|
active_model=", ".join(models[:3]) if models else None
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
backends.append(BackendStatus(
|
||||||
|
name="vision",
|
||||||
|
url=VISION_URL,
|
||||||
|
status="error"
|
||||||
|
))
|
||||||
|
except Exception as e:
|
||||||
|
backends.append(BackendStatus(
|
||||||
|
name="vision",
|
||||||
|
url=VISION_URL,
|
||||||
|
status="offline",
|
||||||
|
error=str(e)
|
||||||
|
))
|
||||||
|
|
||||||
|
# Check OCR
|
||||||
|
try:
|
||||||
|
resp = await http_client.get(f"{OCR_URL}/health", timeout=5.0)
|
||||||
|
backends.append(BackendStatus(
|
||||||
|
name="ocr",
|
||||||
|
url=OCR_URL,
|
||||||
|
status="online" if resp.status_code == 200 else "error"
|
||||||
|
))
|
||||||
|
except Exception as e:
|
||||||
|
backends.append(BackendStatus(
|
||||||
|
name="ocr",
|
||||||
|
url=OCR_URL,
|
||||||
|
status="offline",
|
||||||
|
error=str(e)
|
||||||
|
))
|
||||||
|
|
||||||
|
return backends
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/v1/agents/{agent_id}/infer", response_model=InferResponse)
|
||||||
|
async def agent_infer(agent_id: str, request: InferRequest):
|
||||||
|
"""
|
||||||
|
Route inference request to appropriate backend.
|
||||||
|
|
||||||
|
Router decides which backend to use based on:
|
||||||
|
- Agent configuration (model, capabilities)
|
||||||
|
- Request type (text, vision, audio)
|
||||||
|
- Backend availability
|
||||||
|
"""
|
||||||
|
logger.info(f"🔀 Inference request for agent: {agent_id}")
|
||||||
|
logger.info(f"📝 Prompt: {request.prompt[:100]}...")
|
||||||
|
|
||||||
|
# Determine which backend to use
|
||||||
|
model = request.model or "gpt-oss:latest"
|
||||||
|
|
||||||
|
# Try Swapper first (for LLM models)
|
||||||
|
try:
|
||||||
|
# Check if Swapper is available
|
||||||
|
health_resp = await http_client.get(f"{SWAPPER_URL}/health", timeout=5.0)
|
||||||
|
if health_resp.status_code == 200:
|
||||||
|
# Load model if needed
|
||||||
|
load_resp = await http_client.post(
|
||||||
|
f"{SWAPPER_URL}/load",
|
||||||
|
json={"model": model},
|
||||||
|
timeout=30.0
|
||||||
|
)
|
||||||
|
|
||||||
|
if load_resp.status_code == 200:
|
||||||
|
# Generate response via Ollama
|
||||||
|
generate_resp = await http_client.post(
|
||||||
|
f"{VISION_URL}/api/generate",
|
||||||
|
json={
|
||||||
|
"model": model,
|
||||||
|
"prompt": request.prompt,
|
||||||
|
"system": request.system_prompt,
|
||||||
|
"stream": False,
|
||||||
|
"options": {
|
||||||
|
"num_predict": request.max_tokens,
|
||||||
|
"temperature": request.temperature
|
||||||
|
}
|
||||||
|
},
|
||||||
|
timeout=120.0
|
||||||
|
)
|
||||||
|
|
||||||
|
if generate_resp.status_code == 200:
|
||||||
|
data = generate_resp.json()
|
||||||
|
return InferResponse(
|
||||||
|
response=data.get("response", ""),
|
||||||
|
model=model,
|
||||||
|
tokens_used=data.get("eval_count"),
|
||||||
|
backend="swapper+ollama"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Swapper/Ollama error: {e}")
|
||||||
|
|
||||||
|
# Fallback: return error
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=503,
|
||||||
|
detail=f"No backend available for model: {model}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/v1/models")
|
||||||
|
async def list_available_models():
|
||||||
|
"""List all available models across backends"""
|
||||||
|
models = []
|
||||||
|
|
||||||
|
# Get Swapper models
|
||||||
|
try:
|
||||||
|
resp = await http_client.get(f"{SWAPPER_URL}/models", timeout=5.0)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
data = resp.json()
|
||||||
|
for m in data.get("models", []):
|
||||||
|
models.append({
|
||||||
|
"id": m.get("name"),
|
||||||
|
"backend": "swapper",
|
||||||
|
"size_gb": m.get("size_gb"),
|
||||||
|
"status": m.get("status", "available")
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Cannot get Swapper models: {e}")
|
||||||
|
|
||||||
|
# Get Ollama models
|
||||||
|
try:
|
||||||
|
resp = await http_client.get(f"{VISION_URL}/api/tags", timeout=5.0)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
data = resp.json()
|
||||||
|
for m in data.get("models", []):
|
||||||
|
# Avoid duplicates
|
||||||
|
model_name = m.get("name")
|
||||||
|
if not any(x.get("id") == model_name for x in models):
|
||||||
|
models.append({
|
||||||
|
"id": model_name,
|
||||||
|
"backend": "ollama",
|
||||||
|
"size_gb": round(m.get("size", 0) / 1e9, 1),
|
||||||
|
"status": "loaded"
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Cannot get Ollama models: {e}")
|
||||||
|
|
||||||
|
return {"models": models, "total": len(models)}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ uvicorn[standard]==0.24.0
|
|||||||
pydantic==2.5.0
|
pydantic==2.5.0
|
||||||
nats-py==2.6.0
|
nats-py==2.6.0
|
||||||
PyYAML==6.0.1
|
PyYAML==6.0.1
|
||||||
|
httpx>=0.25.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user