feat: додано Node Registry, GreenFood, Monitoring та Utils

This commit is contained in:
Apple
2025-11-21 00:35:41 -08:00
parent 31f3602047
commit e018b9ab68
74 changed files with 13948 additions and 0 deletions

View File

@@ -0,0 +1,36 @@
FROM python:3.11-slim
# Metadata
LABEL maintainer="DAARION Team <admin@daarion.city>"
LABEL service="node-registry"
LABEL version="0.1.0-stub"
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
curl \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY app/ ./app/
# Create non-root user
RUN useradd -m -u 1000 noderegistry && \
chown -R noderegistry:noderegistry /app
USER noderegistry
# Expose port
EXPOSE 9205
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:9205/health || exit 1
# Run application
CMD ["python", "-m", "app.main"]

View File

@@ -0,0 +1,404 @@
# Node Registry Service
**Version:** 0.1.0-stub
**Status:** 🟡 Stub Implementation (Infrastructure Ready)
**Port:** 9205 (Internal only)
Central registry for DAGI network nodes (Node #1, Node #2, Node #N).
---
## Overview
Node Registry Service provides:
- **Node Registration** — Register new nodes in DAGI network
- **Heartbeat Tracking** — Monitor node health and availability
- **Node Discovery** — Query available nodes and their capabilities
- **Profile Management** — Store node profiles (LLM configs, services, capabilities)
---
## Current Implementation
### ✅ Completed (Infrastructure)
- FastAPI application with /health and /metrics endpoints
- Docker container configuration
- PostgreSQL database schema
- docker-compose integration
- Deployment script for Node #1
### 🚧 To Be Implemented (by Cursor)
- Full REST API endpoints
- Node registration logic
- Heartbeat mechanism
- Database integration (SQLAlchemy models)
- Prometheus metrics export
- Node discovery algorithms
---
## Quick Start
### Local Development
```bash
# Install dependencies
cd services/node-registry
pip install -r requirements.txt
# Set environment variables
export NODE_REGISTRY_DB_HOST=localhost
export NODE_REGISTRY_DB_PORT=5432
export NODE_REGISTRY_DB_NAME=node_registry
export NODE_REGISTRY_DB_USER=node_registry_user
export NODE_REGISTRY_DB_PASSWORD=your_password
export NODE_REGISTRY_HTTP_PORT=9205
export NODE_REGISTRY_ENV=development
export NODE_REGISTRY_LOG_LEVEL=debug
# Run service
python -m app.main
```
Service will start on http://localhost:9205
### Docker (Recommended)
```bash
# Build image
docker-compose build node-registry
# Start service
docker-compose up -d node-registry
# Check logs
docker-compose logs -f node-registry
# Check health
curl http://localhost:9205/health
```
### Deploy to Node #1 (Production)
```bash
# From Node #2 (MacBook)
./scripts/deploy-node-registry.sh
```
This will:
1. Initialize PostgreSQL database
2. Configure environment variables
3. Build Docker image
4. Start service
5. Configure firewall rules (internal access only)
6. Verify deployment
---
## API Endpoints
### Health & Monitoring
#### GET /health
Health check endpoint (used by Docker, Prometheus, etc.)
**Response:**
```json
{
"status": "healthy",
"service": "node-registry",
"version": "0.1.0-stub",
"environment": "production",
"uptime_seconds": 3600.5,
"timestamp": "2025-01-17T14:30:00Z",
"database": {
"connected": true,
"host": "postgres",
"port": 5432,
"database": "node_registry"
}
}
```
#### GET /metrics
Prometheus-compatible metrics endpoint
**Response:**
```json
{
"service": "node-registry",
"uptime_seconds": 3600.5,
"total_nodes": 2,
"active_nodes": 1,
"timestamp": "2025-01-17T14:30:00Z"
}
```
### Node Management (Stub - To Be Implemented)
#### POST /api/v1/nodes/register
Register a new node
**Status:** 501 Not Implemented (stub)
#### POST /api/v1/nodes/{node_id}/heartbeat
Update node heartbeat
**Status:** 501 Not Implemented (stub)
#### GET /api/v1/nodes
List all registered nodes
**Status:** 501 Not Implemented (stub)
#### GET /api/v1/nodes/{node_id}
Get specific node information
**Status:** 501 Not Implemented (stub)
---
## Database Schema
### Tables
#### `nodes`
Core node registry
| Column | Type | Description |
|--------|------|-------------|
| id | UUID | Primary key |
| node_id | VARCHAR(255) | Unique node identifier (e.g. node-1-hetzner-gex44) |
| node_name | VARCHAR(255) | Human-readable name |
| node_role | VARCHAR(50) | production, development, backup |
| node_type | VARCHAR(50) | router, gateway, worker, etc. |
| ip_address | INET | Public IP |
| local_ip | INET | Local network IP |
| hostname | VARCHAR(255) | DNS hostname |
| status | VARCHAR(50) | online, offline, maintenance, degraded |
| last_heartbeat | TIMESTAMP | Last heartbeat time |
| registered_at | TIMESTAMP | Registration timestamp |
| updated_at | TIMESTAMP | Last update timestamp |
| metadata | JSONB | Additional node metadata |
#### `node_profiles`
Node capabilities and configurations
| Column | Type | Description |
|--------|------|-------------|
| id | UUID | Primary key |
| node_id | UUID | Foreign key to nodes.id |
| profile_name | VARCHAR(255) | Profile identifier |
| profile_type | VARCHAR(50) | llm, service, capability |
| config | JSONB | Profile configuration |
| enabled | BOOLEAN | Profile active status |
| created_at | TIMESTAMP | Creation timestamp |
| updated_at | TIMESTAMP | Last update timestamp |
#### `heartbeat_log`
Historical heartbeat data
| Column | Type | Description |
|--------|------|-------------|
| id | UUID | Primary key |
| node_id | UUID | Foreign key to nodes.id |
| timestamp | TIMESTAMP | Heartbeat timestamp |
| status | VARCHAR(50) | Node status at heartbeat |
| metrics | JSONB | System metrics (CPU, RAM, etc.) |
---
## Environment Variables
| Variable | Default | Description |
|----------|---------|-------------|
| NODE_REGISTRY_DB_HOST | postgres | PostgreSQL host |
| NODE_REGISTRY_DB_PORT | 5432 | PostgreSQL port |
| NODE_REGISTRY_DB_NAME | node_registry | Database name |
| NODE_REGISTRY_DB_USER | node_registry_user | Database user |
| NODE_REGISTRY_DB_PASSWORD | - | Database password (required) |
| NODE_REGISTRY_HTTP_PORT | 9205 | HTTP server port |
| NODE_REGISTRY_ENV | production | Environment (development/production) |
| NODE_REGISTRY_LOG_LEVEL | info | Log level (debug/info/warning/error) |
---
## Security
### Network Access
- **Port 9205:** Internal network only (Node #1, Node #2, DAGI nodes)
- **Public Access:** Blocked by firewall (UFW rules)
- **Authentication:** To be implemented (API keys, JWT)
### Firewall Rules (Node #1)
```bash
# Allow from local network
ufw allow from 192.168.1.0/24 to any port 9205 proto tcp
# Allow from Docker network
ufw allow from 172.16.0.0/12 to any port 9205 proto tcp
# Deny from external
ufw deny 9205/tcp
```
---
## Database Initialization
### Manual Setup
```bash
# On Node #1
ssh root@144.76.224.179
# Copy SQL script to container
docker cp services/node-registry/migrations/init_node_registry.sql dagi-postgres:/tmp/
# Run initialization
docker exec -i dagi-postgres psql -U postgres < /tmp/init_node_registry.sql
# Verify
docker exec dagi-postgres psql -U postgres -d node_registry -c "\dt"
```
### Via Deployment Script
The `deploy-node-registry.sh` script automatically:
1. Checks if database exists
2. Creates database and user if needed
3. Generates secure password
4. Saves password to .env
---
## Monitoring & Health
### Docker Health Check
```bash
docker inspect dagi-node-registry | grep -A 5 Health
```
### Prometheus Scraping
Add to prometheus.yml:
```yaml
scrape_configs:
- job_name: 'node-registry'
static_configs:
- targets: ['node-registry:9205']
scrape_interval: 30s
```
### Grafana Dashboard
Add panel with query:
```promql
up{job="node-registry"}
```
---
## Development
### Testing Locally
```bash
# Run with development settings
export NODE_REGISTRY_ENV=development
python -m app.main
# Access interactive API docs
open http://localhost:9205/docs
```
### Adding New Endpoints
1. Edit `app/main.py`
2. Add route with `@app.get()` or `@app.post()`
3. Add Pydantic models for request/response
4. Implement database logic (when ready)
5. Test via /docs or curl
6. Update this README
---
## Troubleshooting
### Service won't start
```bash
# Check logs
docker logs dagi-node-registry
# Check database connection
docker exec dagi-postgres pg_isready
# Check environment variables
docker exec dagi-node-registry env | grep NODE_REGISTRY
```
### Database connection error
```bash
# Verify database exists
docker exec dagi-postgres psql -U postgres -l | grep node_registry
# Verify user exists
docker exec dagi-postgres psql -U postgres -c "\du" | grep node_registry_user
# Test connection
docker exec dagi-postgres psql -U node_registry_user -d node_registry -c "SELECT 1"
```
### Port not accessible
```bash
# Check firewall rules
sudo ufw status | grep 9205
# Check if service is listening
netstat -tlnp | grep 9205
# Test from Node #2
curl http://144.76.224.179:9205/health
```
---
## Next Steps (for Cursor)
1. **Implement Database Layer**
- SQLAlchemy models for nodes, profiles, heartbeat
- Database connection pool
- Migration system (Alembic)
2. **Implement API Endpoints**
- Node registration with validation
- Heartbeat updates with metrics
- Node listing with filters
- Profile CRUD operations
3. **Add Authentication**
- API key-based auth
- JWT tokens for inter-node communication
- Rate limiting
4. **Add Monitoring**
- Prometheus metrics export
- Health check improvements
- Performance metrics
5. **Add Tests**
- Unit tests (pytest)
- Integration tests
- API endpoint tests
---
## Links
- [INFRASTRUCTURE.md](../../INFRASTRUCTURE.md) — Infrastructure overview
- [WARP.md](../../WARP.md) — Main developer guide
- [docker-compose.yml](../../docker-compose.yml) — Service configuration
---
**Last Updated:** 2025-01-17
**Maintained by:** Ivan Tytar & DAARION Team
**Status:** 🟡 Infrastructure Ready — Awaiting Cursor implementation

View File

@@ -0,0 +1,187 @@
#!/usr/bin/env python3
"""
Node Registry Service
Central registry for DAGI network nodes (Node #1, Node #2, Node #N)
This is a stub implementation - full API will be implemented by Cursor.
"""
import os
import time
from datetime import datetime
from typing import Dict, Any
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
import uvicorn
# Environment configuration
HTTP_PORT = int(os.getenv("NODE_REGISTRY_HTTP_PORT", "9205"))
ENV = os.getenv("NODE_REGISTRY_ENV", "development")
LOG_LEVEL = os.getenv("NODE_REGISTRY_LOG_LEVEL", "info")
DB_HOST = os.getenv("NODE_REGISTRY_DB_HOST", "postgres")
DB_PORT = int(os.getenv("NODE_REGISTRY_DB_PORT", "5432"))
DB_NAME = os.getenv("NODE_REGISTRY_DB_NAME", "node_registry")
DB_USER = os.getenv("NODE_REGISTRY_DB_USER", "node_registry_user")
DB_PASSWORD = os.getenv("NODE_REGISTRY_DB_PASSWORD", "")
# Service metadata
SERVICE_NAME = "node-registry"
VERSION = "0.1.0-stub"
START_TIME = time.time()
app = FastAPI(
title="Node Registry Service",
description="Central registry for DAGI network nodes",
version=VERSION,
docs_url="/docs" if ENV == "development" else None,
redoc_url="/redoc" if ENV == "development" else None,
)
# Models (stub - will be expanded by Cursor)
class HealthResponse(BaseModel):
status: str
service: str
version: str
environment: str
uptime_seconds: float
timestamp: str
database: Dict[str, Any]
class MetricsResponse(BaseModel):
service: str
uptime_seconds: float
total_nodes: int
active_nodes: int
timestamp: str
# Health check endpoint
@app.get("/health", response_model=HealthResponse)
async def health_check():
"""
Health check endpoint for monitoring systems.
Returns service status, version, and database connectivity.
"""
uptime = time.time() - START_TIME
# TODO: Implement actual DB health check
db_status = {
"connected": False,
"host": DB_HOST,
"port": DB_PORT,
"database": DB_NAME,
"message": "Not implemented (stub)"
}
return HealthResponse(
status="healthy",
service=SERVICE_NAME,
version=VERSION,
environment=ENV,
uptime_seconds=uptime,
timestamp=datetime.utcnow().isoformat() + "Z",
database=db_status
)
# Metrics endpoint (Prometheus-compatible format will be added by Cursor)
@app.get("/metrics", response_model=MetricsResponse)
async def metrics():
"""
Metrics endpoint for Prometheus scraping.
TODO: Add proper Prometheus format (prometheus_client library)
"""
uptime = time.time() - START_TIME
# TODO: Implement actual metrics from database
return MetricsResponse(
service=SERVICE_NAME,
uptime_seconds=uptime,
total_nodes=0,
active_nodes=0,
timestamp=datetime.utcnow().isoformat() + "Z"
)
# Root endpoint
@app.get("/")
async def root():
"""Root endpoint - service information"""
return {
"service": SERVICE_NAME,
"version": VERSION,
"status": "running",
"environment": ENV,
"message": "Node Registry Service (stub implementation)",
"endpoints": {
"health": "/health",
"metrics": "/metrics",
"docs": "/docs" if ENV == "development" else "disabled",
}
}
# Stub API endpoints (to be implemented by Cursor)
@app.post("/api/v1/nodes/register")
async def register_node():
"""
Register a new node in the registry.
TODO: Implement by Cursor
"""
raise HTTPException(status_code=501, detail="Not implemented - stub endpoint")
@app.post("/api/v1/nodes/{node_id}/heartbeat")
async def update_heartbeat(node_id: str):
"""
Update node heartbeat (keep-alive).
TODO: Implement by Cursor
"""
raise HTTPException(status_code=501, detail="Not implemented - stub endpoint")
@app.get("/api/v1/nodes")
async def list_nodes():
"""
List all registered nodes.
TODO: Implement by Cursor
"""
raise HTTPException(status_code=501, detail="Not implemented - stub endpoint")
@app.get("/api/v1/nodes/{node_id}")
async def get_node(node_id: str):
"""
Get specific node information.
TODO: Implement by Cursor
"""
raise HTTPException(status_code=501, detail="Not implemented - stub endpoint")
if __name__ == "__main__":
print(f"🚀 Starting {SERVICE_NAME} v{VERSION}")
print(f"📊 Environment: {ENV}")
print(f"🔌 Port: {HTTP_PORT}")
print(f"🗄️ Database: {DB_USER}@{DB_HOST}:{DB_PORT}/{DB_NAME}")
print(f"📝 Log level: {LOG_LEVEL}")
print()
uvicorn.run(
app,
host="0.0.0.0",
port=HTTP_PORT,
log_level=LOG_LEVEL.lower(),
access_log=True,
)

View File

@@ -0,0 +1,112 @@
-- Node Registry Database Initialization Script
-- For DAGI Stack Node Registry Service
-- Version: 0.1.0
-- Date: 2025-01-17
-- Create database (run as postgres superuser)
CREATE DATABASE node_registry;
-- Connect to database
\c node_registry;
-- Create user with secure password
-- NOTE: Replace 'CHANGE_ME_STRONG_PASSWORD' with actual strong password
CREATE USER node_registry_user WITH ENCRYPTED PASSWORD 'CHANGE_ME_STRONG_PASSWORD';
-- Grant privileges
GRANT ALL PRIVILEGES ON DATABASE node_registry TO node_registry_user;
GRANT ALL ON SCHEMA public TO node_registry_user;
GRANT CREATE ON SCHEMA public TO node_registry_user;
-- Enable necessary extensions
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
CREATE EXTENSION IF NOT EXISTS "pg_trgm";
-- Create nodes table (basic schema - will be expanded by Cursor)
CREATE TABLE IF NOT EXISTS nodes (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
node_id VARCHAR(255) UNIQUE NOT NULL,
node_name VARCHAR(255) NOT NULL,
node_role VARCHAR(50) NOT NULL, -- 'production', 'development', 'backup'
node_type VARCHAR(50) NOT NULL, -- 'router', 'gateway', 'worker', etc.
ip_address INET,
local_ip INET,
hostname VARCHAR(255),
status VARCHAR(50) DEFAULT 'offline', -- 'online', 'offline', 'maintenance'
last_heartbeat TIMESTAMP WITH TIME ZONE,
registered_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
metadata JSONB DEFAULT '{}',
CONSTRAINT valid_status CHECK (status IN ('online', 'offline', 'maintenance', 'degraded'))
);
-- Create node profiles table
CREATE TABLE IF NOT EXISTS node_profiles (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
node_id UUID REFERENCES nodes(id) ON DELETE CASCADE,
profile_name VARCHAR(255) NOT NULL,
profile_type VARCHAR(50) NOT NULL, -- 'llm', 'service', 'capability'
config JSONB NOT NULL DEFAULT '{}',
enabled BOOLEAN DEFAULT true,
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
UNIQUE(node_id, profile_name)
);
-- Create heartbeat log table (for monitoring)
CREATE TABLE IF NOT EXISTS heartbeat_log (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
node_id UUID REFERENCES nodes(id) ON DELETE CASCADE,
timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
status VARCHAR(50),
metrics JSONB DEFAULT '{}'
);
-- Create indexes
CREATE INDEX idx_nodes_status ON nodes(status);
CREATE INDEX idx_nodes_last_heartbeat ON nodes(last_heartbeat DESC);
CREATE INDEX idx_nodes_node_id ON nodes(node_id);
CREATE INDEX idx_node_profiles_node_id ON node_profiles(node_id);
CREATE INDEX idx_node_profiles_enabled ON node_profiles(enabled);
CREATE INDEX idx_heartbeat_log_node_id ON heartbeat_log(node_id);
CREATE INDEX idx_heartbeat_log_timestamp ON heartbeat_log(timestamp DESC);
-- Create function to update updated_at timestamp
CREATE OR REPLACE FUNCTION update_updated_at_column()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = CURRENT_TIMESTAMP;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Create triggers
CREATE TRIGGER update_nodes_updated_at
BEFORE UPDATE ON nodes
FOR EACH ROW
EXECUTE FUNCTION update_updated_at_column();
CREATE TRIGGER update_node_profiles_updated_at
BEFORE UPDATE ON node_profiles
FOR EACH ROW
EXECUTE FUNCTION update_updated_at_column();
-- Grant table permissions to user
GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public TO node_registry_user;
GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO node_registry_user;
-- Insert initial nodes (Node #1 and Node #2)
INSERT INTO nodes (node_id, node_name, node_role, node_type, ip_address, local_ip, hostname, status)
VALUES
('node-1-hetzner-gex44', 'Hetzner GEX44 Production', 'production', 'router', '144.76.224.179', NULL, 'gateway.daarion.city', 'offline'),
('node-2-macbook-m4max', 'MacBook Pro M4 Max', 'development', 'router', NULL, '192.168.1.244', 'MacBook-Pro.local', 'offline')
ON CONFLICT (node_id) DO NOTHING;
-- Success message
DO $$
BEGIN
RAISE NOTICE '✅ Node Registry database initialized successfully';
RAISE NOTICE '📊 Tables created: nodes, node_profiles, heartbeat_log';
RAISE NOTICE '👤 User created: node_registry_user';
RAISE NOTICE '⚠️ IMPORTANT: Change default password in production!';
END $$;

View File

@@ -0,0 +1,10 @@
fastapi==0.115.0
uvicorn[standard]==0.32.0
pydantic==2.12.4
pydantic-settings==2.7.0
httpx==0.28.1
asyncpg==0.30.0
sqlalchemy[asyncio]==2.0.36
alembic==1.14.0
python-json-logger==3.2.1
prometheus-client==0.21.0