snapshot: NODE1 production state 2026-02-09

Complete snapshot of /opt/microdao-daarion/ from NODE1 (144.76.224.179).
This represents the actual running production code that has diverged
significantly from the previous main branch.

Key changes from old main:
- Gateway (http_api.py): expanded from ~40KB to 164KB with full agent support
- Router: new /v1/agents/{id}/infer endpoint with vision + DeepSeek routing
- Behavior Policy: SOWA v2.2 (3-level: FULL/ACK/SILENT)
- Agent Registry: config/agent_registry.yml as single source of truth
- 13 agents configured (was 3)
- Memory service integration
- CrewAI teams and roles

Excluded from snapshot: venv/, .env, data/, backups, .tgz archives

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Apple
2026-02-09 08:46:46 -08:00
parent 134c044c21
commit ef3473db21
9473 changed files with 408933 additions and 2769877 deletions

240
shared/idempotency_redis.py Normal file
View File

@@ -0,0 +1,240 @@
"""
Idempotency Middleware for NATS Workers
========================================
Redis-based deduplication for async jobs.
Usage:
from idempotency_redis import check_idempotency, mark_completed, mark_failed
async def process_job(job_id: str, payload: dict):
# Check if already processed
status, result = await check_idempotency(job_id)
if status == "completed":
return result # Return cached result
if status == "in_progress":
raise AlreadyProcessingError("Job already in progress")
# Mark as in progress
await mark_in_progress(job_id)
try:
# Process job
result = await do_work(payload)
await mark_completed(job_id, result)
return result
except Exception as e:
await mark_failed(job_id, str(e))
raise
"""
import os
import json
import logging
from typing import Optional, Tuple, Dict, Any
from datetime import datetime, timedelta
import redis.asyncio as redis
logger = logging.getLogger(__name__)
# Redis connection
REDIS_URL = os.getenv("REDIS_URL", "redis://dagi-redis:6379")
REDIS_CLIENT = None
# TTLs
IDEMPOTENCY_TTL_HOURS = int(os.getenv("IDEMPOTENCY_TTL_HOURS", "24"))
IN_PROGRESS_TTL_MINUTES = int(os.getenv("IN_PROGRESS_TTL_MINUTES", "30"))
async def get_redis() -> redis.Redis:
"""Get or create Redis client"""
global REDIS_CLIENT
if REDIS_CLIENT is None:
REDIS_CLIENT = await redis.from_url(REDIS_URL, decode_responses=True)
return REDIS_CLIENT
async def check_idempotency(job_id: str) -> Tuple[str, Optional[Dict[str, Any]]]:
"""
Check if job_id was already processed.
Returns:
(status, result_ref)
status: "new" | "in_progress" | "completed" | "failed"
result_ref: None or dict with result data
"""
r = await get_redis()
key = f"idemp:{job_id}"
value = await r.get(key)
if value is None:
return ("new", None)
try:
data = json.loads(value)
status = data.get("status")
result_ref = data.get("result_ref")
if status == "completed":
# Try to fetch result if stored
result_key = f"idemp:result:{job_id}"
result_data = await r.get(result_key)
if result_data:
result_ref = json.loads(result_data)
return (status, result_ref)
except json.JSONDecodeError:
# Legacy format: just status string
return (value, None)
async def mark_in_progress(job_id: str, metadata: Dict[str, Any] = None):
"""Mark job as in progress"""
r = await get_redis()
key = f"idemp:{job_id}"
data = {
"status": "in_progress",
"started_at": datetime.utcnow().isoformat(),
"metadata": metadata or {}
}
await r.setex(
key,
timedelta(minutes=IN_PROGRESS_TTL_MINUTES),
json.dumps(data)
)
logger.info(f"Marked job {job_id} as in_progress")
async def mark_completed(job_id: str, result: Dict[str, Any] = None, result_ref: str = None):
"""
Mark job as completed.
Args:
job_id: Job identifier
result: Full result data (stored separately if large)
result_ref: Reference to result (e.g., NATS subject, file path)
"""
r = await get_redis()
key = f"idemp:{job_id}"
data = {
"status": "completed",
"completed_at": datetime.utcnow().isoformat(),
"result_ref": result_ref or "stored"
}
# Store result separately if provided (for retrieval)
if result:
result_key = f"idemp:result:{job_id}"
await r.setex(
result_key,
timedelta(hours=IDEMPOTENCY_TTL_HOURS),
json.dumps(result)
)
# Mark as completed with TTL
await r.setex(
key,
timedelta(hours=IDEMPOTENCY_TTL_HOURS),
json.dumps(data)
)
logger.info(f"Marked job {job_id} as completed")
async def mark_failed(job_id: str, error: str, allow_retry: bool = True):
"""
Mark job as failed.
Args:
job_id: Job identifier
error: Error message
allow_retry: If True, delete key to allow retry. If False, mark as failed with short TTL.
"""
r = await get_redis()
key = f"idemp:{job_id}"
if allow_retry:
# Delete key to allow retry
await r.delete(key)
logger.info(f"Marked job {job_id} as failed (retry allowed), deleted key")
else:
# Mark as failed with short TTL (to prevent immediate retry spam)
data = {
"status": "failed",
"failed_at": datetime.utcnow().isoformat(),
"error": error[:500] # Truncate long errors
}
await r.setex(
key,
timedelta(minutes=5), # Short TTL for failed
json.dumps(data)
)
logger.warning(f"Marked job {job_id} as failed (no retry): {error[:100]}")
async def get_job_status(job_id: str) -> Dict[str, Any]:
"""Get full job status for debugging"""
r = await get_redis()
key = f"idemp:{job_id}"
value = await r.get(key)
if value is None:
return {"status": "not_found"}
try:
data = json.loads(value)
return data
except json.JSONDecodeError:
return {"status": value, "raw": value}
# ==================== Decorator for Workers ====================
def idempotent_job(job_id_field: str = "job_id"):
"""
Decorator to make a worker function idempotent.
Usage:
@idempotent_job("job_id")
async def process_workflow(payload: dict):
# payload must contain job_id
...
"""
def decorator(func):
async def wrapper(payload: dict, *args, **kwargs):
job_id = payload.get(job_id_field)
if not job_id:
raise ValueError(f"Payload must contain '{job_id_field}'")
# Check idempotency
status, result = await check_idempotency(job_id)
if status == "completed":
logger.info(f"Job {job_id} already completed, returning cached result")
return result or {"status": "already_completed", "job_id": job_id}
if status == "in_progress":
logger.warning(f"Job {job_id} already in progress, skipping")
raise RuntimeError(f"Job {job_id} already in progress")
# Mark as in progress
await mark_in_progress(job_id, {"function": func.__name__})
try:
# Execute function
result = await func(payload, *args, **kwargs)
# Mark as completed
await mark_completed(job_id, result=result)
return result
except Exception as e:
# Mark as failed (allow retry)
await mark_failed(job_id, str(e), allow_retry=True)
raise
return wrapper
return decorator

186
shared/service_auth.py Normal file
View File

@@ -0,0 +1,186 @@
"""
Service-to-Service Authentication
=================================
JWT-based authentication between internal services.
Usage:
from service_auth import create_service_token, verify_service_token, require_service_auth
# Create token for service
token = create_service_token("router", "router")
# Verify in endpoint
@app.get("/protected")
@require_service_auth(allowed_roles=["router", "gateway"])
async def protected_endpoint():
return {"status": "ok"}
"""
import os
import jwt
import time
from typing import List, Optional, Dict, Any
from functools import wraps
from fastapi import HTTPException, Header, Request
# Configuration
JWT_SECRET = os.getenv("JWT_SECRET", "change-me-in-production")
JWT_ALGORITHM = "HS256"
JWT_AUDIENCE = os.getenv("SERVICE_AUD", "microdao-internal")
JWT_ISSUER = os.getenv("SERVICE_ISS", "microdao")
# Service roles and permissions
SERVICE_ROLES = {
"gateway": ["gateway", "router", "worker", "parser"],
"router": ["router", "worker"],
"worker": ["worker"],
"memory": ["memory"],
"control-plane": ["control-plane"],
"parser": ["parser"],
"ingest": ["ingest"]
}
# Service-to-service access matrix
SERVICE_ACCESS = {
"gateway": ["memory", "control-plane", "router"],
"router": ["memory", "control-plane", "swapper"],
"worker": ["memory", "router"],
"parser": ["memory"],
"ingest": ["memory"]
}
def create_service_token(service_id: str, service_role: str, expires_in: int = 900) -> str:
"""
Create JWT token for service-to-service authentication.
Args:
service_id: Unique service identifier (e.g., "router", "gateway")
service_role: Service role (e.g., "router", "gateway")
expires_in: Token expiration in seconds (default: 1 hour)
Returns:
JWT token string
"""
now = int(time.time())
payload = {
"sub": service_id,
"role": service_role,
"aud": JWT_AUDIENCE,
"iss": JWT_ISSUER,
"iat": now,
"exp": now + expires_in,
"service_id": service_id,
"service_role": service_role
}
token = jwt.encode(payload, JWT_SECRET, algorithm=JWT_ALGORITHM)
return token
def verify_service_token(token: str) -> Dict[str, Any]:
"""
Verify service JWT token.
Returns:
Decoded token payload
Raises:
HTTPException: If token is invalid
"""
try:
payload = jwt.decode(
token,
JWT_SECRET,
algorithms=[JWT_ALGORITHM],
audience=JWT_AUDIENCE,
issuer=JWT_ISSUER
)
return payload
except jwt.ExpiredSignatureError:
raise HTTPException(status_code=401, detail="Token expired")
except jwt.InvalidTokenError as e:
raise HTTPException(status_code=401, detail=f"Invalid token: {e}")
def require_service_auth(allowed_roles: List[str] = None, allowed_services: List[str] = None):
"""
Decorator to require service authentication.
Args:
allowed_roles: List of allowed service roles
allowed_services: List of allowed service IDs
"""
def decorator(func):
@wraps(func)
async def wrapper(request: Request, *args, **kwargs):
# Get Authorization header
auth_header = request.headers.get("Authorization", "")
if not auth_header.startswith("Bearer "):
raise HTTPException(
status_code=401,
detail="Missing or invalid Authorization header"
)
token = auth_header.replace("Bearer ", "")
try:
payload = verify_service_token(token)
service_id = payload.get("service_id")
service_role = payload.get("role")
# Check if service is allowed
if allowed_roles and service_role not in allowed_roles:
raise HTTPException(
status_code=403,
detail=f"Service role '{service_role}' not allowed"
)
if allowed_services and service_id not in allowed_services:
raise HTTPException(
status_code=403,
detail=f"Service '{service_id}' not allowed"
)
# Add service info to request state
request.state.service_id = service_id
request.state.service_role = service_role
return await func(request, *args, **kwargs)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=401, detail=f"Authentication failed: {e}")
return wrapper
return decorator
def get_service_token() -> str:
"""
Get service token for current service (from environment).
"""
service_id = os.getenv("SERVICE_ID")
service_role = os.getenv("SERVICE_ROLE", service_id)
if not service_id:
raise ValueError("SERVICE_ID environment variable not set")
return create_service_token(service_id, service_role)
# FastAPI dependency for service auth
async def verify_service(request: Request, authorization: str = Header(None)):
"""FastAPI dependency for service authentication"""
if not authorization or not authorization.startswith("Bearer "):
raise HTTPException(status_code=401, detail="Missing Authorization header")
token = authorization.replace("Bearer ", "")
payload = verify_service_token(token)
request.state.service_id = payload.get("service_id")
request.state.service_role = payload.get("role")
return payload

282
shared/trace_middleware.py Normal file
View File

@@ -0,0 +1,282 @@
"""
Trace Middleware
================
Стандартизована кореляція запитів через всі сервіси.
Headers:
- X-Trace-ID: uuid (весь шлях)
- X-Request-ID: uuid (HTTP request)
- X-Job-ID: uuid (async NATS job)
- X-User-ID: user identifier
- X-Agent-ID: target agent
- X-Mode: public|team|private|confidential
- X-Policy-Version: version hash
- X-Prompt-Version: version hash
Використання:
1. Gateway генерує trace_id
2. Всі сервіси передають у headers
3. NATS messages містять у metadata
4. Logs структуровані з trace_id
"""
import uuid
import logging
from typing import Optional, Dict, Any
from datetime import datetime
from contextvars import ContextVar
from functools import wraps
from fastapi import Request, Response
from starlette.middleware.base import BaseHTTPMiddleware
# Context variables for trace propagation
trace_context: ContextVar[Dict[str, str]] = ContextVar('trace_context', default={})
logger = logging.getLogger(__name__)
class TraceContext:
"""Immutable trace context for request correlation"""
def __init__(
self,
trace_id: str = None,
request_id: str = None,
job_id: str = None,
user_id: str = None,
agent_id: str = None,
mode: str = "public",
policy_version: str = None,
prompt_version: str = None,
source_service: str = None
):
self.trace_id = trace_id or str(uuid.uuid4())
self.request_id = request_id or str(uuid.uuid4())
self.job_id = job_id
self.user_id = user_id
self.agent_id = agent_id
self.mode = mode
self.policy_version = policy_version
self.prompt_version = prompt_version
self.source_service = source_service
self.timestamp = datetime.utcnow().isoformat()
def to_headers(self) -> Dict[str, str]:
"""Convert to HTTP headers"""
headers = {
"X-Trace-ID": self.trace_id,
"X-Request-ID": self.request_id,
}
if self.job_id:
headers["X-Job-ID"] = self.job_id
if self.user_id:
headers["X-User-ID"] = self.user_id
if self.agent_id:
headers["X-Agent-ID"] = self.agent_id
if self.mode:
headers["X-Mode"] = self.mode
if self.policy_version:
headers["X-Policy-Version"] = self.policy_version
if self.prompt_version:
headers["X-Prompt-Version"] = self.prompt_version
return headers
def to_nats_headers(self) -> Dict[str, str]:
"""Convert to NATS message headers"""
return {
"Nats-Trace-ID": self.trace_id,
"Nats-Job-ID": self.job_id or self.request_id,
"Nats-User-ID": self.user_id or "",
"Nats-Agent-ID": self.agent_id or "",
"Nats-Mode": self.mode,
"Nats-Timestamp": self.timestamp
}
def to_log_context(self) -> Dict[str, Any]:
"""Convert to structured log context"""
return {
"trace_id": self.trace_id,
"request_id": self.request_id,
"job_id": self.job_id,
"user_id": self.user_id,
"agent_id": self.agent_id,
"mode": self.mode,
"policy_version": self.policy_version,
"prompt_version": self.prompt_version,
"timestamp": self.timestamp
}
@classmethod
def from_headers(cls, headers: Dict[str, str]) -> "TraceContext":
"""Create from HTTP headers"""
return cls(
trace_id=headers.get("X-Trace-ID") or headers.get("x-trace-id"),
request_id=headers.get("X-Request-ID") or headers.get("x-request-id"),
job_id=headers.get("X-Job-ID") or headers.get("x-job-id"),
user_id=headers.get("X-User-ID") or headers.get("x-user-id"),
agent_id=headers.get("X-Agent-ID") or headers.get("x-agent-id"),
mode=headers.get("X-Mode") or headers.get("x-mode") or "public",
policy_version=headers.get("X-Policy-Version"),
prompt_version=headers.get("X-Prompt-Version")
)
@classmethod
def from_nats(cls, headers: Dict[str, str]) -> "TraceContext":
"""Create from NATS headers"""
return cls(
trace_id=headers.get("Nats-Trace-ID"),
job_id=headers.get("Nats-Job-ID"),
user_id=headers.get("Nats-User-ID"),
agent_id=headers.get("Nats-Agent-ID"),
mode=headers.get("Nats-Mode", "public")
)
class TraceMiddleware(BaseHTTPMiddleware):
"""FastAPI middleware for trace propagation"""
def __init__(self, app, service_name: str):
super().__init__(app)
self.service_name = service_name
async def dispatch(self, request: Request, call_next):
# Extract or create trace context
ctx = TraceContext.from_headers(dict(request.headers))
ctx.source_service = self.service_name
# Store in context var
trace_context.set(ctx.to_log_context())
# Log request start
logger.info(
f"Request started",
extra={
"trace_id": ctx.trace_id,
"request_id": ctx.request_id,
"method": request.method,
"path": request.url.path,
"service": self.service_name
}
)
# Process request
response = await call_next(request)
# Add trace headers to response
response.headers["X-Trace-ID"] = ctx.trace_id
response.headers["X-Request-ID"] = ctx.request_id
# Log request end
logger.info(
f"Request completed",
extra={
"trace_id": ctx.trace_id,
"request_id": ctx.request_id,
"status_code": response.status_code,
"service": self.service_name
}
)
return response
def get_current_trace() -> Dict[str, str]:
"""Get current trace context from context var"""
return trace_context.get()
def with_trace(func):
"""Decorator to propagate trace context"""
@wraps(func)
async def wrapper(*args, **kwargs):
ctx = get_current_trace()
return await func(*args, trace_context=ctx, **kwargs)
return wrapper
# ==================== Structured Logging ====================
class TraceLogFormatter(logging.Formatter):
"""JSON formatter with trace context"""
def format(self, record):
# Get trace context
ctx = trace_context.get()
log_entry = {
"timestamp": datetime.utcnow().isoformat(),
"level": record.levelname,
"message": record.getMessage(),
"service": getattr(record, 'service', 'unknown'),
"trace_id": ctx.get('trace_id', ''),
"request_id": ctx.get('request_id', ''),
"user_id": ctx.get('user_id', ''),
"agent_id": ctx.get('agent_id', ''),
}
# Add extra fields
if hasattr(record, 'extra'):
log_entry.update(record.extra)
import json
return json.dumps(log_entry)
def setup_trace_logging(service_name: str):
"""Setup structured logging with trace context"""
handler = logging.StreamHandler()
handler.setFormatter(TraceLogFormatter())
root_logger = logging.getLogger()
root_logger.handlers = [handler]
root_logger.setLevel(logging.INFO)
# Add service name to all logs
old_factory = logging.getLogRecordFactory()
def record_factory(*args, **kwargs):
record = old_factory(*args, **kwargs)
record.service = service_name
return record
logging.setLogRecordFactory(record_factory)
# ==================== NATS Integration ====================
async def publish_with_trace(js, subject: str, payload: bytes, ctx: TraceContext):
"""Publish NATS message with trace headers"""
headers = ctx.to_nats_headers()
await js.publish(subject, payload, headers=headers)
def extract_trace_from_msg(msg) -> TraceContext:
"""Extract trace context from NATS message"""
headers = dict(msg.headers) if msg.headers else {}
return TraceContext.from_nats(headers)
# ==================== Audit Event ====================
def create_audit_event(
action: str,
ctx: TraceContext,
details: Dict[str, Any] = None
) -> Dict[str, Any]:
"""Create standardized audit event"""
return {
"event_id": str(uuid.uuid4()),
"event_type": f"audit.action.{action}",
"timestamp": datetime.utcnow().isoformat(),
"trace_id": ctx.trace_id,
"request_id": ctx.request_id,
"job_id": ctx.job_id,
"user_id": ctx.user_id,
"agent_id": ctx.agent_id,
"mode": ctx.mode,
"policy_version": ctx.policy_version,
"prompt_version": ctx.prompt_version,
"action": action,
"details": details or {}
}