Phase6/7 runtime + Gitea smoke gate setup #1

Merged
daarion-admin merged 214 commits from codex/sync-node1-runtime into main 2026-03-05 10:38:18 -08:00
3 changed files with 641 additions and 1 deletions
Showing only changes of commit 7e82a427e3 - Show all commits

View File

@@ -35,6 +35,10 @@ services:
- STT_SERVICE_UPLOAD_URL=http://swapper-service:8890/stt
- OCR_SERVICE_URL=http://swapper-service:8890
- WEB_SEARCH_SERVICE_URL=http://swapper-service:8890
- REDIS_URL=redis://redis:6379/0
- CREWAI_SERVICE_URL=http://dagi-staging-crewai-service:9010
- NATURE_ID_URL=http://plant-vision-node1:8085
- NATURE_ID_MIN_CONFIDENCE=0.65
- ONEOK_CRM_BASE_URL=http://oneok-crm-adapter:8088
- ONEOK_CALC_BASE_URL=http://oneok-calc-adapter:8089
- ONEOK_DOCS_BASE_URL=http://oneok-docs-adapter:8090
@@ -106,6 +110,28 @@ services:
# Image Generation тепер інтегровано в Swapper Service (lazy loading)
# Endpoint: POST /image/generate на swapper-service:8890
# Plant Vision wrapper (local nature-id CLI -> HTTP)
plant-vision-node1:
build:
context: ./services/plant-vision-node1
dockerfile: Dockerfile
container_name: plant-vision-node1
environment:
- NATURE_ID_CMD=${NATURE_ID_CMD:-python /opt/nature-id/nature_id.py -m plants -l -r 5 -s {image_path}}
- NATURE_ID_TIMEOUT=40
- DOWNLOAD_TIMEOUT=20
networks:
- dagi-network
volumes:
- ${DEPLOY_ROOT:-.}/third_party/nature-id:/opt/nature-id:ro
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8085/health')\""]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
# Crawl4AI - Advanced Web Crawler with JavaScript support
crawl4ai:
image: unclecode/crawl4ai@sha256:4d8b065bf185962733cb5f9701f4122d03383fa1ab6b5f6a9873f04fa0416a84
@@ -191,12 +217,15 @@ services:
- STT_SERVICE_UPLOAD_URL=http://swapper-service:8890/stt
- OCR_SERVICE_URL=http://swapper-service:8890
- WEB_SEARCH_SERVICE_URL=http://swapper-service:8890
- REDIS_URL=redis://redis:6379/0
- CREWAI_SERVICE_URL=http://dagi-staging-crewai-service:9010
volumes:
- ${DEPLOY_ROOT:-.}/gateway-bot:/app/gateway-bot:ro
- ${DEPLOY_ROOT:-.}/logs:/app/logs
depends_on:
- router
- memory-service
- redis
networks:
- dagi-network
restart: unless-stopped
@@ -207,6 +236,61 @@ services:
retries: 3
start_period: 10s
gateway-worker:
build:
context: ./gateway-bot
dockerfile: Dockerfile
container_name: dagi-gateway-worker-node1
command: ["python", "-m", "daarion_facade.worker"]
environment:
- ROUTER_BASE_URL=http://router:8000
- REDIS_URL=redis://redis:6379/0
- ROUTER_WORKER_TIMEOUT=60
volumes:
- ${DEPLOY_ROOT:-.}/gateway-bot:/app/gateway-bot:ro
- ${DEPLOY_ROOT:-.}/logs:/app/logs
depends_on:
- router
- redis
networks:
- dagi-network
restart: unless-stopped
healthcheck:
test: ["CMD", "python", "-c", "print(\"ok\")"]
interval: 30s
timeout: 5s
retries: 3
metrics-poller-node1:
build:
context: ./gateway-bot
dockerfile: Dockerfile
container_name: dagi-metrics-poller-node1
command: ["python", "-m", "daarion_facade.metrics_poller"]
environment:
- REDIS_URL=redis://redis:6379/0
- MEMORY_SERVICE_URL=http://memory-service:8000
- DAARION_METRICS_POLL_INTERVAL_SECONDS=${DAARION_METRICS_POLL_INTERVAL_SECONDS:-10}
- DAARION_METRICS_TTL_SECONDS=${DAARION_METRICS_TTL_SECONDS:-60}
- DAARION_METRICS_HTTP_CONNECT_TIMEOUT_SECONDS=${DAARION_METRICS_HTTP_CONNECT_TIMEOUT_SECONDS:-2}
- DAARION_METRICS_HTTP_TOTAL_TIMEOUT_SECONDS=${DAARION_METRICS_HTTP_TOTAL_TIMEOUT_SECONDS:-5}
- DAARION_NODE_COUNT=${DAARION_NODE_COUNT:-1}
volumes:
- ${DEPLOY_ROOT:-.}/gateway-bot:/app/gateway-bot:ro
- ${DEPLOY_ROOT:-.}/logs:/app/logs
depends_on:
- redis
- memory-service
networks:
- dagi-network
restart: unless-stopped
healthcheck:
test: ["CMD", "python", "-c", "print(\"ok\")"]
interval: 30s
timeout: 5s
retries: 3
# CLAN Consent Outbox Worker (Postgres event-store applier; no execute)
clan-consent-outbox-worker:
build:
@@ -736,10 +820,11 @@ services:
ports:
- "9108:9108"
environment:
- GATEWAY_URL=http://172.18.0.18:9300
- GATEWAY_URL=http://gateway:9300
- PROBE_INTERVAL=60
- PROBE_TIMEOUT=30
- METRICS_PORT=9108
- SEMANTIC_AGENTS=clan,sofiia,monitor,helion,agromatrix,senpai
networks:
- dagi-network
restart: unless-stopped

View File

@@ -0,0 +1,287 @@
import asyncio
import json
import logging
import os
import time
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional, Tuple
import httpx
from redis.asyncio import Redis
from .registry_api import _load_crewai_roles, _load_district_registry, _load_registry
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
logger = logging.getLogger("daarion-metrics-poller")
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
POLL_INTERVAL_SECONDS = int(os.getenv("DAARION_METRICS_POLL_INTERVAL_SECONDS", "10"))
METRICS_TTL_SECONDS = int(os.getenv("DAARION_METRICS_TTL_SECONDS", "60"))
HTTP_CONNECT_TIMEOUT_SECONDS = float(os.getenv("DAARION_METRICS_HTTP_CONNECT_TIMEOUT_SECONDS", "2"))
HTTP_TOTAL_TIMEOUT_SECONDS = float(os.getenv("DAARION_METRICS_HTTP_TOTAL_TIMEOUT_SECONDS", "5"))
NODES_TOTAL = int(os.getenv("DAARION_NODE_COUNT", "1"))
MEMORY_SERVICE_URL = os.getenv("MEMORY_SERVICE_URL", "http://memory-service:8000")
DASHBOARD_KEY = "daarion:metrics:dashboard"
DISTRICT_KEY_PREFIX = "daarion:metrics:district"
_redis: Optional[Redis] = None
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def _ensure_url(value: str) -> str:
value = (value or "").strip()
if not value:
return ""
if value.startswith("http://") or value.startswith("https://"):
return value
return f"https://{value}"
def _health_candidates(district: Dict[str, Any]) -> List[str]:
base = _ensure_url(str(district.get("domain") or ""))
candidates: List[str] = []
explicit = str(district.get("health_url") or "").strip()
if explicit:
candidates.append(_ensure_url(explicit))
if base:
candidates.extend(
[
f"{base}/.well-known/daarion-health.json",
f"{base}/health",
f"{base}/v1/health",
]
)
dedup: List[str] = []
seen = set()
for url in candidates:
if url and url not in seen:
dedup.append(url)
seen.add(url)
return dedup
def _extract_agents_online(payload: Dict[str, Any], agents_total: int) -> Optional[int]:
raw = payload.get("agents_online")
if isinstance(raw, bool):
return agents_total if raw else 0
if isinstance(raw, int):
return max(0, min(raw, agents_total))
agents = payload.get("agents")
if isinstance(agents, list):
count = 0
for agent in agents:
if not isinstance(agent, dict):
continue
status = str(agent.get("status", "")).lower()
if status in {"online", "active", "ok"}:
count += 1
return min(count, agents_total)
return None
async def redis_client() -> Redis:
global _redis
if _redis is None:
_redis = Redis.from_url(REDIS_URL, decode_responses=True)
return _redis
async def close_redis() -> None:
global _redis
if _redis is not None:
await _redis.close()
_redis = None
async def _fetch_json_with_latency(
client: httpx.AsyncClient,
url: str,
) -> Tuple[bool, Optional[Dict[str, Any]], Optional[float], Optional[str]]:
started = time.perf_counter()
try:
response = await client.get(url)
latency_ms = round((time.perf_counter() - started) * 1000, 2)
if response.status_code >= 400:
return False, None, latency_ms, f"HTTP {response.status_code}"
data: Optional[Dict[str, Any]] = None
try:
parsed = response.json()
if isinstance(parsed, dict):
data = parsed
except Exception:
data = None
return True, data, latency_ms, None
except Exception as e:
latency_ms = round((time.perf_counter() - started) * 1000, 2)
return False, None, latency_ms, str(e)
async def _read_memory_vectors(client: httpx.AsyncClient) -> int:
try:
ok, payload, _, _ = await _fetch_json_with_latency(client, f"{MEMORY_SERVICE_URL}/health")
if not ok or not payload:
return 0
return int(payload.get("vector_store", {}).get("memories", {}).get("vectors_count", 0) or 0)
except Exception:
return 0
async def _registry_snapshot() -> Tuple[List[Dict[str, Any]], Dict[str, List[Dict[str, Any]]], int, int]:
raw_districts = _load_district_registry().get("districts", [])
districts = [d for d in raw_districts if isinstance(d, dict) and d.get("district_id")]
agents_map = _load_registry().get("agents", {})
role_counts = await _load_crewai_roles()
by_district: Dict[str, List[Dict[str, Any]]] = {}
subagents_total = 0
for aid, cfg in agents_map.items():
if not isinstance(cfg, dict):
continue
aid_str = str(aid)
district_id = str(cfg.get("district_id") or "city-core")
subagents_total += int(role_counts.get(aid_str, 0))
by_district.setdefault(district_id, []).append(
{
"agent_id": aid_str,
"status": str(cfg.get("status", "active")),
}
)
return districts, by_district, len(agents_map), subagents_total
async def build_dashboard() -> Dict[str, Any]:
districts, agents_by_district, agents_total, subagents_total = await _registry_snapshot()
timeout = httpx.Timeout(timeout=HTTP_TOTAL_TIMEOUT_SECONDS, connect=HTTP_CONNECT_TIMEOUT_SECONDS)
by_district: List[Dict[str, Any]] = []
districts_online = 0
agents_online_total = 0
async with httpx.AsyncClient(timeout=timeout, follow_redirects=True) as client:
memory_vectors = await _read_memory_vectors(client)
for district in districts:
district_id = str(district.get("district_id"))
title = district.get("title") or district_id
domain = str(district.get("domain") or "")
status = district.get("status") or "active"
members = agents_by_district.get(district_id, [])
agents_total_district = len(members)
sample = {
"district_id": district_id,
"title": title,
"domain": domain,
"status": status,
"ok": False,
"agents_total": agents_total_district,
"agents_online": 0,
"latency_ms": None,
"last_check_ts": _now_iso(),
"error": None,
}
last_error = "No health endpoint configured"
for candidate in _health_candidates(district):
ok, payload, latency_ms, error_message = await _fetch_json_with_latency(client, candidate)
sample["latency_ms"] = latency_ms
if ok:
sample["ok"] = True
sample["error"] = None
inferred = _extract_agents_online(payload or {}, agents_total_district)
sample["agents_online"] = inferred if inferred is not None else agents_total_district
break
last_error = error_message or "health check failed"
if sample["ok"]:
districts_online += 1
agents_online_total += int(sample.get("agents_online") or 0)
else:
sample["error"] = {"message": last_error}
by_district.append(sample)
return {
"global": {
"nodes": NODES_TOTAL,
"districts": len(districts),
"agents": agents_total,
"subagents": subagents_total,
"memory_vectors": memory_vectors,
"districts_online": districts_online,
"agents_online": agents_online_total,
},
"by_district": by_district,
"updated_at": _now_iso(),
}
async def publish_dashboard(dashboard: Dict[str, Any]) -> None:
redis = await redis_client()
payload = json.dumps(dashboard, ensure_ascii=False)
await redis.set(DASHBOARD_KEY, payload, ex=METRICS_TTL_SECONDS)
for row in dashboard.get("by_district", []):
district_id = row.get("district_id")
if not district_id:
continue
key = f"{DISTRICT_KEY_PREFIX}:{district_id}"
await redis.set(key, json.dumps(row, ensure_ascii=False), ex=METRICS_TTL_SECONDS)
async def run_once() -> None:
dashboard = await build_dashboard()
await publish_dashboard(dashboard)
logger.info(
"dashboard_updated districts=%s districts_online=%s agents=%s agents_online=%s",
dashboard["global"].get("districts"),
dashboard["global"].get("districts_online"),
dashboard["global"].get("agents"),
dashboard["global"].get("agents_online"),
)
async def worker_loop() -> None:
logger.info(
"metrics_poller_started interval=%ss ttl=%ss redis=%s",
POLL_INTERVAL_SECONDS,
METRICS_TTL_SECONDS,
REDIS_URL,
)
while True:
started = time.perf_counter()
try:
await run_once()
except asyncio.CancelledError:
raise
except Exception:
logger.exception("metrics_poller_cycle_failed")
elapsed = time.perf_counter() - started
sleep_for = max(1.0, POLL_INTERVAL_SECONDS - elapsed)
await asyncio.sleep(sleep_for)
if __name__ == "__main__":
try:
asyncio.run(worker_loop())
finally:
try:
asyncio.run(close_redis())
except Exception:
pass

View File

@@ -0,0 +1,268 @@
import json
import os
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
import httpx
from fastapi import APIRouter
from redis.asyncio import Redis
router = APIRouter(prefix="/v1", tags=["daarion-facade"])
REGISTRY_CACHE_TTL = int(os.getenv("REGISTRY_CACHE_TTL", "30"))
MEMORY_SERVICE_URL = os.getenv("MEMORY_SERVICE_URL", "http://memory-service:8000")
CREWAI_SERVICE_URL = os.getenv("CREWAI_SERVICE_URL", "http://dagi-staging-crewai-service:9010")
REDIS_URL = os.getenv("REDIS_URL", "redis://redis:6379/0")
METRICS_DASHBOARD_KEY = "daarion:metrics:dashboard"
_REGISTRY_CACHE: Dict[str, Any] = {"loaded_at": 0.0, "data": None}
_DISTRICT_CACHE: Dict[str, Any] = {"loaded_at": 0.0, "data": None}
_CREWAI_CACHE: Dict[str, Any] = {"loaded_at": 0.0, "data": {}}
_REDIS: Optional[Redis] = None
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def _registry_paths() -> List[Path]:
return [
Path("/app/gateway-bot/agent_registry.json"),
Path("/opt/microdao-daarion/config/agent_registry.json"),
Path(__file__).resolve().parents[1] / "agent_registry.json",
]
def _district_paths() -> List[Path]:
return [
Path("/app/gateway-bot/district_registry.json"),
Path(__file__).resolve().parents[1] / "district_registry.json",
]
def _load_registry() -> Dict[str, Any]:
now = time.time()
if _REGISTRY_CACHE.get("data") and (now - _REGISTRY_CACHE.get("loaded_at", 0.0) < REGISTRY_CACHE_TTL):
return _REGISTRY_CACHE["data"]
for path in _registry_paths():
if path.exists():
with path.open("r", encoding="utf-8") as f:
data = json.load(f)
_REGISTRY_CACHE.update({"loaded_at": now, "data": data})
return data
data = {"agents": {}}
_REGISTRY_CACHE.update({"loaded_at": now, "data": data})
return data
def _load_district_registry() -> Dict[str, Any]:
now = time.time()
if _DISTRICT_CACHE.get("data") and (now - _DISTRICT_CACHE.get("loaded_at", 0.0) < REGISTRY_CACHE_TTL):
return _DISTRICT_CACHE["data"]
for path in _district_paths():
if path.exists():
with path.open("r", encoding="utf-8") as f:
data = json.load(f)
_DISTRICT_CACHE.update({"loaded_at": now, "data": data})
return data
data = {"districts": []}
_DISTRICT_CACHE.update({"loaded_at": now, "data": data})
return data
async def _redis_client() -> Redis:
global _REDIS
if _REDIS is None:
_REDIS = Redis.from_url(REDIS_URL, decode_responses=True)
return _REDIS
async def _load_cached_dashboard() -> Optional[Dict[str, Any]]:
try:
r = await _redis_client()
raw = await r.get(METRICS_DASHBOARD_KEY)
if not raw:
return None
return json.loads(raw)
except Exception:
return None
async def _load_crewai_roles() -> Dict[str, int]:
now = time.time()
if now - _CREWAI_CACHE.get("loaded_at", 0.0) < REGISTRY_CACHE_TTL:
return _CREWAI_CACHE.get("data", {})
out: Dict[str, int] = {}
try:
async with httpx.AsyncClient(timeout=8.0) as client:
resp = await client.get(f"{CREWAI_SERVICE_URL}/crew/agents")
if resp.status_code == 200:
payload = resp.json()
for aid, info in payload.items():
default_roles = info.get("default_roles")
out[str(aid)] = int(default_roles) if isinstance(default_roles, int) else 0
except Exception:
out = {}
_CREWAI_CACHE.update({"loaded_at": now, "data": out})
return out
@router.get("/registry/agents")
async def get_agents() -> Dict[str, Any]:
reg = _load_registry()
agents = reg.get("agents", {}) if isinstance(reg, dict) else {}
role_counts = await _load_crewai_roles()
items: List[Dict[str, Any]] = []
for agent_id, cfg in agents.items():
if not isinstance(cfg, dict):
continue
domains = cfg.get("domains") or []
district_id = cfg.get("district_id") or "city-core"
items.append(
{
"agent_id": agent_id,
"title": cfg.get("display_name") or agent_id,
"role": cfg.get("canonical_role") or "",
"domain_primary": domains[0] if domains else "general",
"domain_aliases": domains[1:] if len(domains) > 1 else [],
"visibility": cfg.get("visibility", "public"),
"status": cfg.get("status", "active"),
"team": {"subagents_total": role_counts.get(agent_id, 0)},
"district_id": district_id,
"avatar_url": cfg.get("avatar_url"),
"health_url": cfg.get("health_url"),
}
)
return {"items": items, "total": len(items)}
@router.get("/registry/districts")
async def get_districts() -> Dict[str, Any]:
agents_payload = await get_agents()
agents = agents_payload.get("items", [])
by_district: Dict[str, List[Dict[str, Any]]] = {}
for a in agents:
by_district.setdefault(a.get("district_id", "city-core"), []).append(a)
catalog = _load_district_registry().get("districts", [])
catalog_by_id: Dict[str, Dict[str, Any]] = {
str(d.get("district_id")): d for d in catalog if isinstance(d, dict) and d.get("district_id")
}
district_ids = sorted(set(catalog_by_id.keys()) | set(by_district.keys()))
items: List[Dict[str, Any]] = []
for district_id in district_ids:
members = by_district.get(district_id, [])
base = catalog_by_id.get(district_id, {})
domain = base.get("domain") or ("daarion.city" if district_id == "city-core" else f"{district_id}.daarion.city")
lead_agent_id = base.get("lead_agent_id")
if not lead_agent_id:
if district_id == "city-core" and any(m.get("agent_id") == "daarwizz" for m in members):
lead_agent_id = "daarwizz"
elif members:
lead_agent_id = members[0].get("agent_id")
else:
lead_agent_id = None
items.append(
{
"district_id": district_id,
"title": base.get("title") or district_id.replace("-", " ").title(),
"domain": domain,
"status": base.get("status", "active"),
"logo_url": base.get("logo_url"),
"health_url": base.get("health_url"),
"well_known": {
"manifest": f"https://{domain}/.well-known/daarion-district.json",
"health": f"https://{domain}/.well-known/daarion-health.json",
"capabilities": f"https://{domain}/.well-known/daarion-capabilities.json",
},
"lead_agent_id": lead_agent_id,
"agents_total": len(members),
}
)
return {"items": items, "total": len(items)}
@router.get("/metrics")
async def get_metrics() -> Dict[str, Any]:
agents_payload = await get_agents()
districts_payload = await get_districts()
agents = agents_payload.get("items", [])
memory_vectors = 0
try:
async with httpx.AsyncClient(timeout=5.0) as client:
resp = await client.get(f"{MEMORY_SERVICE_URL}/health")
if resp.status_code == 200:
data = resp.json()
memory_vectors = int(
data.get("vector_store", {})
.get("memories", {})
.get("vectors_count", 0)
)
except Exception:
memory_vectors = 0
return {
"nodes": 1,
"districts": districts_payload.get("total", 0),
"agents": len(agents),
"subagents": sum(int((a.get("team") or {}).get("subagents_total", 0)) for a in agents),
"memory_vectors": memory_vectors,
}
@router.get("/metrics/dashboard")
async def get_metrics_dashboard() -> Dict[str, Any]:
cached = await _load_cached_dashboard()
if cached:
return cached
metrics = await get_metrics()
districts_payload = await get_districts()
districts = districts_payload.get("items", [])
by_district = []
for d in districts:
by_district.append(
{
"district_id": d.get("district_id"),
"title": d.get("title"),
"domain": d.get("domain"),
"status": d.get("status"),
"ok": None,
"agents_total": d.get("agents_total", 0),
"agents_online": None,
"latency_ms": None,
"last_check_ts": None,
}
)
return {
"global": {
"nodes": metrics.get("nodes", 1),
"districts": metrics.get("districts", 0),
"agents": metrics.get("agents", 0),
"subagents": metrics.get("subagents", 0),
"memory_vectors": metrics.get("memory_vectors", 0),
"districts_online": 0,
"agents_online": 0,
},
"by_district": by_district,
"updated_at": _now_iso(),
"source": "fallback_registry",
}