feat(soak): add --sender-count rotation + --inter-msg-ms; add NODA1 runtime snapshot
Made-with: Cursor
This commit is contained in:
217
docs/ops/node1-runtime-snapshot-20260305.md
Normal file
217
docs/ops/node1-runtime-snapshot-20260305.md
Normal file
@@ -0,0 +1,217 @@
|
||||
# НОДА1 — Runtime Snapshot (2026-03-05)
|
||||
|
||||
**Server:** 144.76.224.179 (node1-daarion)
|
||||
**Captured:** 2026-03-05 ~16:55 UTC+1
|
||||
**Branch on server:** `codex/sync-node1-runtime` @ `e1d73eb`
|
||||
**Hardware:** Hetzner GEX44 · CPU: 32 cores · RAM: 64GB · Disk: 1.7TB NVMe (555GB used, 34%)
|
||||
|
||||
---
|
||||
|
||||
## A) Зрізи контейнерів (що реально запущено)
|
||||
|
||||
| Container | Image | Status | External Ports |
|
||||
|-----------|-------|--------|----------------|
|
||||
| `matrix-bridge-dagi-node1` | microdao-daarion-matrix-bridge-dagi | Up (healthy) | `127.0.0.1:7030` |
|
||||
| `dagi-gateway-node1` | microdao-daarion-gateway | Up (healthy) | `0.0.0.0:9300` |
|
||||
| `dagi-router-node1` | microdao-daarion-router | Up (healthy) | `0.0.0.0:9102→8000` |
|
||||
| `dagi-memory-service-node1` | microdao-daarion-memory-service | Up (healthy) | `0.0.0.0:8000` |
|
||||
| `dagi-sofiia-console-node1` | microdao-daarion-dagi-sofiia-console-node1 | Up (healthy) | `0.0.0.0:8002` |
|
||||
| `dagi-experience-learner-node1` | microdao-daarion-experience-learner | Up (healthy) | `127.0.0.1:9109` |
|
||||
| `dagi-gateway-worker-node1` | microdao-daarion-gateway-worker | Up (healthy) | `9300/tcp` (internal) |
|
||||
| `dagi-gateway-reminder-worker-node1` | microdao-daarion-gateway-reminder-worker | Up (healthy) | internal |
|
||||
| `dagi-staging-router` | dagi-staging-router | Up (healthy) | `8000/tcp` (internal) |
|
||||
| `dagi-staging-crewai-service` | dagi-staging-crewai-service | Up | — |
|
||||
| `dagi-synapse-node1` | matrixdotorg/synapse:latest | Up 2 days (healthy) | `127.0.0.1:8008` |
|
||||
| `dagi-synapse-db-node1` | postgres:15-alpine | Up 2 days (healthy) | `5432/tcp` (internal) |
|
||||
| `dagi-market-data-node1` | microdao-daarion-market-data-service | Up (healthy) | `0.0.0.0:8893→8891` |
|
||||
| `dagi-senpai-md-consumer-node1` | microdao-daarion-senpai-md-consumer | Up (healthy) | `0.0.0.0:8892` |
|
||||
| `dagi-binance-bot-monitor-node1` | microdao-daarion-binance-bot-monitor | Up | — |
|
||||
| `clan-consent-adapter` | microdao-daarion-clan-consent-adapter | Up (healthy) | `0.0.0.0:8111` |
|
||||
| `7e80f50f30e2_clan-visibility-guard` | microdao-daarion-clan-visibility-guard | Up (healthy) | `0.0.0.0:8112` |
|
||||
| `agent-e2e-prober-node1` | microdao-daarion-agent-e2e-prober | Up | `0.0.0.0:9108` |
|
||||
| `dagi-nats-node1` | nats:2.11-alpine | Up (healthy) | `4222, 7422 (leafnode), 8222 (mon)` |
|
||||
| `dagi-redis-node1` | redis:8-alpine | Up (healthy) | `0.0.0.0:6379` |
|
||||
| `dagi-qdrant-node1` | qdrant/qdrant:v1.13.6 | Up (healthy) | `0.0.0.0:6333-6334` |
|
||||
| `dagi-neo4j-node1` | neo4j:5.26-community | Up (healthy) | `0.0.0.0:7474, 0.0.0.0:7687` |
|
||||
| `dagi-postgres` | pgvector/pgvector:pg16 | Up | `0.0.0.0:5432` |
|
||||
| `dagi-farmos-db-node1` | postgres:16-alpine | Up (healthy) | `5432/tcp` (internal) |
|
||||
| `dagi-farmos-node1` | farmos/farmos:4.x-amd64 | Up (healthy) | `127.0.0.1:8088→80` |
|
||||
| `dagi-minio-node1` | minio/minio:latest | Up | `0.0.0.0:9000-9001` |
|
||||
| `dagi-vision-encoder-node1` | microdao-daarion-vision-encoder | Up (healthy) | `0.0.0.0:8001` |
|
||||
| `swapper-service-node1` | microdao-daarion-swapper-service | Up (healthy) | `0.0.0.0:8890-8891` |
|
||||
| `plant-vision-node1` | microdao-daarion-plant-vision-node1 | Up (healthy) | `8085/tcp` (internal) |
|
||||
| `dagi-crawl4ai-node1` | unclecode/crawl4ai:latest | Up (healthy) | `0.0.0.0:11235` |
|
||||
| `ollama` | ollama/ollama:latest | Up 5 days | — (host port 11434) |
|
||||
| `rag-service-node1` | microdao-daarion-rag-service | Up (healthy) | `0.0.0.0:9500` |
|
||||
| `artifact-registry-node1` | microdao-daarion-artifact-registry | Up (healthy) | `0.0.0.0:9220` |
|
||||
| `ingest-service` | microdao-daarion-ingest-service | Up | `0.0.0.0:8100` |
|
||||
| `parser-pipeline` | parser-pipeline:latest | Up | `0.0.0.0:8101` |
|
||||
| `index-doc-worker-node1` | microdao-daarion-index-doc-worker | Up | — |
|
||||
| `render-pptx-worker-node1` | microdao-daarion-render-pptx-worker | Up | — |
|
||||
| `render-pdf-worker-node1` | microdao-daarion-render-pdf-worker | Up | — |
|
||||
| `brand-registry-node1` | microdao-daarion-brand-registry | Up (healthy) | `0.0.0.0:9210` |
|
||||
| `brand-intake-node1` | microdao-daarion-brand-intake | Up (healthy) | `0.0.0.0:9211` |
|
||||
| `presentation-renderer-node1` | microdao-daarion-presentation-renderer | Up (healthy) | `0.0.0.0:9212` |
|
||||
| `crewai-nats-worker` | microdao-daarion-crewai-worker | Up | `0.0.0.0:9011` |
|
||||
| `node-capabilities-node1` | microdao-daarion-node-capabilities | Up | `127.0.0.1:8099` |
|
||||
| `node-worker-node1` | microdao-daarion-node-worker | Up | `127.0.0.1:8109` |
|
||||
| `postgres-backup-node1` | prodrigestivill/postgres-backup-local:16 | Up (healthy) | — |
|
||||
| `clan-consent-outbox-worker` | microdao-daarion-clan-consent-outbox-worker | Up | — |
|
||||
| `dagi-metrics-poller-node1` | microdao-daarion-metrics-poller-node1 | Up (healthy) | — |
|
||||
| `oneok-espocrm-node1` | espocrm/espocrm:latest | Up | `0.0.0.0:9080→80` |
|
||||
| `oneok-espocrm-db-node1` | mariadb:11 | Up | `3306/tcp` (internal) |
|
||||
| `oneok-gotenberg-node1` | gotenberg/gotenberg:8 | Up | `0.0.0.0:3010` |
|
||||
| `oneok-crm-adapter-node1` | microdao-daarion-oneok-crm-adapter | Up | `8088/tcp` (internal) |
|
||||
| `oneok-docs-adapter-node1` | microdao-daarion-oneok-docs-adapter | Up | `8090/tcp` (internal) |
|
||||
| `oneok-calc-adapter-node1` | microdao-daarion-oneok-calc-adapter | Up | `8089/tcp` (internal) |
|
||||
| `oneok-schedule-adapter-node1` | microdao-daarion-oneok-schedule-adapter | Up | `8091/tcp` (internal) |
|
||||
| `prometheus` | prom/prometheus:latest | Up | `0.0.0.0:9090` |
|
||||
| `grafana` | grafana/grafana:latest | Up | `127.0.0.1:3030→3000` |
|
||||
| `control-plane` | control-plane:latest | Up | `9200/tcp` (internal) |
|
||||
| `dagi-nats-js-init-node1` | — | init/done | — |
|
||||
|
||||
**Total containers running: 57**
|
||||
|
||||
---
|
||||
|
||||
## B) Мережа — реально слухає (host)
|
||||
|
||||
| Port | Service | Access |
|
||||
|------|---------|--------|
|
||||
| `22` | sshd | public |
|
||||
| `80/443` | nginx (reverse proxy) | public |
|
||||
| `3010` | gotenberg (PDF) | public |
|
||||
| `4222` | NATS clients | public |
|
||||
| `5432` | PostgreSQL (pgvector) | public |
|
||||
| `6333/6334` | Qdrant HTTP/gRPC | public |
|
||||
| `6379` | Redis | public |
|
||||
| `7422` | NATS leafnode hub | public |
|
||||
| `7474/7687` | Neo4j HTTP/Bolt | public |
|
||||
| `8000` | Memory Service | public |
|
||||
| `8001` | Vision Encoder | public |
|
||||
| `8002` | Sofiia Console | public |
|
||||
| `8100` | Ingest Service | public |
|
||||
| `8101` | Parser Pipeline | public |
|
||||
| `8111/8112` | Clan adapters | public |
|
||||
| `8222` | NATS monitoring | public |
|
||||
| `8890/8891` | Swapper (LLM/metrics) | public |
|
||||
| `8892` | SENPAI MD Consumer | public |
|
||||
| `8893` | Market Data Service | public |
|
||||
| `9000/9001` | MinIO S3 | public |
|
||||
| `9011` | CrewAI NATS worker | public |
|
||||
| `9080` | EspoCRM | public |
|
||||
| `9090` | Prometheus | public |
|
||||
| `9102` | Router (→8000 internal) | public |
|
||||
| `9108` | E2E Prober | public |
|
||||
| `9210/9211/9212` | Brand services | public |
|
||||
| `9220` | Artifact Registry | public |
|
||||
| `9300` | Gateway (Telegram) | public |
|
||||
| `9500` | RAG Service | public |
|
||||
| `11235` | Crawl4AI | public |
|
||||
| `127.0.0.1:3030` | Grafana | localhost only |
|
||||
| `127.0.0.1:6444` | k3s API server | localhost only |
|
||||
| `127.0.0.1:7030` | matrix-bridge-dagi | localhost only |
|
||||
| `127.0.0.1:8008` | Synapse Matrix HS | localhost only |
|
||||
| `127.0.0.1:8088` | FarmOS | localhost only |
|
||||
| `127.0.0.1:8099` | Node Capabilities | localhost only |
|
||||
| `127.0.0.1:8109` | Node Worker | localhost only |
|
||||
| `127.0.0.1:9109` | Experience Learner | localhost only |
|
||||
| `*:6443` | k3s server (Kubernetes) | public |
|
||||
|
||||
> **Note:** k3s (Kubernetes) is running alongside Docker — not reflected in compose files.
|
||||
|
||||
---
|
||||
|
||||
## C) Health (як бачить ops)
|
||||
|
||||
### Gateway `:9300`
|
||||
```
|
||||
status: healthy | agents: 16
|
||||
agents: daarwizz, helion, greenfood, agromatrix, alateya, nutra, druid,
|
||||
clan, eonarch, senpai, oneok, soul, yaromir, sofiia, monitor, aistalk
|
||||
```
|
||||
|
||||
### Router `:9102`
|
||||
```
|
||||
status: ok
|
||||
```
|
||||
|
||||
### Matrix Bridge `:7030`
|
||||
```
|
||||
ok: true | uptime: ~5 min after last restart
|
||||
rate_limiter: room_rpm=20 sender_rpm=10
|
||||
queue: size=6 max=100 workers=2
|
||||
persistent_dedupe: enabled=true db=/app/data/matrix_bridge.db
|
||||
policy_store: ok=true overrides=0
|
||||
sticky_cache: ttl=300s active=0
|
||||
confirm_store: ttl=120s pending=0
|
||||
```
|
||||
|
||||
### Sofiia Console `:8002`
|
||||
```
|
||||
/api/health → status: false (internal issue, not affecting other services)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## D) Ollama моделі (bare-metal, порт 11434)
|
||||
|
||||
| Model | Size | Last Modified |
|
||||
|-------|------|---------------|
|
||||
| `smollm2:135m` | 270 MB | 14 hours ago |
|
||||
| `qwen3.5:0.8b` | 1.0 GB | 2 days ago |
|
||||
| `qwen3.5:2b` | 2.7 GB | 2 days ago |
|
||||
| `qwen3.5:9b` | 6.6 GB | 2 days ago |
|
||||
| `qwen3:8b` | 5.2 GB | 5 days ago |
|
||||
| `qwen3-vl:8b` | 6.1 GB | 6 days ago |
|
||||
| `qwen3.5:27b-q4_K_M` | 17 GB | 6 days ago |
|
||||
| `deepseek-v3.1:671b-cloud` | — (cloud) | 4 weeks ago |
|
||||
|
||||
**Active for Bridge/Router:** `qwen3.5:27b-q4_K_M` (default), `qwen3-vl:8b` (vision)
|
||||
|
||||
---
|
||||
|
||||
## E) Matrix Bridge runtime config (актуальні env)
|
||||
|
||||
```
|
||||
WORKER_CONCURRENCY=2
|
||||
QUEUE_MAX_EVENTS=100
|
||||
RATE_LIMIT_ROOM_RPM=20
|
||||
RATE_LIMIT_SENDER_RPM=10
|
||||
PERSISTENT_DEDUPE=1
|
||||
QUEUE_DRAIN_TIMEOUT_S=5
|
||||
BRIDGE_ROOM_MAP=sofiia:!QwHczWXgefDHBEVkTH:daarion.space
|
||||
BRIDGE_MIXED_ROOM_MAP= (empty)
|
||||
BRIDGE_CONTROL_ROOMS= (empty)
|
||||
CONTROL_ROOM_RPM=60
|
||||
CONTROL_OPERATOR_RPM=30
|
||||
DISCOVERY_RPM=20
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## F) Розбіжності «архітектурна карта vs реальність»
|
||||
|
||||
| У карті | Реальність |
|
||||
|---------|-----------|
|
||||
| Matrix bridge порт `:9400` | Реально: `127.0.0.1:7030` (за nginx на matrix.daarion.space) |
|
||||
| Sofiia console порт `:8002` | Є, але `/api/health` повертає `false` |
|
||||
| `dagi-postgres` порт `:5432` | Конфліктує з `dagi-synapse-db-node1:5432` — обидва піднято, зовнішній маппінг лише у `dagi-postgres` |
|
||||
| `plant-vision` порт `:8085` | Internal only (не проксується назовні) |
|
||||
| Grafana — не було в карті | Запущена на `127.0.0.1:3030` |
|
||||
| Prometheus — не було в карті | Запущена на `0.0.0.0:9090` |
|
||||
| k3s — не було в карті | Kubernetes кластер активний поряд з Docker (`*:6443`) |
|
||||
| `dagi-staging-router` — не було в карті | Додатковий staging router (internal) |
|
||||
| `control-plane` — не було в карті | Окремий control-plane контейнер (internal `:9200`) |
|
||||
| `crewai-nats-worker` — не було в карті | NATS-based CrewAI worker (`:9011`) |
|
||||
| `dagi-staging-crewai-service` — не було | Staging CrewAI service |
|
||||
|
||||
---
|
||||
|
||||
## G) Ресурси
|
||||
|
||||
```
|
||||
Disk: 1.7TB total | 555GB used (34%) | 1.1TB free
|
||||
RAM: 62GB total | 14GB used | 47GB available
|
||||
Swap: 31GB total | 649MB used
|
||||
```
|
||||
@@ -247,10 +247,28 @@ async def run_soak(
|
||||
max_p95_ms: float,
|
||||
max_drop_rate: float,
|
||||
db_path: str = "",
|
||||
sender_count: int = 1,
|
||||
inter_message_ms: float = 0.0,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
sender_count > 1: rotate senders @soak-0001:..., @soak-0002:..., etc.
|
||||
This avoids sender_rpm rate-limiting when testing invoke latency.
|
||||
|
||||
inter_message_ms > 0: sleep between each inject (spread load over time).
|
||||
"""
|
||||
results: List[tuple] = []
|
||||
semaphore = asyncio.Semaphore(concurrency)
|
||||
|
||||
# Build sender pool
|
||||
server = sender.split(":", 1)[-1] if ":" in sender else "daarion.space"
|
||||
sender_pool = (
|
||||
[sender] if sender_count <= 1
|
||||
else [f"@soak-{i:04d}:{server}" for i in range(sender_count)]
|
||||
)
|
||||
if sender_count > 1:
|
||||
print(f"[soak] Sender rotation: {sender_count} senders "
|
||||
f"(@soak-0000:{server} … @soak-{sender_count-1:04d}:{server})")
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
# Pre-check: inject endpoint + health
|
||||
preflight_err = await _preflight_inject(client, url, room_id)
|
||||
@@ -273,15 +291,19 @@ async def run_soak(
|
||||
fo_before = _parse_counter(metrics_before, "matrix_bridge_failover_total")
|
||||
|
||||
print(f"[soak] Bridge health before: {health_before.get('ok', '?')}")
|
||||
print(f"[soak] Starting {n_messages} messages (concurrency={concurrency}) ...")
|
||||
rl_note = f" (⚠️ rate_limited before={rl_before:.0f}, using {len(sender_pool)} sender(s))"
|
||||
print(f"[soak] Starting {n_messages} messages (concurrency={concurrency}) ...{rl_note}")
|
||||
|
||||
t_start = time.monotonic()
|
||||
|
||||
async def worker(i: int):
|
||||
async with semaphore:
|
||||
if inter_message_ms > 0:
|
||||
await asyncio.sleep(inter_message_ms / 1000.0)
|
||||
msg = f"soak-msg-{i:04d}"
|
||||
current_sender = sender_pool[i % len(sender_pool)]
|
||||
lat, status, err = await _send_one(
|
||||
client, url, agent_id, msg, room_id, sender
|
||||
client, url, agent_id, msg, room_id, current_sender
|
||||
)
|
||||
results.append((lat, status, err))
|
||||
if (i + 1) % max(1, n_messages // 10) == 0:
|
||||
@@ -382,6 +404,15 @@ async def run_soak(
|
||||
f"WAL grew {wal_delta:.1f}MB (threshold {report['wal']['threshold_mb']}MB) "
|
||||
"— possible SQLite write pressure (Bottleneck #2)"
|
||||
)
|
||||
# Rate-limited warning (not a failure, but surfaced prominently)
|
||||
rl_delta = m.get("rate_limited", 0)
|
||||
if rl_delta > 0:
|
||||
rl_pct = rl_delta / s["total_messages"] * 100
|
||||
report["warnings"] = report.get("warnings", [])
|
||||
report["warnings"].append(
|
||||
f"rate_limited={rl_delta:.0f} ({rl_pct:.0f}% of messages) — "
|
||||
"use --sender-count >= RATE_LIMIT_SENDER_RPM for invoke baseline"
|
||||
)
|
||||
|
||||
report["passed"] = len(failures) == 0
|
||||
report["failures"] = failures
|
||||
@@ -419,6 +450,9 @@ def _print_report(r: Dict[str, Any]) -> None:
|
||||
wal_warn = " ⚠️" if (w.get("delta_mb") or 0) > w.get("threshold_mb", 10) else ""
|
||||
print(f" WAL: {w['before_mb']}MB → {w['after_mb']}MB {wal_delta_str}{wal_warn}")
|
||||
print()
|
||||
if r.get("warnings"):
|
||||
for w in r["warnings"]:
|
||||
print(f" ⚠️ {w}")
|
||||
if r["failures"]:
|
||||
for f in r["failures"]:
|
||||
print(f" ❌ {f}")
|
||||
@@ -445,11 +479,18 @@ def main() -> int:
|
||||
help=f"Max p95 latency ms (default: {_DEFAULT_MAX_P95_MS})")
|
||||
parser.add_argument("--max-drop-rate",type=float, default=_DEFAULT_MAX_DROP_RATE,
|
||||
help=f"Max queue drop rate 0..1 (default: {_DEFAULT_MAX_DROP_RATE})")
|
||||
parser.add_argument("--report-file", default="",
|
||||
parser.add_argument("--report-file", default="",
|
||||
help="Optional path to write JSON report")
|
||||
parser.add_argument("--db-path", default="",
|
||||
parser.add_argument("--db-path", default="",
|
||||
help="Path to policy_store.db for WAL check "
|
||||
"(e.g. /opt/microdao-daarion/data/matrix_bridge.db)")
|
||||
parser.add_argument("--sender-count", type=int, default=1,
|
||||
help="Number of rotating senders (@soak-0001:server, ...). "
|
||||
"Use >= SENDER_RPM_LIMIT to avoid rate-limit during invoke baseline. "
|
||||
"Default: 1 (single sender, tests rate-limit behavior)")
|
||||
parser.add_argument("--inter-msg-ms", type=float, default=0.0,
|
||||
help="Sleep between each injected message (ms). "
|
||||
"Use to spread load over time (e.g. 100ms = ~10 rps). Default: 0")
|
||||
args = parser.parse_args()
|
||||
|
||||
report = asyncio.run(run_soak(
|
||||
@@ -462,6 +503,8 @@ def main() -> int:
|
||||
max_p95_ms=args.max_p95_ms,
|
||||
max_drop_rate=args.max_drop_rate,
|
||||
db_path=args.db_path,
|
||||
sender_count=args.sender_count,
|
||||
inter_message_ms=args.inter_msg_ms,
|
||||
))
|
||||
_print_report(report)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user