feat(soak): add --sender-count rotation + --inter-msg-ms; add NODA1 runtime snapshot

Made-with: Cursor
This commit is contained in:
Apple
2026-03-05 08:06:31 -08:00
parent e1d73ebc98
commit e12c99903d
2 changed files with 264 additions and 4 deletions

View File

@@ -0,0 +1,217 @@
# НОДА1 — Runtime Snapshot (2026-03-05)
**Server:** 144.76.224.179 (node1-daarion)
**Captured:** 2026-03-05 ~16:55 UTC+1
**Branch on server:** `codex/sync-node1-runtime` @ `e1d73eb`
**Hardware:** Hetzner GEX44 · CPU: 32 cores · RAM: 64GB · Disk: 1.7TB NVMe (555GB used, 34%)
---
## A) Зрізи контейнерів (що реально запущено)
| Container | Image | Status | External Ports |
|-----------|-------|--------|----------------|
| `matrix-bridge-dagi-node1` | microdao-daarion-matrix-bridge-dagi | Up (healthy) | `127.0.0.1:7030` |
| `dagi-gateway-node1` | microdao-daarion-gateway | Up (healthy) | `0.0.0.0:9300` |
| `dagi-router-node1` | microdao-daarion-router | Up (healthy) | `0.0.0.0:9102→8000` |
| `dagi-memory-service-node1` | microdao-daarion-memory-service | Up (healthy) | `0.0.0.0:8000` |
| `dagi-sofiia-console-node1` | microdao-daarion-dagi-sofiia-console-node1 | Up (healthy) | `0.0.0.0:8002` |
| `dagi-experience-learner-node1` | microdao-daarion-experience-learner | Up (healthy) | `127.0.0.1:9109` |
| `dagi-gateway-worker-node1` | microdao-daarion-gateway-worker | Up (healthy) | `9300/tcp` (internal) |
| `dagi-gateway-reminder-worker-node1` | microdao-daarion-gateway-reminder-worker | Up (healthy) | internal |
| `dagi-staging-router` | dagi-staging-router | Up (healthy) | `8000/tcp` (internal) |
| `dagi-staging-crewai-service` | dagi-staging-crewai-service | Up | — |
| `dagi-synapse-node1` | matrixdotorg/synapse:latest | Up 2 days (healthy) | `127.0.0.1:8008` |
| `dagi-synapse-db-node1` | postgres:15-alpine | Up 2 days (healthy) | `5432/tcp` (internal) |
| `dagi-market-data-node1` | microdao-daarion-market-data-service | Up (healthy) | `0.0.0.0:8893→8891` |
| `dagi-senpai-md-consumer-node1` | microdao-daarion-senpai-md-consumer | Up (healthy) | `0.0.0.0:8892` |
| `dagi-binance-bot-monitor-node1` | microdao-daarion-binance-bot-monitor | Up | — |
| `clan-consent-adapter` | microdao-daarion-clan-consent-adapter | Up (healthy) | `0.0.0.0:8111` |
| `7e80f50f30e2_clan-visibility-guard` | microdao-daarion-clan-visibility-guard | Up (healthy) | `0.0.0.0:8112` |
| `agent-e2e-prober-node1` | microdao-daarion-agent-e2e-prober | Up | `0.0.0.0:9108` |
| `dagi-nats-node1` | nats:2.11-alpine | Up (healthy) | `4222, 7422 (leafnode), 8222 (mon)` |
| `dagi-redis-node1` | redis:8-alpine | Up (healthy) | `0.0.0.0:6379` |
| `dagi-qdrant-node1` | qdrant/qdrant:v1.13.6 | Up (healthy) | `0.0.0.0:6333-6334` |
| `dagi-neo4j-node1` | neo4j:5.26-community | Up (healthy) | `0.0.0.0:7474, 0.0.0.0:7687` |
| `dagi-postgres` | pgvector/pgvector:pg16 | Up | `0.0.0.0:5432` |
| `dagi-farmos-db-node1` | postgres:16-alpine | Up (healthy) | `5432/tcp` (internal) |
| `dagi-farmos-node1` | farmos/farmos:4.x-amd64 | Up (healthy) | `127.0.0.1:8088→80` |
| `dagi-minio-node1` | minio/minio:latest | Up | `0.0.0.0:9000-9001` |
| `dagi-vision-encoder-node1` | microdao-daarion-vision-encoder | Up (healthy) | `0.0.0.0:8001` |
| `swapper-service-node1` | microdao-daarion-swapper-service | Up (healthy) | `0.0.0.0:8890-8891` |
| `plant-vision-node1` | microdao-daarion-plant-vision-node1 | Up (healthy) | `8085/tcp` (internal) |
| `dagi-crawl4ai-node1` | unclecode/crawl4ai:latest | Up (healthy) | `0.0.0.0:11235` |
| `ollama` | ollama/ollama:latest | Up 5 days | — (host port 11434) |
| `rag-service-node1` | microdao-daarion-rag-service | Up (healthy) | `0.0.0.0:9500` |
| `artifact-registry-node1` | microdao-daarion-artifact-registry | Up (healthy) | `0.0.0.0:9220` |
| `ingest-service` | microdao-daarion-ingest-service | Up | `0.0.0.0:8100` |
| `parser-pipeline` | parser-pipeline:latest | Up | `0.0.0.0:8101` |
| `index-doc-worker-node1` | microdao-daarion-index-doc-worker | Up | — |
| `render-pptx-worker-node1` | microdao-daarion-render-pptx-worker | Up | — |
| `render-pdf-worker-node1` | microdao-daarion-render-pdf-worker | Up | — |
| `brand-registry-node1` | microdao-daarion-brand-registry | Up (healthy) | `0.0.0.0:9210` |
| `brand-intake-node1` | microdao-daarion-brand-intake | Up (healthy) | `0.0.0.0:9211` |
| `presentation-renderer-node1` | microdao-daarion-presentation-renderer | Up (healthy) | `0.0.0.0:9212` |
| `crewai-nats-worker` | microdao-daarion-crewai-worker | Up | `0.0.0.0:9011` |
| `node-capabilities-node1` | microdao-daarion-node-capabilities | Up | `127.0.0.1:8099` |
| `node-worker-node1` | microdao-daarion-node-worker | Up | `127.0.0.1:8109` |
| `postgres-backup-node1` | prodrigestivill/postgres-backup-local:16 | Up (healthy) | — |
| `clan-consent-outbox-worker` | microdao-daarion-clan-consent-outbox-worker | Up | — |
| `dagi-metrics-poller-node1` | microdao-daarion-metrics-poller-node1 | Up (healthy) | — |
| `oneok-espocrm-node1` | espocrm/espocrm:latest | Up | `0.0.0.0:9080→80` |
| `oneok-espocrm-db-node1` | mariadb:11 | Up | `3306/tcp` (internal) |
| `oneok-gotenberg-node1` | gotenberg/gotenberg:8 | Up | `0.0.0.0:3010` |
| `oneok-crm-adapter-node1` | microdao-daarion-oneok-crm-adapter | Up | `8088/tcp` (internal) |
| `oneok-docs-adapter-node1` | microdao-daarion-oneok-docs-adapter | Up | `8090/tcp` (internal) |
| `oneok-calc-adapter-node1` | microdao-daarion-oneok-calc-adapter | Up | `8089/tcp` (internal) |
| `oneok-schedule-adapter-node1` | microdao-daarion-oneok-schedule-adapter | Up | `8091/tcp` (internal) |
| `prometheus` | prom/prometheus:latest | Up | `0.0.0.0:9090` |
| `grafana` | grafana/grafana:latest | Up | `127.0.0.1:3030→3000` |
| `control-plane` | control-plane:latest | Up | `9200/tcp` (internal) |
| `dagi-nats-js-init-node1` | — | init/done | — |
**Total containers running: 57**
---
## B) Мережа — реально слухає (host)
| Port | Service | Access |
|------|---------|--------|
| `22` | sshd | public |
| `80/443` | nginx (reverse proxy) | public |
| `3010` | gotenberg (PDF) | public |
| `4222` | NATS clients | public |
| `5432` | PostgreSQL (pgvector) | public |
| `6333/6334` | Qdrant HTTP/gRPC | public |
| `6379` | Redis | public |
| `7422` | NATS leafnode hub | public |
| `7474/7687` | Neo4j HTTP/Bolt | public |
| `8000` | Memory Service | public |
| `8001` | Vision Encoder | public |
| `8002` | Sofiia Console | public |
| `8100` | Ingest Service | public |
| `8101` | Parser Pipeline | public |
| `8111/8112` | Clan adapters | public |
| `8222` | NATS monitoring | public |
| `8890/8891` | Swapper (LLM/metrics) | public |
| `8892` | SENPAI MD Consumer | public |
| `8893` | Market Data Service | public |
| `9000/9001` | MinIO S3 | public |
| `9011` | CrewAI NATS worker | public |
| `9080` | EspoCRM | public |
| `9090` | Prometheus | public |
| `9102` | Router (→8000 internal) | public |
| `9108` | E2E Prober | public |
| `9210/9211/9212` | Brand services | public |
| `9220` | Artifact Registry | public |
| `9300` | Gateway (Telegram) | public |
| `9500` | RAG Service | public |
| `11235` | Crawl4AI | public |
| `127.0.0.1:3030` | Grafana | localhost only |
| `127.0.0.1:6444` | k3s API server | localhost only |
| `127.0.0.1:7030` | matrix-bridge-dagi | localhost only |
| `127.0.0.1:8008` | Synapse Matrix HS | localhost only |
| `127.0.0.1:8088` | FarmOS | localhost only |
| `127.0.0.1:8099` | Node Capabilities | localhost only |
| `127.0.0.1:8109` | Node Worker | localhost only |
| `127.0.0.1:9109` | Experience Learner | localhost only |
| `*:6443` | k3s server (Kubernetes) | public |
> **Note:** k3s (Kubernetes) is running alongside Docker — not reflected in compose files.
---
## C) Health (як бачить ops)
### Gateway `:9300`
```
status: healthy | agents: 16
agents: daarwizz, helion, greenfood, agromatrix, alateya, nutra, druid,
clan, eonarch, senpai, oneok, soul, yaromir, sofiia, monitor, aistalk
```
### Router `:9102`
```
status: ok
```
### Matrix Bridge `:7030`
```
ok: true | uptime: ~5 min after last restart
rate_limiter: room_rpm=20 sender_rpm=10
queue: size=6 max=100 workers=2
persistent_dedupe: enabled=true db=/app/data/matrix_bridge.db
policy_store: ok=true overrides=0
sticky_cache: ttl=300s active=0
confirm_store: ttl=120s pending=0
```
### Sofiia Console `:8002`
```
/api/health → status: false (internal issue, not affecting other services)
```
---
## D) Ollama моделі (bare-metal, порт 11434)
| Model | Size | Last Modified |
|-------|------|---------------|
| `smollm2:135m` | 270 MB | 14 hours ago |
| `qwen3.5:0.8b` | 1.0 GB | 2 days ago |
| `qwen3.5:2b` | 2.7 GB | 2 days ago |
| `qwen3.5:9b` | 6.6 GB | 2 days ago |
| `qwen3:8b` | 5.2 GB | 5 days ago |
| `qwen3-vl:8b` | 6.1 GB | 6 days ago |
| `qwen3.5:27b-q4_K_M` | 17 GB | 6 days ago |
| `deepseek-v3.1:671b-cloud` | — (cloud) | 4 weeks ago |
**Active for Bridge/Router:** `qwen3.5:27b-q4_K_M` (default), `qwen3-vl:8b` (vision)
---
## E) Matrix Bridge runtime config (актуальні env)
```
WORKER_CONCURRENCY=2
QUEUE_MAX_EVENTS=100
RATE_LIMIT_ROOM_RPM=20
RATE_LIMIT_SENDER_RPM=10
PERSISTENT_DEDUPE=1
QUEUE_DRAIN_TIMEOUT_S=5
BRIDGE_ROOM_MAP=sofiia:!QwHczWXgefDHBEVkTH:daarion.space
BRIDGE_MIXED_ROOM_MAP= (empty)
BRIDGE_CONTROL_ROOMS= (empty)
CONTROL_ROOM_RPM=60
CONTROL_OPERATOR_RPM=30
DISCOVERY_RPM=20
```
---
## F) Розбіжності «архітектурна карта vs реальність»
| У карті | Реальність |
|---------|-----------|
| Matrix bridge порт `:9400` | Реально: `127.0.0.1:7030` (за nginx на matrix.daarion.space) |
| Sofiia console порт `:8002` | Є, але `/api/health` повертає `false` |
| `dagi-postgres` порт `:5432` | Конфліктує з `dagi-synapse-db-node1:5432` — обидва піднято, зовнішній маппінг лише у `dagi-postgres` |
| `plant-vision` порт `:8085` | Internal only (не проксується назовні) |
| Grafana — не було в карті | Запущена на `127.0.0.1:3030` |
| Prometheus — не було в карті | Запущена на `0.0.0.0:9090` |
| k3s — не було в карті | Kubernetes кластер активний поряд з Docker (`*:6443`) |
| `dagi-staging-router` — не було в карті | Додатковий staging router (internal) |
| `control-plane` — не було в карті | Окремий control-plane контейнер (internal `:9200`) |
| `crewai-nats-worker` — не було в карті | NATS-based CrewAI worker (`:9011`) |
| `dagi-staging-crewai-service` — не було | Staging CrewAI service |
---
## G) Ресурси
```
Disk: 1.7TB total | 555GB used (34%) | 1.1TB free
RAM: 62GB total | 14GB used | 47GB available
Swap: 31GB total | 649MB used
```

View File

@@ -247,10 +247,28 @@ async def run_soak(
max_p95_ms: float,
max_drop_rate: float,
db_path: str = "",
sender_count: int = 1,
inter_message_ms: float = 0.0,
) -> Dict[str, Any]:
"""
sender_count > 1: rotate senders @soak-0001:..., @soak-0002:..., etc.
This avoids sender_rpm rate-limiting when testing invoke latency.
inter_message_ms > 0: sleep between each inject (spread load over time).
"""
results: List[tuple] = []
semaphore = asyncio.Semaphore(concurrency)
# Build sender pool
server = sender.split(":", 1)[-1] if ":" in sender else "daarion.space"
sender_pool = (
[sender] if sender_count <= 1
else [f"@soak-{i:04d}:{server}" for i in range(sender_count)]
)
if sender_count > 1:
print(f"[soak] Sender rotation: {sender_count} senders "
f"(@soak-0000:{server} … @soak-{sender_count-1:04d}:{server})")
async with httpx.AsyncClient() as client:
# Pre-check: inject endpoint + health
preflight_err = await _preflight_inject(client, url, room_id)
@@ -273,15 +291,19 @@ async def run_soak(
fo_before = _parse_counter(metrics_before, "matrix_bridge_failover_total")
print(f"[soak] Bridge health before: {health_before.get('ok', '?')}")
print(f"[soak] Starting {n_messages} messages (concurrency={concurrency}) ...")
rl_note = f" (⚠️ rate_limited before={rl_before:.0f}, using {len(sender_pool)} sender(s))"
print(f"[soak] Starting {n_messages} messages (concurrency={concurrency}) ...{rl_note}")
t_start = time.monotonic()
async def worker(i: int):
async with semaphore:
if inter_message_ms > 0:
await asyncio.sleep(inter_message_ms / 1000.0)
msg = f"soak-msg-{i:04d}"
current_sender = sender_pool[i % len(sender_pool)]
lat, status, err = await _send_one(
client, url, agent_id, msg, room_id, sender
client, url, agent_id, msg, room_id, current_sender
)
results.append((lat, status, err))
if (i + 1) % max(1, n_messages // 10) == 0:
@@ -382,6 +404,15 @@ async def run_soak(
f"WAL grew {wal_delta:.1f}MB (threshold {report['wal']['threshold_mb']}MB) "
"— possible SQLite write pressure (Bottleneck #2)"
)
# Rate-limited warning (not a failure, but surfaced prominently)
rl_delta = m.get("rate_limited", 0)
if rl_delta > 0:
rl_pct = rl_delta / s["total_messages"] * 100
report["warnings"] = report.get("warnings", [])
report["warnings"].append(
f"rate_limited={rl_delta:.0f} ({rl_pct:.0f}% of messages) — "
"use --sender-count >= RATE_LIMIT_SENDER_RPM for invoke baseline"
)
report["passed"] = len(failures) == 0
report["failures"] = failures
@@ -419,6 +450,9 @@ def _print_report(r: Dict[str, Any]) -> None:
wal_warn = " ⚠️" if (w.get("delta_mb") or 0) > w.get("threshold_mb", 10) else ""
print(f" WAL: {w['before_mb']}MB → {w['after_mb']}MB {wal_delta_str}{wal_warn}")
print()
if r.get("warnings"):
for w in r["warnings"]:
print(f" ⚠️ {w}")
if r["failures"]:
for f in r["failures"]:
print(f"{f}")
@@ -445,11 +479,18 @@ def main() -> int:
help=f"Max p95 latency ms (default: {_DEFAULT_MAX_P95_MS})")
parser.add_argument("--max-drop-rate",type=float, default=_DEFAULT_MAX_DROP_RATE,
help=f"Max queue drop rate 0..1 (default: {_DEFAULT_MAX_DROP_RATE})")
parser.add_argument("--report-file", default="",
parser.add_argument("--report-file", default="",
help="Optional path to write JSON report")
parser.add_argument("--db-path", default="",
parser.add_argument("--db-path", default="",
help="Path to policy_store.db for WAL check "
"(e.g. /opt/microdao-daarion/data/matrix_bridge.db)")
parser.add_argument("--sender-count", type=int, default=1,
help="Number of rotating senders (@soak-0001:server, ...). "
"Use >= SENDER_RPM_LIMIT to avoid rate-limit during invoke baseline. "
"Default: 1 (single sender, tests rate-limit behavior)")
parser.add_argument("--inter-msg-ms", type=float, default=0.0,
help="Sleep between each injected message (ms). "
"Use to spread load over time (e.g. 100ms = ~10 rps). Default: 0")
args = parser.parse_args()
report = asyncio.run(run_soak(
@@ -462,6 +503,8 @@ def main() -> int:
max_p95_ms=args.max_p95_ms,
max_drop_rate=args.max_drop_rate,
db_path=args.db_path,
sender_count=args.sender_count,
inter_message_ms=args.inter_msg_ms,
))
_print_report(report)