From e12c99903da58269fdb4989c9fc5c856a958daa5 Mon Sep 17 00:00:00 2001 From: Apple Date: Thu, 5 Mar 2026 08:06:31 -0800 Subject: [PATCH] feat(soak): add --sender-count rotation + --inter-msg-ms; add NODA1 runtime snapshot Made-with: Cursor --- docs/ops/node1-runtime-snapshot-20260305.md | 217 ++++++++++++++++++++ ops/scripts/matrix_bridge_soak.py | 51 ++++- 2 files changed, 264 insertions(+), 4 deletions(-) create mode 100644 docs/ops/node1-runtime-snapshot-20260305.md diff --git a/docs/ops/node1-runtime-snapshot-20260305.md b/docs/ops/node1-runtime-snapshot-20260305.md new file mode 100644 index 00000000..0df0dd93 --- /dev/null +++ b/docs/ops/node1-runtime-snapshot-20260305.md @@ -0,0 +1,217 @@ +# НОДА1 — Runtime Snapshot (2026-03-05) + +**Server:** 144.76.224.179 (node1-daarion) +**Captured:** 2026-03-05 ~16:55 UTC+1 +**Branch on server:** `codex/sync-node1-runtime` @ `e1d73eb` +**Hardware:** Hetzner GEX44 · CPU: 32 cores · RAM: 64GB · Disk: 1.7TB NVMe (555GB used, 34%) + +--- + +## A) Зрізи контейнерів (що реально запущено) + +| Container | Image | Status | External Ports | +|-----------|-------|--------|----------------| +| `matrix-bridge-dagi-node1` | microdao-daarion-matrix-bridge-dagi | Up (healthy) | `127.0.0.1:7030` | +| `dagi-gateway-node1` | microdao-daarion-gateway | Up (healthy) | `0.0.0.0:9300` | +| `dagi-router-node1` | microdao-daarion-router | Up (healthy) | `0.0.0.0:9102→8000` | +| `dagi-memory-service-node1` | microdao-daarion-memory-service | Up (healthy) | `0.0.0.0:8000` | +| `dagi-sofiia-console-node1` | microdao-daarion-dagi-sofiia-console-node1 | Up (healthy) | `0.0.0.0:8002` | +| `dagi-experience-learner-node1` | microdao-daarion-experience-learner | Up (healthy) | `127.0.0.1:9109` | +| `dagi-gateway-worker-node1` | microdao-daarion-gateway-worker | Up (healthy) | `9300/tcp` (internal) | +| `dagi-gateway-reminder-worker-node1` | microdao-daarion-gateway-reminder-worker | Up (healthy) | internal | +| `dagi-staging-router` | dagi-staging-router | Up (healthy) | `8000/tcp` (internal) | +| `dagi-staging-crewai-service` | dagi-staging-crewai-service | Up | — | +| `dagi-synapse-node1` | matrixdotorg/synapse:latest | Up 2 days (healthy) | `127.0.0.1:8008` | +| `dagi-synapse-db-node1` | postgres:15-alpine | Up 2 days (healthy) | `5432/tcp` (internal) | +| `dagi-market-data-node1` | microdao-daarion-market-data-service | Up (healthy) | `0.0.0.0:8893→8891` | +| `dagi-senpai-md-consumer-node1` | microdao-daarion-senpai-md-consumer | Up (healthy) | `0.0.0.0:8892` | +| `dagi-binance-bot-monitor-node1` | microdao-daarion-binance-bot-monitor | Up | — | +| `clan-consent-adapter` | microdao-daarion-clan-consent-adapter | Up (healthy) | `0.0.0.0:8111` | +| `7e80f50f30e2_clan-visibility-guard` | microdao-daarion-clan-visibility-guard | Up (healthy) | `0.0.0.0:8112` | +| `agent-e2e-prober-node1` | microdao-daarion-agent-e2e-prober | Up | `0.0.0.0:9108` | +| `dagi-nats-node1` | nats:2.11-alpine | Up (healthy) | `4222, 7422 (leafnode), 8222 (mon)` | +| `dagi-redis-node1` | redis:8-alpine | Up (healthy) | `0.0.0.0:6379` | +| `dagi-qdrant-node1` | qdrant/qdrant:v1.13.6 | Up (healthy) | `0.0.0.0:6333-6334` | +| `dagi-neo4j-node1` | neo4j:5.26-community | Up (healthy) | `0.0.0.0:7474, 0.0.0.0:7687` | +| `dagi-postgres` | pgvector/pgvector:pg16 | Up | `0.0.0.0:5432` | +| `dagi-farmos-db-node1` | postgres:16-alpine | Up (healthy) | `5432/tcp` (internal) | +| `dagi-farmos-node1` | farmos/farmos:4.x-amd64 | Up (healthy) | `127.0.0.1:8088→80` | +| `dagi-minio-node1` | minio/minio:latest | Up | `0.0.0.0:9000-9001` | +| `dagi-vision-encoder-node1` | microdao-daarion-vision-encoder | Up (healthy) | `0.0.0.0:8001` | +| `swapper-service-node1` | microdao-daarion-swapper-service | Up (healthy) | `0.0.0.0:8890-8891` | +| `plant-vision-node1` | microdao-daarion-plant-vision-node1 | Up (healthy) | `8085/tcp` (internal) | +| `dagi-crawl4ai-node1` | unclecode/crawl4ai:latest | Up (healthy) | `0.0.0.0:11235` | +| `ollama` | ollama/ollama:latest | Up 5 days | — (host port 11434) | +| `rag-service-node1` | microdao-daarion-rag-service | Up (healthy) | `0.0.0.0:9500` | +| `artifact-registry-node1` | microdao-daarion-artifact-registry | Up (healthy) | `0.0.0.0:9220` | +| `ingest-service` | microdao-daarion-ingest-service | Up | `0.0.0.0:8100` | +| `parser-pipeline` | parser-pipeline:latest | Up | `0.0.0.0:8101` | +| `index-doc-worker-node1` | microdao-daarion-index-doc-worker | Up | — | +| `render-pptx-worker-node1` | microdao-daarion-render-pptx-worker | Up | — | +| `render-pdf-worker-node1` | microdao-daarion-render-pdf-worker | Up | — | +| `brand-registry-node1` | microdao-daarion-brand-registry | Up (healthy) | `0.0.0.0:9210` | +| `brand-intake-node1` | microdao-daarion-brand-intake | Up (healthy) | `0.0.0.0:9211` | +| `presentation-renderer-node1` | microdao-daarion-presentation-renderer | Up (healthy) | `0.0.0.0:9212` | +| `crewai-nats-worker` | microdao-daarion-crewai-worker | Up | `0.0.0.0:9011` | +| `node-capabilities-node1` | microdao-daarion-node-capabilities | Up | `127.0.0.1:8099` | +| `node-worker-node1` | microdao-daarion-node-worker | Up | `127.0.0.1:8109` | +| `postgres-backup-node1` | prodrigestivill/postgres-backup-local:16 | Up (healthy) | — | +| `clan-consent-outbox-worker` | microdao-daarion-clan-consent-outbox-worker | Up | — | +| `dagi-metrics-poller-node1` | microdao-daarion-metrics-poller-node1 | Up (healthy) | — | +| `oneok-espocrm-node1` | espocrm/espocrm:latest | Up | `0.0.0.0:9080→80` | +| `oneok-espocrm-db-node1` | mariadb:11 | Up | `3306/tcp` (internal) | +| `oneok-gotenberg-node1` | gotenberg/gotenberg:8 | Up | `0.0.0.0:3010` | +| `oneok-crm-adapter-node1` | microdao-daarion-oneok-crm-adapter | Up | `8088/tcp` (internal) | +| `oneok-docs-adapter-node1` | microdao-daarion-oneok-docs-adapter | Up | `8090/tcp` (internal) | +| `oneok-calc-adapter-node1` | microdao-daarion-oneok-calc-adapter | Up | `8089/tcp` (internal) | +| `oneok-schedule-adapter-node1` | microdao-daarion-oneok-schedule-adapter | Up | `8091/tcp` (internal) | +| `prometheus` | prom/prometheus:latest | Up | `0.0.0.0:9090` | +| `grafana` | grafana/grafana:latest | Up | `127.0.0.1:3030→3000` | +| `control-plane` | control-plane:latest | Up | `9200/tcp` (internal) | +| `dagi-nats-js-init-node1` | — | init/done | — | + +**Total containers running: 57** + +--- + +## B) Мережа — реально слухає (host) + +| Port | Service | Access | +|------|---------|--------| +| `22` | sshd | public | +| `80/443` | nginx (reverse proxy) | public | +| `3010` | gotenberg (PDF) | public | +| `4222` | NATS clients | public | +| `5432` | PostgreSQL (pgvector) | public | +| `6333/6334` | Qdrant HTTP/gRPC | public | +| `6379` | Redis | public | +| `7422` | NATS leafnode hub | public | +| `7474/7687` | Neo4j HTTP/Bolt | public | +| `8000` | Memory Service | public | +| `8001` | Vision Encoder | public | +| `8002` | Sofiia Console | public | +| `8100` | Ingest Service | public | +| `8101` | Parser Pipeline | public | +| `8111/8112` | Clan adapters | public | +| `8222` | NATS monitoring | public | +| `8890/8891` | Swapper (LLM/metrics) | public | +| `8892` | SENPAI MD Consumer | public | +| `8893` | Market Data Service | public | +| `9000/9001` | MinIO S3 | public | +| `9011` | CrewAI NATS worker | public | +| `9080` | EspoCRM | public | +| `9090` | Prometheus | public | +| `9102` | Router (→8000 internal) | public | +| `9108` | E2E Prober | public | +| `9210/9211/9212` | Brand services | public | +| `9220` | Artifact Registry | public | +| `9300` | Gateway (Telegram) | public | +| `9500` | RAG Service | public | +| `11235` | Crawl4AI | public | +| `127.0.0.1:3030` | Grafana | localhost only | +| `127.0.0.1:6444` | k3s API server | localhost only | +| `127.0.0.1:7030` | matrix-bridge-dagi | localhost only | +| `127.0.0.1:8008` | Synapse Matrix HS | localhost only | +| `127.0.0.1:8088` | FarmOS | localhost only | +| `127.0.0.1:8099` | Node Capabilities | localhost only | +| `127.0.0.1:8109` | Node Worker | localhost only | +| `127.0.0.1:9109` | Experience Learner | localhost only | +| `*:6443` | k3s server (Kubernetes) | public | + +> **Note:** k3s (Kubernetes) is running alongside Docker — not reflected in compose files. + +--- + +## C) Health (як бачить ops) + +### Gateway `:9300` +``` +status: healthy | agents: 16 +agents: daarwizz, helion, greenfood, agromatrix, alateya, nutra, druid, + clan, eonarch, senpai, oneok, soul, yaromir, sofiia, monitor, aistalk +``` + +### Router `:9102` +``` +status: ok +``` + +### Matrix Bridge `:7030` +``` +ok: true | uptime: ~5 min after last restart +rate_limiter: room_rpm=20 sender_rpm=10 +queue: size=6 max=100 workers=2 +persistent_dedupe: enabled=true db=/app/data/matrix_bridge.db +policy_store: ok=true overrides=0 +sticky_cache: ttl=300s active=0 +confirm_store: ttl=120s pending=0 +``` + +### Sofiia Console `:8002` +``` +/api/health → status: false (internal issue, not affecting other services) +``` + +--- + +## D) Ollama моделі (bare-metal, порт 11434) + +| Model | Size | Last Modified | +|-------|------|---------------| +| `smollm2:135m` | 270 MB | 14 hours ago | +| `qwen3.5:0.8b` | 1.0 GB | 2 days ago | +| `qwen3.5:2b` | 2.7 GB | 2 days ago | +| `qwen3.5:9b` | 6.6 GB | 2 days ago | +| `qwen3:8b` | 5.2 GB | 5 days ago | +| `qwen3-vl:8b` | 6.1 GB | 6 days ago | +| `qwen3.5:27b-q4_K_M` | 17 GB | 6 days ago | +| `deepseek-v3.1:671b-cloud` | — (cloud) | 4 weeks ago | + +**Active for Bridge/Router:** `qwen3.5:27b-q4_K_M` (default), `qwen3-vl:8b` (vision) + +--- + +## E) Matrix Bridge runtime config (актуальні env) + +``` +WORKER_CONCURRENCY=2 +QUEUE_MAX_EVENTS=100 +RATE_LIMIT_ROOM_RPM=20 +RATE_LIMIT_SENDER_RPM=10 +PERSISTENT_DEDUPE=1 +QUEUE_DRAIN_TIMEOUT_S=5 +BRIDGE_ROOM_MAP=sofiia:!QwHczWXgefDHBEVkTH:daarion.space +BRIDGE_MIXED_ROOM_MAP= (empty) +BRIDGE_CONTROL_ROOMS= (empty) +CONTROL_ROOM_RPM=60 +CONTROL_OPERATOR_RPM=30 +DISCOVERY_RPM=20 +``` + +--- + +## F) Розбіжності «архітектурна карта vs реальність» + +| У карті | Реальність | +|---------|-----------| +| Matrix bridge порт `:9400` | Реально: `127.0.0.1:7030` (за nginx на matrix.daarion.space) | +| Sofiia console порт `:8002` | Є, але `/api/health` повертає `false` | +| `dagi-postgres` порт `:5432` | Конфліктує з `dagi-synapse-db-node1:5432` — обидва піднято, зовнішній маппінг лише у `dagi-postgres` | +| `plant-vision` порт `:8085` | Internal only (не проксується назовні) | +| Grafana — не було в карті | Запущена на `127.0.0.1:3030` | +| Prometheus — не було в карті | Запущена на `0.0.0.0:9090` | +| k3s — не було в карті | Kubernetes кластер активний поряд з Docker (`*:6443`) | +| `dagi-staging-router` — не було в карті | Додатковий staging router (internal) | +| `control-plane` — не було в карті | Окремий control-plane контейнер (internal `:9200`) | +| `crewai-nats-worker` — не було в карті | NATS-based CrewAI worker (`:9011`) | +| `dagi-staging-crewai-service` — не було | Staging CrewAI service | + +--- + +## G) Ресурси + +``` +Disk: 1.7TB total | 555GB used (34%) | 1.1TB free +RAM: 62GB total | 14GB used | 47GB available +Swap: 31GB total | 649MB used +``` diff --git a/ops/scripts/matrix_bridge_soak.py b/ops/scripts/matrix_bridge_soak.py index 10ba73db..ec64d40f 100644 --- a/ops/scripts/matrix_bridge_soak.py +++ b/ops/scripts/matrix_bridge_soak.py @@ -247,10 +247,28 @@ async def run_soak( max_p95_ms: float, max_drop_rate: float, db_path: str = "", + sender_count: int = 1, + inter_message_ms: float = 0.0, ) -> Dict[str, Any]: + """ + sender_count > 1: rotate senders @soak-0001:..., @soak-0002:..., etc. + This avoids sender_rpm rate-limiting when testing invoke latency. + + inter_message_ms > 0: sleep between each inject (spread load over time). + """ results: List[tuple] = [] semaphore = asyncio.Semaphore(concurrency) + # Build sender pool + server = sender.split(":", 1)[-1] if ":" in sender else "daarion.space" + sender_pool = ( + [sender] if sender_count <= 1 + else [f"@soak-{i:04d}:{server}" for i in range(sender_count)] + ) + if sender_count > 1: + print(f"[soak] Sender rotation: {sender_count} senders " + f"(@soak-0000:{server} … @soak-{sender_count-1:04d}:{server})") + async with httpx.AsyncClient() as client: # Pre-check: inject endpoint + health preflight_err = await _preflight_inject(client, url, room_id) @@ -273,15 +291,19 @@ async def run_soak( fo_before = _parse_counter(metrics_before, "matrix_bridge_failover_total") print(f"[soak] Bridge health before: {health_before.get('ok', '?')}") - print(f"[soak] Starting {n_messages} messages (concurrency={concurrency}) ...") + rl_note = f" (⚠️ rate_limited before={rl_before:.0f}, using {len(sender_pool)} sender(s))" + print(f"[soak] Starting {n_messages} messages (concurrency={concurrency}) ...{rl_note}") t_start = time.monotonic() async def worker(i: int): async with semaphore: + if inter_message_ms > 0: + await asyncio.sleep(inter_message_ms / 1000.0) msg = f"soak-msg-{i:04d}" + current_sender = sender_pool[i % len(sender_pool)] lat, status, err = await _send_one( - client, url, agent_id, msg, room_id, sender + client, url, agent_id, msg, room_id, current_sender ) results.append((lat, status, err)) if (i + 1) % max(1, n_messages // 10) == 0: @@ -382,6 +404,15 @@ async def run_soak( f"WAL grew {wal_delta:.1f}MB (threshold {report['wal']['threshold_mb']}MB) " "— possible SQLite write pressure (Bottleneck #2)" ) + # Rate-limited warning (not a failure, but surfaced prominently) + rl_delta = m.get("rate_limited", 0) + if rl_delta > 0: + rl_pct = rl_delta / s["total_messages"] * 100 + report["warnings"] = report.get("warnings", []) + report["warnings"].append( + f"rate_limited={rl_delta:.0f} ({rl_pct:.0f}% of messages) — " + "use --sender-count >= RATE_LIMIT_SENDER_RPM for invoke baseline" + ) report["passed"] = len(failures) == 0 report["failures"] = failures @@ -419,6 +450,9 @@ def _print_report(r: Dict[str, Any]) -> None: wal_warn = " ⚠️" if (w.get("delta_mb") or 0) > w.get("threshold_mb", 10) else "" print(f" WAL: {w['before_mb']}MB → {w['after_mb']}MB {wal_delta_str}{wal_warn}") print() + if r.get("warnings"): + for w in r["warnings"]: + print(f" ⚠️ {w}") if r["failures"]: for f in r["failures"]: print(f" ❌ {f}") @@ -445,11 +479,18 @@ def main() -> int: help=f"Max p95 latency ms (default: {_DEFAULT_MAX_P95_MS})") parser.add_argument("--max-drop-rate",type=float, default=_DEFAULT_MAX_DROP_RATE, help=f"Max queue drop rate 0..1 (default: {_DEFAULT_MAX_DROP_RATE})") - parser.add_argument("--report-file", default="", + parser.add_argument("--report-file", default="", help="Optional path to write JSON report") - parser.add_argument("--db-path", default="", + parser.add_argument("--db-path", default="", help="Path to policy_store.db for WAL check " "(e.g. /opt/microdao-daarion/data/matrix_bridge.db)") + parser.add_argument("--sender-count", type=int, default=1, + help="Number of rotating senders (@soak-0001:server, ...). " + "Use >= SENDER_RPM_LIMIT to avoid rate-limit during invoke baseline. " + "Default: 1 (single sender, tests rate-limit behavior)") + parser.add_argument("--inter-msg-ms", type=float, default=0.0, + help="Sleep between each injected message (ms). " + "Use to spread load over time (e.g. 100ms = ~10 rps). Default: 0") args = parser.parse_args() report = asyncio.run(run_soak( @@ -462,6 +503,8 @@ def main() -> int: max_p95_ms=args.max_p95_ms, max_drop_rate=args.max_drop_rate, db_path=args.db_path, + sender_count=args.sender_count, + inter_message_ms=args.inter_msg_ms, )) _print_report(report)