feat(production): sync all modified production files to git

Includes updates across gateway, router, node-worker, memory-service, aurora-service, swapper, sofiia-console UI and node2 infrastructure: - gateway-bot: Dockerfile, http_api.py, druid/aistalk prompts, doc_service - services/router: main.py, router-config.yml, fabric_metrics, memory_retrieval, offload_client, prompt_builder - services/node-worker: worker.py, main.py, config.py, fabric_metrics - services/memory-service: Dockerfile, database.py, main.py, requirements - services/aurora-service: main.py (+399), kling.py, quality_report.py - services/swapper-service: main.py, swapper_config_node2.yaml - services/sofiia-console: static/index.html (console UI update) - config: agent_registry, crewai_agents/teams, router_agents - ops/fabric_preflight.sh: updated preflight checks - router-config.yml, docker-compose.node2.yml: infra updates - docs: NODA1-AGENT-ARCHITECTURE, fabric_contract updated Made-with: Cursor
2026-03-03 07:13:29 -08:00
parent 9aac835882
commit e9dedffa48
35 changed files with 3317 additions and 805 deletions
--- a/services/node-worker/config.py
+++ b/services/node-worker/config.py
@@ -14,3 +14,19 @@ STT_PROVIDER = os.getenv("STT_PROVIDER", "none")
 TTS_PROVIDER = os.getenv("TTS_PROVIDER", "none")
 OCR_PROVIDER = os.getenv("OCR_PROVIDER", "vision_prompted")
 IMAGE_PROVIDER = os.getenv("IMAGE_PROVIDER", "none")
+
+# Memory Service URL (used by memory_service STT/TTS providers)
+MEMORY_SERVICE_URL = os.getenv("MEMORY_SERVICE_URL", "http://memory-service:8000")
+
+# ── Voice HA: dedicated concurrency limits (separate from generic stt/tts/llm) ──
+# These control semaphores for node.{id}.voice.*.request subjects.
+# Independent from MAX_CONCURRENCY so voice never starves generic inference.
+VOICE_MAX_CONCURRENT_TTS = int(os.getenv("VOICE_MAX_CONCURRENT_TTS", "4"))
+VOICE_MAX_CONCURRENT_LLM = int(os.getenv("VOICE_MAX_CONCURRENT_LLM", "2"))
+VOICE_MAX_CONCURRENT_STT = int(os.getenv("VOICE_MAX_CONCURRENT_STT", "2"))
+
+# Timeouts for voice subjects (milliseconds). Router uses these as defaults.
+VOICE_TTS_DEADLINE_MS  = int(os.getenv("VOICE_TTS_DEADLINE_MS",  "3000"))
+VOICE_LLM_FAST_MS      = int(os.getenv("VOICE_LLM_FAST_MS",      "9000"))
+VOICE_LLM_QUALITY_MS   = int(os.getenv("VOICE_LLM_QUALITY_MS",  "12000"))
+VOICE_STT_DEADLINE_MS  = int(os.getenv("VOICE_STT_DEADLINE_MS",  "6000"))
--- a/services/node-worker/fabric_metrics.py
+++ b/services/node-worker/fabric_metrics.py
@@ -8,6 +8,7 @@ try:
    PROM_AVAILABLE = True
    REGISTRY = CollectorRegistry()

+    # Generic job metrics
    jobs_total = Counter(
        "node_worker_jobs_total", "Jobs processed",
        ["type", "status"], registry=REGISTRY,
@@ -23,6 +24,26 @@ try:
        registry=REGISTRY,
    )

+    # ── Voice HA metrics (separate labels from generic) ───────────────────────
+    # cap label: "voice.tts" | "voice.llm" | "voice.stt"
+    voice_jobs_total = Counter(
+        "node_worker_voice_jobs_total",
+        "Voice HA jobs processed (node.{id}.voice.*.request)",
+        ["cap", "status"], registry=REGISTRY,
+    )
+    voice_inflight_gauge = Gauge(
+        "node_worker_voice_inflight",
+        "Voice HA inflight jobs per capability",
+        ["cap"], registry=REGISTRY,
+    )
+    voice_latency_hist = Histogram(
+        "node_worker_voice_latency_ms",
+        "Voice HA job latency in ms",
+        ["cap"],
+        buckets=[100, 250, 500, 1000, 1500, 2000, 3000, 5000, 9000, 12000],
+        registry=REGISTRY,
+    )
+
 except ImportError:
    PROM_AVAILABLE = False
    REGISTRY = None
@@ -44,6 +65,21 @@ def observe_latency(req_type: str, model: str, latency_ms: int):
        latency_hist.labels(type=req_type, model=model).observe(latency_ms)


+def inc_voice_job(cap: str, status: str):
+    if PROM_AVAILABLE:
+        voice_jobs_total.labels(cap=cap, status=status).inc()
+
+
+def set_voice_inflight(cap: str, count: int):
+    if PROM_AVAILABLE:
+        voice_inflight_gauge.labels(cap=cap).set(count)
+
+
+def observe_voice_latency(cap: str, latency_ms: int):
+    if PROM_AVAILABLE:
+        voice_latency_hist.labels(cap=cap).observe(latency_ms)
+
+
 def get_metrics_text():
    if PROM_AVAILABLE and REGISTRY:
        return generate_latest(REGISTRY)
--- a/services/node-worker/main.py
+++ b/services/node-worker/main.py
@@ -43,7 +43,30 @@ async def prom_metrics():

@app.get("/caps")
 async def caps():
-    """Capability flags for NCS to aggregate."""
+    """Capability flags for NCS to aggregate.
+
+    Semantic vs operational separation (contract):
+    - capabilities.voice_* = semantic availability (provider configured).
+      True as long as the provider is configured, regardless of NATS state.
+      Routing decisions are based on this.
+    - runtime.nats_subscriptions.voice_* = operational (NATS sub active).
+      Used for health/telemetry only — NOT for routing.
+
+    This prevents false-negatives during reconnects / restart races.
+    """
+    import worker as _w
+    nid = config.NODE_ID.lower()
+
+    # Semantic: provider configured → capability is available
+    voice_tts_cap = config.TTS_PROVIDER != "none"
+    voice_stt_cap = config.STT_PROVIDER != "none"
+    voice_llm_cap = True  # LLM always available when node-worker is up
+
+    # Operational: actual NATS subscription state (health/telemetry only)
+    nats_voice_tts_active = f"node.{nid}.voice.tts.request" in _w._VOICE_SUBJECTS
+    nats_voice_stt_active = f"node.{nid}.voice.stt.request" in _w._VOICE_SUBJECTS
+    nats_voice_llm_active = f"node.{nid}.voice.llm.request" in _w._VOICE_SUBJECTS
+
    return {
        "node_id": config.NODE_ID,
        "capabilities": {
@@ -53,6 +76,10 @@ async def caps():
            "tts": config.TTS_PROVIDER != "none",
            "ocr": config.OCR_PROVIDER != "none",
            "image": config.IMAGE_PROVIDER != "none",
+            # Voice HA semantic capability flags (provider-based, not NATS-based)
+            "voice_tts": voice_tts_cap,
+            "voice_llm": voice_llm_cap,
+            "voice_stt": voice_stt_cap,
        },
        "providers": {
            "stt": config.STT_PROVIDER,
@@ -65,6 +92,19 @@ async def caps():
            "vision": config.DEFAULT_VISION,
        },
        "concurrency": config.MAX_CONCURRENCY,
+        "voice_concurrency": {
+            "voice_tts": config.VOICE_MAX_CONCURRENT_TTS,
+            "voice_llm": config.VOICE_MAX_CONCURRENT_LLM,
+            "voice_stt": config.VOICE_MAX_CONCURRENT_STT,
+        },
+        # Operational NATS subscription state — for health/monitoring only
+        "runtime": {
+            "nats_subscriptions": {
+                "voice_tts_active": nats_voice_tts_active,
+                "voice_stt_active": nats_voice_stt_active,
+                "voice_llm_active": nats_voice_llm_active,
+            }
+        },
    }


--- a/services/node-worker/worker.py
+++ b/services/node-worker/worker.py
@@ -11,24 +11,44 @@ from models import JobRequest, JobResponse, JobError
 from idempotency import IdempotencyStore
 from providers import ollama, ollama_vision
 from providers import stt_mlx_whisper, tts_mlx_kokoro
+from providers import stt_memory_service, tts_memory_service
 import fabric_metrics as fm

 logger = logging.getLogger("node-worker")

 _idem = IdempotencyStore()
 _semaphore: asyncio.Semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY)
+
+# Voice-dedicated semaphores — independent from generic MAX_CONCURRENCY.
+# Prevents voice requests from starving generic inference and vice versa.
+_voice_sem_tts: asyncio.Semaphore = asyncio.Semaphore(config.VOICE_MAX_CONCURRENT_TTS)
+_voice_sem_llm: asyncio.Semaphore = asyncio.Semaphore(config.VOICE_MAX_CONCURRENT_LLM)
+_voice_sem_stt: asyncio.Semaphore = asyncio.Semaphore(config.VOICE_MAX_CONCURRENT_STT)
+
+_VOICE_SEMAPHORES = {
+    "voice.tts": _voice_sem_tts,
+    "voice.llm": _voice_sem_llm,
+    "voice.stt": _voice_sem_stt,
+}
+
 _nats_client = None
 _inflight_count: int = 0
+_voice_inflight: Dict[str, int] = {"voice.tts": 0, "voice.llm": 0, "voice.stt": 0}
 _latencies_llm: list = []
 _latencies_vision: list = []
 _LATENCY_BUFFER = 50

+# Set of subjects that use the voice handler path
+_VOICE_SUBJECTS: set = set()
+

 async def start(nats_client):
    global _nats_client
    _nats_client = nats_client

    nid = config.NODE_ID.lower()
+
+    # Generic subjects (unchanged — backward compatible)
    subjects = [
        f"node.{nid}.llm.request",
        f"node.{nid}.vision.request",
@@ -41,6 +61,31 @@ async def start(nats_client):
        await nats_client.subscribe(subj, cb=_handle_request)
        logger.info(f"✅ Subscribed: {subj}")

+    # Voice HA subjects — separate semaphores, own metrics, own deadlines
+    # Only subscribe if the relevant provider is configured (preflight-first)
+    voice_subjects_to_caps = {
+        f"node.{nid}.voice.tts.request": ("tts",   _voice_sem_tts,  "voice.tts"),
+        f"node.{nid}.voice.llm.request": ("llm",   _voice_sem_llm,  "voice.llm"),
+        f"node.{nid}.voice.stt.request": ("stt",   _voice_sem_stt,  "voice.stt"),
+    }
+    for subj, (required_cap, sem, cap_key) in voice_subjects_to_caps.items():
+        if required_cap == "tts" and config.TTS_PROVIDER == "none":
+            logger.info(f"⏭ Skipping {subj}: TTS_PROVIDER=none")
+            continue
+        if required_cap == "stt" and config.STT_PROVIDER == "none":
+            logger.info(f"⏭ Skipping {subj}: STT_PROVIDER=none")
+            continue
+        # LLM always available on this node
+        _VOICE_SUBJECTS.add(subj)
+
+        async def _make_voice_handler(s=sem, k=cap_key):
+            async def _voice_handler(msg):
+                await _handle_voice_request(msg, voice_sem=s, cap_key=k)
+            return _voice_handler
+
+        await nats_client.subscribe(subj, cb=await _make_voice_handler())
+        logger.info(f"✅ Voice subscribed: {subj}")
+

 async def _handle_request(msg):
    t0 = time.time()
@@ -136,6 +181,103 @@ async def _handle_request(msg):
            pass


+async def _handle_voice_request(msg, voice_sem: asyncio.Semaphore, cap_key: str):
+    """Voice-dedicated handler: separate semaphore, metrics, retry hints.
+
+    Maps voice.{tts|llm|stt} to the same _execute() but with:
+    - Own concurrency limit (VOICE_MAX_CONCURRENT_{TTS|LLM|STT})
+    - TOO_BUSY includes retry_after_ms hint (client can retry immediately elsewhere)
+    - Voice-specific Prometheus labels (type=voice.tts, etc.)
+    - WARNING log on fallback (contract: no silent fallback)
+    """
+    t0 = time.time()
+    # Extract the base type for _execute (voice.tts → tts)
+    base_type = cap_key.split(".")[-1]  # "tts", "llm", "stt"
+
+    try:
+        raw = msg.data
+        if len(raw) > config.MAX_PAYLOAD_BYTES:
+            await _reply(msg, JobResponse(
+                node_id=config.NODE_ID, status="error",
+                error=JobError(code="PAYLOAD_TOO_LARGE", message=f"max {config.MAX_PAYLOAD_BYTES} bytes"),
+            ))
+            return
+
+        data = json.loads(raw)
+        job = JobRequest(**data)
+        job.trace_id = job.trace_id or job.job_id
+
+        remaining = job.remaining_ms()
+        if remaining <= 0:
+            await _reply(msg, JobResponse(
+                job_id=job.job_id, trace_id=job.trace_id, node_id=config.NODE_ID,
+                status="timeout", error=JobError(code="DEADLINE_EXCEEDED"),
+            ))
+            return
+
+        # Voice concurrency check — TOO_BUSY includes retry hint
+        if voice_sem._value == 0:
+            logger.warning(
+                "[voice.busy] cap=%s node=%s — all %d slots occupied. "
+                "WARNING: request turned away, Router should failover.",
+                cap_key, config.NODE_ID, {
+                    "voice.tts": config.VOICE_MAX_CONCURRENT_TTS,
+                    "voice.llm": config.VOICE_MAX_CONCURRENT_LLM,
+                    "voice.stt": config.VOICE_MAX_CONCURRENT_STT,
+                }.get(cap_key, "?"),
+            )
+            fm.inc_voice_job(cap_key, "busy")
+            await _reply(msg, JobResponse(
+                job_id=job.job_id, trace_id=job.trace_id, node_id=config.NODE_ID,
+                status="busy",
+                error=JobError(
+                    code="TOO_BUSY",
+                    message=f"voice {cap_key} at capacity",
+                    details={"retry_after_ms": 500, "cap": cap_key},
+                ),
+            ))
+            return
+
+        global _voice_inflight
+        _voice_inflight[cap_key] = _voice_inflight.get(cap_key, 0) + 1
+        fm.set_voice_inflight(cap_key, _voice_inflight[cap_key])
+
+        try:
+            async with voice_sem:
+                # Route to _execute with the base type
+                job.required_type = base_type
+                resp = await _execute(job, remaining)
+        finally:
+            _voice_inflight[cap_key] = max(0, _voice_inflight.get(cap_key, 1) - 1)
+            fm.set_voice_inflight(cap_key, _voice_inflight[cap_key])
+
+        resp.latency_ms = int((time.time() - t0) * 1000)
+        fm.inc_voice_job(cap_key, resp.status)
+        if resp.status == "ok" and resp.latency_ms > 0:
+            fm.observe_voice_latency(cap_key, resp.latency_ms)
+
+        # Contract: log WARNING on any non-ok voice result
+        if resp.status != "ok":
+            logger.warning(
+                "[voice.fallback] cap=%s node=%s status=%s error=%s trace=%s",
+                cap_key, config.NODE_ID, resp.status,
+                resp.error.code if resp.error else "?", job.trace_id,
+            )
+
+        await _reply(msg, resp)
+
+    except Exception as e:
+        logger.exception(f"Voice handler error cap={cap_key}: {e}")
+        fm.inc_voice_job(cap_key, "error")
+        try:
+            await _reply(msg, JobResponse(
+                node_id=config.NODE_ID, status="error",
+                error=JobError(code="INTERNAL", message=str(e)[:200]),
+            ))
+        except Exception:
+            pass
+
+
 async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
    payload = job.payload
    hints = job.hints
@@ -184,9 +326,14 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
                    status="error",
                    error=JobError(code="NOT_AVAILABLE", message="STT not configured on this node"),
                )
-            result = await asyncio.wait_for(
-                stt_mlx_whisper.transcribe(payload), timeout=timeout_s,
-            )
+            if config.STT_PROVIDER == "memory_service":
+                result = await asyncio.wait_for(
+                    stt_memory_service.transcribe(payload), timeout=timeout_s,
+                )
+            else:
+                result = await asyncio.wait_for(
+                    stt_mlx_whisper.transcribe(payload), timeout=timeout_s,
+                )
        elif job.required_type == "tts":
            if config.TTS_PROVIDER == "none":
                return JobResponse(
@@ -194,9 +341,14 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
                    status="error",
                    error=JobError(code="NOT_AVAILABLE", message="TTS not configured on this node"),
                )
-            result = await asyncio.wait_for(
-                tts_mlx_kokoro.synthesize(payload), timeout=timeout_s,
-            )
+            if config.TTS_PROVIDER == "memory_service":
+                result = await asyncio.wait_for(
+                    tts_memory_service.synthesize(payload), timeout=timeout_s,
+                )
+            else:
+                result = await asyncio.wait_for(
+                    tts_mlx_kokoro.synthesize(payload), timeout=timeout_s,
+                )
        elif job.required_type == "ocr":
            if config.OCR_PROVIDER == "none":
                return JobResponse(