feat(production): sync all modified production files to git

Includes updates across gateway, router, node-worker, memory-service, aurora-service, swapper, sofiia-console UI and node2 infrastructure: - gateway-bot: Dockerfile, http_api.py, druid/aistalk prompts, doc_service - services/router: main.py, router-config.yml, fabric_metrics, memory_retrieval, offload_client, prompt_builder - services/node-worker: worker.py, main.py, config.py, fabric_metrics - services/memory-service: Dockerfile, database.py, main.py, requirements - services/aurora-service: main.py (+399), kling.py, quality_report.py - services/swapper-service: main.py, swapper_config_node2.yaml - services/sofiia-console: static/index.html (console UI update) - config: agent_registry, crewai_agents/teams, router_agents - ops/fabric_preflight.sh: updated preflight checks - router-config.yml, docker-compose.node2.yml: infra updates - docs: NODA1-AGENT-ARCHITECTURE, fabric_contract updated Made-with: Cursor
2026-03-03 07:13:29 -08:00
parent 9aac835882
commit e9dedffa48
35 changed files with 3317 additions and 805 deletions
--- a/services/node-worker/worker.py
+++ b/services/node-worker/worker.py
@@ -11,24 +11,44 @@ from models import JobRequest, JobResponse, JobError
 from idempotency import IdempotencyStore
 from providers import ollama, ollama_vision
 from providers import stt_mlx_whisper, tts_mlx_kokoro
+from providers import stt_memory_service, tts_memory_service
 import fabric_metrics as fm

 logger = logging.getLogger("node-worker")

 _idem = IdempotencyStore()
 _semaphore: asyncio.Semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY)
+
+# Voice-dedicated semaphores — independent from generic MAX_CONCURRENCY.
+# Prevents voice requests from starving generic inference and vice versa.
+_voice_sem_tts: asyncio.Semaphore = asyncio.Semaphore(config.VOICE_MAX_CONCURRENT_TTS)
+_voice_sem_llm: asyncio.Semaphore = asyncio.Semaphore(config.VOICE_MAX_CONCURRENT_LLM)
+_voice_sem_stt: asyncio.Semaphore = asyncio.Semaphore(config.VOICE_MAX_CONCURRENT_STT)
+
+_VOICE_SEMAPHORES = {
+    "voice.tts": _voice_sem_tts,
+    "voice.llm": _voice_sem_llm,
+    "voice.stt": _voice_sem_stt,
+}
+
 _nats_client = None
 _inflight_count: int = 0
+_voice_inflight: Dict[str, int] = {"voice.tts": 0, "voice.llm": 0, "voice.stt": 0}
 _latencies_llm: list = []
 _latencies_vision: list = []
 _LATENCY_BUFFER = 50

+# Set of subjects that use the voice handler path
+_VOICE_SUBJECTS: set = set()
+

 async def start(nats_client):
    global _nats_client
    _nats_client = nats_client

    nid = config.NODE_ID.lower()
+
+    # Generic subjects (unchanged — backward compatible)
    subjects = [
        f"node.{nid}.llm.request",
        f"node.{nid}.vision.request",
@@ -41,6 +61,31 @@ async def start(nats_client):
        await nats_client.subscribe(subj, cb=_handle_request)
        logger.info(f"✅ Subscribed: {subj}")

+    # Voice HA subjects — separate semaphores, own metrics, own deadlines
+    # Only subscribe if the relevant provider is configured (preflight-first)
+    voice_subjects_to_caps = {
+        f"node.{nid}.voice.tts.request": ("tts",   _voice_sem_tts,  "voice.tts"),
+        f"node.{nid}.voice.llm.request": ("llm",   _voice_sem_llm,  "voice.llm"),
+        f"node.{nid}.voice.stt.request": ("stt",   _voice_sem_stt,  "voice.stt"),
+    }
+    for subj, (required_cap, sem, cap_key) in voice_subjects_to_caps.items():
+        if required_cap == "tts" and config.TTS_PROVIDER == "none":
+            logger.info(f"⏭ Skipping {subj}: TTS_PROVIDER=none")
+            continue
+        if required_cap == "stt" and config.STT_PROVIDER == "none":
+            logger.info(f"⏭ Skipping {subj}: STT_PROVIDER=none")
+            continue
+        # LLM always available on this node
+        _VOICE_SUBJECTS.add(subj)
+
+        async def _make_voice_handler(s=sem, k=cap_key):
+            async def _voice_handler(msg):
+                await _handle_voice_request(msg, voice_sem=s, cap_key=k)
+            return _voice_handler
+
+        await nats_client.subscribe(subj, cb=await _make_voice_handler())
+        logger.info(f"✅ Voice subscribed: {subj}")
+

 async def _handle_request(msg):
    t0 = time.time()
@@ -136,6 +181,103 @@ async def _handle_request(msg):
            pass


+async def _handle_voice_request(msg, voice_sem: asyncio.Semaphore, cap_key: str):
+    """Voice-dedicated handler: separate semaphore, metrics, retry hints.
+
+    Maps voice.{tts|llm|stt} to the same _execute() but with:
+    - Own concurrency limit (VOICE_MAX_CONCURRENT_{TTS|LLM|STT})
+    - TOO_BUSY includes retry_after_ms hint (client can retry immediately elsewhere)
+    - Voice-specific Prometheus labels (type=voice.tts, etc.)
+    - WARNING log on fallback (contract: no silent fallback)
+    """
+    t0 = time.time()
+    # Extract the base type for _execute (voice.tts → tts)
+    base_type = cap_key.split(".")[-1]  # "tts", "llm", "stt"
+
+    try:
+        raw = msg.data
+        if len(raw) > config.MAX_PAYLOAD_BYTES:
+            await _reply(msg, JobResponse(
+                node_id=config.NODE_ID, status="error",
+                error=JobError(code="PAYLOAD_TOO_LARGE", message=f"max {config.MAX_PAYLOAD_BYTES} bytes"),
+            ))
+            return
+
+        data = json.loads(raw)
+        job = JobRequest(**data)
+        job.trace_id = job.trace_id or job.job_id
+
+        remaining = job.remaining_ms()
+        if remaining <= 0:
+            await _reply(msg, JobResponse(
+                job_id=job.job_id, trace_id=job.trace_id, node_id=config.NODE_ID,
+                status="timeout", error=JobError(code="DEADLINE_EXCEEDED"),
+            ))
+            return
+
+        # Voice concurrency check — TOO_BUSY includes retry hint
+        if voice_sem._value == 0:
+            logger.warning(
+                "[voice.busy] cap=%s node=%s — all %d slots occupied. "
+                "WARNING: request turned away, Router should failover.",
+                cap_key, config.NODE_ID, {
+                    "voice.tts": config.VOICE_MAX_CONCURRENT_TTS,
+                    "voice.llm": config.VOICE_MAX_CONCURRENT_LLM,
+                    "voice.stt": config.VOICE_MAX_CONCURRENT_STT,
+                }.get(cap_key, "?"),
+            )
+            fm.inc_voice_job(cap_key, "busy")
+            await _reply(msg, JobResponse(
+                job_id=job.job_id, trace_id=job.trace_id, node_id=config.NODE_ID,
+                status="busy",
+                error=JobError(
+                    code="TOO_BUSY",
+                    message=f"voice {cap_key} at capacity",
+                    details={"retry_after_ms": 500, "cap": cap_key},
+                ),
+            ))
+            return
+
+        global _voice_inflight
+        _voice_inflight[cap_key] = _voice_inflight.get(cap_key, 0) + 1
+        fm.set_voice_inflight(cap_key, _voice_inflight[cap_key])
+
+        try:
+            async with voice_sem:
+                # Route to _execute with the base type
+                job.required_type = base_type
+                resp = await _execute(job, remaining)
+        finally:
+            _voice_inflight[cap_key] = max(0, _voice_inflight.get(cap_key, 1) - 1)
+            fm.set_voice_inflight(cap_key, _voice_inflight[cap_key])
+
+        resp.latency_ms = int((time.time() - t0) * 1000)
+        fm.inc_voice_job(cap_key, resp.status)
+        if resp.status == "ok" and resp.latency_ms > 0:
+            fm.observe_voice_latency(cap_key, resp.latency_ms)
+
+        # Contract: log WARNING on any non-ok voice result
+        if resp.status != "ok":
+            logger.warning(
+                "[voice.fallback] cap=%s node=%s status=%s error=%s trace=%s",
+                cap_key, config.NODE_ID, resp.status,
+                resp.error.code if resp.error else "?", job.trace_id,
+            )
+
+        await _reply(msg, resp)
+
+    except Exception as e:
+        logger.exception(f"Voice handler error cap={cap_key}: {e}")
+        fm.inc_voice_job(cap_key, "error")
+        try:
+            await _reply(msg, JobResponse(
+                node_id=config.NODE_ID, status="error",
+                error=JobError(code="INTERNAL", message=str(e)[:200]),
+            ))
+        except Exception:
+            pass
+
+
 async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
    payload = job.payload
    hints = job.hints
@@ -184,9 +326,14 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
                    status="error",
                    error=JobError(code="NOT_AVAILABLE", message="STT not configured on this node"),
                )
-            result = await asyncio.wait_for(
-                stt_mlx_whisper.transcribe(payload), timeout=timeout_s,
-            )
+            if config.STT_PROVIDER == "memory_service":
+                result = await asyncio.wait_for(
+                    stt_memory_service.transcribe(payload), timeout=timeout_s,
+                )
+            else:
+                result = await asyncio.wait_for(
+                    stt_mlx_whisper.transcribe(payload), timeout=timeout_s,
+                )
        elif job.required_type == "tts":
            if config.TTS_PROVIDER == "none":
                return JobResponse(
@@ -194,9 +341,14 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
                    status="error",
                    error=JobError(code="NOT_AVAILABLE", message="TTS not configured on this node"),
                )
-            result = await asyncio.wait_for(
-                tts_mlx_kokoro.synthesize(payload), timeout=timeout_s,
-            )
+            if config.TTS_PROVIDER == "memory_service":
+                result = await asyncio.wait_for(
+                    tts_memory_service.synthesize(payload), timeout=timeout_s,
+                )
+            else:
+                result = await asyncio.wait_for(
+                    tts_mlx_kokoro.synthesize(payload), timeout=timeout_s,
+                )
        elif job.required_type == "ocr":
            if config.OCR_PROVIDER == "none":
                return JobResponse(