feat(production): sync all modified production files to git

Includes updates across gateway, router, node-worker, memory-service,
aurora-service, swapper, sofiia-console UI and node2 infrastructure:

- gateway-bot: Dockerfile, http_api.py, druid/aistalk prompts, doc_service
- services/router: main.py, router-config.yml, fabric_metrics, memory_retrieval,
  offload_client, prompt_builder
- services/node-worker: worker.py, main.py, config.py, fabric_metrics
- services/memory-service: Dockerfile, database.py, main.py, requirements
- services/aurora-service: main.py (+399), kling.py, quality_report.py
- services/swapper-service: main.py, swapper_config_node2.yaml
- services/sofiia-console: static/index.html (console UI update)
- config: agent_registry, crewai_agents/teams, router_agents
- ops/fabric_preflight.sh: updated preflight checks
- router-config.yml, docker-compose.node2.yml: infra updates
- docs: NODA1-AGENT-ARCHITECTURE, fabric_contract updated

Made-with: Cursor
This commit is contained in:
Apple
2026-03-03 07:13:29 -08:00
parent 9aac835882
commit e9dedffa48
35 changed files with 3317 additions and 805 deletions

View File

@@ -11,24 +11,44 @@ from models import JobRequest, JobResponse, JobError
from idempotency import IdempotencyStore
from providers import ollama, ollama_vision
from providers import stt_mlx_whisper, tts_mlx_kokoro
from providers import stt_memory_service, tts_memory_service
import fabric_metrics as fm
logger = logging.getLogger("node-worker")
_idem = IdempotencyStore()
_semaphore: asyncio.Semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY)
# Voice-dedicated semaphores — independent from generic MAX_CONCURRENCY.
# Prevents voice requests from starving generic inference and vice versa.
_voice_sem_tts: asyncio.Semaphore = asyncio.Semaphore(config.VOICE_MAX_CONCURRENT_TTS)
_voice_sem_llm: asyncio.Semaphore = asyncio.Semaphore(config.VOICE_MAX_CONCURRENT_LLM)
_voice_sem_stt: asyncio.Semaphore = asyncio.Semaphore(config.VOICE_MAX_CONCURRENT_STT)
_VOICE_SEMAPHORES = {
"voice.tts": _voice_sem_tts,
"voice.llm": _voice_sem_llm,
"voice.stt": _voice_sem_stt,
}
_nats_client = None
_inflight_count: int = 0
_voice_inflight: Dict[str, int] = {"voice.tts": 0, "voice.llm": 0, "voice.stt": 0}
_latencies_llm: list = []
_latencies_vision: list = []
_LATENCY_BUFFER = 50
# Set of subjects that use the voice handler path
_VOICE_SUBJECTS: set = set()
async def start(nats_client):
global _nats_client
_nats_client = nats_client
nid = config.NODE_ID.lower()
# Generic subjects (unchanged — backward compatible)
subjects = [
f"node.{nid}.llm.request",
f"node.{nid}.vision.request",
@@ -41,6 +61,31 @@ async def start(nats_client):
await nats_client.subscribe(subj, cb=_handle_request)
logger.info(f"✅ Subscribed: {subj}")
# Voice HA subjects — separate semaphores, own metrics, own deadlines
# Only subscribe if the relevant provider is configured (preflight-first)
voice_subjects_to_caps = {
f"node.{nid}.voice.tts.request": ("tts", _voice_sem_tts, "voice.tts"),
f"node.{nid}.voice.llm.request": ("llm", _voice_sem_llm, "voice.llm"),
f"node.{nid}.voice.stt.request": ("stt", _voice_sem_stt, "voice.stt"),
}
for subj, (required_cap, sem, cap_key) in voice_subjects_to_caps.items():
if required_cap == "tts" and config.TTS_PROVIDER == "none":
logger.info(f"⏭ Skipping {subj}: TTS_PROVIDER=none")
continue
if required_cap == "stt" and config.STT_PROVIDER == "none":
logger.info(f"⏭ Skipping {subj}: STT_PROVIDER=none")
continue
# LLM always available on this node
_VOICE_SUBJECTS.add(subj)
async def _make_voice_handler(s=sem, k=cap_key):
async def _voice_handler(msg):
await _handle_voice_request(msg, voice_sem=s, cap_key=k)
return _voice_handler
await nats_client.subscribe(subj, cb=await _make_voice_handler())
logger.info(f"✅ Voice subscribed: {subj}")
async def _handle_request(msg):
t0 = time.time()
@@ -136,6 +181,103 @@ async def _handle_request(msg):
pass
async def _handle_voice_request(msg, voice_sem: asyncio.Semaphore, cap_key: str):
"""Voice-dedicated handler: separate semaphore, metrics, retry hints.
Maps voice.{tts|llm|stt} to the same _execute() but with:
- Own concurrency limit (VOICE_MAX_CONCURRENT_{TTS|LLM|STT})
- TOO_BUSY includes retry_after_ms hint (client can retry immediately elsewhere)
- Voice-specific Prometheus labels (type=voice.tts, etc.)
- WARNING log on fallback (contract: no silent fallback)
"""
t0 = time.time()
# Extract the base type for _execute (voice.tts → tts)
base_type = cap_key.split(".")[-1] # "tts", "llm", "stt"
try:
raw = msg.data
if len(raw) > config.MAX_PAYLOAD_BYTES:
await _reply(msg, JobResponse(
node_id=config.NODE_ID, status="error",
error=JobError(code="PAYLOAD_TOO_LARGE", message=f"max {config.MAX_PAYLOAD_BYTES} bytes"),
))
return
data = json.loads(raw)
job = JobRequest(**data)
job.trace_id = job.trace_id or job.job_id
remaining = job.remaining_ms()
if remaining <= 0:
await _reply(msg, JobResponse(
job_id=job.job_id, trace_id=job.trace_id, node_id=config.NODE_ID,
status="timeout", error=JobError(code="DEADLINE_EXCEEDED"),
))
return
# Voice concurrency check — TOO_BUSY includes retry hint
if voice_sem._value == 0:
logger.warning(
"[voice.busy] cap=%s node=%s — all %d slots occupied. "
"WARNING: request turned away, Router should failover.",
cap_key, config.NODE_ID, {
"voice.tts": config.VOICE_MAX_CONCURRENT_TTS,
"voice.llm": config.VOICE_MAX_CONCURRENT_LLM,
"voice.stt": config.VOICE_MAX_CONCURRENT_STT,
}.get(cap_key, "?"),
)
fm.inc_voice_job(cap_key, "busy")
await _reply(msg, JobResponse(
job_id=job.job_id, trace_id=job.trace_id, node_id=config.NODE_ID,
status="busy",
error=JobError(
code="TOO_BUSY",
message=f"voice {cap_key} at capacity",
details={"retry_after_ms": 500, "cap": cap_key},
),
))
return
global _voice_inflight
_voice_inflight[cap_key] = _voice_inflight.get(cap_key, 0) + 1
fm.set_voice_inflight(cap_key, _voice_inflight[cap_key])
try:
async with voice_sem:
# Route to _execute with the base type
job.required_type = base_type
resp = await _execute(job, remaining)
finally:
_voice_inflight[cap_key] = max(0, _voice_inflight.get(cap_key, 1) - 1)
fm.set_voice_inflight(cap_key, _voice_inflight[cap_key])
resp.latency_ms = int((time.time() - t0) * 1000)
fm.inc_voice_job(cap_key, resp.status)
if resp.status == "ok" and resp.latency_ms > 0:
fm.observe_voice_latency(cap_key, resp.latency_ms)
# Contract: log WARNING on any non-ok voice result
if resp.status != "ok":
logger.warning(
"[voice.fallback] cap=%s node=%s status=%s error=%s trace=%s",
cap_key, config.NODE_ID, resp.status,
resp.error.code if resp.error else "?", job.trace_id,
)
await _reply(msg, resp)
except Exception as e:
logger.exception(f"Voice handler error cap={cap_key}: {e}")
fm.inc_voice_job(cap_key, "error")
try:
await _reply(msg, JobResponse(
node_id=config.NODE_ID, status="error",
error=JobError(code="INTERNAL", message=str(e)[:200]),
))
except Exception:
pass
async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
payload = job.payload
hints = job.hints
@@ -184,9 +326,14 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
status="error",
error=JobError(code="NOT_AVAILABLE", message="STT not configured on this node"),
)
result = await asyncio.wait_for(
stt_mlx_whisper.transcribe(payload), timeout=timeout_s,
)
if config.STT_PROVIDER == "memory_service":
result = await asyncio.wait_for(
stt_memory_service.transcribe(payload), timeout=timeout_s,
)
else:
result = await asyncio.wait_for(
stt_mlx_whisper.transcribe(payload), timeout=timeout_s,
)
elif job.required_type == "tts":
if config.TTS_PROVIDER == "none":
return JobResponse(
@@ -194,9 +341,14 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
status="error",
error=JobError(code="NOT_AVAILABLE", message="TTS not configured on this node"),
)
result = await asyncio.wait_for(
tts_mlx_kokoro.synthesize(payload), timeout=timeout_s,
)
if config.TTS_PROVIDER == "memory_service":
result = await asyncio.wait_for(
tts_memory_service.synthesize(payload), timeout=timeout_s,
)
else:
result = await asyncio.wait_for(
tts_mlx_kokoro.synthesize(payload), timeout=timeout_s,
)
elif job.required_type == "ocr":
if config.OCR_PROVIDER == "none":
return JobResponse(