feat(production): sync all modified production files to git

Includes updates across gateway, router, node-worker, memory-service,
aurora-service, swapper, sofiia-console UI and node2 infrastructure:

- gateway-bot: Dockerfile, http_api.py, druid/aistalk prompts, doc_service
- services/router: main.py, router-config.yml, fabric_metrics, memory_retrieval,
  offload_client, prompt_builder
- services/node-worker: worker.py, main.py, config.py, fabric_metrics
- services/memory-service: Dockerfile, database.py, main.py, requirements
- services/aurora-service: main.py (+399), kling.py, quality_report.py
- services/swapper-service: main.py, swapper_config_node2.yaml
- services/sofiia-console: static/index.html (console UI update)
- config: agent_registry, crewai_agents/teams, router_agents
- ops/fabric_preflight.sh: updated preflight checks
- router-config.yml, docker-compose.node2.yml: infra updates
- docs: NODA1-AGENT-ARCHITECTURE, fabric_contract updated

Made-with: Cursor
This commit is contained in:
Apple
2026-03-03 07:13:29 -08:00
parent 9aac835882
commit e9dedffa48
35 changed files with 3317 additions and 805 deletions

View File

@@ -14,3 +14,19 @@ STT_PROVIDER = os.getenv("STT_PROVIDER", "none")
TTS_PROVIDER = os.getenv("TTS_PROVIDER", "none")
OCR_PROVIDER = os.getenv("OCR_PROVIDER", "vision_prompted")
IMAGE_PROVIDER = os.getenv("IMAGE_PROVIDER", "none")
# Memory Service URL (used by memory_service STT/TTS providers)
MEMORY_SERVICE_URL = os.getenv("MEMORY_SERVICE_URL", "http://memory-service:8000")
# ── Voice HA: dedicated concurrency limits (separate from generic stt/tts/llm) ──
# These control semaphores for node.{id}.voice.*.request subjects.
# Independent from MAX_CONCURRENCY so voice never starves generic inference.
VOICE_MAX_CONCURRENT_TTS = int(os.getenv("VOICE_MAX_CONCURRENT_TTS", "4"))
VOICE_MAX_CONCURRENT_LLM = int(os.getenv("VOICE_MAX_CONCURRENT_LLM", "2"))
VOICE_MAX_CONCURRENT_STT = int(os.getenv("VOICE_MAX_CONCURRENT_STT", "2"))
# Timeouts for voice subjects (milliseconds). Router uses these as defaults.
VOICE_TTS_DEADLINE_MS = int(os.getenv("VOICE_TTS_DEADLINE_MS", "3000"))
VOICE_LLM_FAST_MS = int(os.getenv("VOICE_LLM_FAST_MS", "9000"))
VOICE_LLM_QUALITY_MS = int(os.getenv("VOICE_LLM_QUALITY_MS", "12000"))
VOICE_STT_DEADLINE_MS = int(os.getenv("VOICE_STT_DEADLINE_MS", "6000"))

View File

@@ -8,6 +8,7 @@ try:
PROM_AVAILABLE = True
REGISTRY = CollectorRegistry()
# Generic job metrics
jobs_total = Counter(
"node_worker_jobs_total", "Jobs processed",
["type", "status"], registry=REGISTRY,
@@ -23,6 +24,26 @@ try:
registry=REGISTRY,
)
# ── Voice HA metrics (separate labels from generic) ───────────────────────
# cap label: "voice.tts" | "voice.llm" | "voice.stt"
voice_jobs_total = Counter(
"node_worker_voice_jobs_total",
"Voice HA jobs processed (node.{id}.voice.*.request)",
["cap", "status"], registry=REGISTRY,
)
voice_inflight_gauge = Gauge(
"node_worker_voice_inflight",
"Voice HA inflight jobs per capability",
["cap"], registry=REGISTRY,
)
voice_latency_hist = Histogram(
"node_worker_voice_latency_ms",
"Voice HA job latency in ms",
["cap"],
buckets=[100, 250, 500, 1000, 1500, 2000, 3000, 5000, 9000, 12000],
registry=REGISTRY,
)
except ImportError:
PROM_AVAILABLE = False
REGISTRY = None
@@ -44,6 +65,21 @@ def observe_latency(req_type: str, model: str, latency_ms: int):
latency_hist.labels(type=req_type, model=model).observe(latency_ms)
def inc_voice_job(cap: str, status: str):
if PROM_AVAILABLE:
voice_jobs_total.labels(cap=cap, status=status).inc()
def set_voice_inflight(cap: str, count: int):
if PROM_AVAILABLE:
voice_inflight_gauge.labels(cap=cap).set(count)
def observe_voice_latency(cap: str, latency_ms: int):
if PROM_AVAILABLE:
voice_latency_hist.labels(cap=cap).observe(latency_ms)
def get_metrics_text():
if PROM_AVAILABLE and REGISTRY:
return generate_latest(REGISTRY)

View File

@@ -43,7 +43,30 @@ async def prom_metrics():
@app.get("/caps")
async def caps():
"""Capability flags for NCS to aggregate."""
"""Capability flags for NCS to aggregate.
Semantic vs operational separation (contract):
- capabilities.voice_* = semantic availability (provider configured).
True as long as the provider is configured, regardless of NATS state.
Routing decisions are based on this.
- runtime.nats_subscriptions.voice_* = operational (NATS sub active).
Used for health/telemetry only — NOT for routing.
This prevents false-negatives during reconnects / restart races.
"""
import worker as _w
nid = config.NODE_ID.lower()
# Semantic: provider configured → capability is available
voice_tts_cap = config.TTS_PROVIDER != "none"
voice_stt_cap = config.STT_PROVIDER != "none"
voice_llm_cap = True # LLM always available when node-worker is up
# Operational: actual NATS subscription state (health/telemetry only)
nats_voice_tts_active = f"node.{nid}.voice.tts.request" in _w._VOICE_SUBJECTS
nats_voice_stt_active = f"node.{nid}.voice.stt.request" in _w._VOICE_SUBJECTS
nats_voice_llm_active = f"node.{nid}.voice.llm.request" in _w._VOICE_SUBJECTS
return {
"node_id": config.NODE_ID,
"capabilities": {
@@ -53,6 +76,10 @@ async def caps():
"tts": config.TTS_PROVIDER != "none",
"ocr": config.OCR_PROVIDER != "none",
"image": config.IMAGE_PROVIDER != "none",
# Voice HA semantic capability flags (provider-based, not NATS-based)
"voice_tts": voice_tts_cap,
"voice_llm": voice_llm_cap,
"voice_stt": voice_stt_cap,
},
"providers": {
"stt": config.STT_PROVIDER,
@@ -65,6 +92,19 @@ async def caps():
"vision": config.DEFAULT_VISION,
},
"concurrency": config.MAX_CONCURRENCY,
"voice_concurrency": {
"voice_tts": config.VOICE_MAX_CONCURRENT_TTS,
"voice_llm": config.VOICE_MAX_CONCURRENT_LLM,
"voice_stt": config.VOICE_MAX_CONCURRENT_STT,
},
# Operational NATS subscription state — for health/monitoring only
"runtime": {
"nats_subscriptions": {
"voice_tts_active": nats_voice_tts_active,
"voice_stt_active": nats_voice_stt_active,
"voice_llm_active": nats_voice_llm_active,
}
},
}

View File

@@ -11,24 +11,44 @@ from models import JobRequest, JobResponse, JobError
from idempotency import IdempotencyStore
from providers import ollama, ollama_vision
from providers import stt_mlx_whisper, tts_mlx_kokoro
from providers import stt_memory_service, tts_memory_service
import fabric_metrics as fm
logger = logging.getLogger("node-worker")
_idem = IdempotencyStore()
_semaphore: asyncio.Semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY)
# Voice-dedicated semaphores — independent from generic MAX_CONCURRENCY.
# Prevents voice requests from starving generic inference and vice versa.
_voice_sem_tts: asyncio.Semaphore = asyncio.Semaphore(config.VOICE_MAX_CONCURRENT_TTS)
_voice_sem_llm: asyncio.Semaphore = asyncio.Semaphore(config.VOICE_MAX_CONCURRENT_LLM)
_voice_sem_stt: asyncio.Semaphore = asyncio.Semaphore(config.VOICE_MAX_CONCURRENT_STT)
_VOICE_SEMAPHORES = {
"voice.tts": _voice_sem_tts,
"voice.llm": _voice_sem_llm,
"voice.stt": _voice_sem_stt,
}
_nats_client = None
_inflight_count: int = 0
_voice_inflight: Dict[str, int] = {"voice.tts": 0, "voice.llm": 0, "voice.stt": 0}
_latencies_llm: list = []
_latencies_vision: list = []
_LATENCY_BUFFER = 50
# Set of subjects that use the voice handler path
_VOICE_SUBJECTS: set = set()
async def start(nats_client):
global _nats_client
_nats_client = nats_client
nid = config.NODE_ID.lower()
# Generic subjects (unchanged — backward compatible)
subjects = [
f"node.{nid}.llm.request",
f"node.{nid}.vision.request",
@@ -41,6 +61,31 @@ async def start(nats_client):
await nats_client.subscribe(subj, cb=_handle_request)
logger.info(f"✅ Subscribed: {subj}")
# Voice HA subjects — separate semaphores, own metrics, own deadlines
# Only subscribe if the relevant provider is configured (preflight-first)
voice_subjects_to_caps = {
f"node.{nid}.voice.tts.request": ("tts", _voice_sem_tts, "voice.tts"),
f"node.{nid}.voice.llm.request": ("llm", _voice_sem_llm, "voice.llm"),
f"node.{nid}.voice.stt.request": ("stt", _voice_sem_stt, "voice.stt"),
}
for subj, (required_cap, sem, cap_key) in voice_subjects_to_caps.items():
if required_cap == "tts" and config.TTS_PROVIDER == "none":
logger.info(f"⏭ Skipping {subj}: TTS_PROVIDER=none")
continue
if required_cap == "stt" and config.STT_PROVIDER == "none":
logger.info(f"⏭ Skipping {subj}: STT_PROVIDER=none")
continue
# LLM always available on this node
_VOICE_SUBJECTS.add(subj)
async def _make_voice_handler(s=sem, k=cap_key):
async def _voice_handler(msg):
await _handle_voice_request(msg, voice_sem=s, cap_key=k)
return _voice_handler
await nats_client.subscribe(subj, cb=await _make_voice_handler())
logger.info(f"✅ Voice subscribed: {subj}")
async def _handle_request(msg):
t0 = time.time()
@@ -136,6 +181,103 @@ async def _handle_request(msg):
pass
async def _handle_voice_request(msg, voice_sem: asyncio.Semaphore, cap_key: str):
"""Voice-dedicated handler: separate semaphore, metrics, retry hints.
Maps voice.{tts|llm|stt} to the same _execute() but with:
- Own concurrency limit (VOICE_MAX_CONCURRENT_{TTS|LLM|STT})
- TOO_BUSY includes retry_after_ms hint (client can retry immediately elsewhere)
- Voice-specific Prometheus labels (type=voice.tts, etc.)
- WARNING log on fallback (contract: no silent fallback)
"""
t0 = time.time()
# Extract the base type for _execute (voice.tts → tts)
base_type = cap_key.split(".")[-1] # "tts", "llm", "stt"
try:
raw = msg.data
if len(raw) > config.MAX_PAYLOAD_BYTES:
await _reply(msg, JobResponse(
node_id=config.NODE_ID, status="error",
error=JobError(code="PAYLOAD_TOO_LARGE", message=f"max {config.MAX_PAYLOAD_BYTES} bytes"),
))
return
data = json.loads(raw)
job = JobRequest(**data)
job.trace_id = job.trace_id or job.job_id
remaining = job.remaining_ms()
if remaining <= 0:
await _reply(msg, JobResponse(
job_id=job.job_id, trace_id=job.trace_id, node_id=config.NODE_ID,
status="timeout", error=JobError(code="DEADLINE_EXCEEDED"),
))
return
# Voice concurrency check — TOO_BUSY includes retry hint
if voice_sem._value == 0:
logger.warning(
"[voice.busy] cap=%s node=%s — all %d slots occupied. "
"WARNING: request turned away, Router should failover.",
cap_key, config.NODE_ID, {
"voice.tts": config.VOICE_MAX_CONCURRENT_TTS,
"voice.llm": config.VOICE_MAX_CONCURRENT_LLM,
"voice.stt": config.VOICE_MAX_CONCURRENT_STT,
}.get(cap_key, "?"),
)
fm.inc_voice_job(cap_key, "busy")
await _reply(msg, JobResponse(
job_id=job.job_id, trace_id=job.trace_id, node_id=config.NODE_ID,
status="busy",
error=JobError(
code="TOO_BUSY",
message=f"voice {cap_key} at capacity",
details={"retry_after_ms": 500, "cap": cap_key},
),
))
return
global _voice_inflight
_voice_inflight[cap_key] = _voice_inflight.get(cap_key, 0) + 1
fm.set_voice_inflight(cap_key, _voice_inflight[cap_key])
try:
async with voice_sem:
# Route to _execute with the base type
job.required_type = base_type
resp = await _execute(job, remaining)
finally:
_voice_inflight[cap_key] = max(0, _voice_inflight.get(cap_key, 1) - 1)
fm.set_voice_inflight(cap_key, _voice_inflight[cap_key])
resp.latency_ms = int((time.time() - t0) * 1000)
fm.inc_voice_job(cap_key, resp.status)
if resp.status == "ok" and resp.latency_ms > 0:
fm.observe_voice_latency(cap_key, resp.latency_ms)
# Contract: log WARNING on any non-ok voice result
if resp.status != "ok":
logger.warning(
"[voice.fallback] cap=%s node=%s status=%s error=%s trace=%s",
cap_key, config.NODE_ID, resp.status,
resp.error.code if resp.error else "?", job.trace_id,
)
await _reply(msg, resp)
except Exception as e:
logger.exception(f"Voice handler error cap={cap_key}: {e}")
fm.inc_voice_job(cap_key, "error")
try:
await _reply(msg, JobResponse(
node_id=config.NODE_ID, status="error",
error=JobError(code="INTERNAL", message=str(e)[:200]),
))
except Exception:
pass
async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
payload = job.payload
hints = job.hints
@@ -184,9 +326,14 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
status="error",
error=JobError(code="NOT_AVAILABLE", message="STT not configured on this node"),
)
result = await asyncio.wait_for(
stt_mlx_whisper.transcribe(payload), timeout=timeout_s,
)
if config.STT_PROVIDER == "memory_service":
result = await asyncio.wait_for(
stt_memory_service.transcribe(payload), timeout=timeout_s,
)
else:
result = await asyncio.wait_for(
stt_mlx_whisper.transcribe(payload), timeout=timeout_s,
)
elif job.required_type == "tts":
if config.TTS_PROVIDER == "none":
return JobResponse(
@@ -194,9 +341,14 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
status="error",
error=JobError(code="NOT_AVAILABLE", message="TTS not configured on this node"),
)
result = await asyncio.wait_for(
tts_mlx_kokoro.synthesize(payload), timeout=timeout_s,
)
if config.TTS_PROVIDER == "memory_service":
result = await asyncio.wait_for(
tts_memory_service.synthesize(payload), timeout=timeout_s,
)
else:
result = await asyncio.wait_for(
tts_mlx_kokoro.synthesize(payload), timeout=timeout_s,
)
elif job.required_type == "ocr":
if config.OCR_PROVIDER == "none":
return JobResponse(