feat(production): sync all modified production files to git
Includes updates across gateway, router, node-worker, memory-service, aurora-service, swapper, sofiia-console UI and node2 infrastructure: - gateway-bot: Dockerfile, http_api.py, druid/aistalk prompts, doc_service - services/router: main.py, router-config.yml, fabric_metrics, memory_retrieval, offload_client, prompt_builder - services/node-worker: worker.py, main.py, config.py, fabric_metrics - services/memory-service: Dockerfile, database.py, main.py, requirements - services/aurora-service: main.py (+399), kling.py, quality_report.py - services/swapper-service: main.py, swapper_config_node2.yaml - services/sofiia-console: static/index.html (console UI update) - config: agent_registry, crewai_agents/teams, router_agents - ops/fabric_preflight.sh: updated preflight checks - router-config.yml, docker-compose.node2.yml: infra updates - docs: NODA1-AGENT-ARCHITECTURE, fabric_contract updated Made-with: Cursor
This commit is contained in:
@@ -14,3 +14,19 @@ STT_PROVIDER = os.getenv("STT_PROVIDER", "none")
|
||||
TTS_PROVIDER = os.getenv("TTS_PROVIDER", "none")
|
||||
OCR_PROVIDER = os.getenv("OCR_PROVIDER", "vision_prompted")
|
||||
IMAGE_PROVIDER = os.getenv("IMAGE_PROVIDER", "none")
|
||||
|
||||
# Memory Service URL (used by memory_service STT/TTS providers)
|
||||
MEMORY_SERVICE_URL = os.getenv("MEMORY_SERVICE_URL", "http://memory-service:8000")
|
||||
|
||||
# ── Voice HA: dedicated concurrency limits (separate from generic stt/tts/llm) ──
|
||||
# These control semaphores for node.{id}.voice.*.request subjects.
|
||||
# Independent from MAX_CONCURRENCY so voice never starves generic inference.
|
||||
VOICE_MAX_CONCURRENT_TTS = int(os.getenv("VOICE_MAX_CONCURRENT_TTS", "4"))
|
||||
VOICE_MAX_CONCURRENT_LLM = int(os.getenv("VOICE_MAX_CONCURRENT_LLM", "2"))
|
||||
VOICE_MAX_CONCURRENT_STT = int(os.getenv("VOICE_MAX_CONCURRENT_STT", "2"))
|
||||
|
||||
# Timeouts for voice subjects (milliseconds). Router uses these as defaults.
|
||||
VOICE_TTS_DEADLINE_MS = int(os.getenv("VOICE_TTS_DEADLINE_MS", "3000"))
|
||||
VOICE_LLM_FAST_MS = int(os.getenv("VOICE_LLM_FAST_MS", "9000"))
|
||||
VOICE_LLM_QUALITY_MS = int(os.getenv("VOICE_LLM_QUALITY_MS", "12000"))
|
||||
VOICE_STT_DEADLINE_MS = int(os.getenv("VOICE_STT_DEADLINE_MS", "6000"))
|
||||
|
||||
@@ -8,6 +8,7 @@ try:
|
||||
PROM_AVAILABLE = True
|
||||
REGISTRY = CollectorRegistry()
|
||||
|
||||
# Generic job metrics
|
||||
jobs_total = Counter(
|
||||
"node_worker_jobs_total", "Jobs processed",
|
||||
["type", "status"], registry=REGISTRY,
|
||||
@@ -23,6 +24,26 @@ try:
|
||||
registry=REGISTRY,
|
||||
)
|
||||
|
||||
# ── Voice HA metrics (separate labels from generic) ───────────────────────
|
||||
# cap label: "voice.tts" | "voice.llm" | "voice.stt"
|
||||
voice_jobs_total = Counter(
|
||||
"node_worker_voice_jobs_total",
|
||||
"Voice HA jobs processed (node.{id}.voice.*.request)",
|
||||
["cap", "status"], registry=REGISTRY,
|
||||
)
|
||||
voice_inflight_gauge = Gauge(
|
||||
"node_worker_voice_inflight",
|
||||
"Voice HA inflight jobs per capability",
|
||||
["cap"], registry=REGISTRY,
|
||||
)
|
||||
voice_latency_hist = Histogram(
|
||||
"node_worker_voice_latency_ms",
|
||||
"Voice HA job latency in ms",
|
||||
["cap"],
|
||||
buckets=[100, 250, 500, 1000, 1500, 2000, 3000, 5000, 9000, 12000],
|
||||
registry=REGISTRY,
|
||||
)
|
||||
|
||||
except ImportError:
|
||||
PROM_AVAILABLE = False
|
||||
REGISTRY = None
|
||||
@@ -44,6 +65,21 @@ def observe_latency(req_type: str, model: str, latency_ms: int):
|
||||
latency_hist.labels(type=req_type, model=model).observe(latency_ms)
|
||||
|
||||
|
||||
def inc_voice_job(cap: str, status: str):
|
||||
if PROM_AVAILABLE:
|
||||
voice_jobs_total.labels(cap=cap, status=status).inc()
|
||||
|
||||
|
||||
def set_voice_inflight(cap: str, count: int):
|
||||
if PROM_AVAILABLE:
|
||||
voice_inflight_gauge.labels(cap=cap).set(count)
|
||||
|
||||
|
||||
def observe_voice_latency(cap: str, latency_ms: int):
|
||||
if PROM_AVAILABLE:
|
||||
voice_latency_hist.labels(cap=cap).observe(latency_ms)
|
||||
|
||||
|
||||
def get_metrics_text():
|
||||
if PROM_AVAILABLE and REGISTRY:
|
||||
return generate_latest(REGISTRY)
|
||||
|
||||
@@ -43,7 +43,30 @@ async def prom_metrics():
|
||||
|
||||
@app.get("/caps")
|
||||
async def caps():
|
||||
"""Capability flags for NCS to aggregate."""
|
||||
"""Capability flags for NCS to aggregate.
|
||||
|
||||
Semantic vs operational separation (contract):
|
||||
- capabilities.voice_* = semantic availability (provider configured).
|
||||
True as long as the provider is configured, regardless of NATS state.
|
||||
Routing decisions are based on this.
|
||||
- runtime.nats_subscriptions.voice_* = operational (NATS sub active).
|
||||
Used for health/telemetry only — NOT for routing.
|
||||
|
||||
This prevents false-negatives during reconnects / restart races.
|
||||
"""
|
||||
import worker as _w
|
||||
nid = config.NODE_ID.lower()
|
||||
|
||||
# Semantic: provider configured → capability is available
|
||||
voice_tts_cap = config.TTS_PROVIDER != "none"
|
||||
voice_stt_cap = config.STT_PROVIDER != "none"
|
||||
voice_llm_cap = True # LLM always available when node-worker is up
|
||||
|
||||
# Operational: actual NATS subscription state (health/telemetry only)
|
||||
nats_voice_tts_active = f"node.{nid}.voice.tts.request" in _w._VOICE_SUBJECTS
|
||||
nats_voice_stt_active = f"node.{nid}.voice.stt.request" in _w._VOICE_SUBJECTS
|
||||
nats_voice_llm_active = f"node.{nid}.voice.llm.request" in _w._VOICE_SUBJECTS
|
||||
|
||||
return {
|
||||
"node_id": config.NODE_ID,
|
||||
"capabilities": {
|
||||
@@ -53,6 +76,10 @@ async def caps():
|
||||
"tts": config.TTS_PROVIDER != "none",
|
||||
"ocr": config.OCR_PROVIDER != "none",
|
||||
"image": config.IMAGE_PROVIDER != "none",
|
||||
# Voice HA semantic capability flags (provider-based, not NATS-based)
|
||||
"voice_tts": voice_tts_cap,
|
||||
"voice_llm": voice_llm_cap,
|
||||
"voice_stt": voice_stt_cap,
|
||||
},
|
||||
"providers": {
|
||||
"stt": config.STT_PROVIDER,
|
||||
@@ -65,6 +92,19 @@ async def caps():
|
||||
"vision": config.DEFAULT_VISION,
|
||||
},
|
||||
"concurrency": config.MAX_CONCURRENCY,
|
||||
"voice_concurrency": {
|
||||
"voice_tts": config.VOICE_MAX_CONCURRENT_TTS,
|
||||
"voice_llm": config.VOICE_MAX_CONCURRENT_LLM,
|
||||
"voice_stt": config.VOICE_MAX_CONCURRENT_STT,
|
||||
},
|
||||
# Operational NATS subscription state — for health/monitoring only
|
||||
"runtime": {
|
||||
"nats_subscriptions": {
|
||||
"voice_tts_active": nats_voice_tts_active,
|
||||
"voice_stt_active": nats_voice_stt_active,
|
||||
"voice_llm_active": nats_voice_llm_active,
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -11,24 +11,44 @@ from models import JobRequest, JobResponse, JobError
|
||||
from idempotency import IdempotencyStore
|
||||
from providers import ollama, ollama_vision
|
||||
from providers import stt_mlx_whisper, tts_mlx_kokoro
|
||||
from providers import stt_memory_service, tts_memory_service
|
||||
import fabric_metrics as fm
|
||||
|
||||
logger = logging.getLogger("node-worker")
|
||||
|
||||
_idem = IdempotencyStore()
|
||||
_semaphore: asyncio.Semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY)
|
||||
|
||||
# Voice-dedicated semaphores — independent from generic MAX_CONCURRENCY.
|
||||
# Prevents voice requests from starving generic inference and vice versa.
|
||||
_voice_sem_tts: asyncio.Semaphore = asyncio.Semaphore(config.VOICE_MAX_CONCURRENT_TTS)
|
||||
_voice_sem_llm: asyncio.Semaphore = asyncio.Semaphore(config.VOICE_MAX_CONCURRENT_LLM)
|
||||
_voice_sem_stt: asyncio.Semaphore = asyncio.Semaphore(config.VOICE_MAX_CONCURRENT_STT)
|
||||
|
||||
_VOICE_SEMAPHORES = {
|
||||
"voice.tts": _voice_sem_tts,
|
||||
"voice.llm": _voice_sem_llm,
|
||||
"voice.stt": _voice_sem_stt,
|
||||
}
|
||||
|
||||
_nats_client = None
|
||||
_inflight_count: int = 0
|
||||
_voice_inflight: Dict[str, int] = {"voice.tts": 0, "voice.llm": 0, "voice.stt": 0}
|
||||
_latencies_llm: list = []
|
||||
_latencies_vision: list = []
|
||||
_LATENCY_BUFFER = 50
|
||||
|
||||
# Set of subjects that use the voice handler path
|
||||
_VOICE_SUBJECTS: set = set()
|
||||
|
||||
|
||||
async def start(nats_client):
|
||||
global _nats_client
|
||||
_nats_client = nats_client
|
||||
|
||||
nid = config.NODE_ID.lower()
|
||||
|
||||
# Generic subjects (unchanged — backward compatible)
|
||||
subjects = [
|
||||
f"node.{nid}.llm.request",
|
||||
f"node.{nid}.vision.request",
|
||||
@@ -41,6 +61,31 @@ async def start(nats_client):
|
||||
await nats_client.subscribe(subj, cb=_handle_request)
|
||||
logger.info(f"✅ Subscribed: {subj}")
|
||||
|
||||
# Voice HA subjects — separate semaphores, own metrics, own deadlines
|
||||
# Only subscribe if the relevant provider is configured (preflight-first)
|
||||
voice_subjects_to_caps = {
|
||||
f"node.{nid}.voice.tts.request": ("tts", _voice_sem_tts, "voice.tts"),
|
||||
f"node.{nid}.voice.llm.request": ("llm", _voice_sem_llm, "voice.llm"),
|
||||
f"node.{nid}.voice.stt.request": ("stt", _voice_sem_stt, "voice.stt"),
|
||||
}
|
||||
for subj, (required_cap, sem, cap_key) in voice_subjects_to_caps.items():
|
||||
if required_cap == "tts" and config.TTS_PROVIDER == "none":
|
||||
logger.info(f"⏭ Skipping {subj}: TTS_PROVIDER=none")
|
||||
continue
|
||||
if required_cap == "stt" and config.STT_PROVIDER == "none":
|
||||
logger.info(f"⏭ Skipping {subj}: STT_PROVIDER=none")
|
||||
continue
|
||||
# LLM always available on this node
|
||||
_VOICE_SUBJECTS.add(subj)
|
||||
|
||||
async def _make_voice_handler(s=sem, k=cap_key):
|
||||
async def _voice_handler(msg):
|
||||
await _handle_voice_request(msg, voice_sem=s, cap_key=k)
|
||||
return _voice_handler
|
||||
|
||||
await nats_client.subscribe(subj, cb=await _make_voice_handler())
|
||||
logger.info(f"✅ Voice subscribed: {subj}")
|
||||
|
||||
|
||||
async def _handle_request(msg):
|
||||
t0 = time.time()
|
||||
@@ -136,6 +181,103 @@ async def _handle_request(msg):
|
||||
pass
|
||||
|
||||
|
||||
async def _handle_voice_request(msg, voice_sem: asyncio.Semaphore, cap_key: str):
|
||||
"""Voice-dedicated handler: separate semaphore, metrics, retry hints.
|
||||
|
||||
Maps voice.{tts|llm|stt} to the same _execute() but with:
|
||||
- Own concurrency limit (VOICE_MAX_CONCURRENT_{TTS|LLM|STT})
|
||||
- TOO_BUSY includes retry_after_ms hint (client can retry immediately elsewhere)
|
||||
- Voice-specific Prometheus labels (type=voice.tts, etc.)
|
||||
- WARNING log on fallback (contract: no silent fallback)
|
||||
"""
|
||||
t0 = time.time()
|
||||
# Extract the base type for _execute (voice.tts → tts)
|
||||
base_type = cap_key.split(".")[-1] # "tts", "llm", "stt"
|
||||
|
||||
try:
|
||||
raw = msg.data
|
||||
if len(raw) > config.MAX_PAYLOAD_BYTES:
|
||||
await _reply(msg, JobResponse(
|
||||
node_id=config.NODE_ID, status="error",
|
||||
error=JobError(code="PAYLOAD_TOO_LARGE", message=f"max {config.MAX_PAYLOAD_BYTES} bytes"),
|
||||
))
|
||||
return
|
||||
|
||||
data = json.loads(raw)
|
||||
job = JobRequest(**data)
|
||||
job.trace_id = job.trace_id or job.job_id
|
||||
|
||||
remaining = job.remaining_ms()
|
||||
if remaining <= 0:
|
||||
await _reply(msg, JobResponse(
|
||||
job_id=job.job_id, trace_id=job.trace_id, node_id=config.NODE_ID,
|
||||
status="timeout", error=JobError(code="DEADLINE_EXCEEDED"),
|
||||
))
|
||||
return
|
||||
|
||||
# Voice concurrency check — TOO_BUSY includes retry hint
|
||||
if voice_sem._value == 0:
|
||||
logger.warning(
|
||||
"[voice.busy] cap=%s node=%s — all %d slots occupied. "
|
||||
"WARNING: request turned away, Router should failover.",
|
||||
cap_key, config.NODE_ID, {
|
||||
"voice.tts": config.VOICE_MAX_CONCURRENT_TTS,
|
||||
"voice.llm": config.VOICE_MAX_CONCURRENT_LLM,
|
||||
"voice.stt": config.VOICE_MAX_CONCURRENT_STT,
|
||||
}.get(cap_key, "?"),
|
||||
)
|
||||
fm.inc_voice_job(cap_key, "busy")
|
||||
await _reply(msg, JobResponse(
|
||||
job_id=job.job_id, trace_id=job.trace_id, node_id=config.NODE_ID,
|
||||
status="busy",
|
||||
error=JobError(
|
||||
code="TOO_BUSY",
|
||||
message=f"voice {cap_key} at capacity",
|
||||
details={"retry_after_ms": 500, "cap": cap_key},
|
||||
),
|
||||
))
|
||||
return
|
||||
|
||||
global _voice_inflight
|
||||
_voice_inflight[cap_key] = _voice_inflight.get(cap_key, 0) + 1
|
||||
fm.set_voice_inflight(cap_key, _voice_inflight[cap_key])
|
||||
|
||||
try:
|
||||
async with voice_sem:
|
||||
# Route to _execute with the base type
|
||||
job.required_type = base_type
|
||||
resp = await _execute(job, remaining)
|
||||
finally:
|
||||
_voice_inflight[cap_key] = max(0, _voice_inflight.get(cap_key, 1) - 1)
|
||||
fm.set_voice_inflight(cap_key, _voice_inflight[cap_key])
|
||||
|
||||
resp.latency_ms = int((time.time() - t0) * 1000)
|
||||
fm.inc_voice_job(cap_key, resp.status)
|
||||
if resp.status == "ok" and resp.latency_ms > 0:
|
||||
fm.observe_voice_latency(cap_key, resp.latency_ms)
|
||||
|
||||
# Contract: log WARNING on any non-ok voice result
|
||||
if resp.status != "ok":
|
||||
logger.warning(
|
||||
"[voice.fallback] cap=%s node=%s status=%s error=%s trace=%s",
|
||||
cap_key, config.NODE_ID, resp.status,
|
||||
resp.error.code if resp.error else "?", job.trace_id,
|
||||
)
|
||||
|
||||
await _reply(msg, resp)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Voice handler error cap={cap_key}: {e}")
|
||||
fm.inc_voice_job(cap_key, "error")
|
||||
try:
|
||||
await _reply(msg, JobResponse(
|
||||
node_id=config.NODE_ID, status="error",
|
||||
error=JobError(code="INTERNAL", message=str(e)[:200]),
|
||||
))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
|
||||
payload = job.payload
|
||||
hints = job.hints
|
||||
@@ -184,9 +326,14 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
|
||||
status="error",
|
||||
error=JobError(code="NOT_AVAILABLE", message="STT not configured on this node"),
|
||||
)
|
||||
result = await asyncio.wait_for(
|
||||
stt_mlx_whisper.transcribe(payload), timeout=timeout_s,
|
||||
)
|
||||
if config.STT_PROVIDER == "memory_service":
|
||||
result = await asyncio.wait_for(
|
||||
stt_memory_service.transcribe(payload), timeout=timeout_s,
|
||||
)
|
||||
else:
|
||||
result = await asyncio.wait_for(
|
||||
stt_mlx_whisper.transcribe(payload), timeout=timeout_s,
|
||||
)
|
||||
elif job.required_type == "tts":
|
||||
if config.TTS_PROVIDER == "none":
|
||||
return JobResponse(
|
||||
@@ -194,9 +341,14 @@ async def _execute(job: JobRequest, remaining_ms: int) -> JobResponse:
|
||||
status="error",
|
||||
error=JobError(code="NOT_AVAILABLE", message="TTS not configured on this node"),
|
||||
)
|
||||
result = await asyncio.wait_for(
|
||||
tts_mlx_kokoro.synthesize(payload), timeout=timeout_s,
|
||||
)
|
||||
if config.TTS_PROVIDER == "memory_service":
|
||||
result = await asyncio.wait_for(
|
||||
tts_memory_service.synthesize(payload), timeout=timeout_s,
|
||||
)
|
||||
else:
|
||||
result = await asyncio.wait_for(
|
||||
tts_mlx_kokoro.synthesize(payload), timeout=timeout_s,
|
||||
)
|
||||
elif job.required_type == "ocr":
|
||||
if config.OCR_PROVIDER == "none":
|
||||
return JobResponse(
|
||||
|
||||
Reference in New Issue
Block a user