P3.5-P3.7: 2-layer inventory, capability routing, STT/TTS adapters, Dev Contract

NCS:
- _collect_worker_caps() fetches capability flags from node-worker /caps
- _derive_capabilities() merges served model types + worker provider flags
- installed_artifacts replaces inventory_only (disk scan with DISK_SCAN_PATHS env)
- New endpoints: /capabilities/caps, /capabilities/installed

Node Worker:
- STT_PROVIDER, TTS_PROVIDER, OCR_PROVIDER, IMAGE_PROVIDER env flags
- /caps endpoint returns capabilities + providers for NCS aggregation
- STT adapter (providers/stt_mlx_whisper.py) — remote + local mode
- TTS adapter (providers/tts_mlx_kokoro.py) — remote + local mode
- OCR handler via vision_prompted (ollama_vision with OCR prompt)
- NATS subjects: node.{id}.stt/tts/ocr/image.request

Router:
- POST /v1/capability/{stt,tts,ocr,image} — capability-based offload routing
- GET /v1/capabilities — global view with capabilities_by_node
- require_fresh_caps(ttl) preflight guard
- find_nodes_with_capability(cap) + load-based node selection

Ops:
- ops/fabric_snapshot.py — full runtime snapshot collector
- ops/fabric_preflight.sh — quick check + snapshot save + diff
- docs/fabric_contract.md — Dev Contract v0.1 (preflight-first)
- tests/test_fabric_contract.py — CI enforcement (6 tests)

Made-with: Cursor
This commit is contained in:
Apple
2026-02-27 05:24:09 -08:00
parent 194c87f53c
commit 9a36020316
17 changed files with 1352 additions and 21 deletions

View File

@@ -100,8 +100,8 @@ async def _discover_remote_nodes() -> List[Dict[str, Any]]:
sub = await _nats_client.subscribe(inbox)
try:
await _nats_client.publish_request(
"node.*.capabilities.get", inbox, b""
await _nats_client.publish(
CAPS_DISCOVERY_SUBJECT, b"", reply=inbox,
)
await _nats_client.flush()
@@ -183,6 +183,7 @@ async def get_global_capabilities(force: bool = False) -> Dict[str, Any]:
def _build_global_view() -> Dict[str, Any]:
"""Build a unified view from all cached node capabilities."""
all_served: List[Dict[str, Any]] = []
global_caps: Dict[str, Dict[str, Any]] = {}
for node_id, caps in _node_cache.items():
is_local = (node_id.lower() == LOCAL_NODE_ID.lower())
@@ -194,16 +195,27 @@ def _build_global_view() -> Dict[str, Any]:
"local": is_local,
"node_age_s": round(age, 1),
})
node_caps = caps.get("capabilities", {})
if node_caps:
global_caps[node_id] = {
k: v for k, v in node_caps.items() if k != "providers"
}
all_served.sort(key=lambda m: (0 if m.get("local") else 1, m.get("name", "")))
return {
"local_node": LOCAL_NODE_ID,
"nodes": {nid: {"node_id": nid, "served_count": len(c.get("served_models", [])),
"age_s": round(time.time() - _node_timestamps.get(nid, 0), 1)}
for nid, c in _node_cache.items()},
"nodes": {nid: {
"node_id": nid,
"served_count": len(c.get("served_models", [])),
"installed_count": c.get("installed_count", 0),
"capabilities": c.get("capabilities", {}),
"node_load": c.get("node_load", {}),
"age_s": round(time.time() - _node_timestamps.get(nid, 0), 1),
} for nid, c in _node_cache.items()},
"served_models": all_served,
"served_count": len(all_served),
"capabilities_by_node": global_caps,
"node_count": len(_node_cache),
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
}
@@ -214,6 +226,44 @@ def get_cached_global() -> Dict[str, Any]:
return _build_global_view()
async def require_fresh_caps(ttl: int = 30) -> Optional[Dict[str, Any]]:
"""Preflight: return global caps only if fresh enough.
Returns None if NCS data is stale beyond ttl — caller should use
safe fallback instead of making routing decisions on outdated info.
"""
if not _node_timestamps:
gcaps = await get_global_capabilities(force=True)
if not _node_timestamps:
return None
return gcaps
oldest = min(_node_timestamps.values())
if (time.time() - oldest) > ttl:
gcaps = await get_global_capabilities(force=True)
oldest = min(_node_timestamps.values()) if _node_timestamps else 0
if (time.time() - oldest) > ttl:
logger.warning("[preflight] caps stale after refresh, age=%ds", int(time.time() - oldest))
return None
return gcaps
return _build_global_view()
def find_nodes_with_capability(cap: str) -> List[str]:
"""Return node IDs that have a given capability enabled."""
result = []
for nid, caps in _node_cache.items():
node_caps = caps.get("capabilities", {})
if node_caps.get(cap, False):
result.append(nid)
return result
def get_node_load(node_id: str) -> Dict[str, Any]:
"""Get cached node_load for a specific node."""
caps = _node_cache.get(node_id, {})
return caps.get("node_load", {})
async def send_offload_request(
node_id: str,
request_type: str,