- Vision Encoder Service (OpenCLIP ViT-L/14, GPU-accelerated)
- FastAPI app with text/image embedding endpoints (768-dim)
- Docker support with NVIDIA GPU runtime
- Port 8001, health checks, model info API
- Qdrant Vector Database integration
- Port 6333/6334 (HTTP/gRPC)
- Image embeddings storage (768-dim, Cosine distance)
- Auto collection creation
- Vision RAG implementation
- VisionEncoderClient (Python client for API)
- Image Search module (text-to-image, image-to-image)
- Vision RAG routing in DAGI Router (mode: image_search)
- VisionEncoderProvider integration
- Documentation (5000+ lines)
- SYSTEM-INVENTORY.md - Complete system inventory
- VISION-ENCODER-STATUS.md - Service status
- VISION-RAG-IMPLEMENTATION.md - Implementation details
- vision_encoder_deployment_task.md - Deployment checklist
- services/vision-encoder/README.md - Deployment guide
- Updated WARP.md, INFRASTRUCTURE.md, Jupyter Notebook
- Testing
- test-vision-encoder.sh - Smoke tests (6 tests)
- Unit tests for client, image search, routing
- Services: 17 total (added Vision Encoder + Qdrant)
- AI Models: 3 (qwen3:8b, OpenCLIP ViT-L/14, BAAI/bge-m3)
- GPU Services: 2 (Vision Encoder, Ollama)
- VRAM Usage: ~10 GB (concurrent)
Status: Production Ready ✅
203 lines
7.2 KiB
Python
203 lines
7.2 KiB
Python
"""
|
|
Vision Encoder Provider
|
|
Calls Vision Encoder service for text and image embeddings using OpenCLIP.
|
|
|
|
Endpoints:
|
|
- /embed/text - Generate text embedding
|
|
- /embed/image - Generate image embedding (from URL)
|
|
- /embed/image/upload - Generate image embedding (from file upload)
|
|
"""
|
|
import logging
|
|
from typing import Dict, Any, Optional
|
|
import httpx
|
|
|
|
from providers.base import Provider
|
|
from router_models import RouterRequest, RouterResponse
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class VisionEncoderProvider(Provider):
|
|
"""
|
|
Provider that routes requests to Vision Encoder service.
|
|
|
|
Supports:
|
|
- Text embeddings (for text-to-image search)
|
|
- Image embeddings (for image-to-text search or image similarity)
|
|
- Normalized embeddings (cosine similarity ready)
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
provider_id: str,
|
|
base_url: str,
|
|
timeout: int = 60,
|
|
**kwargs
|
|
):
|
|
super().__init__(provider_id)
|
|
self.base_url = base_url.rstrip("/")
|
|
self.timeout = timeout
|
|
logger.info(f"VisionEncoderProvider initialized: {provider_id} → {base_url}")
|
|
|
|
async def call(self, request: RouterRequest) -> RouterResponse:
|
|
"""
|
|
Route request to Vision Encoder service.
|
|
|
|
Expected request.payload format:
|
|
{
|
|
"operation": "embed_text" | "embed_image",
|
|
"text": "...", # for embed_text
|
|
"image_url": "...", # for embed_image
|
|
"normalize": true # optional, default true
|
|
}
|
|
"""
|
|
try:
|
|
# Extract operation from payload
|
|
operation = request.payload.get("operation") if request.payload else None
|
|
if not operation:
|
|
return RouterResponse(
|
|
ok=False,
|
|
provider_id=self.id,
|
|
error="Missing 'operation' in request payload. Expected 'embed_text' or 'embed_image'"
|
|
)
|
|
|
|
normalize = request.payload.get("normalize", True)
|
|
|
|
# Route based on operation
|
|
if operation == "embed_text":
|
|
return await self._embed_text(request, normalize)
|
|
elif operation == "embed_image":
|
|
return await self._embed_image(request, normalize)
|
|
else:
|
|
return RouterResponse(
|
|
ok=False,
|
|
provider_id=self.id,
|
|
error=f"Unknown operation: {operation}. Available: embed_text, embed_image"
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"VisionEncoder error: {e}")
|
|
return RouterResponse(
|
|
ok=False,
|
|
provider_id=self.id,
|
|
error=str(e)
|
|
)
|
|
|
|
async def _embed_text(self, request: RouterRequest, normalize: bool) -> RouterResponse:
|
|
"""Generate text embedding."""
|
|
try:
|
|
text = request.payload.get("text") if request.payload else None
|
|
if not text:
|
|
return RouterResponse(
|
|
ok=False,
|
|
provider_id=self.id,
|
|
error="Missing 'text' in request payload"
|
|
)
|
|
|
|
# Call Vision Encoder API
|
|
url = f"{self.base_url}/embed/text"
|
|
body = {
|
|
"text": text,
|
|
"normalize": normalize
|
|
}
|
|
|
|
logger.info(f"VisionEncoder embed_text: {text[:100]}...")
|
|
|
|
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
response = await client.post(url, json=body)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
|
|
return RouterResponse(
|
|
ok=True,
|
|
provider_id=self.id,
|
|
data={
|
|
"embedding": data.get("embedding"),
|
|
"dimension": data.get("dimension"),
|
|
"model": data.get("model"),
|
|
"normalized": data.get("normalized")
|
|
},
|
|
metadata={
|
|
"provider_type": "vision_encoder",
|
|
"operation": "embed_text",
|
|
"text_length": len(text),
|
|
"status_code": response.status_code
|
|
}
|
|
)
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
logger.error(f"VisionEncoder HTTP error: {e}")
|
|
return RouterResponse(
|
|
ok=False,
|
|
provider_id=self.id,
|
|
error=f"HTTP {e.response.status_code}: {e.response.text}"
|
|
)
|
|
|
|
except httpx.RequestError as e:
|
|
logger.error(f"VisionEncoder request error: {e}")
|
|
return RouterResponse(
|
|
ok=False,
|
|
provider_id=self.id,
|
|
error=f"Request failed: {str(e)}"
|
|
)
|
|
|
|
async def _embed_image(self, request: RouterRequest, normalize: bool) -> RouterResponse:
|
|
"""Generate image embedding from URL."""
|
|
try:
|
|
image_url = request.payload.get("image_url") if request.payload else None
|
|
if not image_url:
|
|
return RouterResponse(
|
|
ok=False,
|
|
provider_id=self.id,
|
|
error="Missing 'image_url' in request payload"
|
|
)
|
|
|
|
# Call Vision Encoder API
|
|
url = f"{self.base_url}/embed/image"
|
|
body = {
|
|
"image_url": image_url,
|
|
"normalize": normalize
|
|
}
|
|
|
|
logger.info(f"VisionEncoder embed_image: {image_url}")
|
|
|
|
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
response = await client.post(url, json=body)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
|
|
return RouterResponse(
|
|
ok=True,
|
|
provider_id=self.id,
|
|
data={
|
|
"embedding": data.get("embedding"),
|
|
"dimension": data.get("dimension"),
|
|
"model": data.get("model"),
|
|
"normalized": data.get("normalized")
|
|
},
|
|
metadata={
|
|
"provider_type": "vision_encoder",
|
|
"operation": "embed_image",
|
|
"image_url": image_url,
|
|
"status_code": response.status_code
|
|
}
|
|
)
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
logger.error(f"VisionEncoder HTTP error: {e}")
|
|
return RouterResponse(
|
|
ok=False,
|
|
provider_id=self.id,
|
|
error=f"HTTP {e.response.status_code}: {e.response.text}"
|
|
)
|
|
|
|
except httpx.RequestError as e:
|
|
logger.error(f"VisionEncoder request error: {e}")
|
|
return RouterResponse(
|
|
ok=False,
|
|
provider_id=self.id,
|
|
error=f"Request failed: {str(e)}"
|
|
)
|