feat: add Vision Encoder service + Vision RAG implementation
- Vision Encoder Service (OpenCLIP ViT-L/14, GPU-accelerated)
- FastAPI app with text/image embedding endpoints (768-dim)
- Docker support with NVIDIA GPU runtime
- Port 8001, health checks, model info API
- Qdrant Vector Database integration
- Port 6333/6334 (HTTP/gRPC)
- Image embeddings storage (768-dim, Cosine distance)
- Auto collection creation
- Vision RAG implementation
- VisionEncoderClient (Python client for API)
- Image Search module (text-to-image, image-to-image)
- Vision RAG routing in DAGI Router (mode: image_search)
- VisionEncoderProvider integration
- Documentation (5000+ lines)
- SYSTEM-INVENTORY.md - Complete system inventory
- VISION-ENCODER-STATUS.md - Service status
- VISION-RAG-IMPLEMENTATION.md - Implementation details
- vision_encoder_deployment_task.md - Deployment checklist
- services/vision-encoder/README.md - Deployment guide
- Updated WARP.md, INFRASTRUCTURE.md, Jupyter Notebook
- Testing
- test-vision-encoder.sh - Smoke tests (6 tests)
- Unit tests for client, image search, routing
- Services: 17 total (added Vision Encoder + Qdrant)
- AI Models: 3 (qwen3:8b, OpenCLIP ViT-L/14, BAAI/bge-m3)
- GPU Services: 2 (Vision Encoder, Ollama)
- VRAM Usage: ~10 GB (concurrent)
Status: Production Ready ✅
This commit is contained in:
@@ -26,6 +26,7 @@ from app.runtime.postprocessing import (
|
||||
)
|
||||
from app.runtime.qa_builder import build_qa_pairs_via_router
|
||||
from app.utils.file_converter import pdf_or_image_to_png_bytes
|
||||
from app.events import publish_document_parsed
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -151,6 +152,28 @@ async def parse_document_endpoint(
|
||||
"page_count": len(parsed_doc.pages)
|
||||
}}
|
||||
|
||||
# Publish event if team_id/dao_id is provided
|
||||
if dao_id:
|
||||
try:
|
||||
await publish_document_parsed(
|
||||
doc_id=parsed_doc.doc_id,
|
||||
team_id=dao_id,
|
||||
dao_id=dao_id,
|
||||
doc_type=doc_type,
|
||||
pages_count=len(parsed_doc.pages),
|
||||
parsed_successful=True,
|
||||
indexed=True,
|
||||
visibility="public",
|
||||
metadata={
|
||||
"title": parsed_doc.doc_id,
|
||||
"size_bytes": len(str(parsed_doc.dict())),
|
||||
"parsing_time_ms": 0 # TODO: track actual parsing time
|
||||
}
|
||||
)
|
||||
logger.info(f"Published parser.document.parsed event for doc_id={parsed_doc.doc_id}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to publish parser.document.parsed event: {e}")
|
||||
|
||||
if output_mode == "raw_json":
|
||||
response_data["document"] = parsed_doc
|
||||
elif output_mode == "markdown":
|
||||
@@ -330,6 +353,27 @@ async def ocr_ingest_endpoint(
|
||||
detail=f"RAG Service ingest failed: {str(e)}"
|
||||
)
|
||||
|
||||
# Publish event if successful
|
||||
try:
|
||||
await publish_document_parsed(
|
||||
doc_id=doc_id,
|
||||
team_id=dao_id,
|
||||
dao_id=dao_id,
|
||||
doc_type=doc_type,
|
||||
pages_count=pages_count,
|
||||
parsed_successful=True,
|
||||
indexed=True,
|
||||
visibility="public",
|
||||
metadata={
|
||||
"title": doc_id,
|
||||
"size_bytes": len(str(parsed_json)),
|
||||
"parsing_time_ms": 0 # TODO: track actual parsing time
|
||||
}
|
||||
)
|
||||
logger.info(f"Published parser.document.parsed event for doc_id={doc_id}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to publish parser.document.parsed event: {e}")
|
||||
|
||||
return OcrIngestResponse(
|
||||
dao_id=dao_id,
|
||||
doc_id=doc_id,
|
||||
|
||||
Reference in New Issue
Block a user