Files
microdao-daarion/services/vector-db-service/app/main.py

367 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Vector DB Service - Векторна база даних для DAARION Knowledge Base
ChromaDB + Sentence Transformers для embeddings та RAG
"""
from fastapi import FastAPI, HTTPException, UploadFile, File
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import logging
import os
from typing import Optional, List, Dict, Any
from datetime import datetime
import hashlib
# Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(
title="Vector DB Service",
description="Vector Database для DAARION (ChromaDB + Sentence Transformers)",
version="1.0.0"
)
# CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Lazy import vector DB
try:
import chromadb
from chromadb.config import Settings
CHROMADB_AVAILABLE = True
except ImportError:
CHROMADB_AVAILABLE = False
logger.warning("⚠️ ChromaDB not available")
try:
from sentence_transformers import SentenceTransformer
EMBEDDINGS_AVAILABLE = True
except ImportError:
EMBEDDINGS_AVAILABLE = False
logger.warning("⚠️ Sentence Transformers not available")
# Конфігурація
CHROMA_PERSIST_DIR = os.getenv("CHROMA_PERSIST_DIR", "./chroma_data")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "all-MiniLM-L6-v2")
# Global variables
_chroma_client = None
_embedding_model = None
def get_chroma_client():
"""Lazy initialization of ChromaDB client"""
global _chroma_client
if _chroma_client is None and CHROMADB_AVAILABLE:
_chroma_client = chromadb.Client(Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=CHROMA_PERSIST_DIR
))
return _chroma_client
def get_embedding_model():
"""Lazy initialization of embedding model"""
global _embedding_model
if _embedding_model is None and EMBEDDINGS_AVAILABLE:
_embedding_model = SentenceTransformer(EMBEDDING_MODEL)
return _embedding_model
class Document(BaseModel):
id: str
text: str
metadata: Optional[Dict[str, Any]] = {}
class AddDocumentsRequest(BaseModel):
collection: str
documents: List[Document]
class SearchRequest(BaseModel):
collection: str
query: str
n_results: Optional[int] = 5
filter: Optional[Dict[str, Any]] = None
class SearchResult(BaseModel):
id: str
text: str
score: float
metadata: Dict[str, Any]
class SearchResponse(BaseModel):
query: str
results: List[SearchResult]
collection: str
total: int
timestamp: str
@app.get("/")
async def root():
"""Health check"""
return {
"service": "Vector DB Service",
"status": "running",
"chromadb": CHROMADB_AVAILABLE,
"embeddings": EMBEDDINGS_AVAILABLE,
"embedding_model": EMBEDDING_MODEL,
"persist_dir": CHROMA_PERSIST_DIR,
"version": "1.0.0"
}
@app.get("/health")
async def health():
"""Health check endpoint"""
return {
"status": "healthy" if (CHROMADB_AVAILABLE and EMBEDDINGS_AVAILABLE) else "degraded",
"chromadb": "available" if CHROMADB_AVAILABLE else "unavailable",
"embeddings": "available" if EMBEDDINGS_AVAILABLE else "unavailable"
}
@app.post("/api/collections")
async def create_collection(collection_name: str):
"""
Створює нову колекцію
"""
try:
client = get_chroma_client()
if not client:
raise HTTPException(status_code=503, detail="ChromaDB not available")
# Створити або отримати колекцію
collection = client.get_or_create_collection(
name=collection_name,
metadata={"created_at": datetime.utcnow().isoformat()}
)
logger.info(f"✅ Created/got collection: {collection_name}")
return {
"collection": collection_name,
"created": True,
"count": collection.count()
}
except Exception as e:
logger.error(f"❌ Create collection error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.get("/api/collections")
async def list_collections():
"""
Список всіх колекцій
"""
try:
client = get_chroma_client()
if not client:
raise HTTPException(status_code=503, detail="ChromaDB not available")
collections = client.list_collections()
return {
"collections": [
{
"name": col.name,
"count": col.count(),
"metadata": col.metadata
}
for col in collections
],
"total": len(collections)
}
except Exception as e:
logger.error(f"❌ List collections error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/documents")
async def add_documents(request: AddDocumentsRequest):
"""
Додає документи в колекцію
Body:
{
"collection": "knowledge_base",
"documents": [
{
"id": "doc1",
"text": "DAARION is a decentralized platform...",
"metadata": {"source": "website", "date": "2025-01-01"}
}
]
}
"""
try:
client = get_chroma_client()
if not client:
raise HTTPException(status_code=503, detail="ChromaDB not available")
model = get_embedding_model()
if not model:
raise HTTPException(status_code=503, detail="Embeddings not available")
# Отримати колекцію
collection = client.get_or_create_collection(name=request.collection)
# Підготувати дані
ids = [doc.id for doc in request.documents]
documents = [doc.text for doc in request.documents]
metadatas = [doc.metadata for doc in request.documents]
# Згенерувати embeddings
embeddings = model.encode(documents, convert_to_numpy=True).tolist()
# Додати в ChromaDB
collection.add(
ids=ids,
documents=documents,
embeddings=embeddings,
metadatas=metadatas
)
logger.info(f"✅ Added {len(documents)} documents to {request.collection}")
return {
"collection": request.collection,
"added": len(documents),
"total": collection.count()
}
except Exception as e:
logger.error(f"❌ Add documents error: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@app.post("/api/search", response_model=SearchResponse)
async def search_documents(request: SearchRequest):
"""
Пошук документів в колекції
Body:
{
"collection": "knowledge_base",
"query": "What is DAARION?",
"n_results": 5
}
"""
try:
client = get_chroma_client()
if not client:
raise HTTPException(status_code=503, detail="ChromaDB not available")
model = get_embedding_model()
if not model:
raise HTTPException(status_code=503, detail="Embeddings not available")
# Отримати колекцію
try:
collection = client.get_collection(name=request.collection)
except:
raise HTTPException(status_code=404, detail=f"Collection '{request.collection}' not found")
# Згенерувати embedding для запиту
query_embedding = model.encode([request.query], convert_to_numpy=True).tolist()[0]
# Пошук
results = collection.query(
query_embeddings=[query_embedding],
n_results=request.n_results,
where=request.filter
)
# Форматувати результати
search_results = []
for idx in range(len(results['ids'][0])):
search_results.append(SearchResult(
id=results['ids'][0][idx],
text=results['documents'][0][idx],
score=1 - results['distances'][0][idx], # Convert distance to similarity
metadata=results['metadatas'][0][idx]
))
logger.info(f"✅ Found {len(search_results)} results for '{request.query}'")
return SearchResponse(
query=request.query,
results=search_results,
collection=request.collection,
total=len(search_results),
timestamp=datetime.utcnow().isoformat()
)
except HTTPException:
raise
except Exception as e:
logger.error(f"❌ Search error: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@app.delete("/api/documents")
async def delete_documents(collection: str, ids: List[str]):
"""
Видаляє документи з колекції
"""
try:
client = get_chroma_client()
if not client:
raise HTTPException(status_code=503, detail="ChromaDB not available")
# Отримати колекцію
try:
coll = client.get_collection(name=collection)
except:
raise HTTPException(status_code=404, detail=f"Collection '{collection}' not found")
# Видалити документи
coll.delete(ids=ids)
logger.info(f"✅ Deleted {len(ids)} documents from {collection}")
return {
"collection": collection,
"deleted": len(ids),
"remaining": coll.count()
}
except HTTPException:
raise
except Exception as e:
logger.error(f"❌ Delete documents error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.delete("/api/collections/{collection_name}")
async def delete_collection(collection_name: str):
"""
Видаляє колекцію
"""
try:
client = get_chroma_client()
if not client:
raise HTTPException(status_code=503, detail="ChromaDB not available")
# Видалити колекцію
client.delete_collection(name=collection_name)
logger.info(f"✅ Deleted collection: {collection_name}")
return {
"collection": collection_name,
"deleted": True
}
except Exception as e:
logger.error(f"❌ Delete collection error: {e}")
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8898)