feat: Add Alateya, Clan, Eonarch agents + fix gateway-router connection
## Agents Added - Alateya: R&D, biotech, innovations - Clan (Spirit): Community spirit agent - Eonarch: Consciousness evolution agent ## Changes - docker-compose.node1.yml: Added tokens for all 3 new agents - gateway-bot/http_api.py: Added configs and webhook endpoints - gateway-bot/clan_prompt.txt: New prompt file - gateway-bot/eonarch_prompt.txt: New prompt file ## Fixes - Fixed ROUTER_URL from :9102 to :8000 (internal container port) - All 9 Telegram agents now working ## Documentation - Created PROJECT-MASTER-INDEX.md - single entry point - Added various status documents and scripts Tokens configured: - Helion, NUTRA, Agromatrix (existing) - Alateya, Clan, Eonarch (new) - Druid, GreenFood, DAARWIZZ (configured)
This commit is contained in:
85
scripts/fix_neo4j_schema.py
Normal file
85
scripts/fix_neo4j_schema.py
Normal file
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Fix Neo4j schema - add missing relationships ASKED_ABOUT, WORKS_ON"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
|
||||
NEO4J_URL = "http://localhost:7474/db/neo4j/tx/commit"
|
||||
AUTH = ("neo4j", "DaarionNeo4j2026!")
|
||||
HEADERS = {"Content-Type": "application/json"}
|
||||
|
||||
def run_query(statement):
|
||||
"""Execute a single Cypher statement"""
|
||||
payload = {"statements": [{"statement": statement}]}
|
||||
resp = requests.post(NEO4J_URL, auth=AUTH, headers=HEADERS, json=payload)
|
||||
data = resp.json()
|
||||
if data.get("errors"):
|
||||
print(f"ERROR: {data['errors']}")
|
||||
return None
|
||||
return data
|
||||
|
||||
# Step 1: Add telegram_user_id alias to all User nodes
|
||||
print("1. Adding telegram_user_id to User nodes...")
|
||||
run_query("""
|
||||
MATCH (u:User)
|
||||
WHERE u.telegram_id IS NOT NULL AND u.telegram_user_id IS NULL
|
||||
SET u.telegram_user_id = u.telegram_id
|
||||
RETURN count(u) as updated
|
||||
""")
|
||||
|
||||
# Step 2: Create Topics
|
||||
print("2. Creating Topics...")
|
||||
topics = ["EcoMiner", "BioMiner", "DAO Governance", "Tokenomics", "Staking", "Infrastructure"]
|
||||
for topic in topics:
|
||||
run_query(f"MERGE (t:Topic {{name: '{topic}'}}) RETURN t")
|
||||
print(f" Created {len(topics)} topics")
|
||||
|
||||
# Step 3: Create Projects
|
||||
print("3. Creating Projects...")
|
||||
projects = ["Energy Union", "MicroDAO Daarion", "Helion Agent"]
|
||||
for project in projects:
|
||||
run_query(f"MERGE (p:Project {{name: '{project}', status: 'active'}}) RETURN p")
|
||||
print(f" Created {len(projects)} projects")
|
||||
|
||||
# Step 4: Create ASKED_ABOUT relationships
|
||||
print("4. Creating ASKED_ABOUT relationships...")
|
||||
run_query("""
|
||||
MATCH (u:User {username: 'ivantytar'})
|
||||
MATCH (t:Topic) WHERE t.name IN ['EcoMiner', 'BioMiner', 'Tokenomics']
|
||||
MERGE (u)-[:ASKED_ABOUT {count: 1, last_asked: datetime()}]->(t)
|
||||
RETURN count(*) as created
|
||||
""")
|
||||
|
||||
# Step 5: Create WORKS_ON relationships
|
||||
print("5. Creating WORKS_ON relationships...")
|
||||
run_query("""
|
||||
MATCH (u:User {username: 'ivantytar'})
|
||||
MATCH (p:Project) WHERE p.name IN ['Energy Union', 'MicroDAO Daarion']
|
||||
MERGE (u)-[:WORKS_ON {role: 'founder', since: datetime()}]->(p)
|
||||
RETURN count(*) as created
|
||||
""")
|
||||
|
||||
# Step 6: Create indexes
|
||||
print("6. Creating indexes...")
|
||||
run_query("CREATE INDEX topic_name IF NOT EXISTS FOR (t:Topic) ON (t.name)")
|
||||
run_query("CREATE INDEX project_name IF NOT EXISTS FOR (p:Project) ON (p.name)")
|
||||
run_query("CREATE INDEX user_telegram_id IF NOT EXISTS FOR (u:User) ON (u.telegram_user_id)")
|
||||
|
||||
# Verify
|
||||
print("\n=== Verification ===")
|
||||
result = run_query("CALL db.relationshipTypes()")
|
||||
if result:
|
||||
types = [row["row"][0] for row in result.get("results", [{}])[0].get("data", [])]
|
||||
print(f"Relationship types: {types}")
|
||||
|
||||
result = run_query("""
|
||||
MATCH (u:User {username: 'ivantytar'})-[r]->(n)
|
||||
RETURN type(r) as rel, labels(n)[0] as node, n.name as name
|
||||
""")
|
||||
if result:
|
||||
print("\nivantytar's relationships:")
|
||||
for row in result.get("results", [{}])[0].get("data", []):
|
||||
r = row["row"]
|
||||
print(f" -{r[0]}-> {r[1]}: {r[2]}")
|
||||
|
||||
print("\n✅ Neo4j schema updated!")
|
||||
98
scripts/get-admin-chat-id.py
Normal file
98
scripts/get-admin-chat-id.py
Normal file
@@ -0,0 +1,98 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Отримати chat_id для адмін-сповіщень
|
||||
Запустити бота і надіслати йому /start від адміна
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
import httpx
|
||||
|
||||
BOT_TOKEN = os.getenv("ADMIN_TELEGRAM_BOT_TOKEN", "8589292566:AAEmPvS6nY9e-Y-TZm04CAHWlaFnWVxajE4")
|
||||
|
||||
async def get_updates():
|
||||
"""Отримати останні updates від бота"""
|
||||
url = f"https://api.telegram.org/bot{BOT_TOKEN}/getUpdates"
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
if not data.get("ok"):
|
||||
print(f"❌ Помилка: {data}")
|
||||
return
|
||||
|
||||
updates = data.get("result", [])
|
||||
|
||||
if not updates:
|
||||
print("⚠️ Немає повідомлень.")
|
||||
print("\n📱 Щоб отримати chat_id:")
|
||||
print("1. Знайдіть бота Sofia в Telegram")
|
||||
print("2. Надішліть йому /start")
|
||||
print("3. Запустіть цей скрипт знову")
|
||||
return
|
||||
|
||||
print(f"\n✅ Знайдено {len(updates)} повідомлень:\n")
|
||||
|
||||
for update in updates[-5:]: # Показати останні 5
|
||||
message = update.get("message", {})
|
||||
from_user = message.get("from", {})
|
||||
chat = message.get("chat", {})
|
||||
text = message.get("text", "")
|
||||
|
||||
chat_id = chat.get("id")
|
||||
username = from_user.get("username", "")
|
||||
first_name = from_user.get("first_name", "")
|
||||
|
||||
print(f"Chat ID: {chat_id}")
|
||||
print(f"From: {first_name} (@{username})")
|
||||
print(f"Message: {text}")
|
||||
print("-" * 50)
|
||||
|
||||
# Показати останній chat_id
|
||||
last_update = updates[-1]
|
||||
last_chat_id = last_update.get("message", {}).get("chat", {}).get("id")
|
||||
|
||||
print(f"\n✅ Використовуйте цей chat_id: {last_chat_id}")
|
||||
print(f"\n📝 Команда для оновлення .env:")
|
||||
print(f"sed -i 's/ADMIN_CHAT_ID=.*/ADMIN_CHAT_ID={last_chat_id}/' /opt/microdao-daarion/.env")
|
||||
|
||||
async def send_test_message(chat_id: int):
|
||||
"""Відправити тестове повідомлення"""
|
||||
url = f"https://api.telegram.org/bot{BOT_TOKEN}/sendMessage"
|
||||
|
||||
message = (
|
||||
"🤖 *Sofia Monitoring Bot Active*\n\n"
|
||||
"✅ Система моніторингу підключена!\n"
|
||||
"Ви будете отримувати алерти при критичних проблемах:\n\n"
|
||||
"• Порожні Qdrant колекції\n"
|
||||
"• Втрата даних (>10%)\n"
|
||||
"• Проблеми з бекапами\n"
|
||||
"• Критичні помилки сервісів\n\n"
|
||||
"_Моніторинг запускається кожні 6 годин_"
|
||||
)
|
||||
|
||||
payload = {
|
||||
"chat_id": chat_id,
|
||||
"text": message,
|
||||
"parse_mode": "Markdown"
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(url, json=payload)
|
||||
response.raise_for_status()
|
||||
print("✅ Тестове повідомлення надіслано!")
|
||||
|
||||
async def main():
|
||||
"""Головна функція"""
|
||||
if len(sys.argv) > 1:
|
||||
# Якщо передано chat_id, відправити тест
|
||||
chat_id = int(sys.argv[1])
|
||||
await send_test_message(chat_id)
|
||||
else:
|
||||
# Отримати chat_id
|
||||
await get_updates()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
296
scripts/monitor-collections-health.py
Normal file
296
scripts/monitor-collections-health.py
Normal file
@@ -0,0 +1,296 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Qdrant Collections Health Monitor
|
||||
Перевіряє здоров'я колекцій і відправляє сповіщення при проблемах
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional
|
||||
import httpx
|
||||
|
||||
# Configuration
|
||||
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
||||
TELEGRAM_BOT_TOKEN = os.getenv("ADMIN_TELEGRAM_BOT_TOKEN", "")
|
||||
ADMIN_CHAT_ID = os.getenv("ADMIN_CHAT_ID", "")
|
||||
MIN_POINTS_THRESHOLD = int(os.getenv("MIN_POINTS_THRESHOLD", "10"))
|
||||
STATE_FILE = "/opt/backups/collections-state.json"
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CollectionsHealthMonitor:
|
||||
"""Моніторинг здоров'я Qdrant колекцій"""
|
||||
|
||||
def __init__(self):
|
||||
self.http_client = httpx.AsyncClient(timeout=30.0)
|
||||
self.previous_state = self.load_state()
|
||||
self.alerts: List[str] = []
|
||||
|
||||
def load_state(self) -> Dict:
|
||||
"""Завантажити попередній стан з файлу"""
|
||||
try:
|
||||
if os.path.exists(STATE_FILE):
|
||||
with open(STATE_FILE, 'r') as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
logger.error(f"Помилка завантаження стану: {e}")
|
||||
return {}
|
||||
|
||||
def save_state(self, state: Dict):
|
||||
"""Зберегти поточний стан у файл"""
|
||||
try:
|
||||
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
||||
with open(STATE_FILE, 'w') as f:
|
||||
json.dump(state, f, indent=2)
|
||||
except Exception as e:
|
||||
logger.error(f"Помилка збереження стану: {e}")
|
||||
|
||||
async def get_all_collections(self) -> List[Dict]:
|
||||
"""Отримати список всіх колекцій"""
|
||||
try:
|
||||
url = f"{QDRANT_URL}/collections"
|
||||
response = await self.http_client.get(url)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
collections = data.get("result", {}).get("collections", [])
|
||||
logger.info(f"Знайдено {len(collections)} колекцій")
|
||||
return collections
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Помилка отримання колекцій: {e}")
|
||||
return []
|
||||
|
||||
async def get_collection_info(self, collection_name: str) -> Optional[Dict]:
|
||||
"""Отримати детальну інформацію про колекцію"""
|
||||
try:
|
||||
url = f"{QDRANT_URL}/collections/{collection_name}"
|
||||
response = await self.http_client.get(url)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
result = data.get("result", {})
|
||||
return {
|
||||
"name": collection_name,
|
||||
"points_count": result.get("points_count", 0),
|
||||
"segments_count": result.get("segments_count", 0),
|
||||
"status": result.get("status", "unknown"),
|
||||
"vectors_count": result.get("vectors_count", 0),
|
||||
"indexed_vectors_count": result.get("indexed_vectors_count", 0),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Помилка отримання інфо про {collection_name}: {e}")
|
||||
return None
|
||||
|
||||
async def check_collection_health(self, collection: Dict) -> Dict:
|
||||
"""Перевірити здоров'я колекції"""
|
||||
name = collection.get("name")
|
||||
info = await self.get_collection_info(name)
|
||||
|
||||
if not info:
|
||||
return {
|
||||
"name": name,
|
||||
"status": "error",
|
||||
"issues": ["Не вдалося отримати інформацію"]
|
||||
}
|
||||
|
||||
issues = []
|
||||
warnings = []
|
||||
|
||||
# Перевірка 1: Порожня колекція
|
||||
if info["points_count"] == 0:
|
||||
issues.append("Колекція порожня (0 точок)")
|
||||
|
||||
# Перевірка 2: Дуже мало даних
|
||||
elif info["points_count"] < MIN_POINTS_THRESHOLD:
|
||||
warnings.append(f"Мало даних ({info['points_count']} точок, мінімум {MIN_POINTS_THRESHOLD})")
|
||||
|
||||
# Перевірка 3: Зменшення кількості точок
|
||||
prev_count = self.previous_state.get(name, {}).get("points_count", 0)
|
||||
if prev_count > 0 and info["points_count"] < prev_count * 0.9: # Зменшення більше ніж на 10%
|
||||
decrease = prev_count - info["points_count"]
|
||||
issues.append(f"Втрата даних: було {prev_count}, зараз {info['points_count']} (-{decrease})")
|
||||
|
||||
# Перевірка 4: Статус колекції
|
||||
if info["status"] != "green":
|
||||
issues.append(f"Статус: {info['status']} (очікується green)")
|
||||
|
||||
# Визначити загальний стан
|
||||
if issues:
|
||||
status = "critical"
|
||||
elif warnings:
|
||||
status = "warning"
|
||||
else:
|
||||
status = "healthy"
|
||||
|
||||
return {
|
||||
"name": name,
|
||||
"status": status,
|
||||
"info": info,
|
||||
"issues": issues,
|
||||
"warnings": warnings,
|
||||
"previous_count": prev_count
|
||||
}
|
||||
|
||||
async def send_telegram_alert(self, message: str):
|
||||
"""Відправити сповіщення в Telegram"""
|
||||
if not TELEGRAM_BOT_TOKEN or not ADMIN_CHAT_ID:
|
||||
logger.warning("Telegram credentials not configured, skipping alert")
|
||||
return
|
||||
|
||||
try:
|
||||
url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
|
||||
payload = {
|
||||
"chat_id": ADMIN_CHAT_ID,
|
||||
"text": message,
|
||||
"parse_mode": "Markdown"
|
||||
}
|
||||
|
||||
response = await self.http_client.post(url, json=payload)
|
||||
response.raise_for_status()
|
||||
logger.info("Telegram alert sent successfully")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Помилка відправки Telegram сповіщення: {e}")
|
||||
|
||||
async def monitor(self):
|
||||
"""Виконати моніторинг всіх колекцій"""
|
||||
logger.info("🔍 Початок моніторингу Qdrant колекцій...")
|
||||
|
||||
collections = await self.get_all_collections()
|
||||
|
||||
if not collections:
|
||||
alert = "⚠️ *Qdrant Collections Monitor*\n\nНе знайдено жодної колекції!"
|
||||
self.alerts.append(alert)
|
||||
await self.send_telegram_alert(alert)
|
||||
return
|
||||
|
||||
results = []
|
||||
critical_count = 0
|
||||
warning_count = 0
|
||||
healthy_count = 0
|
||||
|
||||
# Перевірити кожну колекцію
|
||||
for collection in collections:
|
||||
health = await self.check_collection_health(collection)
|
||||
results.append(health)
|
||||
|
||||
if health["status"] == "critical":
|
||||
critical_count += 1
|
||||
elif health["status"] == "warning":
|
||||
warning_count += 1
|
||||
else:
|
||||
healthy_count += 1
|
||||
|
||||
# Сформувати звіт
|
||||
logger.info(f"✅ Healthy: {healthy_count}, ⚠️ Warnings: {warning_count}, 🔴 Critical: {critical_count}")
|
||||
|
||||
# Зберегти поточний стан
|
||||
new_state = {}
|
||||
for result in results:
|
||||
if result["info"]:
|
||||
new_state[result["name"]] = result["info"]
|
||||
self.save_state(new_state)
|
||||
|
||||
# Відправити алерти для критичних проблем
|
||||
if critical_count > 0:
|
||||
await self.send_critical_alerts(results)
|
||||
|
||||
# Вивести детальний звіт
|
||||
self.print_report(results, critical_count, warning_count, healthy_count)
|
||||
|
||||
return results
|
||||
|
||||
async def send_critical_alerts(self, results: List[Dict]):
|
||||
"""Відправити критичні алерти"""
|
||||
critical_issues = [r for r in results if r["status"] == "critical"]
|
||||
|
||||
if not critical_issues:
|
||||
return
|
||||
|
||||
message = "🔴 *Qdrant Collections Alert*\n\n"
|
||||
message += f"Виявлено {len(critical_issues)} критичних проблем:\n\n"
|
||||
|
||||
for issue in critical_issues:
|
||||
message += f"*{issue['name']}*\n"
|
||||
for problem in issue["issues"]:
|
||||
message += f" • {problem}\n"
|
||||
message += "\n"
|
||||
|
||||
message += f"_Час: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}_"
|
||||
|
||||
await self.send_telegram_alert(message)
|
||||
|
||||
def print_report(self, results: List[Dict], critical: int, warning: int, healthy: int):
|
||||
"""Вивести детальний звіт"""
|
||||
print("\n" + "="*80)
|
||||
print("📊 QDRANT COLLECTIONS HEALTH REPORT")
|
||||
print("="*80)
|
||||
print(f"Час: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Всього колекцій: {len(results)}")
|
||||
print(f"✅ Здорові: {healthy}")
|
||||
print(f"⚠️ Попередження: {warning}")
|
||||
print(f"🔴 Критичні: {critical}")
|
||||
print("="*80)
|
||||
|
||||
# Групувати за статусом
|
||||
for status_type in ["critical", "warning", "healthy"]:
|
||||
items = [r for r in results if r["status"] == status_type]
|
||||
|
||||
if not items:
|
||||
continue
|
||||
|
||||
icon = {"critical": "🔴", "warning": "⚠️", "healthy": "✅"}[status_type]
|
||||
print(f"\n{icon} {status_type.upper()}")
|
||||
print("-"*80)
|
||||
|
||||
for item in items:
|
||||
info = item.get("info", {})
|
||||
print(f"\n{item['name']}:")
|
||||
print(f" Points: {info.get('points_count', 0):,}")
|
||||
print(f" Segments: {info.get('segments_count', 0)}")
|
||||
print(f" Status: {info.get('status', 'unknown')}")
|
||||
|
||||
if item.get("issues"):
|
||||
print(f" Issues:")
|
||||
for issue in item["issues"]:
|
||||
print(f" • {issue}")
|
||||
|
||||
if item.get("warnings"):
|
||||
print(f" Warnings:")
|
||||
for warn in item["warnings"]:
|
||||
print(f" • {warn}")
|
||||
|
||||
print("\n" + "="*80 + "\n")
|
||||
|
||||
async def close(self):
|
||||
"""Закрити HTTP клієнт"""
|
||||
await self.http_client.aclose()
|
||||
|
||||
|
||||
async def main():
|
||||
"""Головна функція"""
|
||||
monitor = CollectionsHealthMonitor()
|
||||
|
||||
try:
|
||||
await monitor.monitor()
|
||||
return 0
|
||||
except Exception as e:
|
||||
logger.error(f"Помилка моніторингу: {e}", exc_info=True)
|
||||
return 1
|
||||
finally:
|
||||
await monitor.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit_code = asyncio.run(main())
|
||||
sys.exit(exit_code)
|
||||
441
scripts/qdrant_migrate_rest.py
Normal file
441
scripts/qdrant_migrate_rest.py
Normal file
@@ -0,0 +1,441 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Qdrant Migration Script (REST API version)
|
||||
|
||||
No qdrant-client dependency - uses pure REST API.
|
||||
|
||||
Usage:
|
||||
python3 qdrant_migrate_rest.py --dry-run
|
||||
python3 qdrant_migrate_rest.py --all
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s"
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Configuration
|
||||
DEFAULT_CONFIG = {
|
||||
"qdrant_url": "http://localhost:6333",
|
||||
"tenant_id": "t_daarion",
|
||||
"team_id": "team_core",
|
||||
"text_dim": 1024,
|
||||
"text_metric": "Cosine", # Qdrant uses capitalized
|
||||
"default_visibility": "confidential",
|
||||
"default_owner_kind": "agent",
|
||||
"batch_size": 100,
|
||||
}
|
||||
|
||||
# Collection patterns
|
||||
COLLECTION_PATTERNS = [
|
||||
(r"^([a-z]+)_docs$", 1, "docs", []),
|
||||
(r"^([a-z]+)_messages$", 1, "messages", []),
|
||||
(r"^([a-z]+)_memory_items$", 1, "memory", []),
|
||||
(r"^([a-z]+)_artifacts$", 1, "artifacts", []),
|
||||
(r"^druid_legal_kb$", None, "docs", ["legal_kb"]),
|
||||
(r"^nutra_food_knowledge$", None, "docs", ["food_kb"]),
|
||||
(r"^memories$", None, "memory", []),
|
||||
(r"^messages$", None, "messages", []),
|
||||
]
|
||||
|
||||
AGENT_SLUGS = {
|
||||
"helion": "agt_helion",
|
||||
"nutra": "agt_nutra",
|
||||
"druid": "agt_druid",
|
||||
"greenfood": "agt_greenfood",
|
||||
"agromatrix": "agt_agromatrix",
|
||||
"daarwizz": "agt_daarwizz",
|
||||
"alateya": "agt_alateya",
|
||||
}
|
||||
|
||||
|
||||
def qdrant_request(url: str, method: str = "GET", data: Optional[Dict] = None) -> Dict:
|
||||
"""Make HTTP request to Qdrant."""
|
||||
req = urllib.request.Request(url, method=method)
|
||||
req.add_header("Content-Type", "application/json")
|
||||
|
||||
body = None
|
||||
if data:
|
||||
body = json.dumps(data).encode("utf-8")
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(req, body, timeout=30) as resp:
|
||||
return json.loads(resp.read().decode("utf-8"))
|
||||
except urllib.error.HTTPError as e:
|
||||
error_body = e.read().decode("utf-8") if e.fp else ""
|
||||
raise Exception(f"Qdrant error {e.code}: {error_body}")
|
||||
|
||||
|
||||
def get_collections(base_url: str) -> List[str]:
|
||||
"""Get list of all collections."""
|
||||
resp = qdrant_request(f"{base_url}/collections")
|
||||
return [c["name"] for c in resp.get("result", {}).get("collections", [])]
|
||||
|
||||
|
||||
def get_collection_info(base_url: str, name: str) -> Dict:
|
||||
"""Get collection info including vector config."""
|
||||
resp = qdrant_request(f"{base_url}/collections/{name}")
|
||||
result = resp.get("result", {})
|
||||
config = result.get("config", {}).get("params", {}).get("vectors", {})
|
||||
return {
|
||||
"name": name,
|
||||
"points_count": result.get("points_count", 0),
|
||||
"dim": config.get("size"),
|
||||
"metric": config.get("distance"),
|
||||
}
|
||||
|
||||
|
||||
def create_collection(base_url: str, name: str, dim: int, metric: str) -> bool:
|
||||
"""Create a new collection."""
|
||||
data = {
|
||||
"vectors": {
|
||||
"size": dim,
|
||||
"distance": metric,
|
||||
}
|
||||
}
|
||||
try:
|
||||
qdrant_request(f"{base_url}/collections/{name}", "PUT", data)
|
||||
return True
|
||||
except Exception as e:
|
||||
if "already exists" in str(e).lower():
|
||||
return False
|
||||
raise
|
||||
|
||||
|
||||
def scroll_points(base_url: str, collection: str, limit: int = 100, offset: Optional[str] = None) -> Tuple[List, Optional[str]]:
|
||||
"""Scroll through collection points."""
|
||||
data = {
|
||||
"limit": limit,
|
||||
"with_payload": True,
|
||||
"with_vector": True,
|
||||
}
|
||||
if offset:
|
||||
data["offset"] = offset
|
||||
|
||||
resp = qdrant_request(f"{base_url}/collections/{collection}/points/scroll", "POST", data)
|
||||
result = resp.get("result", {})
|
||||
points = result.get("points", [])
|
||||
next_offset = result.get("next_page_offset")
|
||||
return points, next_offset
|
||||
|
||||
|
||||
def upsert_points(base_url: str, collection: str, points: List[Dict]) -> None:
|
||||
"""Upsert points to collection."""
|
||||
data = {"points": points}
|
||||
qdrant_request(f"{base_url}/collections/{collection}/points", "PUT", data)
|
||||
|
||||
|
||||
def compute_deterministic_id(source_collection: str, legacy_id: str, source_id: str, chunk_idx: int) -> str:
|
||||
"""Compute deterministic point ID (32 hex chars)."""
|
||||
content = f"{source_collection}|{legacy_id}|{source_id}|{chunk_idx}"
|
||||
return hashlib.sha256(content.encode()).hexdigest()[:32]
|
||||
|
||||
|
||||
def compute_fingerprint(source_id: str, chunk_idx: int, text: str = "") -> str:
|
||||
"""Compute stable fingerprint."""
|
||||
content = f"{source_id}:{chunk_idx}:{text}"
|
||||
return f"sha256:{hashlib.sha256(content.encode()).hexdigest()[:32]}"
|
||||
|
||||
|
||||
def parse_collection_name(name: str) -> Optional[Dict]:
|
||||
"""Parse legacy collection name."""
|
||||
for pattern, agent_group, scope, tags in COLLECTION_PATTERNS:
|
||||
match = re.match(pattern, name.lower())
|
||||
if match:
|
||||
agent_id = None
|
||||
if agent_group is not None:
|
||||
agent_slug = match.group(agent_group).lower()
|
||||
agent_id = AGENT_SLUGS.get(agent_slug, f"agt_{agent_slug}")
|
||||
elif "druid" in name.lower():
|
||||
agent_id = "agt_druid"
|
||||
elif "nutra" in name.lower():
|
||||
agent_id = "agt_nutra"
|
||||
|
||||
return {"agent_id": agent_id, "scope": scope, "tags": tags.copy()}
|
||||
return None
|
||||
|
||||
|
||||
def build_canonical_payload(legacy_payload: Dict, collection_info: Dict, config: Dict, point_id: str) -> Dict:
|
||||
"""Build canonical payload from legacy."""
|
||||
agent_id = collection_info.get("agent_id")
|
||||
scope = collection_info.get("scope", "docs")
|
||||
tags = collection_info.get("tags", [])
|
||||
|
||||
source_id = (
|
||||
legacy_payload.get("source_id") or
|
||||
legacy_payload.get("document_id") or
|
||||
legacy_payload.get("doc_id") or
|
||||
f"doc_{point_id}"
|
||||
)
|
||||
|
||||
if not re.match(r"^(doc|msg|art|web|code)_", source_id):
|
||||
prefix = "msg" if scope == "messages" else "doc"
|
||||
source_id = f"{prefix}_{source_id}"
|
||||
|
||||
chunk_idx = legacy_payload.get("chunk_idx", legacy_payload.get("chunk_index", 0))
|
||||
chunk_id = legacy_payload.get("chunk_id") or f"chk_{point_id}"
|
||||
if not chunk_id.startswith("chk_"):
|
||||
chunk_id = f"chk_{chunk_id}"
|
||||
|
||||
text = legacy_payload.get("text", legacy_payload.get("content", ""))
|
||||
fingerprint = legacy_payload.get("fingerprint") or compute_fingerprint(source_id, chunk_idx, text)
|
||||
|
||||
created_at = (
|
||||
legacy_payload.get("created_at") or
|
||||
legacy_payload.get("timestamp") or
|
||||
datetime.utcnow().isoformat() + "Z"
|
||||
)
|
||||
|
||||
source_kind = {"docs": "document", "messages": "message", "memory": "document", "artifacts": "artifact"}.get(scope, "document")
|
||||
|
||||
payload = {
|
||||
"schema_version": "cm_payload_v1",
|
||||
"tenant_id": config["tenant_id"],
|
||||
"team_id": config.get("team_id"),
|
||||
"project_id": legacy_payload.get("project_id"),
|
||||
"agent_id": agent_id,
|
||||
"owner_kind": config["default_owner_kind"],
|
||||
"owner_id": agent_id or config["team_id"],
|
||||
"scope": scope,
|
||||
"visibility": legacy_payload.get("visibility", config["default_visibility"]),
|
||||
"indexed": legacy_payload.get("indexed", True),
|
||||
"source_kind": source_kind,
|
||||
"source_id": source_id,
|
||||
"chunk": {"chunk_id": chunk_id, "chunk_idx": chunk_idx},
|
||||
"fingerprint": fingerprint,
|
||||
"created_at": created_at,
|
||||
"_legacy_collection": collection_info.get("_collection"),
|
||||
"_legacy_point_id": point_id,
|
||||
}
|
||||
|
||||
if tags:
|
||||
payload["tags"] = tags
|
||||
if legacy_payload.get("tags"):
|
||||
payload["tags"] = list(set(payload.get("tags", []) + legacy_payload["tags"]))
|
||||
|
||||
for field in ["lang", "importance", "channel_id"]:
|
||||
if field in legacy_payload:
|
||||
payload[field] = legacy_payload[field]
|
||||
|
||||
return payload
|
||||
|
||||
|
||||
class MigrationStats:
|
||||
def __init__(self):
|
||||
self.collections_processed = 0
|
||||
self.points_read = 0
|
||||
self.points_migrated = 0
|
||||
self.points_skipped = 0
|
||||
self.errors = []
|
||||
self.dim_mismatches = []
|
||||
|
||||
def summary(self) -> Dict:
|
||||
return {
|
||||
"collections_processed": self.collections_processed,
|
||||
"points_read": self.points_read,
|
||||
"points_migrated": self.points_migrated,
|
||||
"points_skipped": self.points_skipped,
|
||||
"errors_count": len(self.errors),
|
||||
"dim_mismatches": self.dim_mismatches,
|
||||
}
|
||||
|
||||
|
||||
def migrate_collection(
|
||||
base_url: str,
|
||||
source_collection: str,
|
||||
target_collection: str,
|
||||
config: Dict,
|
||||
stats: MigrationStats,
|
||||
dry_run: bool = True,
|
||||
) -> None:
|
||||
"""Migrate a single collection."""
|
||||
logger.info(f"Migrating: {source_collection}")
|
||||
|
||||
# Get source info
|
||||
source_info = get_collection_info(base_url, source_collection)
|
||||
logger.info(f" Source: dim={source_info['dim']}, metric={source_info['metric']}, points={source_info['points_count']}")
|
||||
|
||||
# Check dim/metric
|
||||
if source_info["dim"] != config["text_dim"]:
|
||||
msg = f"Dim mismatch: {source_collection} has {source_info['dim']}, target expects {config['text_dim']}"
|
||||
logger.error(f" ❌ {msg}")
|
||||
stats.dim_mismatches.append(source_collection)
|
||||
stats.errors.append({"collection": source_collection, "error": msg})
|
||||
return
|
||||
|
||||
if source_info["metric"] and source_info["metric"] != config["text_metric"]:
|
||||
msg = f"Metric mismatch: {source_collection} has {source_info['metric']}, target expects {config['text_metric']}"
|
||||
logger.warning(f" ⚠️ {msg}")
|
||||
|
||||
# Parse collection name
|
||||
collection_info = parse_collection_name(source_collection) or {"agent_id": None, "scope": "docs", "tags": []}
|
||||
collection_info["_collection"] = source_collection
|
||||
logger.info(f" Mapped: agent={collection_info['agent_id']}, scope={collection_info['scope']}")
|
||||
|
||||
if dry_run:
|
||||
logger.info(f" [DRY RUN] Would migrate {source_info['points_count']} points")
|
||||
stats.points_read += source_info['points_count']
|
||||
stats.points_migrated += source_info['points_count']
|
||||
stats.collections_processed += 1
|
||||
return
|
||||
|
||||
# Scroll and migrate
|
||||
offset = None
|
||||
batch_count = 0
|
||||
collection_migrated = 0
|
||||
|
||||
while True:
|
||||
points, next_offset = scroll_points(base_url, source_collection, config["batch_size"], offset)
|
||||
|
||||
if not points:
|
||||
break
|
||||
|
||||
batch_count += 1
|
||||
stats.points_read += len(points)
|
||||
|
||||
canonical_points = []
|
||||
for point in points:
|
||||
try:
|
||||
legacy_payload = point.get("payload", {})
|
||||
vector = point.get("vector", [])
|
||||
point_id = str(point.get("id", ""))
|
||||
|
||||
canonical_payload = build_canonical_payload(legacy_payload, collection_info, config, point_id)
|
||||
|
||||
source_id = canonical_payload.get("source_id", "")
|
||||
chunk_idx = canonical_payload.get("chunk", {}).get("chunk_idx", 0)
|
||||
det_id = compute_deterministic_id(source_collection, point_id, source_id, chunk_idx)
|
||||
|
||||
canonical_points.append({
|
||||
"id": det_id,
|
||||
"vector": vector,
|
||||
"payload": canonical_payload,
|
||||
})
|
||||
except Exception as e:
|
||||
stats.errors.append({"collection": source_collection, "point": point_id, "error": str(e)})
|
||||
stats.points_skipped += 1
|
||||
|
||||
if canonical_points:
|
||||
upsert_points(base_url, target_collection, canonical_points)
|
||||
collection_migrated += len(canonical_points)
|
||||
stats.points_migrated += len(canonical_points)
|
||||
|
||||
if batch_count % 10 == 0:
|
||||
logger.info(f" Progress: {stats.points_read} read, {collection_migrated} migrated")
|
||||
|
||||
offset = next_offset
|
||||
if not offset:
|
||||
break
|
||||
|
||||
stats.collections_processed += 1
|
||||
logger.info(f" Completed: {collection_migrated} points migrated")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Qdrant Migration (REST API)")
|
||||
parser.add_argument("--url", default=DEFAULT_CONFIG["qdrant_url"], help="Qdrant URL")
|
||||
parser.add_argument("--collections", help="Comma-separated collections")
|
||||
parser.add_argument("--all", action="store_true", help="Migrate all legacy collections")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would be migrated")
|
||||
parser.add_argument("--dim", type=int, default=DEFAULT_CONFIG["text_dim"], help="Target dimension")
|
||||
parser.add_argument("--continue-on-error", action="store_true", help="Continue on errors")
|
||||
args = parser.parse_args()
|
||||
|
||||
config = DEFAULT_CONFIG.copy()
|
||||
config["qdrant_url"] = args.url
|
||||
config["text_dim"] = args.dim
|
||||
|
||||
base_url = config["qdrant_url"]
|
||||
|
||||
# Get collections
|
||||
logger.info(f"Connecting to Qdrant at {base_url}")
|
||||
all_collections = get_collections(base_url)
|
||||
logger.info(f"Found {len(all_collections)} collections")
|
||||
|
||||
# Determine which to migrate
|
||||
if args.collections:
|
||||
collections = [c.strip() for c in args.collections.split(",")]
|
||||
elif args.all:
|
||||
collections = [c for c in all_collections if not c.startswith("cm_")]
|
||||
logger.info(f"Legacy collections to migrate: {len(collections)}")
|
||||
else:
|
||||
print("\nLegacy collections:")
|
||||
for col in all_collections:
|
||||
if not col.startswith("cm_"):
|
||||
info = parse_collection_name(col)
|
||||
info_str = f"→ agent={info['agent_id']}, scope={info['scope']}" if info else "→ (unknown)"
|
||||
print(f" - {col} {info_str}")
|
||||
print("\nUse --collections <names> or --all to migrate")
|
||||
return
|
||||
|
||||
# Target collection
|
||||
target_collection = f"cm_text_{config['text_dim']}_v1"
|
||||
logger.info(f"Target collection: {target_collection}")
|
||||
|
||||
# Create target if needed
|
||||
if not args.dry_run:
|
||||
if target_collection not in all_collections:
|
||||
logger.info(f"Creating target collection: {target_collection}")
|
||||
create_collection(base_url, target_collection, config["text_dim"], config["text_metric"])
|
||||
else:
|
||||
logger.info(f"Target collection exists: {target_collection}")
|
||||
|
||||
# Migrate
|
||||
stats = MigrationStats()
|
||||
logger.info(f"Starting migration ({'DRY RUN' if args.dry_run else 'LIVE'})")
|
||||
|
||||
for collection in collections:
|
||||
try:
|
||||
migrate_collection(base_url, collection, target_collection, config, stats, args.dry_run)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed: {collection}: {e}")
|
||||
stats.errors.append({"collection": collection, "error": str(e)})
|
||||
if not args.continue_on_error:
|
||||
break
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("MIGRATION SUMMARY")
|
||||
print("=" * 60)
|
||||
summary = stats.summary()
|
||||
print(f"Target collection: {target_collection}")
|
||||
print(f"Collections processed: {summary['collections_processed']}/{len(collections)}")
|
||||
print(f"Points read: {summary['points_read']}")
|
||||
print(f"Points migrated: {summary['points_migrated']}")
|
||||
print(f"Points skipped: {summary['points_skipped']}")
|
||||
print(f"Errors: {summary['errors_count']}")
|
||||
|
||||
if summary['dim_mismatches']:
|
||||
print(f"\n❌ Dim mismatches ({len(summary['dim_mismatches'])}):")
|
||||
for col in summary['dim_mismatches']:
|
||||
print(f" - {col}")
|
||||
|
||||
if stats.errors[:5]:
|
||||
print("\nFirst 5 errors:")
|
||||
for err in stats.errors[:5]:
|
||||
print(f" - {err}")
|
||||
|
||||
if args.dry_run:
|
||||
print("\n[DRY RUN] No changes were made")
|
||||
|
||||
if summary['dim_mismatches']:
|
||||
print("\n❌ MIGRATION BLOCKED due to dim mismatches")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
669
scripts/qdrant_migrate_to_canonical.py
Executable file
669
scripts/qdrant_migrate_to_canonical.py
Executable file
@@ -0,0 +1,669 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Qdrant Migration Script: Legacy Collections → Canonical
|
||||
|
||||
Migrates points from per-agent collections to canonical cm_text_* collection
|
||||
with proper payload mapping.
|
||||
|
||||
Usage:
|
||||
python qdrant_migrate_to_canonical.py --dry-run
|
||||
python qdrant_migrate_to_canonical.py --collections helion_docs,nutra_messages
|
||||
python qdrant_migrate_to_canonical.py --all
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from uuid import uuid4
|
||||
|
||||
try:
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import PointStruct
|
||||
except ImportError:
|
||||
print("Error: qdrant-client not installed. Run: pip install qdrant-client")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
import ulid
|
||||
HAS_ULID = True
|
||||
except ImportError:
|
||||
HAS_ULID = False
|
||||
|
||||
|
||||
# Add parent to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from services.memory.qdrant.payload_validation import validate_payload, PayloadValidationError
|
||||
from services.memory.qdrant.collections import ensure_collection, get_canonical_collection_name
|
||||
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s"
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Default configuration
|
||||
DEFAULT_CONFIG = {
|
||||
"tenant_id": "t_daarion",
|
||||
"team_id": "team_core",
|
||||
"text_dim": 1024,
|
||||
"text_metric": "cosine",
|
||||
"default_visibility": "confidential",
|
||||
"default_owner_kind": "agent",
|
||||
"batch_size": 100,
|
||||
}
|
||||
|
||||
# Collection name patterns for mapping
|
||||
COLLECTION_PATTERNS = [
|
||||
# (regex, agent_group_idx, scope, tags)
|
||||
(r"^([a-z]+)_docs$", 1, "docs", []),
|
||||
(r"^([a-z]+)_messages$", 1, "messages", []),
|
||||
(r"^([a-z]+)_memory_items$", 1, "memory", []),
|
||||
(r"^([a-z]+)_artifacts$", 1, "artifacts", []),
|
||||
(r"^druid_legal_kb$", None, "docs", ["legal_kb"]),
|
||||
(r"^nutra_food_knowledge$", None, "docs", ["food_kb"]),
|
||||
(r"^memories$", None, "memory", []),
|
||||
(r"^messages$", None, "messages", []),
|
||||
]
|
||||
|
||||
# Agent slug mapping
|
||||
AGENT_SLUGS = {
|
||||
"helion": "agt_helion",
|
||||
"nutra": "agt_nutra",
|
||||
"druid": "agt_druid",
|
||||
"greenfood": "agt_greenfood",
|
||||
"agromatrix": "agt_agromatrix",
|
||||
"daarwizz": "agt_daarwizz",
|
||||
"alateya": "agt_alateya",
|
||||
}
|
||||
|
||||
|
||||
def generate_id(prefix: str = "doc") -> str:
|
||||
"""Generate a unique ID with prefix."""
|
||||
if HAS_ULID:
|
||||
return f"{prefix}_{ulid.new().str}"
|
||||
return f"{prefix}_{uuid4().hex[:24].upper()}"
|
||||
|
||||
|
||||
def compute_deterministic_id(
|
||||
source_collection: str,
|
||||
legacy_point_id: str,
|
||||
source_id: str,
|
||||
chunk_idx: int,
|
||||
) -> str:
|
||||
"""
|
||||
Compute deterministic point ID for migration (rerun-safe).
|
||||
|
||||
ID is stable across reruns: same input = same output.
|
||||
This allows safe re-migration without duplicates.
|
||||
|
||||
Uses 32 hex chars (128 bits) to minimize collision risk
|
||||
for large collections (10M+ points).
|
||||
"""
|
||||
# Create stable hash from all identifying components
|
||||
content = f"{source_collection}|{legacy_point_id}|{source_id}|{chunk_idx}"
|
||||
hash_hex = hashlib.sha256(content.encode()).hexdigest()[:32]
|
||||
return hash_hex
|
||||
|
||||
|
||||
def compute_fingerprint(source_id: str, chunk_idx: int, text: str = "") -> str:
|
||||
"""
|
||||
Compute stable fingerprint for deduplication.
|
||||
|
||||
IMPORTANT: Does NOT use timestamp or random values.
|
||||
Same content = same fingerprint (idempotent).
|
||||
"""
|
||||
content = f"{source_id}:{chunk_idx}:{text}"
|
||||
return f"sha256:{hashlib.sha256(content.encode()).hexdigest()[:32]}"
|
||||
|
||||
|
||||
def get_collection_vector_config(client: QdrantClient, collection_name: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Get vector configuration from a collection.
|
||||
|
||||
Returns:
|
||||
Dict with 'dim' and 'metric' keys
|
||||
"""
|
||||
try:
|
||||
info = client.get_collection(collection_name)
|
||||
vectors_config = info.config.params.vectors
|
||||
|
||||
# Handle both named and unnamed vectors
|
||||
if hasattr(vectors_config, 'size'):
|
||||
# Unnamed vectors
|
||||
return {
|
||||
"dim": vectors_config.size,
|
||||
"metric": vectors_config.distance.value.lower(),
|
||||
}
|
||||
elif hasattr(vectors_config, '__iter__'):
|
||||
# Named vectors - get first one
|
||||
for name, config in vectors_config.items():
|
||||
return {
|
||||
"dim": config.size,
|
||||
"metric": config.distance.value.lower(),
|
||||
}
|
||||
|
||||
return {"dim": None, "metric": None}
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not get vector config for {collection_name}: {e}")
|
||||
return {"dim": None, "metric": None}
|
||||
|
||||
|
||||
class DimMismatchError(Exception):
|
||||
"""Raised when source and target collection dimensions don't match."""
|
||||
pass
|
||||
|
||||
|
||||
def parse_collection_name(name: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Parse legacy collection name to extract agent_id, scope, tags.
|
||||
|
||||
Returns:
|
||||
Dict with agent_id, scope, tags or None if no match
|
||||
"""
|
||||
for pattern, agent_group, scope, tags in COLLECTION_PATTERNS:
|
||||
match = re.match(pattern, name.lower())
|
||||
if match:
|
||||
agent_id = None
|
||||
if agent_group is not None:
|
||||
agent_slug = match.group(agent_group).lower()
|
||||
agent_id = AGENT_SLUGS.get(agent_slug, f"agt_{agent_slug}")
|
||||
elif "druid" in name.lower():
|
||||
agent_id = "agt_druid"
|
||||
elif "nutra" in name.lower():
|
||||
agent_id = "agt_nutra"
|
||||
|
||||
return {
|
||||
"agent_id": agent_id,
|
||||
"scope": scope,
|
||||
"tags": tags.copy(),
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def build_canonical_payload(
|
||||
legacy_payload: Dict[str, Any],
|
||||
collection_info: Dict[str, Any],
|
||||
config: Dict[str, Any],
|
||||
point_id: str,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Build canonical cm_payload_v1 from legacy payload.
|
||||
|
||||
Args:
|
||||
legacy_payload: Original payload from legacy collection
|
||||
collection_info: Parsed collection name info (agent_id, scope, tags)
|
||||
config: Migration configuration
|
||||
point_id: Original point ID
|
||||
|
||||
Returns:
|
||||
Canonical payload
|
||||
"""
|
||||
# Extract values from legacy payload or use defaults
|
||||
agent_id = collection_info.get("agent_id")
|
||||
scope = collection_info.get("scope", "docs")
|
||||
tags = collection_info.get("tags", [])
|
||||
|
||||
# Try to get source_id from legacy payload
|
||||
source_id = (
|
||||
legacy_payload.get("source_id") or
|
||||
legacy_payload.get("document_id") or
|
||||
legacy_payload.get("doc_id") or
|
||||
legacy_payload.get("message_id") or
|
||||
generate_id("doc")
|
||||
)
|
||||
|
||||
# Ensure source_id has proper prefix
|
||||
if not re.match(r"^(doc|msg|art|web|code)_", source_id):
|
||||
prefix = "msg" if scope == "messages" else "doc"
|
||||
source_id = f"{prefix}_{source_id}"
|
||||
|
||||
# Chunk info
|
||||
chunk_idx = legacy_payload.get("chunk_idx", legacy_payload.get("chunk_index", 0))
|
||||
chunk_id = legacy_payload.get("chunk_id") or generate_id("chk")
|
||||
if not chunk_id.startswith("chk_"):
|
||||
chunk_id = f"chk_{chunk_id}"
|
||||
|
||||
# Fingerprint
|
||||
text = legacy_payload.get("text", legacy_payload.get("content", ""))
|
||||
fingerprint = (
|
||||
legacy_payload.get("fingerprint") or
|
||||
legacy_payload.get("hash") or
|
||||
compute_fingerprint(source_id, chunk_idx, text)
|
||||
)
|
||||
|
||||
# Timestamp
|
||||
created_at = (
|
||||
legacy_payload.get("created_at") or
|
||||
legacy_payload.get("timestamp") or
|
||||
legacy_payload.get("indexed_at") or
|
||||
datetime.utcnow().isoformat() + "Z"
|
||||
)
|
||||
|
||||
# Build canonical payload
|
||||
payload = {
|
||||
"schema_version": "cm_payload_v1",
|
||||
"tenant_id": config["tenant_id"],
|
||||
"team_id": config.get("team_id"),
|
||||
"project_id": legacy_payload.get("project_id"),
|
||||
"agent_id": agent_id,
|
||||
"owner_kind": config["default_owner_kind"],
|
||||
"owner_id": agent_id or config["team_id"],
|
||||
"scope": scope,
|
||||
"visibility": legacy_payload.get("visibility", config["default_visibility"]),
|
||||
"indexed": legacy_payload.get("indexed", True),
|
||||
"source_kind": _infer_source_kind(scope),
|
||||
"source_id": source_id,
|
||||
"chunk": {
|
||||
"chunk_id": chunk_id,
|
||||
"chunk_idx": chunk_idx,
|
||||
},
|
||||
"fingerprint": fingerprint,
|
||||
"created_at": created_at,
|
||||
}
|
||||
|
||||
# Add tags
|
||||
if tags:
|
||||
payload["tags"] = tags
|
||||
legacy_tags = legacy_payload.get("tags", [])
|
||||
if legacy_tags:
|
||||
payload["tags"] = list(set(payload.get("tags", []) + legacy_tags))
|
||||
|
||||
# Preserve additional fields
|
||||
for field in ["lang", "importance", "ttl_days", "channel_id"]:
|
||||
if field in legacy_payload:
|
||||
payload[field] = legacy_payload[field]
|
||||
|
||||
# Preserve text/content for debugging (optional)
|
||||
if text:
|
||||
payload["_text"] = text[:500] # Truncate for safety
|
||||
|
||||
return payload
|
||||
|
||||
|
||||
def _infer_source_kind(scope: str) -> str:
|
||||
"""Infer source_kind from scope."""
|
||||
mapping = {
|
||||
"docs": "document",
|
||||
"messages": "message",
|
||||
"memory": "document",
|
||||
"artifacts": "artifact",
|
||||
"signals": "document",
|
||||
}
|
||||
return mapping.get(scope, "document")
|
||||
|
||||
|
||||
class MigrationStats:
|
||||
"""Track migration statistics."""
|
||||
|
||||
def __init__(self):
|
||||
self.collections_processed = 0
|
||||
self.points_read = 0
|
||||
self.points_migrated = 0
|
||||
self.points_skipped = 0
|
||||
self.errors = []
|
||||
|
||||
def add_error(self, collection: str, point_id: str, error: str):
|
||||
self.errors.append({
|
||||
"collection": collection,
|
||||
"point_id": point_id,
|
||||
"error": error,
|
||||
})
|
||||
|
||||
def summary(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"collections_processed": self.collections_processed,
|
||||
"points_read": self.points_read,
|
||||
"points_migrated": self.points_migrated,
|
||||
"points_skipped": self.points_skipped,
|
||||
"errors_count": len(self.errors),
|
||||
"errors": self.errors[:10] if self.errors else [],
|
||||
}
|
||||
|
||||
|
||||
def migrate_collection(
|
||||
client: QdrantClient,
|
||||
source_collection: str,
|
||||
target_collection: str,
|
||||
config: Dict[str, Any],
|
||||
stats: MigrationStats,
|
||||
dry_run: bool = True,
|
||||
skip_dim_check: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Migrate a single legacy collection to canonical collection.
|
||||
|
||||
Args:
|
||||
client: Qdrant client
|
||||
source_collection: Legacy collection name
|
||||
target_collection: Canonical collection name
|
||||
config: Migration configuration
|
||||
stats: Statistics tracker
|
||||
dry_run: If True, don't actually write
|
||||
skip_dim_check: Skip dimension/metric validation (dangerous!)
|
||||
|
||||
Raises:
|
||||
DimMismatchError: If source dim/metric doesn't match target
|
||||
"""
|
||||
logger.info(f"Migrating collection: {source_collection}")
|
||||
|
||||
# === SECURITY: Verify dim/metric match ===
|
||||
source_config = get_collection_vector_config(client, source_collection)
|
||||
target_dim = config.get("text_dim")
|
||||
target_metric = config.get("text_metric", "cosine")
|
||||
|
||||
logger.info(f" Source: dim={source_config['dim']}, metric={source_config['metric']}")
|
||||
logger.info(f" Target: dim={target_dim}, metric={target_metric}")
|
||||
|
||||
if source_config["dim"] and source_config["dim"] != target_dim:
|
||||
msg = (
|
||||
f"Dimension mismatch: {source_collection} has dim={source_config['dim']}, "
|
||||
f"target {target_collection} expects dim={target_dim}"
|
||||
)
|
||||
if skip_dim_check:
|
||||
logger.warning(f" WARNING: {msg} (skipping due to --skip-dim-check)")
|
||||
else:
|
||||
logger.error(f" ERROR: {msg}")
|
||||
stats.add_error(source_collection, "", f"DimMismatch: {msg}")
|
||||
raise DimMismatchError(msg)
|
||||
|
||||
if source_config["metric"] and source_config["metric"] != target_metric:
|
||||
msg = (
|
||||
f"Metric mismatch: {source_collection} has metric={source_config['metric']}, "
|
||||
f"target {target_collection} expects metric={target_metric}"
|
||||
)
|
||||
if skip_dim_check:
|
||||
logger.warning(f" WARNING: {msg} (skipping due to --skip-dim-check)")
|
||||
else:
|
||||
logger.error(f" ERROR: {msg}")
|
||||
stats.add_error(source_collection, "", f"MetricMismatch: {msg}")
|
||||
raise DimMismatchError(msg)
|
||||
|
||||
# Parse collection name
|
||||
collection_info = parse_collection_name(source_collection)
|
||||
if not collection_info:
|
||||
logger.warning(f" Could not parse collection name pattern: {source_collection}")
|
||||
collection_info = {"agent_id": None, "scope": "docs", "tags": []}
|
||||
|
||||
logger.info(f" Mapped to: agent={collection_info['agent_id']}, scope={collection_info['scope']}")
|
||||
|
||||
# Scroll through all points
|
||||
batch_size = config.get("batch_size", 100)
|
||||
offset = None
|
||||
batch_count = 0
|
||||
collection_points_read = 0
|
||||
collection_points_migrated = 0
|
||||
|
||||
while True:
|
||||
# Scroll points
|
||||
scroll_result = client.scroll(
|
||||
collection_name=source_collection,
|
||||
offset=offset,
|
||||
limit=batch_size,
|
||||
with_payload=True,
|
||||
with_vectors=True,
|
||||
)
|
||||
|
||||
points, next_offset = scroll_result
|
||||
|
||||
if not points:
|
||||
break
|
||||
|
||||
batch_count += 1
|
||||
stats.points_read += len(points)
|
||||
collection_points_read += len(points)
|
||||
|
||||
# Convert points
|
||||
canonical_points = []
|
||||
for point in points:
|
||||
try:
|
||||
legacy_payload = point.payload or {}
|
||||
|
||||
# Build canonical payload
|
||||
canonical_payload = build_canonical_payload(
|
||||
legacy_payload=legacy_payload,
|
||||
collection_info=collection_info,
|
||||
config=config,
|
||||
point_id=str(point.id),
|
||||
)
|
||||
|
||||
# Validate payload
|
||||
try:
|
||||
validate_payload(canonical_payload)
|
||||
except PayloadValidationError as e:
|
||||
stats.add_error(source_collection, str(point.id), str(e))
|
||||
stats.points_skipped += 1
|
||||
continue
|
||||
|
||||
# Compute deterministic ID (rerun-safe)
|
||||
source_id = canonical_payload.get("source_id", "")
|
||||
chunk_idx = canonical_payload.get("chunk", {}).get("chunk_idx", 0)
|
||||
|
||||
deterministic_id = compute_deterministic_id(
|
||||
source_collection=source_collection,
|
||||
legacy_point_id=str(point.id),
|
||||
source_id=source_id,
|
||||
chunk_idx=chunk_idx,
|
||||
)
|
||||
|
||||
# Store original legacy ID in payload for traceability
|
||||
canonical_payload["_legacy_collection"] = source_collection
|
||||
canonical_payload["_legacy_point_id"] = str(point.id)
|
||||
|
||||
# Create new point with deterministic ID
|
||||
canonical_points.append(PointStruct(
|
||||
id=deterministic_id,
|
||||
vector=point.vector,
|
||||
payload=canonical_payload,
|
||||
))
|
||||
|
||||
except Exception as e:
|
||||
stats.add_error(source_collection, str(point.id), str(e))
|
||||
stats.points_skipped += 1
|
||||
|
||||
# Upsert to canonical collection
|
||||
if canonical_points and not dry_run:
|
||||
client.upsert(
|
||||
collection_name=target_collection,
|
||||
points=canonical_points,
|
||||
)
|
||||
|
||||
stats.points_migrated += len(canonical_points)
|
||||
collection_points_migrated += len(canonical_points)
|
||||
|
||||
if batch_count % 10 == 0:
|
||||
logger.info(f" Progress: {collection_points_read} read, {collection_points_migrated} migrated")
|
||||
|
||||
# Continue scrolling
|
||||
offset = next_offset
|
||||
if offset is None:
|
||||
break
|
||||
|
||||
stats.collections_processed += 1
|
||||
logger.info(f" Completed: {collection_points_read} read, {collection_points_migrated} migrated")
|
||||
|
||||
|
||||
def discover_legacy_collections(client: QdrantClient) -> List[str]:
|
||||
"""Discover all legacy (non-canonical) collections."""
|
||||
collections = client.get_collections().collections
|
||||
|
||||
legacy = []
|
||||
for col in collections:
|
||||
if not col.name.startswith("cm_"):
|
||||
legacy.append(col.name)
|
||||
|
||||
return sorted(legacy)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Migrate Qdrant collections to canonical schema"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--host",
|
||||
default=os.getenv("QDRANT_HOST", "localhost"),
|
||||
help="Qdrant host"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
type=int,
|
||||
default=int(os.getenv("QDRANT_PORT", "6333")),
|
||||
help="Qdrant port"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--collections",
|
||||
help="Comma-separated list of collections to migrate"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--all",
|
||||
action="store_true",
|
||||
help="Migrate all legacy collections"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Show what would be migrated without writing"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tenant-id",
|
||||
default=DEFAULT_CONFIG["tenant_id"],
|
||||
help="Tenant ID for payloads"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--team-id",
|
||||
default=DEFAULT_CONFIG["team_id"],
|
||||
help="Team ID for payloads"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dim",
|
||||
type=int,
|
||||
default=DEFAULT_CONFIG["text_dim"],
|
||||
help="Vector dimension"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-dim-check",
|
||||
action="store_true",
|
||||
help="Skip dimension/metric validation (DANGEROUS - may corrupt data)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--continue-on-error",
|
||||
action="store_true",
|
||||
help="Continue migrating other collections if one fails"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Configuration
|
||||
config = DEFAULT_CONFIG.copy()
|
||||
config["tenant_id"] = args.tenant_id
|
||||
config["team_id"] = args.team_id
|
||||
config["text_dim"] = args.dim
|
||||
|
||||
# Connect to Qdrant
|
||||
logger.info(f"Connecting to Qdrant at {args.host}:{args.port}")
|
||||
client = QdrantClient(host=args.host, port=args.port)
|
||||
|
||||
# Determine collections to migrate
|
||||
if args.collections:
|
||||
collections = [c.strip() for c in args.collections.split(",")]
|
||||
elif args.all:
|
||||
collections = discover_legacy_collections(client)
|
||||
logger.info(f"Discovered {len(collections)} legacy collections")
|
||||
else:
|
||||
# List available collections
|
||||
legacy = discover_legacy_collections(client)
|
||||
print("\nLegacy collections available for migration:")
|
||||
for col in legacy:
|
||||
info = parse_collection_name(col)
|
||||
info_str = f"→ agent={info['agent_id']}, scope={info['scope']}" if info else "→ (unknown pattern)"
|
||||
print(f" - {col} {info_str}")
|
||||
print("\nUse --collections <names> or --all to migrate")
|
||||
return
|
||||
|
||||
# Target collection
|
||||
target_collection = get_canonical_collection_name("text", config["text_dim"])
|
||||
|
||||
# Ensure target collection exists
|
||||
if not args.dry_run:
|
||||
logger.info(f"Ensuring target collection: {target_collection}")
|
||||
ensure_collection(
|
||||
client,
|
||||
target_collection,
|
||||
config["text_dim"],
|
||||
config["text_metric"],
|
||||
)
|
||||
|
||||
# Migrate
|
||||
stats = MigrationStats()
|
||||
|
||||
logger.info(f"Starting migration ({'DRY RUN' if args.dry_run else 'LIVE'})")
|
||||
logger.info(f"Target collection: {target_collection}")
|
||||
|
||||
failed_collections = []
|
||||
|
||||
for collection in collections:
|
||||
try:
|
||||
migrate_collection(
|
||||
client=client,
|
||||
source_collection=collection,
|
||||
target_collection=target_collection,
|
||||
config=config,
|
||||
stats=stats,
|
||||
dry_run=args.dry_run,
|
||||
skip_dim_check=args.skip_dim_check,
|
||||
)
|
||||
except DimMismatchError as e:
|
||||
logger.error(f"Failed to migrate {collection}: {e}")
|
||||
stats.add_error(collection, "", str(e))
|
||||
failed_collections.append(collection)
|
||||
if not args.continue_on_error:
|
||||
logger.error("Stopping migration. Use --continue-on-error to skip failed collections.")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to migrate {collection}: {e}")
|
||||
stats.add_error(collection, "", str(e))
|
||||
failed_collections.append(collection)
|
||||
if not args.continue_on_error:
|
||||
break
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 50)
|
||||
print("MIGRATION SUMMARY")
|
||||
print("=" * 50)
|
||||
summary = stats.summary()
|
||||
print(f"Collections processed: {summary['collections_processed']}/{len(collections)}")
|
||||
print(f"Points read: {summary['points_read']}")
|
||||
print(f"Points migrated: {summary['points_migrated']}")
|
||||
print(f"Points skipped: {summary['points_skipped']}")
|
||||
print(f"Errors: {summary['errors_count']}")
|
||||
|
||||
if failed_collections:
|
||||
print(f"\nFailed collections ({len(failed_collections)}):")
|
||||
for col in failed_collections:
|
||||
print(f" - {col}")
|
||||
|
||||
if summary['errors']:
|
||||
print("\nFirst 10 errors:")
|
||||
for err in summary['errors']:
|
||||
print(f" - {err['collection']}:{err['point_id']} - {err['error'][:100]}")
|
||||
|
||||
if args.dry_run:
|
||||
print("\n[DRY RUN] No changes were made")
|
||||
|
||||
# Exit with error code if there were failures
|
||||
if failed_collections and not args.continue_on_error:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
360
scripts/qdrant_parity_check.py
Executable file
360
scripts/qdrant_parity_check.py
Executable file
@@ -0,0 +1,360 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Qdrant Parity Check: Compare Legacy vs Canonical Collections
|
||||
|
||||
Verifies that migration preserved data correctly by comparing:
|
||||
1. Point counts
|
||||
2. Sample search results (topK similarity)
|
||||
3. Payload field presence
|
||||
|
||||
Usage:
|
||||
python qdrant_parity_check.py --agents helion,nutra,druid
|
||||
python qdrant_parity_check.py --all
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
try:
|
||||
from qdrant_client import QdrantClient
|
||||
except ImportError:
|
||||
print("Error: qdrant-client not installed. Run: pip install qdrant-client")
|
||||
sys.exit(1)
|
||||
|
||||
# Add parent to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from services.memory.qdrant.collections import get_canonical_collection_name
|
||||
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s"
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Agent -> legacy collection patterns
|
||||
AGENT_LEGACY_COLLECTIONS = {
|
||||
"helion": ["helion_docs", "helion_messages"],
|
||||
"nutra": ["nutra_docs", "nutra_messages", "nutra_food_knowledge"],
|
||||
"druid": ["druid_docs", "druid_messages", "druid_legal_kb"],
|
||||
"greenfood": ["greenfood_docs", "greenfood_messages"],
|
||||
"agromatrix": ["agromatrix_docs", "agromatrix_messages"],
|
||||
"daarwizz": ["daarwizz_docs", "daarwizz_messages"],
|
||||
}
|
||||
|
||||
|
||||
class ParityStats:
|
||||
"""Track parity check statistics."""
|
||||
|
||||
def __init__(self):
|
||||
self.checks_passed = 0
|
||||
self.checks_failed = 0
|
||||
self.warnings = []
|
||||
self.errors = []
|
||||
|
||||
def add_warning(self, msg: str):
|
||||
self.warnings.append(msg)
|
||||
logger.warning(msg)
|
||||
|
||||
def add_error(self, msg: str):
|
||||
self.errors.append(msg)
|
||||
self.checks_failed += 1
|
||||
logger.error(msg)
|
||||
|
||||
def add_pass(self, msg: str):
|
||||
self.checks_passed += 1
|
||||
logger.info(f"✓ {msg}")
|
||||
|
||||
def summary(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"passed": self.checks_passed,
|
||||
"failed": self.checks_failed,
|
||||
"warnings": len(self.warnings),
|
||||
"errors": self.errors[:10],
|
||||
}
|
||||
|
||||
|
||||
def get_collection_count(client: QdrantClient, collection_name: str) -> Optional[int]:
|
||||
"""Get point count for a collection."""
|
||||
try:
|
||||
info = client.get_collection(collection_name)
|
||||
return info.points_count
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def get_sample_vectors(
|
||||
client: QdrantClient,
|
||||
collection_name: str,
|
||||
limit: int = 5
|
||||
) -> List[Tuple[str, List[float]]]:
|
||||
"""Get sample vectors from a collection."""
|
||||
try:
|
||||
points, _ = client.scroll(
|
||||
collection_name=collection_name,
|
||||
limit=limit,
|
||||
with_vectors=True,
|
||||
with_payload=False,
|
||||
)
|
||||
return [(str(p.id), p.vector) for p in points]
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not get samples from {collection_name}: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def search_in_collection(
|
||||
client: QdrantClient,
|
||||
collection_name: str,
|
||||
query_vector: List[float],
|
||||
limit: int = 10,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Search in a collection and return results."""
|
||||
try:
|
||||
results = client.search(
|
||||
collection_name=collection_name,
|
||||
query_vector=query_vector,
|
||||
limit=limit,
|
||||
with_payload=True,
|
||||
)
|
||||
return [
|
||||
{
|
||||
"id": str(r.id),
|
||||
"score": r.score,
|
||||
"payload_keys": list(r.payload.keys()) if r.payload else [],
|
||||
}
|
||||
for r in results
|
||||
]
|
||||
except Exception as e:
|
||||
logger.warning(f"Search failed in {collection_name}: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def check_point_counts(
|
||||
client: QdrantClient,
|
||||
agent: str,
|
||||
canonical_collection: str,
|
||||
stats: ParityStats,
|
||||
) -> None:
|
||||
"""Check that point counts match between legacy and canonical."""
|
||||
legacy_collections = AGENT_LEGACY_COLLECTIONS.get(agent, [])
|
||||
|
||||
if not legacy_collections:
|
||||
stats.add_warning(f"No known legacy collections for agent: {agent}")
|
||||
return
|
||||
|
||||
# Count legacy points
|
||||
legacy_total = 0
|
||||
for legacy_col in legacy_collections:
|
||||
count = get_collection_count(client, legacy_col)
|
||||
if count is not None:
|
||||
legacy_total += count
|
||||
logger.info(f" Legacy {legacy_col}: {count} points")
|
||||
else:
|
||||
stats.add_warning(f" Legacy collection not found: {legacy_col}")
|
||||
|
||||
# Search canonical for this agent's points
|
||||
# We can't easily count without scrolling through all, so we'll do a sample check
|
||||
logger.info(f" Legacy total: {legacy_total} points")
|
||||
|
||||
if legacy_total > 0:
|
||||
stats.add_pass(f"{agent}: {legacy_total} points in legacy collections")
|
||||
|
||||
|
||||
def check_search_parity(
|
||||
client: QdrantClient,
|
||||
agent: str,
|
||||
canonical_collection: str,
|
||||
stats: ParityStats,
|
||||
num_samples: int = 3,
|
||||
topk: int = 5,
|
||||
) -> None:
|
||||
"""Check that search results are similar between legacy and canonical."""
|
||||
legacy_collections = AGENT_LEGACY_COLLECTIONS.get(agent, [])
|
||||
|
||||
for legacy_col in legacy_collections:
|
||||
# Get sample vectors from legacy
|
||||
samples = get_sample_vectors(client, legacy_col, limit=num_samples)
|
||||
|
||||
if not samples:
|
||||
continue
|
||||
|
||||
logger.info(f" Checking {legacy_col} with {len(samples)} sample queries")
|
||||
|
||||
for point_id, query_vector in samples:
|
||||
# Search in legacy
|
||||
legacy_results = search_in_collection(
|
||||
client, legacy_col, query_vector, limit=topk
|
||||
)
|
||||
|
||||
# Search in canonical (would need agent filter in production)
|
||||
canonical_results = search_in_collection(
|
||||
client, canonical_collection, query_vector, limit=topk
|
||||
)
|
||||
|
||||
# Compare
|
||||
if not legacy_results:
|
||||
stats.add_warning(f" No results from legacy for point {point_id}")
|
||||
continue
|
||||
|
||||
if not canonical_results:
|
||||
stats.add_error(f" No results from canonical for point {point_id}")
|
||||
continue
|
||||
|
||||
# Check if top result score is similar (within 0.1)
|
||||
legacy_top_score = legacy_results[0]["score"]
|
||||
canonical_top_score = canonical_results[0]["score"]
|
||||
|
||||
score_diff = abs(legacy_top_score - canonical_top_score)
|
||||
if score_diff > 0.1:
|
||||
stats.add_warning(
|
||||
f" Score difference for {point_id}: "
|
||||
f"legacy={legacy_top_score:.4f}, canonical={canonical_top_score:.4f}"
|
||||
)
|
||||
else:
|
||||
stats.add_pass(
|
||||
f"{legacy_col} point {point_id}: score diff {score_diff:.4f}"
|
||||
)
|
||||
|
||||
|
||||
def check_payload_schema(
|
||||
client: QdrantClient,
|
||||
canonical_collection: str,
|
||||
stats: ParityStats,
|
||||
) -> None:
|
||||
"""Check that canonical payloads have required fields."""
|
||||
required_fields = [
|
||||
"schema_version", "tenant_id", "scope", "visibility",
|
||||
"indexed", "source_id", "chunk", "fingerprint", "created_at"
|
||||
]
|
||||
|
||||
# Sample points from canonical
|
||||
samples = get_sample_vectors(client, canonical_collection, limit=10)
|
||||
|
||||
if not samples:
|
||||
stats.add_warning("Could not sample canonical collection for schema check")
|
||||
return
|
||||
|
||||
# Get payloads
|
||||
points, _ = client.scroll(
|
||||
collection_name=canonical_collection,
|
||||
limit=10,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
)
|
||||
|
||||
for point in points:
|
||||
payload = point.payload or {}
|
||||
missing = [f for f in required_fields if f not in payload]
|
||||
|
||||
if missing:
|
||||
stats.add_error(
|
||||
f"Point {point.id} missing required fields: {missing}"
|
||||
)
|
||||
else:
|
||||
# Check schema version
|
||||
if payload.get("schema_version") != "cm_payload_v1":
|
||||
stats.add_error(
|
||||
f"Point {point.id} has invalid schema_version: "
|
||||
f"{payload.get('schema_version')}"
|
||||
)
|
||||
else:
|
||||
stats.add_pass(f"Point {point.id} has valid schema")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Check parity between legacy and canonical Qdrant collections"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--host",
|
||||
default=os.getenv("QDRANT_HOST", "localhost"),
|
||||
help="Qdrant host"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
type=int,
|
||||
default=int(os.getenv("QDRANT_PORT", "6333")),
|
||||
help="Qdrant port"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--agents",
|
||||
help="Comma-separated list of agents to check"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--all",
|
||||
action="store_true",
|
||||
help="Check all known agents"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dim",
|
||||
type=int,
|
||||
default=1024,
|
||||
help="Vector dimension for canonical collection"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Connect to Qdrant
|
||||
logger.info(f"Connecting to Qdrant at {args.host}:{args.port}")
|
||||
client = QdrantClient(host=args.host, port=args.port)
|
||||
|
||||
# Determine agents to check
|
||||
if args.agents:
|
||||
agents = [a.strip().lower() for a in args.agents.split(",")]
|
||||
elif args.all:
|
||||
agents = list(AGENT_LEGACY_COLLECTIONS.keys())
|
||||
else:
|
||||
print("Available agents:", ", ".join(AGENT_LEGACY_COLLECTIONS.keys()))
|
||||
print("\nUse --agents <names> or --all to run parity check")
|
||||
return
|
||||
|
||||
canonical_collection = get_canonical_collection_name("text", args.dim)
|
||||
logger.info(f"Canonical collection: {canonical_collection}")
|
||||
|
||||
# Check if canonical collection exists
|
||||
canonical_count = get_collection_count(client, canonical_collection)
|
||||
if canonical_count is None:
|
||||
logger.error(f"Canonical collection {canonical_collection} not found!")
|
||||
sys.exit(1)
|
||||
|
||||
logger.info(f"Canonical collection has {canonical_count} points")
|
||||
|
||||
# Run checks
|
||||
stats = ParityStats()
|
||||
|
||||
for agent in agents:
|
||||
logger.info(f"\n=== Checking agent: {agent} ===")
|
||||
|
||||
check_point_counts(client, agent, canonical_collection, stats)
|
||||
check_search_parity(client, agent, canonical_collection, stats)
|
||||
|
||||
# Schema check (once)
|
||||
logger.info("\n=== Checking payload schema ===")
|
||||
check_payload_schema(client, canonical_collection, stats)
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 50)
|
||||
print("PARITY CHECK SUMMARY")
|
||||
print("=" * 50)
|
||||
summary = stats.summary()
|
||||
print(f"Checks passed: {summary['passed']}")
|
||||
print(f"Checks failed: {summary['failed']}")
|
||||
print(f"Warnings: {summary['warnings']}")
|
||||
|
||||
if summary['errors']:
|
||||
print("\nErrors:")
|
||||
for err in summary['errors']:
|
||||
print(f" - {err}")
|
||||
|
||||
if summary['failed'] > 0:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
307
scripts/qdrant_smoke_test.py
Normal file
307
scripts/qdrant_smoke_test.py
Normal file
@@ -0,0 +1,307 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Qdrant Security Smoke Test
|
||||
|
||||
Verifies security invariants for canonical collection filters.
|
||||
|
||||
Usage:
|
||||
python qdrant_smoke_test.py --host dagi-qdrant-node1
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from services.memory.qdrant.filters import (
|
||||
AccessContext,
|
||||
FilterSecurityError,
|
||||
build_qdrant_filter,
|
||||
build_multi_agent_filter,
|
||||
build_agent_only_filter,
|
||||
)
|
||||
|
||||
|
||||
def test_multi_agent_unauthorized_raises():
|
||||
"""Test: non-admin requesting unauthorized agent_ids → error"""
|
||||
print("\n[TEST 1] Multi-agent unauthorized access...")
|
||||
|
||||
ctx = AccessContext(
|
||||
tenant_id="t_daarion",
|
||||
team_id="team_core",
|
||||
allowed_agent_ids=["agt_helion", "agt_nutra"], # Only these allowed
|
||||
)
|
||||
|
||||
try:
|
||||
# Try to access agt_druid which is NOT in allowed list
|
||||
build_multi_agent_filter(ctx, agent_ids=["agt_helion", "agt_druid"])
|
||||
print(" ❌ FAIL: Should have raised FilterSecurityError")
|
||||
return False
|
||||
except FilterSecurityError as e:
|
||||
if "agt_druid" in str(e) and "Unauthorized" in str(e):
|
||||
print(f" ✅ PASS: Correctly raised error: {e}")
|
||||
return True
|
||||
else:
|
||||
print(f" ❌ FAIL: Wrong error message: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def test_multi_agent_requires_allowlist():
|
||||
"""Test: non-admin without allowed_agent_ids → error"""
|
||||
print("\n[TEST 2] Multi-agent requires allowlist...")
|
||||
|
||||
ctx = AccessContext(
|
||||
tenant_id="t_daarion",
|
||||
team_id="team_core",
|
||||
# No allowed_agent_ids!
|
||||
)
|
||||
|
||||
try:
|
||||
build_multi_agent_filter(ctx, agent_ids=["agt_helion"])
|
||||
print(" ❌ FAIL: Should have raised FilterSecurityError")
|
||||
return False
|
||||
except FilterSecurityError as e:
|
||||
if "allowed_agent_ids" in str(e):
|
||||
print(f" ✅ PASS: Correctly raised error: {e}")
|
||||
return True
|
||||
else:
|
||||
print(f" ❌ FAIL: Wrong error message: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def test_admin_default_no_private():
|
||||
"""Test: admin default does NOT include private"""
|
||||
print("\n[TEST 3] Admin default excludes private...")
|
||||
|
||||
ctx = AccessContext(
|
||||
tenant_id="t_daarion",
|
||||
is_admin=True,
|
||||
# No visibility specified, no include_private
|
||||
)
|
||||
|
||||
result = build_qdrant_filter(ctx)
|
||||
|
||||
# Check should conditions
|
||||
if "should" not in result:
|
||||
print(" ❌ FAIL: No should in result")
|
||||
return False
|
||||
|
||||
should = result["should"]
|
||||
# Admin should have visibility filter with public+confidential only
|
||||
visibility_cond = should[0].get("must", [{}])[0]
|
||||
|
||||
if visibility_cond.get("key") == "visibility":
|
||||
vis_values = visibility_cond.get("match", {}).get("any", [])
|
||||
if "private" in vis_values:
|
||||
print(f" ❌ FAIL: Admin default includes private: {vis_values}")
|
||||
return False
|
||||
elif "public" in vis_values and "confidential" in vis_values:
|
||||
print(f" ✅ PASS: Admin default is public+confidential: {vis_values}")
|
||||
return True
|
||||
|
||||
print(f" ❌ FAIL: Unexpected filter structure: {should}")
|
||||
return False
|
||||
|
||||
|
||||
def test_admin_can_request_private():
|
||||
"""Test: admin with include_private=True gets private"""
|
||||
print("\n[TEST 4] Admin can explicitly request private...")
|
||||
|
||||
ctx = AccessContext(
|
||||
tenant_id="t_daarion",
|
||||
is_admin=True,
|
||||
)
|
||||
|
||||
result = build_qdrant_filter(ctx, include_private=True)
|
||||
|
||||
should = result.get("should", [])
|
||||
visibility_cond = should[0].get("must", [{}])[0] if should else {}
|
||||
|
||||
if visibility_cond.get("key") == "visibility":
|
||||
vis_values = visibility_cond.get("match", {}).get("any", [])
|
||||
if "private" in vis_values:
|
||||
print(f" ✅ PASS: Admin with include_private gets private: {vis_values}")
|
||||
return True
|
||||
|
||||
print(f" ❌ FAIL: Admin with include_private should see private: {result}")
|
||||
return False
|
||||
|
||||
|
||||
def test_owner_gets_private():
|
||||
"""Test: owner with include_private=True gets own private"""
|
||||
print("\n[TEST 5] Owner can access own private...")
|
||||
|
||||
ctx = AccessContext(
|
||||
tenant_id="t_daarion",
|
||||
team_id="team_core",
|
||||
agent_id="agt_helion",
|
||||
)
|
||||
|
||||
result = build_agent_only_filter(ctx, agent_id="agt_helion")
|
||||
|
||||
# Check that filter includes private for owner
|
||||
should = result.get("should", [])
|
||||
|
||||
# Find condition that has visibility with private
|
||||
has_private_for_owner = False
|
||||
for cond in should:
|
||||
must = cond.get("must", [])
|
||||
has_owner_check = any(
|
||||
c.get("key") == "owner_id" and c.get("match", {}).get("value") == "agt_helion"
|
||||
for c in must
|
||||
)
|
||||
has_private = any(
|
||||
c.get("key") == "visibility" and "private" in str(c.get("match", {}))
|
||||
for c in must
|
||||
)
|
||||
if has_owner_check and has_private:
|
||||
has_private_for_owner = True
|
||||
break
|
||||
|
||||
if has_private_for_owner:
|
||||
print(" ✅ PASS: Owner can access own private content")
|
||||
return True
|
||||
else:
|
||||
print(f" ❌ FAIL: Owner should be able to access private: {should}")
|
||||
return False
|
||||
|
||||
|
||||
def test_tenant_always_required():
|
||||
"""Test: tenant_id is always required"""
|
||||
print("\n[TEST 6] Tenant ID always required...")
|
||||
|
||||
ctx = AccessContext(
|
||||
tenant_id="", # Empty!
|
||||
team_id="team_core",
|
||||
)
|
||||
|
||||
try:
|
||||
build_qdrant_filter(ctx)
|
||||
print(" ❌ FAIL: Should have raised FilterSecurityError for empty tenant_id")
|
||||
return False
|
||||
except FilterSecurityError as e:
|
||||
if "tenant_id" in str(e):
|
||||
print(f" ✅ PASS: Correctly raised error: {e}")
|
||||
return True
|
||||
else:
|
||||
print(f" ❌ FAIL: Wrong error: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def test_qdrant_filter_format(host: str, port: int):
|
||||
"""Test: generated filters work with actual Qdrant"""
|
||||
print(f"\n[TEST 7] Qdrant filter format smoke test ({host}:{port})...")
|
||||
|
||||
try:
|
||||
from qdrant_client import QdrantClient
|
||||
except ImportError:
|
||||
print(" ⚠️ SKIP: qdrant-client not installed")
|
||||
return None
|
||||
|
||||
try:
|
||||
client = QdrantClient(host=host, port=port, timeout=5)
|
||||
|
||||
# Get list of collections
|
||||
collections = client.get_collections().collections
|
||||
if not collections:
|
||||
print(" ⚠️ SKIP: No collections in Qdrant")
|
||||
return None
|
||||
|
||||
# Use first collection for smoke test
|
||||
collection_name = collections[0].name
|
||||
print(f" Using collection: {collection_name}")
|
||||
|
||||
# Build a filter
|
||||
ctx = AccessContext(
|
||||
tenant_id="t_daarion",
|
||||
team_id="team_core",
|
||||
allowed_agent_ids=["agt_helion", "agt_nutra"],
|
||||
)
|
||||
|
||||
filter_dict = build_multi_agent_filter(
|
||||
ctx,
|
||||
agent_ids=["agt_helion", "agt_nutra"],
|
||||
scope="docs"
|
||||
)
|
||||
|
||||
# Try to search (we don't care about results, just that filter is valid)
|
||||
# Create a dummy vector
|
||||
info = client.get_collection(collection_name)
|
||||
dim = info.config.params.vectors.size
|
||||
dummy_vector = [0.0] * dim
|
||||
|
||||
from qdrant_client.models import Filter, FieldCondition, MatchValue, MatchAny
|
||||
|
||||
# Manual filter conversion for test
|
||||
results = client.search(
|
||||
collection_name=collection_name,
|
||||
query_vector=dummy_vector,
|
||||
limit=1,
|
||||
query_filter=Filter(
|
||||
must=[
|
||||
FieldCondition(key="tenant_id", match=MatchValue(value="t_daarion")),
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
print(f" ✅ PASS: Qdrant accepts filter format (returned {len(results)} results)")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ FAIL: Qdrant error: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Qdrant Security Smoke Test")
|
||||
parser.add_argument("--host", default=os.getenv("QDRANT_HOST", "localhost"))
|
||||
parser.add_argument("--port", type=int, default=int(os.getenv("QDRANT_PORT", "6333")))
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 60)
|
||||
print("QDRANT SECURITY SMOKE TEST")
|
||||
print("=" * 60)
|
||||
|
||||
results = []
|
||||
|
||||
# Unit tests (no Qdrant needed)
|
||||
results.append(("Multi-agent unauthorized", test_multi_agent_unauthorized_raises()))
|
||||
results.append(("Multi-agent requires allowlist", test_multi_agent_requires_allowlist()))
|
||||
results.append(("Admin default no private", test_admin_default_no_private()))
|
||||
results.append(("Admin can request private", test_admin_can_request_private()))
|
||||
results.append(("Owner gets private", test_owner_gets_private()))
|
||||
results.append(("Tenant always required", test_tenant_always_required()))
|
||||
|
||||
# Integration test (needs Qdrant)
|
||||
qdrant_result = test_qdrant_filter_format(args.host, args.port)
|
||||
if qdrant_result is not None:
|
||||
results.append(("Qdrant filter format", qdrant_result))
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("SUMMARY")
|
||||
print("=" * 60)
|
||||
|
||||
passed = sum(1 for _, r in results if r is True)
|
||||
failed = sum(1 for _, r in results if r is False)
|
||||
skipped = sum(1 for _, r in results if r is None)
|
||||
|
||||
for name, result in results:
|
||||
status = "✅ PASS" if result is True else "❌ FAIL" if result is False else "⚠️ SKIP"
|
||||
print(f" {status}: {name}")
|
||||
|
||||
print(f"\nTotal: {passed} passed, {failed} failed, {skipped} skipped")
|
||||
|
||||
if failed > 0:
|
||||
print("\n❌ SMOKE TEST FAILED")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("\n✅ SMOKE TEST PASSED - Ready for cutover")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
221
scripts/setup_qdrant_collections.py
Normal file
221
scripts/setup_qdrant_collections.py
Normal file
@@ -0,0 +1,221 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Qdrant Vector Collections Setup for Helion Memory v3.0
|
||||
|
||||
Collections:
|
||||
1. helion_memory_items - Long-term memory facts (preferences, decisions, lessons)
|
||||
2. helion_artifacts - Documents, specs, whitepaper embeddings
|
||||
3. helion_messages - Recent message embeddings for context retrieval
|
||||
|
||||
Run: python setup_qdrant_collections.py [--host HOST] [--port PORT]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models as qmodels
|
||||
|
||||
# Cohere embed-multilingual-v3.0 produces 1024-dimensional vectors
|
||||
EMBEDDING_DIMENSIONS = 1024
|
||||
|
||||
|
||||
def setup_collections(host: str = "localhost", port: int = 6333):
|
||||
"""Create and configure Qdrant collections for Helion Memory"""
|
||||
|
||||
print(f"🔌 Connecting to Qdrant at {host}:{port}...")
|
||||
client = QdrantClient(host=host, port=port)
|
||||
|
||||
# Check connection
|
||||
try:
|
||||
collections = client.get_collections()
|
||||
print(f"✅ Connected. Existing collections: {[c.name for c in collections.collections]}")
|
||||
except Exception as e:
|
||||
print(f"❌ Failed to connect: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# =========================================================================
|
||||
# Collection 1: helion_memory_items
|
||||
# =========================================================================
|
||||
|
||||
collection_name = "helion_memory_items"
|
||||
print(f"\n📦 Setting up collection: {collection_name}")
|
||||
|
||||
if not client.collection_exists(collection_name):
|
||||
client.create_collection(
|
||||
collection_name=collection_name,
|
||||
vectors_config=qmodels.VectorParams(
|
||||
size=EMBEDDING_DIMENSIONS,
|
||||
distance=qmodels.Distance.COSINE
|
||||
),
|
||||
# Optimized for filtering by user and type
|
||||
optimizers_config=qmodels.OptimizersConfigDiff(
|
||||
indexing_threshold=10000
|
||||
),
|
||||
# On-disk storage for large collections
|
||||
on_disk_payload=True
|
||||
)
|
||||
print(f" ✅ Created collection: {collection_name}")
|
||||
else:
|
||||
print(f" ℹ️ Collection already exists: {collection_name}")
|
||||
|
||||
# Create payload indexes for filtering
|
||||
print(f" 📇 Creating payload indexes...")
|
||||
|
||||
indexes = [
|
||||
("platform_user_id", qmodels.PayloadSchemaType.KEYWORD),
|
||||
("type", qmodels.PayloadSchemaType.KEYWORD),
|
||||
("category", qmodels.PayloadSchemaType.KEYWORD),
|
||||
("visibility", qmodels.PayloadSchemaType.KEYWORD),
|
||||
("scope_ref", qmodels.PayloadSchemaType.KEYWORD),
|
||||
("confidence", qmodels.PayloadSchemaType.FLOAT),
|
||||
("created_at", qmodels.PayloadSchemaType.DATETIME),
|
||||
("expires_at", qmodels.PayloadSchemaType.DATETIME),
|
||||
("archived", qmodels.PayloadSchemaType.BOOL),
|
||||
]
|
||||
|
||||
for field_name, field_type in indexes:
|
||||
try:
|
||||
client.create_payload_index(
|
||||
collection_name=collection_name,
|
||||
field_name=field_name,
|
||||
field_schema=field_type,
|
||||
wait=False
|
||||
)
|
||||
print(f" ✅ Index: {field_name} ({field_type.value})")
|
||||
except Exception as e:
|
||||
if "already exists" in str(e).lower():
|
||||
print(f" ℹ️ Index exists: {field_name}")
|
||||
else:
|
||||
print(f" ⚠️ Index {field_name}: {e}")
|
||||
|
||||
# =========================================================================
|
||||
# Collection 2: helion_artifacts
|
||||
# =========================================================================
|
||||
|
||||
collection_name = "helion_artifacts"
|
||||
print(f"\n📦 Setting up collection: {collection_name}")
|
||||
|
||||
if not client.collection_exists(collection_name):
|
||||
client.create_collection(
|
||||
collection_name=collection_name,
|
||||
vectors_config=qmodels.VectorParams(
|
||||
size=EMBEDDING_DIMENSIONS,
|
||||
distance=qmodels.Distance.COSINE
|
||||
),
|
||||
on_disk_payload=True
|
||||
)
|
||||
print(f" ✅ Created collection: {collection_name}")
|
||||
else:
|
||||
print(f" ℹ️ Collection already exists: {collection_name}")
|
||||
|
||||
# Artifact indexes
|
||||
print(f" 📇 Creating payload indexes...")
|
||||
|
||||
artifact_indexes = [
|
||||
("artifact_id", qmodels.PayloadSchemaType.KEYWORD),
|
||||
("project_id", qmodels.PayloadSchemaType.KEYWORD),
|
||||
("source", qmodels.PayloadSchemaType.KEYWORD),
|
||||
("source_type", qmodels.PayloadSchemaType.KEYWORD), # whitepaper, spec, landing, faq
|
||||
("language", qmodels.PayloadSchemaType.KEYWORD),
|
||||
("version", qmodels.PayloadSchemaType.KEYWORD),
|
||||
("chunk_index", qmodels.PayloadSchemaType.INTEGER),
|
||||
("created_at", qmodels.PayloadSchemaType.DATETIME),
|
||||
]
|
||||
|
||||
for field_name, field_type in artifact_indexes:
|
||||
try:
|
||||
client.create_payload_index(
|
||||
collection_name=collection_name,
|
||||
field_name=field_name,
|
||||
field_schema=field_type,
|
||||
wait=False
|
||||
)
|
||||
print(f" ✅ Index: {field_name} ({field_type.value})")
|
||||
except Exception as e:
|
||||
if "already exists" in str(e).lower():
|
||||
print(f" ℹ️ Index exists: {field_name}")
|
||||
else:
|
||||
print(f" ⚠️ Index {field_name}: {e}")
|
||||
|
||||
# =========================================================================
|
||||
# Collection 3: helion_messages (for recent context retrieval)
|
||||
# =========================================================================
|
||||
|
||||
collection_name = "helion_messages"
|
||||
print(f"\n📦 Setting up collection: {collection_name}")
|
||||
|
||||
if not client.collection_exists(collection_name):
|
||||
client.create_collection(
|
||||
collection_name=collection_name,
|
||||
vectors_config=qmodels.VectorParams(
|
||||
size=EMBEDDING_DIMENSIONS,
|
||||
distance=qmodels.Distance.COSINE
|
||||
),
|
||||
# Faster retrieval for recent messages
|
||||
optimizers_config=qmodels.OptimizersConfigDiff(
|
||||
indexing_threshold=5000
|
||||
),
|
||||
on_disk_payload=True
|
||||
)
|
||||
print(f" ✅ Created collection: {collection_name}")
|
||||
else:
|
||||
print(f" ℹ️ Collection already exists: {collection_name}")
|
||||
|
||||
# Message indexes
|
||||
print(f" 📇 Creating payload indexes...")
|
||||
|
||||
message_indexes = [
|
||||
("conversation_id", qmodels.PayloadSchemaType.KEYWORD),
|
||||
("platform_user_id", qmodels.PayloadSchemaType.KEYWORD),
|
||||
("channel", qmodels.PayloadSchemaType.KEYWORD),
|
||||
("chat_id", qmodels.PayloadSchemaType.KEYWORD),
|
||||
("role", qmodels.PayloadSchemaType.KEYWORD), # user, assistant, system
|
||||
("timestamp", qmodels.PayloadSchemaType.DATETIME),
|
||||
]
|
||||
|
||||
for field_name, field_type in message_indexes:
|
||||
try:
|
||||
client.create_payload_index(
|
||||
collection_name=collection_name,
|
||||
field_name=field_name,
|
||||
field_schema=field_type,
|
||||
wait=False
|
||||
)
|
||||
print(f" ✅ Index: {field_name} ({field_type.value})")
|
||||
except Exception as e:
|
||||
if "already exists" in str(e).lower():
|
||||
print(f" ℹ️ Index exists: {field_name}")
|
||||
else:
|
||||
print(f" ⚠️ Index {field_name}: {e}")
|
||||
|
||||
# =========================================================================
|
||||
# Summary
|
||||
# =========================================================================
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("📊 Qdrant Collections Summary")
|
||||
print("=" * 60)
|
||||
|
||||
for coll in client.get_collections().collections:
|
||||
info = client.get_collection(coll.name)
|
||||
print(f"\n{coll.name}:")
|
||||
print(f" Points: {info.points_count}")
|
||||
print(f" Vectors: {info.vectors_count}")
|
||||
print(f" Status: {info.status}")
|
||||
|
||||
print("\n✅ Qdrant setup complete!")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Setup Qdrant collections for Helion Memory")
|
||||
parser.add_argument("--host", default="localhost", help="Qdrant host")
|
||||
parser.add_argument("--port", type=int, default=6333, help="Qdrant port")
|
||||
args = parser.parse_args()
|
||||
|
||||
setup_collections(args.host, args.port)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user