feat: Add Alateya, Clan, Eonarch agents + fix gateway-router connection

## Agents Added
- Alateya: R&D, biotech, innovations
- Clan (Spirit): Community spirit agent
- Eonarch: Consciousness evolution agent

## Changes
- docker-compose.node1.yml: Added tokens for all 3 new agents
- gateway-bot/http_api.py: Added configs and webhook endpoints
- gateway-bot/clan_prompt.txt: New prompt file
- gateway-bot/eonarch_prompt.txt: New prompt file

## Fixes
- Fixed ROUTER_URL from :9102 to :8000 (internal container port)
- All 9 Telegram agents now working

## Documentation
- Created PROJECT-MASTER-INDEX.md - single entry point
- Added various status documents and scripts

Tokens configured:
- Helion, NUTRA, Agromatrix (existing)
- Alateya, Clan, Eonarch (new)
- Druid, GreenFood, DAARWIZZ (configured)
This commit is contained in:
Apple
2026-01-28 06:40:34 -08:00
parent 4aeb69e7ae
commit 0c8bef82f4
120 changed files with 21905 additions and 425 deletions

View File

@@ -0,0 +1,85 @@
#!/usr/bin/env python3
"""Fix Neo4j schema - add missing relationships ASKED_ABOUT, WORKS_ON"""
import requests
import json
NEO4J_URL = "http://localhost:7474/db/neo4j/tx/commit"
AUTH = ("neo4j", "DaarionNeo4j2026!")
HEADERS = {"Content-Type": "application/json"}
def run_query(statement):
"""Execute a single Cypher statement"""
payload = {"statements": [{"statement": statement}]}
resp = requests.post(NEO4J_URL, auth=AUTH, headers=HEADERS, json=payload)
data = resp.json()
if data.get("errors"):
print(f"ERROR: {data['errors']}")
return None
return data
# Step 1: Add telegram_user_id alias to all User nodes
print("1. Adding telegram_user_id to User nodes...")
run_query("""
MATCH (u:User)
WHERE u.telegram_id IS NOT NULL AND u.telegram_user_id IS NULL
SET u.telegram_user_id = u.telegram_id
RETURN count(u) as updated
""")
# Step 2: Create Topics
print("2. Creating Topics...")
topics = ["EcoMiner", "BioMiner", "DAO Governance", "Tokenomics", "Staking", "Infrastructure"]
for topic in topics:
run_query(f"MERGE (t:Topic {{name: '{topic}'}}) RETURN t")
print(f" Created {len(topics)} topics")
# Step 3: Create Projects
print("3. Creating Projects...")
projects = ["Energy Union", "MicroDAO Daarion", "Helion Agent"]
for project in projects:
run_query(f"MERGE (p:Project {{name: '{project}', status: 'active'}}) RETURN p")
print(f" Created {len(projects)} projects")
# Step 4: Create ASKED_ABOUT relationships
print("4. Creating ASKED_ABOUT relationships...")
run_query("""
MATCH (u:User {username: 'ivantytar'})
MATCH (t:Topic) WHERE t.name IN ['EcoMiner', 'BioMiner', 'Tokenomics']
MERGE (u)-[:ASKED_ABOUT {count: 1, last_asked: datetime()}]->(t)
RETURN count(*) as created
""")
# Step 5: Create WORKS_ON relationships
print("5. Creating WORKS_ON relationships...")
run_query("""
MATCH (u:User {username: 'ivantytar'})
MATCH (p:Project) WHERE p.name IN ['Energy Union', 'MicroDAO Daarion']
MERGE (u)-[:WORKS_ON {role: 'founder', since: datetime()}]->(p)
RETURN count(*) as created
""")
# Step 6: Create indexes
print("6. Creating indexes...")
run_query("CREATE INDEX topic_name IF NOT EXISTS FOR (t:Topic) ON (t.name)")
run_query("CREATE INDEX project_name IF NOT EXISTS FOR (p:Project) ON (p.name)")
run_query("CREATE INDEX user_telegram_id IF NOT EXISTS FOR (u:User) ON (u.telegram_user_id)")
# Verify
print("\n=== Verification ===")
result = run_query("CALL db.relationshipTypes()")
if result:
types = [row["row"][0] for row in result.get("results", [{}])[0].get("data", [])]
print(f"Relationship types: {types}")
result = run_query("""
MATCH (u:User {username: 'ivantytar'})-[r]->(n)
RETURN type(r) as rel, labels(n)[0] as node, n.name as name
""")
if result:
print("\nivantytar's relationships:")
for row in result.get("results", [{}])[0].get("data", []):
r = row["row"]
print(f" -{r[0]}-> {r[1]}: {r[2]}")
print("\n✅ Neo4j schema updated!")

View File

@@ -0,0 +1,98 @@
#!/usr/bin/env python3
"""
Отримати chat_id для адмін-сповіщень
Запустити бота і надіслати йому /start від адміна
"""
import os
import sys
import asyncio
import httpx
BOT_TOKEN = os.getenv("ADMIN_TELEGRAM_BOT_TOKEN", "8589292566:AAEmPvS6nY9e-Y-TZm04CAHWlaFnWVxajE4")
async def get_updates():
"""Отримати останні updates від бота"""
url = f"https://api.telegram.org/bot{BOT_TOKEN}/getUpdates"
async with httpx.AsyncClient() as client:
response = await client.get(url)
response.raise_for_status()
data = response.json()
if not data.get("ok"):
print(f"❌ Помилка: {data}")
return
updates = data.get("result", [])
if not updates:
print("⚠️ Немає повідомлень.")
print("\n📱 Щоб отримати chat_id:")
print("1. Знайдіть бота Sofia в Telegram")
print("2. Надішліть йому /start")
print("3. Запустіть цей скрипт знову")
return
print(f"\n✅ Знайдено {len(updates)} повідомлень:\n")
for update in updates[-5:]: # Показати останні 5
message = update.get("message", {})
from_user = message.get("from", {})
chat = message.get("chat", {})
text = message.get("text", "")
chat_id = chat.get("id")
username = from_user.get("username", "")
first_name = from_user.get("first_name", "")
print(f"Chat ID: {chat_id}")
print(f"From: {first_name} (@{username})")
print(f"Message: {text}")
print("-" * 50)
# Показати останній chat_id
last_update = updates[-1]
last_chat_id = last_update.get("message", {}).get("chat", {}).get("id")
print(f"\n✅ Використовуйте цей chat_id: {last_chat_id}")
print(f"\n📝 Команда для оновлення .env:")
print(f"sed -i 's/ADMIN_CHAT_ID=.*/ADMIN_CHAT_ID={last_chat_id}/' /opt/microdao-daarion/.env")
async def send_test_message(chat_id: int):
"""Відправити тестове повідомлення"""
url = f"https://api.telegram.org/bot{BOT_TOKEN}/sendMessage"
message = (
"🤖 *Sofia Monitoring Bot Active*\n\n"
"✅ Система моніторингу підключена!\n"
"Ви будете отримувати алерти при критичних проблемах:\n\n"
"• Порожні Qdrant колекції\n"
"• Втрата даних (>10%)\n"
"• Проблеми з бекапами\n"
"• Критичні помилки сервісів\n\n"
"_Моніторинг запускається кожні 6 годин_"
)
payload = {
"chat_id": chat_id,
"text": message,
"parse_mode": "Markdown"
}
async with httpx.AsyncClient() as client:
response = await client.post(url, json=payload)
response.raise_for_status()
print("✅ Тестове повідомлення надіслано!")
async def main():
"""Головна функція"""
if len(sys.argv) > 1:
# Якщо передано chat_id, відправити тест
chat_id = int(sys.argv[1])
await send_test_message(chat_id)
else:
# Отримати chat_id
await get_updates()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,296 @@
#!/usr/bin/env python3
"""
Qdrant Collections Health Monitor
Перевіряє здоров'я колекцій і відправляє сповіщення при проблемах
"""
import asyncio
import json
import logging
import os
import sys
from datetime import datetime
from typing import Dict, List, Optional
import httpx
# Configuration
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
TELEGRAM_BOT_TOKEN = os.getenv("ADMIN_TELEGRAM_BOT_TOKEN", "")
ADMIN_CHAT_ID = os.getenv("ADMIN_CHAT_ID", "")
MIN_POINTS_THRESHOLD = int(os.getenv("MIN_POINTS_THRESHOLD", "10"))
STATE_FILE = "/opt/backups/collections-state.json"
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class CollectionsHealthMonitor:
"""Моніторинг здоров'я Qdrant колекцій"""
def __init__(self):
self.http_client = httpx.AsyncClient(timeout=30.0)
self.previous_state = self.load_state()
self.alerts: List[str] = []
def load_state(self) -> Dict:
"""Завантажити попередній стан з файлу"""
try:
if os.path.exists(STATE_FILE):
with open(STATE_FILE, 'r') as f:
return json.load(f)
except Exception as e:
logger.error(f"Помилка завантаження стану: {e}")
return {}
def save_state(self, state: Dict):
"""Зберегти поточний стан у файл"""
try:
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
with open(STATE_FILE, 'w') as f:
json.dump(state, f, indent=2)
except Exception as e:
logger.error(f"Помилка збереження стану: {e}")
async def get_all_collections(self) -> List[Dict]:
"""Отримати список всіх колекцій"""
try:
url = f"{QDRANT_URL}/collections"
response = await self.http_client.get(url)
response.raise_for_status()
data = response.json()
collections = data.get("result", {}).get("collections", [])
logger.info(f"Знайдено {len(collections)} колекцій")
return collections
except Exception as e:
logger.error(f"Помилка отримання колекцій: {e}")
return []
async def get_collection_info(self, collection_name: str) -> Optional[Dict]:
"""Отримати детальну інформацію про колекцію"""
try:
url = f"{QDRANT_URL}/collections/{collection_name}"
response = await self.http_client.get(url)
response.raise_for_status()
data = response.json()
result = data.get("result", {})
return {
"name": collection_name,
"points_count": result.get("points_count", 0),
"segments_count": result.get("segments_count", 0),
"status": result.get("status", "unknown"),
"vectors_count": result.get("vectors_count", 0),
"indexed_vectors_count": result.get("indexed_vectors_count", 0),
}
except Exception as e:
logger.error(f"Помилка отримання інфо про {collection_name}: {e}")
return None
async def check_collection_health(self, collection: Dict) -> Dict:
"""Перевірити здоров'я колекції"""
name = collection.get("name")
info = await self.get_collection_info(name)
if not info:
return {
"name": name,
"status": "error",
"issues": ["Не вдалося отримати інформацію"]
}
issues = []
warnings = []
# Перевірка 1: Порожня колекція
if info["points_count"] == 0:
issues.append("Колекція порожня (0 точок)")
# Перевірка 2: Дуже мало даних
elif info["points_count"] < MIN_POINTS_THRESHOLD:
warnings.append(f"Мало даних ({info['points_count']} точок, мінімум {MIN_POINTS_THRESHOLD})")
# Перевірка 3: Зменшення кількості точок
prev_count = self.previous_state.get(name, {}).get("points_count", 0)
if prev_count > 0 and info["points_count"] < prev_count * 0.9: # Зменшення більше ніж на 10%
decrease = prev_count - info["points_count"]
issues.append(f"Втрата даних: було {prev_count}, зараз {info['points_count']} (-{decrease})")
# Перевірка 4: Статус колекції
if info["status"] != "green":
issues.append(f"Статус: {info['status']} (очікується green)")
# Визначити загальний стан
if issues:
status = "critical"
elif warnings:
status = "warning"
else:
status = "healthy"
return {
"name": name,
"status": status,
"info": info,
"issues": issues,
"warnings": warnings,
"previous_count": prev_count
}
async def send_telegram_alert(self, message: str):
"""Відправити сповіщення в Telegram"""
if not TELEGRAM_BOT_TOKEN or not ADMIN_CHAT_ID:
logger.warning("Telegram credentials not configured, skipping alert")
return
try:
url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
payload = {
"chat_id": ADMIN_CHAT_ID,
"text": message,
"parse_mode": "Markdown"
}
response = await self.http_client.post(url, json=payload)
response.raise_for_status()
logger.info("Telegram alert sent successfully")
except Exception as e:
logger.error(f"Помилка відправки Telegram сповіщення: {e}")
async def monitor(self):
"""Виконати моніторинг всіх колекцій"""
logger.info("🔍 Початок моніторингу Qdrant колекцій...")
collections = await self.get_all_collections()
if not collections:
alert = "⚠️ *Qdrant Collections Monitor*\n\nНе знайдено жодної колекції!"
self.alerts.append(alert)
await self.send_telegram_alert(alert)
return
results = []
critical_count = 0
warning_count = 0
healthy_count = 0
# Перевірити кожну колекцію
for collection in collections:
health = await self.check_collection_health(collection)
results.append(health)
if health["status"] == "critical":
critical_count += 1
elif health["status"] == "warning":
warning_count += 1
else:
healthy_count += 1
# Сформувати звіт
logger.info(f"✅ Healthy: {healthy_count}, ⚠️ Warnings: {warning_count}, 🔴 Critical: {critical_count}")
# Зберегти поточний стан
new_state = {}
for result in results:
if result["info"]:
new_state[result["name"]] = result["info"]
self.save_state(new_state)
# Відправити алерти для критичних проблем
if critical_count > 0:
await self.send_critical_alerts(results)
# Вивести детальний звіт
self.print_report(results, critical_count, warning_count, healthy_count)
return results
async def send_critical_alerts(self, results: List[Dict]):
"""Відправити критичні алерти"""
critical_issues = [r for r in results if r["status"] == "critical"]
if not critical_issues:
return
message = "🔴 *Qdrant Collections Alert*\n\n"
message += f"Виявлено {len(critical_issues)} критичних проблем:\n\n"
for issue in critical_issues:
message += f"*{issue['name']}*\n"
for problem in issue["issues"]:
message += f"{problem}\n"
message += "\n"
message += f"ас: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}_"
await self.send_telegram_alert(message)
def print_report(self, results: List[Dict], critical: int, warning: int, healthy: int):
"""Вивести детальний звіт"""
print("\n" + "="*80)
print("📊 QDRANT COLLECTIONS HEALTH REPORT")
print("="*80)
print(f"Час: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Всього колекцій: {len(results)}")
print(f"✅ Здорові: {healthy}")
print(f"⚠️ Попередження: {warning}")
print(f"🔴 Критичні: {critical}")
print("="*80)
# Групувати за статусом
for status_type in ["critical", "warning", "healthy"]:
items = [r for r in results if r["status"] == status_type]
if not items:
continue
icon = {"critical": "🔴", "warning": "⚠️", "healthy": ""}[status_type]
print(f"\n{icon} {status_type.upper()}")
print("-"*80)
for item in items:
info = item.get("info", {})
print(f"\n{item['name']}:")
print(f" Points: {info.get('points_count', 0):,}")
print(f" Segments: {info.get('segments_count', 0)}")
print(f" Status: {info.get('status', 'unknown')}")
if item.get("issues"):
print(f" Issues:")
for issue in item["issues"]:
print(f"{issue}")
if item.get("warnings"):
print(f" Warnings:")
for warn in item["warnings"]:
print(f"{warn}")
print("\n" + "="*80 + "\n")
async def close(self):
"""Закрити HTTP клієнт"""
await self.http_client.aclose()
async def main():
"""Головна функція"""
monitor = CollectionsHealthMonitor()
try:
await monitor.monitor()
return 0
except Exception as e:
logger.error(f"Помилка моніторингу: {e}", exc_info=True)
return 1
finally:
await monitor.close()
if __name__ == "__main__":
exit_code = asyncio.run(main())
sys.exit(exit_code)

View File

@@ -0,0 +1,441 @@
#!/usr/bin/env python3
"""
Qdrant Migration Script (REST API version)
No qdrant-client dependency - uses pure REST API.
Usage:
python3 qdrant_migrate_rest.py --dry-run
python3 qdrant_migrate_rest.py --all
"""
import argparse
import hashlib
import json
import logging
import os
import re
import sys
import urllib.request
import urllib.error
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s"
)
logger = logging.getLogger(__name__)
# Configuration
DEFAULT_CONFIG = {
"qdrant_url": "http://localhost:6333",
"tenant_id": "t_daarion",
"team_id": "team_core",
"text_dim": 1024,
"text_metric": "Cosine", # Qdrant uses capitalized
"default_visibility": "confidential",
"default_owner_kind": "agent",
"batch_size": 100,
}
# Collection patterns
COLLECTION_PATTERNS = [
(r"^([a-z]+)_docs$", 1, "docs", []),
(r"^([a-z]+)_messages$", 1, "messages", []),
(r"^([a-z]+)_memory_items$", 1, "memory", []),
(r"^([a-z]+)_artifacts$", 1, "artifacts", []),
(r"^druid_legal_kb$", None, "docs", ["legal_kb"]),
(r"^nutra_food_knowledge$", None, "docs", ["food_kb"]),
(r"^memories$", None, "memory", []),
(r"^messages$", None, "messages", []),
]
AGENT_SLUGS = {
"helion": "agt_helion",
"nutra": "agt_nutra",
"druid": "agt_druid",
"greenfood": "agt_greenfood",
"agromatrix": "agt_agromatrix",
"daarwizz": "agt_daarwizz",
"alateya": "agt_alateya",
}
def qdrant_request(url: str, method: str = "GET", data: Optional[Dict] = None) -> Dict:
"""Make HTTP request to Qdrant."""
req = urllib.request.Request(url, method=method)
req.add_header("Content-Type", "application/json")
body = None
if data:
body = json.dumps(data).encode("utf-8")
try:
with urllib.request.urlopen(req, body, timeout=30) as resp:
return json.loads(resp.read().decode("utf-8"))
except urllib.error.HTTPError as e:
error_body = e.read().decode("utf-8") if e.fp else ""
raise Exception(f"Qdrant error {e.code}: {error_body}")
def get_collections(base_url: str) -> List[str]:
"""Get list of all collections."""
resp = qdrant_request(f"{base_url}/collections")
return [c["name"] for c in resp.get("result", {}).get("collections", [])]
def get_collection_info(base_url: str, name: str) -> Dict:
"""Get collection info including vector config."""
resp = qdrant_request(f"{base_url}/collections/{name}")
result = resp.get("result", {})
config = result.get("config", {}).get("params", {}).get("vectors", {})
return {
"name": name,
"points_count": result.get("points_count", 0),
"dim": config.get("size"),
"metric": config.get("distance"),
}
def create_collection(base_url: str, name: str, dim: int, metric: str) -> bool:
"""Create a new collection."""
data = {
"vectors": {
"size": dim,
"distance": metric,
}
}
try:
qdrant_request(f"{base_url}/collections/{name}", "PUT", data)
return True
except Exception as e:
if "already exists" in str(e).lower():
return False
raise
def scroll_points(base_url: str, collection: str, limit: int = 100, offset: Optional[str] = None) -> Tuple[List, Optional[str]]:
"""Scroll through collection points."""
data = {
"limit": limit,
"with_payload": True,
"with_vector": True,
}
if offset:
data["offset"] = offset
resp = qdrant_request(f"{base_url}/collections/{collection}/points/scroll", "POST", data)
result = resp.get("result", {})
points = result.get("points", [])
next_offset = result.get("next_page_offset")
return points, next_offset
def upsert_points(base_url: str, collection: str, points: List[Dict]) -> None:
"""Upsert points to collection."""
data = {"points": points}
qdrant_request(f"{base_url}/collections/{collection}/points", "PUT", data)
def compute_deterministic_id(source_collection: str, legacy_id: str, source_id: str, chunk_idx: int) -> str:
"""Compute deterministic point ID (32 hex chars)."""
content = f"{source_collection}|{legacy_id}|{source_id}|{chunk_idx}"
return hashlib.sha256(content.encode()).hexdigest()[:32]
def compute_fingerprint(source_id: str, chunk_idx: int, text: str = "") -> str:
"""Compute stable fingerprint."""
content = f"{source_id}:{chunk_idx}:{text}"
return f"sha256:{hashlib.sha256(content.encode()).hexdigest()[:32]}"
def parse_collection_name(name: str) -> Optional[Dict]:
"""Parse legacy collection name."""
for pattern, agent_group, scope, tags in COLLECTION_PATTERNS:
match = re.match(pattern, name.lower())
if match:
agent_id = None
if agent_group is not None:
agent_slug = match.group(agent_group).lower()
agent_id = AGENT_SLUGS.get(agent_slug, f"agt_{agent_slug}")
elif "druid" in name.lower():
agent_id = "agt_druid"
elif "nutra" in name.lower():
agent_id = "agt_nutra"
return {"agent_id": agent_id, "scope": scope, "tags": tags.copy()}
return None
def build_canonical_payload(legacy_payload: Dict, collection_info: Dict, config: Dict, point_id: str) -> Dict:
"""Build canonical payload from legacy."""
agent_id = collection_info.get("agent_id")
scope = collection_info.get("scope", "docs")
tags = collection_info.get("tags", [])
source_id = (
legacy_payload.get("source_id") or
legacy_payload.get("document_id") or
legacy_payload.get("doc_id") or
f"doc_{point_id}"
)
if not re.match(r"^(doc|msg|art|web|code)_", source_id):
prefix = "msg" if scope == "messages" else "doc"
source_id = f"{prefix}_{source_id}"
chunk_idx = legacy_payload.get("chunk_idx", legacy_payload.get("chunk_index", 0))
chunk_id = legacy_payload.get("chunk_id") or f"chk_{point_id}"
if not chunk_id.startswith("chk_"):
chunk_id = f"chk_{chunk_id}"
text = legacy_payload.get("text", legacy_payload.get("content", ""))
fingerprint = legacy_payload.get("fingerprint") or compute_fingerprint(source_id, chunk_idx, text)
created_at = (
legacy_payload.get("created_at") or
legacy_payload.get("timestamp") or
datetime.utcnow().isoformat() + "Z"
)
source_kind = {"docs": "document", "messages": "message", "memory": "document", "artifacts": "artifact"}.get(scope, "document")
payload = {
"schema_version": "cm_payload_v1",
"tenant_id": config["tenant_id"],
"team_id": config.get("team_id"),
"project_id": legacy_payload.get("project_id"),
"agent_id": agent_id,
"owner_kind": config["default_owner_kind"],
"owner_id": agent_id or config["team_id"],
"scope": scope,
"visibility": legacy_payload.get("visibility", config["default_visibility"]),
"indexed": legacy_payload.get("indexed", True),
"source_kind": source_kind,
"source_id": source_id,
"chunk": {"chunk_id": chunk_id, "chunk_idx": chunk_idx},
"fingerprint": fingerprint,
"created_at": created_at,
"_legacy_collection": collection_info.get("_collection"),
"_legacy_point_id": point_id,
}
if tags:
payload["tags"] = tags
if legacy_payload.get("tags"):
payload["tags"] = list(set(payload.get("tags", []) + legacy_payload["tags"]))
for field in ["lang", "importance", "channel_id"]:
if field in legacy_payload:
payload[field] = legacy_payload[field]
return payload
class MigrationStats:
def __init__(self):
self.collections_processed = 0
self.points_read = 0
self.points_migrated = 0
self.points_skipped = 0
self.errors = []
self.dim_mismatches = []
def summary(self) -> Dict:
return {
"collections_processed": self.collections_processed,
"points_read": self.points_read,
"points_migrated": self.points_migrated,
"points_skipped": self.points_skipped,
"errors_count": len(self.errors),
"dim_mismatches": self.dim_mismatches,
}
def migrate_collection(
base_url: str,
source_collection: str,
target_collection: str,
config: Dict,
stats: MigrationStats,
dry_run: bool = True,
) -> None:
"""Migrate a single collection."""
logger.info(f"Migrating: {source_collection}")
# Get source info
source_info = get_collection_info(base_url, source_collection)
logger.info(f" Source: dim={source_info['dim']}, metric={source_info['metric']}, points={source_info['points_count']}")
# Check dim/metric
if source_info["dim"] != config["text_dim"]:
msg = f"Dim mismatch: {source_collection} has {source_info['dim']}, target expects {config['text_dim']}"
logger.error(f"{msg}")
stats.dim_mismatches.append(source_collection)
stats.errors.append({"collection": source_collection, "error": msg})
return
if source_info["metric"] and source_info["metric"] != config["text_metric"]:
msg = f"Metric mismatch: {source_collection} has {source_info['metric']}, target expects {config['text_metric']}"
logger.warning(f" ⚠️ {msg}")
# Parse collection name
collection_info = parse_collection_name(source_collection) or {"agent_id": None, "scope": "docs", "tags": []}
collection_info["_collection"] = source_collection
logger.info(f" Mapped: agent={collection_info['agent_id']}, scope={collection_info['scope']}")
if dry_run:
logger.info(f" [DRY RUN] Would migrate {source_info['points_count']} points")
stats.points_read += source_info['points_count']
stats.points_migrated += source_info['points_count']
stats.collections_processed += 1
return
# Scroll and migrate
offset = None
batch_count = 0
collection_migrated = 0
while True:
points, next_offset = scroll_points(base_url, source_collection, config["batch_size"], offset)
if not points:
break
batch_count += 1
stats.points_read += len(points)
canonical_points = []
for point in points:
try:
legacy_payload = point.get("payload", {})
vector = point.get("vector", [])
point_id = str(point.get("id", ""))
canonical_payload = build_canonical_payload(legacy_payload, collection_info, config, point_id)
source_id = canonical_payload.get("source_id", "")
chunk_idx = canonical_payload.get("chunk", {}).get("chunk_idx", 0)
det_id = compute_deterministic_id(source_collection, point_id, source_id, chunk_idx)
canonical_points.append({
"id": det_id,
"vector": vector,
"payload": canonical_payload,
})
except Exception as e:
stats.errors.append({"collection": source_collection, "point": point_id, "error": str(e)})
stats.points_skipped += 1
if canonical_points:
upsert_points(base_url, target_collection, canonical_points)
collection_migrated += len(canonical_points)
stats.points_migrated += len(canonical_points)
if batch_count % 10 == 0:
logger.info(f" Progress: {stats.points_read} read, {collection_migrated} migrated")
offset = next_offset
if not offset:
break
stats.collections_processed += 1
logger.info(f" Completed: {collection_migrated} points migrated")
def main():
parser = argparse.ArgumentParser(description="Qdrant Migration (REST API)")
parser.add_argument("--url", default=DEFAULT_CONFIG["qdrant_url"], help="Qdrant URL")
parser.add_argument("--collections", help="Comma-separated collections")
parser.add_argument("--all", action="store_true", help="Migrate all legacy collections")
parser.add_argument("--dry-run", action="store_true", help="Show what would be migrated")
parser.add_argument("--dim", type=int, default=DEFAULT_CONFIG["text_dim"], help="Target dimension")
parser.add_argument("--continue-on-error", action="store_true", help="Continue on errors")
args = parser.parse_args()
config = DEFAULT_CONFIG.copy()
config["qdrant_url"] = args.url
config["text_dim"] = args.dim
base_url = config["qdrant_url"]
# Get collections
logger.info(f"Connecting to Qdrant at {base_url}")
all_collections = get_collections(base_url)
logger.info(f"Found {len(all_collections)} collections")
# Determine which to migrate
if args.collections:
collections = [c.strip() for c in args.collections.split(",")]
elif args.all:
collections = [c for c in all_collections if not c.startswith("cm_")]
logger.info(f"Legacy collections to migrate: {len(collections)}")
else:
print("\nLegacy collections:")
for col in all_collections:
if not col.startswith("cm_"):
info = parse_collection_name(col)
info_str = f"→ agent={info['agent_id']}, scope={info['scope']}" if info else "→ (unknown)"
print(f" - {col} {info_str}")
print("\nUse --collections <names> or --all to migrate")
return
# Target collection
target_collection = f"cm_text_{config['text_dim']}_v1"
logger.info(f"Target collection: {target_collection}")
# Create target if needed
if not args.dry_run:
if target_collection not in all_collections:
logger.info(f"Creating target collection: {target_collection}")
create_collection(base_url, target_collection, config["text_dim"], config["text_metric"])
else:
logger.info(f"Target collection exists: {target_collection}")
# Migrate
stats = MigrationStats()
logger.info(f"Starting migration ({'DRY RUN' if args.dry_run else 'LIVE'})")
for collection in collections:
try:
migrate_collection(base_url, collection, target_collection, config, stats, args.dry_run)
except Exception as e:
logger.error(f"Failed: {collection}: {e}")
stats.errors.append({"collection": collection, "error": str(e)})
if not args.continue_on_error:
break
# Summary
print("\n" + "=" * 60)
print("MIGRATION SUMMARY")
print("=" * 60)
summary = stats.summary()
print(f"Target collection: {target_collection}")
print(f"Collections processed: {summary['collections_processed']}/{len(collections)}")
print(f"Points read: {summary['points_read']}")
print(f"Points migrated: {summary['points_migrated']}")
print(f"Points skipped: {summary['points_skipped']}")
print(f"Errors: {summary['errors_count']}")
if summary['dim_mismatches']:
print(f"\n❌ Dim mismatches ({len(summary['dim_mismatches'])}):")
for col in summary['dim_mismatches']:
print(f" - {col}")
if stats.errors[:5]:
print("\nFirst 5 errors:")
for err in stats.errors[:5]:
print(f" - {err}")
if args.dry_run:
print("\n[DRY RUN] No changes were made")
if summary['dim_mismatches']:
print("\n❌ MIGRATION BLOCKED due to dim mismatches")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,669 @@
#!/usr/bin/env python3
"""
Qdrant Migration Script: Legacy Collections → Canonical
Migrates points from per-agent collections to canonical cm_text_* collection
with proper payload mapping.
Usage:
python qdrant_migrate_to_canonical.py --dry-run
python qdrant_migrate_to_canonical.py --collections helion_docs,nutra_messages
python qdrant_migrate_to_canonical.py --all
"""
import argparse
import hashlib
import json
import logging
import os
import re
import sys
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from uuid import uuid4
try:
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
except ImportError:
print("Error: qdrant-client not installed. Run: pip install qdrant-client")
sys.exit(1)
try:
import ulid
HAS_ULID = True
except ImportError:
HAS_ULID = False
# Add parent to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from services.memory.qdrant.payload_validation import validate_payload, PayloadValidationError
from services.memory.qdrant.collections import ensure_collection, get_canonical_collection_name
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s"
)
logger = logging.getLogger(__name__)
# Default configuration
DEFAULT_CONFIG = {
"tenant_id": "t_daarion",
"team_id": "team_core",
"text_dim": 1024,
"text_metric": "cosine",
"default_visibility": "confidential",
"default_owner_kind": "agent",
"batch_size": 100,
}
# Collection name patterns for mapping
COLLECTION_PATTERNS = [
# (regex, agent_group_idx, scope, tags)
(r"^([a-z]+)_docs$", 1, "docs", []),
(r"^([a-z]+)_messages$", 1, "messages", []),
(r"^([a-z]+)_memory_items$", 1, "memory", []),
(r"^([a-z]+)_artifacts$", 1, "artifacts", []),
(r"^druid_legal_kb$", None, "docs", ["legal_kb"]),
(r"^nutra_food_knowledge$", None, "docs", ["food_kb"]),
(r"^memories$", None, "memory", []),
(r"^messages$", None, "messages", []),
]
# Agent slug mapping
AGENT_SLUGS = {
"helion": "agt_helion",
"nutra": "agt_nutra",
"druid": "agt_druid",
"greenfood": "agt_greenfood",
"agromatrix": "agt_agromatrix",
"daarwizz": "agt_daarwizz",
"alateya": "agt_alateya",
}
def generate_id(prefix: str = "doc") -> str:
"""Generate a unique ID with prefix."""
if HAS_ULID:
return f"{prefix}_{ulid.new().str}"
return f"{prefix}_{uuid4().hex[:24].upper()}"
def compute_deterministic_id(
source_collection: str,
legacy_point_id: str,
source_id: str,
chunk_idx: int,
) -> str:
"""
Compute deterministic point ID for migration (rerun-safe).
ID is stable across reruns: same input = same output.
This allows safe re-migration without duplicates.
Uses 32 hex chars (128 bits) to minimize collision risk
for large collections (10M+ points).
"""
# Create stable hash from all identifying components
content = f"{source_collection}|{legacy_point_id}|{source_id}|{chunk_idx}"
hash_hex = hashlib.sha256(content.encode()).hexdigest()[:32]
return hash_hex
def compute_fingerprint(source_id: str, chunk_idx: int, text: str = "") -> str:
"""
Compute stable fingerprint for deduplication.
IMPORTANT: Does NOT use timestamp or random values.
Same content = same fingerprint (idempotent).
"""
content = f"{source_id}:{chunk_idx}:{text}"
return f"sha256:{hashlib.sha256(content.encode()).hexdigest()[:32]}"
def get_collection_vector_config(client: QdrantClient, collection_name: str) -> Dict[str, Any]:
"""
Get vector configuration from a collection.
Returns:
Dict with 'dim' and 'metric' keys
"""
try:
info = client.get_collection(collection_name)
vectors_config = info.config.params.vectors
# Handle both named and unnamed vectors
if hasattr(vectors_config, 'size'):
# Unnamed vectors
return {
"dim": vectors_config.size,
"metric": vectors_config.distance.value.lower(),
}
elif hasattr(vectors_config, '__iter__'):
# Named vectors - get first one
for name, config in vectors_config.items():
return {
"dim": config.size,
"metric": config.distance.value.lower(),
}
return {"dim": None, "metric": None}
except Exception as e:
logger.warning(f"Could not get vector config for {collection_name}: {e}")
return {"dim": None, "metric": None}
class DimMismatchError(Exception):
"""Raised when source and target collection dimensions don't match."""
pass
def parse_collection_name(name: str) -> Optional[Dict[str, Any]]:
"""
Parse legacy collection name to extract agent_id, scope, tags.
Returns:
Dict with agent_id, scope, tags or None if no match
"""
for pattern, agent_group, scope, tags in COLLECTION_PATTERNS:
match = re.match(pattern, name.lower())
if match:
agent_id = None
if agent_group is not None:
agent_slug = match.group(agent_group).lower()
agent_id = AGENT_SLUGS.get(agent_slug, f"agt_{agent_slug}")
elif "druid" in name.lower():
agent_id = "agt_druid"
elif "nutra" in name.lower():
agent_id = "agt_nutra"
return {
"agent_id": agent_id,
"scope": scope,
"tags": tags.copy(),
}
return None
def build_canonical_payload(
legacy_payload: Dict[str, Any],
collection_info: Dict[str, Any],
config: Dict[str, Any],
point_id: str,
) -> Dict[str, Any]:
"""
Build canonical cm_payload_v1 from legacy payload.
Args:
legacy_payload: Original payload from legacy collection
collection_info: Parsed collection name info (agent_id, scope, tags)
config: Migration configuration
point_id: Original point ID
Returns:
Canonical payload
"""
# Extract values from legacy payload or use defaults
agent_id = collection_info.get("agent_id")
scope = collection_info.get("scope", "docs")
tags = collection_info.get("tags", [])
# Try to get source_id from legacy payload
source_id = (
legacy_payload.get("source_id") or
legacy_payload.get("document_id") or
legacy_payload.get("doc_id") or
legacy_payload.get("message_id") or
generate_id("doc")
)
# Ensure source_id has proper prefix
if not re.match(r"^(doc|msg|art|web|code)_", source_id):
prefix = "msg" if scope == "messages" else "doc"
source_id = f"{prefix}_{source_id}"
# Chunk info
chunk_idx = legacy_payload.get("chunk_idx", legacy_payload.get("chunk_index", 0))
chunk_id = legacy_payload.get("chunk_id") or generate_id("chk")
if not chunk_id.startswith("chk_"):
chunk_id = f"chk_{chunk_id}"
# Fingerprint
text = legacy_payload.get("text", legacy_payload.get("content", ""))
fingerprint = (
legacy_payload.get("fingerprint") or
legacy_payload.get("hash") or
compute_fingerprint(source_id, chunk_idx, text)
)
# Timestamp
created_at = (
legacy_payload.get("created_at") or
legacy_payload.get("timestamp") or
legacy_payload.get("indexed_at") or
datetime.utcnow().isoformat() + "Z"
)
# Build canonical payload
payload = {
"schema_version": "cm_payload_v1",
"tenant_id": config["tenant_id"],
"team_id": config.get("team_id"),
"project_id": legacy_payload.get("project_id"),
"agent_id": agent_id,
"owner_kind": config["default_owner_kind"],
"owner_id": agent_id or config["team_id"],
"scope": scope,
"visibility": legacy_payload.get("visibility", config["default_visibility"]),
"indexed": legacy_payload.get("indexed", True),
"source_kind": _infer_source_kind(scope),
"source_id": source_id,
"chunk": {
"chunk_id": chunk_id,
"chunk_idx": chunk_idx,
},
"fingerprint": fingerprint,
"created_at": created_at,
}
# Add tags
if tags:
payload["tags"] = tags
legacy_tags = legacy_payload.get("tags", [])
if legacy_tags:
payload["tags"] = list(set(payload.get("tags", []) + legacy_tags))
# Preserve additional fields
for field in ["lang", "importance", "ttl_days", "channel_id"]:
if field in legacy_payload:
payload[field] = legacy_payload[field]
# Preserve text/content for debugging (optional)
if text:
payload["_text"] = text[:500] # Truncate for safety
return payload
def _infer_source_kind(scope: str) -> str:
"""Infer source_kind from scope."""
mapping = {
"docs": "document",
"messages": "message",
"memory": "document",
"artifacts": "artifact",
"signals": "document",
}
return mapping.get(scope, "document")
class MigrationStats:
"""Track migration statistics."""
def __init__(self):
self.collections_processed = 0
self.points_read = 0
self.points_migrated = 0
self.points_skipped = 0
self.errors = []
def add_error(self, collection: str, point_id: str, error: str):
self.errors.append({
"collection": collection,
"point_id": point_id,
"error": error,
})
def summary(self) -> Dict[str, Any]:
return {
"collections_processed": self.collections_processed,
"points_read": self.points_read,
"points_migrated": self.points_migrated,
"points_skipped": self.points_skipped,
"errors_count": len(self.errors),
"errors": self.errors[:10] if self.errors else [],
}
def migrate_collection(
client: QdrantClient,
source_collection: str,
target_collection: str,
config: Dict[str, Any],
stats: MigrationStats,
dry_run: bool = True,
skip_dim_check: bool = False,
) -> None:
"""
Migrate a single legacy collection to canonical collection.
Args:
client: Qdrant client
source_collection: Legacy collection name
target_collection: Canonical collection name
config: Migration configuration
stats: Statistics tracker
dry_run: If True, don't actually write
skip_dim_check: Skip dimension/metric validation (dangerous!)
Raises:
DimMismatchError: If source dim/metric doesn't match target
"""
logger.info(f"Migrating collection: {source_collection}")
# === SECURITY: Verify dim/metric match ===
source_config = get_collection_vector_config(client, source_collection)
target_dim = config.get("text_dim")
target_metric = config.get("text_metric", "cosine")
logger.info(f" Source: dim={source_config['dim']}, metric={source_config['metric']}")
logger.info(f" Target: dim={target_dim}, metric={target_metric}")
if source_config["dim"] and source_config["dim"] != target_dim:
msg = (
f"Dimension mismatch: {source_collection} has dim={source_config['dim']}, "
f"target {target_collection} expects dim={target_dim}"
)
if skip_dim_check:
logger.warning(f" WARNING: {msg} (skipping due to --skip-dim-check)")
else:
logger.error(f" ERROR: {msg}")
stats.add_error(source_collection, "", f"DimMismatch: {msg}")
raise DimMismatchError(msg)
if source_config["metric"] and source_config["metric"] != target_metric:
msg = (
f"Metric mismatch: {source_collection} has metric={source_config['metric']}, "
f"target {target_collection} expects metric={target_metric}"
)
if skip_dim_check:
logger.warning(f" WARNING: {msg} (skipping due to --skip-dim-check)")
else:
logger.error(f" ERROR: {msg}")
stats.add_error(source_collection, "", f"MetricMismatch: {msg}")
raise DimMismatchError(msg)
# Parse collection name
collection_info = parse_collection_name(source_collection)
if not collection_info:
logger.warning(f" Could not parse collection name pattern: {source_collection}")
collection_info = {"agent_id": None, "scope": "docs", "tags": []}
logger.info(f" Mapped to: agent={collection_info['agent_id']}, scope={collection_info['scope']}")
# Scroll through all points
batch_size = config.get("batch_size", 100)
offset = None
batch_count = 0
collection_points_read = 0
collection_points_migrated = 0
while True:
# Scroll points
scroll_result = client.scroll(
collection_name=source_collection,
offset=offset,
limit=batch_size,
with_payload=True,
with_vectors=True,
)
points, next_offset = scroll_result
if not points:
break
batch_count += 1
stats.points_read += len(points)
collection_points_read += len(points)
# Convert points
canonical_points = []
for point in points:
try:
legacy_payload = point.payload or {}
# Build canonical payload
canonical_payload = build_canonical_payload(
legacy_payload=legacy_payload,
collection_info=collection_info,
config=config,
point_id=str(point.id),
)
# Validate payload
try:
validate_payload(canonical_payload)
except PayloadValidationError as e:
stats.add_error(source_collection, str(point.id), str(e))
stats.points_skipped += 1
continue
# Compute deterministic ID (rerun-safe)
source_id = canonical_payload.get("source_id", "")
chunk_idx = canonical_payload.get("chunk", {}).get("chunk_idx", 0)
deterministic_id = compute_deterministic_id(
source_collection=source_collection,
legacy_point_id=str(point.id),
source_id=source_id,
chunk_idx=chunk_idx,
)
# Store original legacy ID in payload for traceability
canonical_payload["_legacy_collection"] = source_collection
canonical_payload["_legacy_point_id"] = str(point.id)
# Create new point with deterministic ID
canonical_points.append(PointStruct(
id=deterministic_id,
vector=point.vector,
payload=canonical_payload,
))
except Exception as e:
stats.add_error(source_collection, str(point.id), str(e))
stats.points_skipped += 1
# Upsert to canonical collection
if canonical_points and not dry_run:
client.upsert(
collection_name=target_collection,
points=canonical_points,
)
stats.points_migrated += len(canonical_points)
collection_points_migrated += len(canonical_points)
if batch_count % 10 == 0:
logger.info(f" Progress: {collection_points_read} read, {collection_points_migrated} migrated")
# Continue scrolling
offset = next_offset
if offset is None:
break
stats.collections_processed += 1
logger.info(f" Completed: {collection_points_read} read, {collection_points_migrated} migrated")
def discover_legacy_collections(client: QdrantClient) -> List[str]:
"""Discover all legacy (non-canonical) collections."""
collections = client.get_collections().collections
legacy = []
for col in collections:
if not col.name.startswith("cm_"):
legacy.append(col.name)
return sorted(legacy)
def main():
parser = argparse.ArgumentParser(
description="Migrate Qdrant collections to canonical schema"
)
parser.add_argument(
"--host",
default=os.getenv("QDRANT_HOST", "localhost"),
help="Qdrant host"
)
parser.add_argument(
"--port",
type=int,
default=int(os.getenv("QDRANT_PORT", "6333")),
help="Qdrant port"
)
parser.add_argument(
"--collections",
help="Comma-separated list of collections to migrate"
)
parser.add_argument(
"--all",
action="store_true",
help="Migrate all legacy collections"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be migrated without writing"
)
parser.add_argument(
"--tenant-id",
default=DEFAULT_CONFIG["tenant_id"],
help="Tenant ID for payloads"
)
parser.add_argument(
"--team-id",
default=DEFAULT_CONFIG["team_id"],
help="Team ID for payloads"
)
parser.add_argument(
"--dim",
type=int,
default=DEFAULT_CONFIG["text_dim"],
help="Vector dimension"
)
parser.add_argument(
"--skip-dim-check",
action="store_true",
help="Skip dimension/metric validation (DANGEROUS - may corrupt data)"
)
parser.add_argument(
"--continue-on-error",
action="store_true",
help="Continue migrating other collections if one fails"
)
args = parser.parse_args()
# Configuration
config = DEFAULT_CONFIG.copy()
config["tenant_id"] = args.tenant_id
config["team_id"] = args.team_id
config["text_dim"] = args.dim
# Connect to Qdrant
logger.info(f"Connecting to Qdrant at {args.host}:{args.port}")
client = QdrantClient(host=args.host, port=args.port)
# Determine collections to migrate
if args.collections:
collections = [c.strip() for c in args.collections.split(",")]
elif args.all:
collections = discover_legacy_collections(client)
logger.info(f"Discovered {len(collections)} legacy collections")
else:
# List available collections
legacy = discover_legacy_collections(client)
print("\nLegacy collections available for migration:")
for col in legacy:
info = parse_collection_name(col)
info_str = f"→ agent={info['agent_id']}, scope={info['scope']}" if info else "→ (unknown pattern)"
print(f" - {col} {info_str}")
print("\nUse --collections <names> or --all to migrate")
return
# Target collection
target_collection = get_canonical_collection_name("text", config["text_dim"])
# Ensure target collection exists
if not args.dry_run:
logger.info(f"Ensuring target collection: {target_collection}")
ensure_collection(
client,
target_collection,
config["text_dim"],
config["text_metric"],
)
# Migrate
stats = MigrationStats()
logger.info(f"Starting migration ({'DRY RUN' if args.dry_run else 'LIVE'})")
logger.info(f"Target collection: {target_collection}")
failed_collections = []
for collection in collections:
try:
migrate_collection(
client=client,
source_collection=collection,
target_collection=target_collection,
config=config,
stats=stats,
dry_run=args.dry_run,
skip_dim_check=args.skip_dim_check,
)
except DimMismatchError as e:
logger.error(f"Failed to migrate {collection}: {e}")
stats.add_error(collection, "", str(e))
failed_collections.append(collection)
if not args.continue_on_error:
logger.error("Stopping migration. Use --continue-on-error to skip failed collections.")
break
except Exception as e:
logger.error(f"Failed to migrate {collection}: {e}")
stats.add_error(collection, "", str(e))
failed_collections.append(collection)
if not args.continue_on_error:
break
# Summary
print("\n" + "=" * 50)
print("MIGRATION SUMMARY")
print("=" * 50)
summary = stats.summary()
print(f"Collections processed: {summary['collections_processed']}/{len(collections)}")
print(f"Points read: {summary['points_read']}")
print(f"Points migrated: {summary['points_migrated']}")
print(f"Points skipped: {summary['points_skipped']}")
print(f"Errors: {summary['errors_count']}")
if failed_collections:
print(f"\nFailed collections ({len(failed_collections)}):")
for col in failed_collections:
print(f" - {col}")
if summary['errors']:
print("\nFirst 10 errors:")
for err in summary['errors']:
print(f" - {err['collection']}:{err['point_id']} - {err['error'][:100]}")
if args.dry_run:
print("\n[DRY RUN] No changes were made")
# Exit with error code if there were failures
if failed_collections and not args.continue_on_error:
sys.exit(1)
if __name__ == "__main__":
main()

360
scripts/qdrant_parity_check.py Executable file
View File

@@ -0,0 +1,360 @@
#!/usr/bin/env python3
"""
Qdrant Parity Check: Compare Legacy vs Canonical Collections
Verifies that migration preserved data correctly by comparing:
1. Point counts
2. Sample search results (topK similarity)
3. Payload field presence
Usage:
python qdrant_parity_check.py --agents helion,nutra,druid
python qdrant_parity_check.py --all
"""
import argparse
import logging
import os
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
try:
from qdrant_client import QdrantClient
except ImportError:
print("Error: qdrant-client not installed. Run: pip install qdrant-client")
sys.exit(1)
# Add parent to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from services.memory.qdrant.collections import get_canonical_collection_name
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s"
)
logger = logging.getLogger(__name__)
# Agent -> legacy collection patterns
AGENT_LEGACY_COLLECTIONS = {
"helion": ["helion_docs", "helion_messages"],
"nutra": ["nutra_docs", "nutra_messages", "nutra_food_knowledge"],
"druid": ["druid_docs", "druid_messages", "druid_legal_kb"],
"greenfood": ["greenfood_docs", "greenfood_messages"],
"agromatrix": ["agromatrix_docs", "agromatrix_messages"],
"daarwizz": ["daarwizz_docs", "daarwizz_messages"],
}
class ParityStats:
"""Track parity check statistics."""
def __init__(self):
self.checks_passed = 0
self.checks_failed = 0
self.warnings = []
self.errors = []
def add_warning(self, msg: str):
self.warnings.append(msg)
logger.warning(msg)
def add_error(self, msg: str):
self.errors.append(msg)
self.checks_failed += 1
logger.error(msg)
def add_pass(self, msg: str):
self.checks_passed += 1
logger.info(f"{msg}")
def summary(self) -> Dict[str, Any]:
return {
"passed": self.checks_passed,
"failed": self.checks_failed,
"warnings": len(self.warnings),
"errors": self.errors[:10],
}
def get_collection_count(client: QdrantClient, collection_name: str) -> Optional[int]:
"""Get point count for a collection."""
try:
info = client.get_collection(collection_name)
return info.points_count
except Exception:
return None
def get_sample_vectors(
client: QdrantClient,
collection_name: str,
limit: int = 5
) -> List[Tuple[str, List[float]]]:
"""Get sample vectors from a collection."""
try:
points, _ = client.scroll(
collection_name=collection_name,
limit=limit,
with_vectors=True,
with_payload=False,
)
return [(str(p.id), p.vector) for p in points]
except Exception as e:
logger.warning(f"Could not get samples from {collection_name}: {e}")
return []
def search_in_collection(
client: QdrantClient,
collection_name: str,
query_vector: List[float],
limit: int = 10,
) -> List[Dict[str, Any]]:
"""Search in a collection and return results."""
try:
results = client.search(
collection_name=collection_name,
query_vector=query_vector,
limit=limit,
with_payload=True,
)
return [
{
"id": str(r.id),
"score": r.score,
"payload_keys": list(r.payload.keys()) if r.payload else [],
}
for r in results
]
except Exception as e:
logger.warning(f"Search failed in {collection_name}: {e}")
return []
def check_point_counts(
client: QdrantClient,
agent: str,
canonical_collection: str,
stats: ParityStats,
) -> None:
"""Check that point counts match between legacy and canonical."""
legacy_collections = AGENT_LEGACY_COLLECTIONS.get(agent, [])
if not legacy_collections:
stats.add_warning(f"No known legacy collections for agent: {agent}")
return
# Count legacy points
legacy_total = 0
for legacy_col in legacy_collections:
count = get_collection_count(client, legacy_col)
if count is not None:
legacy_total += count
logger.info(f" Legacy {legacy_col}: {count} points")
else:
stats.add_warning(f" Legacy collection not found: {legacy_col}")
# Search canonical for this agent's points
# We can't easily count without scrolling through all, so we'll do a sample check
logger.info(f" Legacy total: {legacy_total} points")
if legacy_total > 0:
stats.add_pass(f"{agent}: {legacy_total} points in legacy collections")
def check_search_parity(
client: QdrantClient,
agent: str,
canonical_collection: str,
stats: ParityStats,
num_samples: int = 3,
topk: int = 5,
) -> None:
"""Check that search results are similar between legacy and canonical."""
legacy_collections = AGENT_LEGACY_COLLECTIONS.get(agent, [])
for legacy_col in legacy_collections:
# Get sample vectors from legacy
samples = get_sample_vectors(client, legacy_col, limit=num_samples)
if not samples:
continue
logger.info(f" Checking {legacy_col} with {len(samples)} sample queries")
for point_id, query_vector in samples:
# Search in legacy
legacy_results = search_in_collection(
client, legacy_col, query_vector, limit=topk
)
# Search in canonical (would need agent filter in production)
canonical_results = search_in_collection(
client, canonical_collection, query_vector, limit=topk
)
# Compare
if not legacy_results:
stats.add_warning(f" No results from legacy for point {point_id}")
continue
if not canonical_results:
stats.add_error(f" No results from canonical for point {point_id}")
continue
# Check if top result score is similar (within 0.1)
legacy_top_score = legacy_results[0]["score"]
canonical_top_score = canonical_results[0]["score"]
score_diff = abs(legacy_top_score - canonical_top_score)
if score_diff > 0.1:
stats.add_warning(
f" Score difference for {point_id}: "
f"legacy={legacy_top_score:.4f}, canonical={canonical_top_score:.4f}"
)
else:
stats.add_pass(
f"{legacy_col} point {point_id}: score diff {score_diff:.4f}"
)
def check_payload_schema(
client: QdrantClient,
canonical_collection: str,
stats: ParityStats,
) -> None:
"""Check that canonical payloads have required fields."""
required_fields = [
"schema_version", "tenant_id", "scope", "visibility",
"indexed", "source_id", "chunk", "fingerprint", "created_at"
]
# Sample points from canonical
samples = get_sample_vectors(client, canonical_collection, limit=10)
if not samples:
stats.add_warning("Could not sample canonical collection for schema check")
return
# Get payloads
points, _ = client.scroll(
collection_name=canonical_collection,
limit=10,
with_payload=True,
with_vectors=False,
)
for point in points:
payload = point.payload or {}
missing = [f for f in required_fields if f not in payload]
if missing:
stats.add_error(
f"Point {point.id} missing required fields: {missing}"
)
else:
# Check schema version
if payload.get("schema_version") != "cm_payload_v1":
stats.add_error(
f"Point {point.id} has invalid schema_version: "
f"{payload.get('schema_version')}"
)
else:
stats.add_pass(f"Point {point.id} has valid schema")
def main():
parser = argparse.ArgumentParser(
description="Check parity between legacy and canonical Qdrant collections"
)
parser.add_argument(
"--host",
default=os.getenv("QDRANT_HOST", "localhost"),
help="Qdrant host"
)
parser.add_argument(
"--port",
type=int,
default=int(os.getenv("QDRANT_PORT", "6333")),
help="Qdrant port"
)
parser.add_argument(
"--agents",
help="Comma-separated list of agents to check"
)
parser.add_argument(
"--all",
action="store_true",
help="Check all known agents"
)
parser.add_argument(
"--dim",
type=int,
default=1024,
help="Vector dimension for canonical collection"
)
args = parser.parse_args()
# Connect to Qdrant
logger.info(f"Connecting to Qdrant at {args.host}:{args.port}")
client = QdrantClient(host=args.host, port=args.port)
# Determine agents to check
if args.agents:
agents = [a.strip().lower() for a in args.agents.split(",")]
elif args.all:
agents = list(AGENT_LEGACY_COLLECTIONS.keys())
else:
print("Available agents:", ", ".join(AGENT_LEGACY_COLLECTIONS.keys()))
print("\nUse --agents <names> or --all to run parity check")
return
canonical_collection = get_canonical_collection_name("text", args.dim)
logger.info(f"Canonical collection: {canonical_collection}")
# Check if canonical collection exists
canonical_count = get_collection_count(client, canonical_collection)
if canonical_count is None:
logger.error(f"Canonical collection {canonical_collection} not found!")
sys.exit(1)
logger.info(f"Canonical collection has {canonical_count} points")
# Run checks
stats = ParityStats()
for agent in agents:
logger.info(f"\n=== Checking agent: {agent} ===")
check_point_counts(client, agent, canonical_collection, stats)
check_search_parity(client, agent, canonical_collection, stats)
# Schema check (once)
logger.info("\n=== Checking payload schema ===")
check_payload_schema(client, canonical_collection, stats)
# Summary
print("\n" + "=" * 50)
print("PARITY CHECK SUMMARY")
print("=" * 50)
summary = stats.summary()
print(f"Checks passed: {summary['passed']}")
print(f"Checks failed: {summary['failed']}")
print(f"Warnings: {summary['warnings']}")
if summary['errors']:
print("\nErrors:")
for err in summary['errors']:
print(f" - {err}")
if summary['failed'] > 0:
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,307 @@
#!/usr/bin/env python3
"""
Qdrant Security Smoke Test
Verifies security invariants for canonical collection filters.
Usage:
python qdrant_smoke_test.py --host dagi-qdrant-node1
"""
import argparse
import os
import sys
from pathlib import Path
# Add parent to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from services.memory.qdrant.filters import (
AccessContext,
FilterSecurityError,
build_qdrant_filter,
build_multi_agent_filter,
build_agent_only_filter,
)
def test_multi_agent_unauthorized_raises():
"""Test: non-admin requesting unauthorized agent_ids → error"""
print("\n[TEST 1] Multi-agent unauthorized access...")
ctx = AccessContext(
tenant_id="t_daarion",
team_id="team_core",
allowed_agent_ids=["agt_helion", "agt_nutra"], # Only these allowed
)
try:
# Try to access agt_druid which is NOT in allowed list
build_multi_agent_filter(ctx, agent_ids=["agt_helion", "agt_druid"])
print(" ❌ FAIL: Should have raised FilterSecurityError")
return False
except FilterSecurityError as e:
if "agt_druid" in str(e) and "Unauthorized" in str(e):
print(f" ✅ PASS: Correctly raised error: {e}")
return True
else:
print(f" ❌ FAIL: Wrong error message: {e}")
return False
def test_multi_agent_requires_allowlist():
"""Test: non-admin without allowed_agent_ids → error"""
print("\n[TEST 2] Multi-agent requires allowlist...")
ctx = AccessContext(
tenant_id="t_daarion",
team_id="team_core",
# No allowed_agent_ids!
)
try:
build_multi_agent_filter(ctx, agent_ids=["agt_helion"])
print(" ❌ FAIL: Should have raised FilterSecurityError")
return False
except FilterSecurityError as e:
if "allowed_agent_ids" in str(e):
print(f" ✅ PASS: Correctly raised error: {e}")
return True
else:
print(f" ❌ FAIL: Wrong error message: {e}")
return False
def test_admin_default_no_private():
"""Test: admin default does NOT include private"""
print("\n[TEST 3] Admin default excludes private...")
ctx = AccessContext(
tenant_id="t_daarion",
is_admin=True,
# No visibility specified, no include_private
)
result = build_qdrant_filter(ctx)
# Check should conditions
if "should" not in result:
print(" ❌ FAIL: No should in result")
return False
should = result["should"]
# Admin should have visibility filter with public+confidential only
visibility_cond = should[0].get("must", [{}])[0]
if visibility_cond.get("key") == "visibility":
vis_values = visibility_cond.get("match", {}).get("any", [])
if "private" in vis_values:
print(f" ❌ FAIL: Admin default includes private: {vis_values}")
return False
elif "public" in vis_values and "confidential" in vis_values:
print(f" ✅ PASS: Admin default is public+confidential: {vis_values}")
return True
print(f" ❌ FAIL: Unexpected filter structure: {should}")
return False
def test_admin_can_request_private():
"""Test: admin with include_private=True gets private"""
print("\n[TEST 4] Admin can explicitly request private...")
ctx = AccessContext(
tenant_id="t_daarion",
is_admin=True,
)
result = build_qdrant_filter(ctx, include_private=True)
should = result.get("should", [])
visibility_cond = should[0].get("must", [{}])[0] if should else {}
if visibility_cond.get("key") == "visibility":
vis_values = visibility_cond.get("match", {}).get("any", [])
if "private" in vis_values:
print(f" ✅ PASS: Admin with include_private gets private: {vis_values}")
return True
print(f" ❌ FAIL: Admin with include_private should see private: {result}")
return False
def test_owner_gets_private():
"""Test: owner with include_private=True gets own private"""
print("\n[TEST 5] Owner can access own private...")
ctx = AccessContext(
tenant_id="t_daarion",
team_id="team_core",
agent_id="agt_helion",
)
result = build_agent_only_filter(ctx, agent_id="agt_helion")
# Check that filter includes private for owner
should = result.get("should", [])
# Find condition that has visibility with private
has_private_for_owner = False
for cond in should:
must = cond.get("must", [])
has_owner_check = any(
c.get("key") == "owner_id" and c.get("match", {}).get("value") == "agt_helion"
for c in must
)
has_private = any(
c.get("key") == "visibility" and "private" in str(c.get("match", {}))
for c in must
)
if has_owner_check and has_private:
has_private_for_owner = True
break
if has_private_for_owner:
print(" ✅ PASS: Owner can access own private content")
return True
else:
print(f" ❌ FAIL: Owner should be able to access private: {should}")
return False
def test_tenant_always_required():
"""Test: tenant_id is always required"""
print("\n[TEST 6] Tenant ID always required...")
ctx = AccessContext(
tenant_id="", # Empty!
team_id="team_core",
)
try:
build_qdrant_filter(ctx)
print(" ❌ FAIL: Should have raised FilterSecurityError for empty tenant_id")
return False
except FilterSecurityError as e:
if "tenant_id" in str(e):
print(f" ✅ PASS: Correctly raised error: {e}")
return True
else:
print(f" ❌ FAIL: Wrong error: {e}")
return False
def test_qdrant_filter_format(host: str, port: int):
"""Test: generated filters work with actual Qdrant"""
print(f"\n[TEST 7] Qdrant filter format smoke test ({host}:{port})...")
try:
from qdrant_client import QdrantClient
except ImportError:
print(" ⚠️ SKIP: qdrant-client not installed")
return None
try:
client = QdrantClient(host=host, port=port, timeout=5)
# Get list of collections
collections = client.get_collections().collections
if not collections:
print(" ⚠️ SKIP: No collections in Qdrant")
return None
# Use first collection for smoke test
collection_name = collections[0].name
print(f" Using collection: {collection_name}")
# Build a filter
ctx = AccessContext(
tenant_id="t_daarion",
team_id="team_core",
allowed_agent_ids=["agt_helion", "agt_nutra"],
)
filter_dict = build_multi_agent_filter(
ctx,
agent_ids=["agt_helion", "agt_nutra"],
scope="docs"
)
# Try to search (we don't care about results, just that filter is valid)
# Create a dummy vector
info = client.get_collection(collection_name)
dim = info.config.params.vectors.size
dummy_vector = [0.0] * dim
from qdrant_client.models import Filter, FieldCondition, MatchValue, MatchAny
# Manual filter conversion for test
results = client.search(
collection_name=collection_name,
query_vector=dummy_vector,
limit=1,
query_filter=Filter(
must=[
FieldCondition(key="tenant_id", match=MatchValue(value="t_daarion")),
]
)
)
print(f" ✅ PASS: Qdrant accepts filter format (returned {len(results)} results)")
return True
except Exception as e:
print(f" ❌ FAIL: Qdrant error: {e}")
return False
def main():
parser = argparse.ArgumentParser(description="Qdrant Security Smoke Test")
parser.add_argument("--host", default=os.getenv("QDRANT_HOST", "localhost"))
parser.add_argument("--port", type=int, default=int(os.getenv("QDRANT_PORT", "6333")))
args = parser.parse_args()
print("=" * 60)
print("QDRANT SECURITY SMOKE TEST")
print("=" * 60)
results = []
# Unit tests (no Qdrant needed)
results.append(("Multi-agent unauthorized", test_multi_agent_unauthorized_raises()))
results.append(("Multi-agent requires allowlist", test_multi_agent_requires_allowlist()))
results.append(("Admin default no private", test_admin_default_no_private()))
results.append(("Admin can request private", test_admin_can_request_private()))
results.append(("Owner gets private", test_owner_gets_private()))
results.append(("Tenant always required", test_tenant_always_required()))
# Integration test (needs Qdrant)
qdrant_result = test_qdrant_filter_format(args.host, args.port)
if qdrant_result is not None:
results.append(("Qdrant filter format", qdrant_result))
# Summary
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
passed = sum(1 for _, r in results if r is True)
failed = sum(1 for _, r in results if r is False)
skipped = sum(1 for _, r in results if r is None)
for name, result in results:
status = "✅ PASS" if result is True else "❌ FAIL" if result is False else "⚠️ SKIP"
print(f" {status}: {name}")
print(f"\nTotal: {passed} passed, {failed} failed, {skipped} skipped")
if failed > 0:
print("\n❌ SMOKE TEST FAILED")
sys.exit(1)
else:
print("\n✅ SMOKE TEST PASSED - Ready for cutover")
sys.exit(0)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,221 @@
#!/usr/bin/env python3
"""
Qdrant Vector Collections Setup for Helion Memory v3.0
Collections:
1. helion_memory_items - Long-term memory facts (preferences, decisions, lessons)
2. helion_artifacts - Documents, specs, whitepaper embeddings
3. helion_messages - Recent message embeddings for context retrieval
Run: python setup_qdrant_collections.py [--host HOST] [--port PORT]
"""
import argparse
import sys
from qdrant_client import QdrantClient
from qdrant_client.http import models as qmodels
# Cohere embed-multilingual-v3.0 produces 1024-dimensional vectors
EMBEDDING_DIMENSIONS = 1024
def setup_collections(host: str = "localhost", port: int = 6333):
"""Create and configure Qdrant collections for Helion Memory"""
print(f"🔌 Connecting to Qdrant at {host}:{port}...")
client = QdrantClient(host=host, port=port)
# Check connection
try:
collections = client.get_collections()
print(f"✅ Connected. Existing collections: {[c.name for c in collections.collections]}")
except Exception as e:
print(f"❌ Failed to connect: {e}")
sys.exit(1)
# =========================================================================
# Collection 1: helion_memory_items
# =========================================================================
collection_name = "helion_memory_items"
print(f"\n📦 Setting up collection: {collection_name}")
if not client.collection_exists(collection_name):
client.create_collection(
collection_name=collection_name,
vectors_config=qmodels.VectorParams(
size=EMBEDDING_DIMENSIONS,
distance=qmodels.Distance.COSINE
),
# Optimized for filtering by user and type
optimizers_config=qmodels.OptimizersConfigDiff(
indexing_threshold=10000
),
# On-disk storage for large collections
on_disk_payload=True
)
print(f" ✅ Created collection: {collection_name}")
else:
print(f" Collection already exists: {collection_name}")
# Create payload indexes for filtering
print(f" 📇 Creating payload indexes...")
indexes = [
("platform_user_id", qmodels.PayloadSchemaType.KEYWORD),
("type", qmodels.PayloadSchemaType.KEYWORD),
("category", qmodels.PayloadSchemaType.KEYWORD),
("visibility", qmodels.PayloadSchemaType.KEYWORD),
("scope_ref", qmodels.PayloadSchemaType.KEYWORD),
("confidence", qmodels.PayloadSchemaType.FLOAT),
("created_at", qmodels.PayloadSchemaType.DATETIME),
("expires_at", qmodels.PayloadSchemaType.DATETIME),
("archived", qmodels.PayloadSchemaType.BOOL),
]
for field_name, field_type in indexes:
try:
client.create_payload_index(
collection_name=collection_name,
field_name=field_name,
field_schema=field_type,
wait=False
)
print(f" ✅ Index: {field_name} ({field_type.value})")
except Exception as e:
if "already exists" in str(e).lower():
print(f" Index exists: {field_name}")
else:
print(f" ⚠️ Index {field_name}: {e}")
# =========================================================================
# Collection 2: helion_artifacts
# =========================================================================
collection_name = "helion_artifacts"
print(f"\n📦 Setting up collection: {collection_name}")
if not client.collection_exists(collection_name):
client.create_collection(
collection_name=collection_name,
vectors_config=qmodels.VectorParams(
size=EMBEDDING_DIMENSIONS,
distance=qmodels.Distance.COSINE
),
on_disk_payload=True
)
print(f" ✅ Created collection: {collection_name}")
else:
print(f" Collection already exists: {collection_name}")
# Artifact indexes
print(f" 📇 Creating payload indexes...")
artifact_indexes = [
("artifact_id", qmodels.PayloadSchemaType.KEYWORD),
("project_id", qmodels.PayloadSchemaType.KEYWORD),
("source", qmodels.PayloadSchemaType.KEYWORD),
("source_type", qmodels.PayloadSchemaType.KEYWORD), # whitepaper, spec, landing, faq
("language", qmodels.PayloadSchemaType.KEYWORD),
("version", qmodels.PayloadSchemaType.KEYWORD),
("chunk_index", qmodels.PayloadSchemaType.INTEGER),
("created_at", qmodels.PayloadSchemaType.DATETIME),
]
for field_name, field_type in artifact_indexes:
try:
client.create_payload_index(
collection_name=collection_name,
field_name=field_name,
field_schema=field_type,
wait=False
)
print(f" ✅ Index: {field_name} ({field_type.value})")
except Exception as e:
if "already exists" in str(e).lower():
print(f" Index exists: {field_name}")
else:
print(f" ⚠️ Index {field_name}: {e}")
# =========================================================================
# Collection 3: helion_messages (for recent context retrieval)
# =========================================================================
collection_name = "helion_messages"
print(f"\n📦 Setting up collection: {collection_name}")
if not client.collection_exists(collection_name):
client.create_collection(
collection_name=collection_name,
vectors_config=qmodels.VectorParams(
size=EMBEDDING_DIMENSIONS,
distance=qmodels.Distance.COSINE
),
# Faster retrieval for recent messages
optimizers_config=qmodels.OptimizersConfigDiff(
indexing_threshold=5000
),
on_disk_payload=True
)
print(f" ✅ Created collection: {collection_name}")
else:
print(f" Collection already exists: {collection_name}")
# Message indexes
print(f" 📇 Creating payload indexes...")
message_indexes = [
("conversation_id", qmodels.PayloadSchemaType.KEYWORD),
("platform_user_id", qmodels.PayloadSchemaType.KEYWORD),
("channel", qmodels.PayloadSchemaType.KEYWORD),
("chat_id", qmodels.PayloadSchemaType.KEYWORD),
("role", qmodels.PayloadSchemaType.KEYWORD), # user, assistant, system
("timestamp", qmodels.PayloadSchemaType.DATETIME),
]
for field_name, field_type in message_indexes:
try:
client.create_payload_index(
collection_name=collection_name,
field_name=field_name,
field_schema=field_type,
wait=False
)
print(f" ✅ Index: {field_name} ({field_type.value})")
except Exception as e:
if "already exists" in str(e).lower():
print(f" Index exists: {field_name}")
else:
print(f" ⚠️ Index {field_name}: {e}")
# =========================================================================
# Summary
# =========================================================================
print("\n" + "=" * 60)
print("📊 Qdrant Collections Summary")
print("=" * 60)
for coll in client.get_collections().collections:
info = client.get_collection(coll.name)
print(f"\n{coll.name}:")
print(f" Points: {info.points_count}")
print(f" Vectors: {info.vectors_count}")
print(f" Status: {info.status}")
print("\n✅ Qdrant setup complete!")
return True
def main():
parser = argparse.ArgumentParser(description="Setup Qdrant collections for Helion Memory")
parser.add_argument("--host", default="localhost", help="Qdrant host")
parser.add_argument("--port", type=int, default=6333, help="Qdrant port")
args = parser.parse_args()
setup_collections(args.host, args.port)
if __name__ == "__main__":
main()