microdao-daarion/tests/rag_eval.py

#!/usr/bin/env python3
"""
RAG Evaluation Script
Tests RAG quality with fixed questions and saves results
"""

import json
import csv
import time
import sys
from pathlib import Path
from typing import List, Dict, Any
from datetime import datetime

import httpx


# Configuration
RAG_URL = "http://localhost:9500"
ROUTER_URL = "http://localhost:9102"
DAO_ID = "daarion"

# Test questions
TEST_QUESTIONS = [
    {
        "id": "q1",
        "question": "Яка роль стейкінгу в microDAO?",
        "expected_doc_ids": ["microdao-tokenomics"],
        "category": "tokenomics"
    },
    {
        "id": "q2",
        "question": "Які основні фази roadmap розгортання?",
        "expected_doc_ids": ["roadmap", "deployment"],
        "category": "roadmap"
    },
    {
        "id": "q3",
        "question": "Поясни архітектуру DAARION.city",
        "expected_doc_ids": ["architecture", "whitepaper"],
        "category": "architecture"
    },
    {
        "id": "q4",
        "question": "Як працює система ролей та RBAC?",
        "expected_doc_ids": ["rbac", "roles"],
        "category": "rbac"
    },
    {
        "id": "q5",
        "question": "Що таке μGOV токен і навіщо він потрібен?",
        "expected_doc_ids": ["microdao-tokenomics", "tokenomics"],
        "category": "tokenomics"
    }
]


async def test_rag_query(question: Dict[str, Any], dao_id: str) -> Dict[str, Any]:
    """Test single RAG query"""
    async with httpx.AsyncClient(timeout=60.0) as client:
        start_time = time.time()

        response = await client.post(
            f"{RAG_URL}/query",
            json={
                "dao_id": dao_id,
                "question": question["question"],
                "top_k": 5
            }
        )

        elapsed = time.time() - start_time
        response.raise_for_status()
        data = response.json()

        # Extract metrics
        metrics = data.get("metrics", {})
        citations = data.get("citations", [])
        answer = data.get("answer", "")

        # Check if expected doc_ids are found
        found_doc_ids = [c.get("doc_id", "") for c in citations]
        expected_found = any(
            expected_id in found_doc_id
            for expected_id in question["expected_doc_ids"]
            for found_doc_id in found_doc_ids
        )

        return {
            "question_id": question["id"],
            "question": question["question"],
            "category": question["category"],
            "answer": answer,
            "answer_length": len(answer),
            "citations_count": len(citations),
            "citations": citations,
            "doc_ids_found": found_doc_ids,
            "expected_doc_found": expected_found,
            "query_time_seconds": elapsed,
            "metrics": metrics,
            "timestamp": datetime.utcnow().isoformat()
        }


async def test_router_query(question: Dict[str, Any], dao_id: str, user_id: str = "test-user") -> Dict[str, Any]:
    """Test query via Router (Memory + RAG)"""
    async with httpx.AsyncClient(timeout=60.0) as client:
        start_time = time.time()

        response = await client.post(
            f"{ROUTER_URL}/route",
            json={
                "mode": "rag_query",
                "dao_id": dao_id,
                "user_id": user_id,
                "payload": {
                    "question": question["question"]
                }
            }
        )

        elapsed = time.time() - start_time
        response.raise_for_status()
        data = response.json()

        # Extract data
        answer = data.get("data", {}).get("text", "")
        citations = data.get("data", {}).get("citations", []) or data.get("metadata", {}).get("citations", [])
        metadata = data.get("metadata", {})

        return {
            "question_id": question["id"],
            "question": question["question"],
            "category": question["category"],
            "answer": answer,
            "answer_length": len(answer),
            "citations_count": len(citations),
            "citations": citations,
            "memory_used": metadata.get("memory_used", False),
            "rag_used": metadata.get("rag_used", False),
            "query_time_seconds": elapsed,
            "metadata": metadata,
            "timestamp": datetime.utcnow().isoformat()
        }


async def run_evaluation(output_dir: Path = Path("tests/rag_eval_results")):
    """Run full evaluation"""
    output_dir.mkdir(parents=True, exist_ok=True)

    timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")

    # Test RAG Service directly
    print("Testing RAG Service directly...")
    rag_results = []
    for question in TEST_QUESTIONS:
        print(f"  Testing: {question['question'][:50]}...")
        try:
            result = await test_rag_query(question, DAO_ID)
            rag_results.append(result)
            print(f"    ✓ Found {result['citations_count']} citations, expected doc: {result['expected_doc_found']}")
        except Exception as e:
            print(f"    ✗ Error: {e}")
            rag_results.append({
                "question_id": question["id"],
                "error": str(e)
            })

    # Test Router (Memory + RAG)
    print("\nTesting Router (Memory + RAG)...")
    router_results = []
    for question in TEST_QUESTIONS:
        print(f"  Testing: {question['question'][:50]}...")
        try:
            result = await test_router_query(question, DAO_ID)
            router_results.append(result)
            print(f"    ✓ Answer length: {result['answer_length']}, citations: {result['citations_count']}")
        except Exception as e:
            print(f"    ✗ Error: {e}")
            router_results.append({
                "question_id": question["id"],
                "error": str(e)
            })

    # Save results
    results_file = output_dir / f"rag_eval_{timestamp}.json"
    with open(results_file, "w", encoding="utf-8") as f:
        json.dump({
            "rag_service_results": rag_results,
            "router_results": router_results,
            "timestamp": timestamp,
            "dao_id": DAO_ID
        }, f, indent=2, ensure_ascii=False)

    # Save CSV summary
    csv_file = output_dir / f"rag_eval_{timestamp}.csv"
    with open(csv_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow([
            "Question ID", "Question", "Category",
            "RAG Citations", "RAG Expected Found", "RAG Time (s)",
            "Router Citations", "Router Memory Used", "Router Time (s)",
            "Answer Length"
        ])

        for rag_res, router_res in zip(rag_results, router_results):
            writer.writerow([
                rag_res.get("question_id", ""),
                rag_res.get("question", ""),
                rag_res.get("category", ""),
                rag_res.get("citations_count", 0),
                rag_res.get("expected_doc_found", False),
                rag_res.get("query_time_seconds", 0),
                router_res.get("citations_count", 0),
                router_res.get("memory_used", False),
                router_res.get("query_time_seconds", 0),
                router_res.get("answer_length", 0)
            ])

    print(f"\n✓ Results saved:")
    print(f"  JSON: {results_file}")
    print(f"  CSV: {csv_file}")

    # Print summary
    print("\n=== Summary ===")
    rag_avg_time = sum(r.get("query_time_seconds", 0) for r in rag_results) / len(rag_results)
    router_avg_time = sum(r.get("query_time_seconds", 0) for r in router_results) / len(router_results)

    print(f"RAG Service: avg time={rag_avg_time:.2f}s")
    print(f"Router: avg time={router_avg_time:.2f}s")
    print(f"Expected docs found: {sum(1 for r in rag_results if r.get('expected_doc_found', False))}/{len(rag_results)}")


if __name__ == "__main__":
    import asyncio
    asyncio.run(run_evaluation())