feat: implement RAG Service MVP with PARSER + Memory integration
RAG Service Implementation: - Create rag-service/ with full structure (config, document_store, embedding, pipelines) - Document Store: PostgreSQL + pgvector via Haystack - Embedding: BAAI/bge-m3 (multilingual, 1024 dim) - Ingest Pipeline: Convert ParsedDocument to Haystack Documents, embed, index - Query Pipeline: Retrieve documents, generate answers via DAGI Router - FastAPI endpoints: /ingest, /query, /health Tests: - Unit tests for ingest and query pipelines - E2E test with example parsed JSON - Test fixtures with real PARSER output example Router Integration: - Add mode='rag_query' routing rule in router-config.yml - Priority 7, uses local_qwen3_8b for RAG queries Docker: - Add rag-service to docker-compose.yml - Configure dependencies (router, city-db) - Add model cache volume Documentation: - Complete README with API examples - Integration guides for PARSER and Router
This commit is contained in:
0
services/rag-service/tests/__init__.py
Normal file
0
services/rag-service/tests/__init__.py
Normal file
56
services/rag-service/tests/fixtures/parsed_json_example.json
vendored
Normal file
56
services/rag-service/tests/fixtures/parsed_json_example.json
vendored
Normal file
@@ -0,0 +1,56 @@
|
||||
{
|
||||
"doc_id": "microdao-tokenomics-2025-11",
|
||||
"doc_type": "pdf",
|
||||
"pages": [
|
||||
{
|
||||
"page_num": 1,
|
||||
"blocks": [
|
||||
{
|
||||
"type": "heading",
|
||||
"text": "Токеноміка MicroDAO",
|
||||
"bbox": {"x": 0, "y": 0, "width": 800, "height": 50},
|
||||
"reading_order": 1
|
||||
},
|
||||
{
|
||||
"type": "paragraph",
|
||||
"text": "MicroDAO використовує токен μGOV як ключ доступу до приватних спільнот. Стейкінг μGOV дозволяє отримувати дохід та участь у управлінні.",
|
||||
"bbox": {"x": 0, "y": 60, "width": 800, "height": 100},
|
||||
"reading_order": 2
|
||||
},
|
||||
{
|
||||
"type": "paragraph",
|
||||
"text": "Стейкінг є основним механізмом отримання доходу в MicroDAO. Користувачі можуть стейкати токени та отримувати винагороди за участь у спільноті.",
|
||||
"bbox": {"x": 0, "y": 170, "width": 800, "height": 100},
|
||||
"reading_order": 3
|
||||
}
|
||||
],
|
||||
"width": 800,
|
||||
"height": 600
|
||||
},
|
||||
{
|
||||
"page_num": 2,
|
||||
"blocks": [
|
||||
{
|
||||
"type": "heading",
|
||||
"text": "Роль стейкінгу",
|
||||
"bbox": {"x": 0, "y": 0, "width": 800, "height": 50},
|
||||
"reading_order": 1
|
||||
},
|
||||
{
|
||||
"type": "paragraph",
|
||||
"text": "Стейкінг μGOV токенів дає користувачам право голосу та доступ до приватних каналів спільноти. Мінімальна сума стейкінгу визначається кожною спільнотою окремо.",
|
||||
"bbox": {"x": 0, "y": 60, "width": 800, "height": 100},
|
||||
"reading_order": 2
|
||||
}
|
||||
],
|
||||
"width": 800,
|
||||
"height": 600
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"dao_id": "daarion",
|
||||
"title": "Токеноміка MicroDAO",
|
||||
"created_at": "2025-01-15T10:00:00Z"
|
||||
}
|
||||
}
|
||||
|
||||
67
services/rag-service/tests/test_e2e.py
Normal file
67
services/rag-service/tests/test_e2e.py
Normal file
@@ -0,0 +1,67 @@
|
||||
"""
|
||||
E2E tests for RAG Service
|
||||
Tests full ingest → query pipeline
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import json
|
||||
from pathlib import Path
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from app.main import app
|
||||
|
||||
client = TestClient(app)
|
||||
|
||||
# Load example parsed JSON
|
||||
FIXTURES_DIR = Path(__file__).parent / "fixtures"
|
||||
EXAMPLE_JSON = json.loads((FIXTURES_DIR / "parsed_json_example.json").read_text())
|
||||
|
||||
|
||||
class TestE2E:
|
||||
"""End-to-end tests"""
|
||||
|
||||
def test_health(self):
|
||||
"""Test health endpoint"""
|
||||
response = client.get("/health")
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["status"] == "healthy"
|
||||
assert data["service"] == "rag-service"
|
||||
|
||||
@pytest.mark.skip(reason="Requires database connection")
|
||||
def test_ingest_then_query(self):
|
||||
"""Test full pipeline: ingest → query"""
|
||||
# Step 1: Ingest document
|
||||
ingest_request = {
|
||||
"dao_id": "daarion",
|
||||
"doc_id": "microdao-tokenomics-2025-11",
|
||||
"parsed_json": EXAMPLE_JSON
|
||||
}
|
||||
|
||||
ingest_response = client.post("/ingest", json=ingest_request)
|
||||
assert ingest_response.status_code == 200
|
||||
ingest_data = ingest_response.json()
|
||||
assert ingest_data["status"] == "success"
|
||||
assert ingest_data["doc_count"] > 0
|
||||
|
||||
# Step 2: Query
|
||||
query_request = {
|
||||
"dao_id": "daarion",
|
||||
"question": "Поясни токеноміку microDAO і роль стейкінгу"
|
||||
}
|
||||
|
||||
query_response = client.post("/query", json=query_request)
|
||||
assert query_response.status_code == 200
|
||||
query_data = query_response.json()
|
||||
|
||||
assert "answer" in query_data
|
||||
assert len(query_data["answer"]) > 0
|
||||
assert "citations" in query_data
|
||||
assert len(query_data["citations"]) > 0
|
||||
|
||||
# Check citation structure
|
||||
citation = query_data["citations"][0]
|
||||
assert "doc_id" in citation
|
||||
assert "page" in citation
|
||||
assert "excerpt" in citation
|
||||
|
||||
82
services/rag-service/tests/test_ingest.py
Normal file
82
services/rag-service/tests/test_ingest.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""
|
||||
Tests for ingest pipeline
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from app.ingest_pipeline import ingest_parsed_document, _parsed_json_to_documents
|
||||
|
||||
|
||||
class TestIngestPipeline:
|
||||
"""Tests for document ingestion"""
|
||||
|
||||
def test_parsed_json_to_documents(self):
|
||||
"""Test conversion of parsed JSON to Haystack Documents"""
|
||||
parsed_json = {
|
||||
"doc_id": "test-doc",
|
||||
"doc_type": "pdf",
|
||||
"pages": [
|
||||
{
|
||||
"page_num": 1,
|
||||
"blocks": [
|
||||
{
|
||||
"type": "heading",
|
||||
"text": "Test Document",
|
||||
"bbox": {"x": 0, "y": 0, "width": 800, "height": 50},
|
||||
"reading_order": 1
|
||||
},
|
||||
{
|
||||
"type": "paragraph",
|
||||
"text": "This is test content.",
|
||||
"bbox": {"x": 0, "y": 60, "width": 800, "height": 100},
|
||||
"reading_order": 2
|
||||
}
|
||||
],
|
||||
"width": 800,
|
||||
"height": 600
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"dao_id": "test-dao",
|
||||
"title": "Test Document"
|
||||
}
|
||||
}
|
||||
|
||||
documents = _parsed_json_to_documents(
|
||||
parsed_json=parsed_json,
|
||||
dao_id="test-dao",
|
||||
doc_id="test-doc"
|
||||
)
|
||||
|
||||
assert len(documents) == 2
|
||||
assert documents[0].content == "Test Document"
|
||||
assert documents[0].meta["dao_id"] == "test-dao"
|
||||
assert documents[0].meta["doc_id"] == "test-doc"
|
||||
assert documents[0].meta["page"] == 1
|
||||
assert documents[0].meta["block_type"] == "heading"
|
||||
|
||||
def test_parsed_json_to_documents_empty_blocks(self):
|
||||
"""Test that empty blocks are skipped"""
|
||||
parsed_json = {
|
||||
"doc_id": "test-doc",
|
||||
"pages": [
|
||||
{
|
||||
"page_num": 1,
|
||||
"blocks": [
|
||||
{"type": "paragraph", "text": ""},
|
||||
{"type": "paragraph", "text": " "},
|
||||
{"type": "paragraph", "text": "Valid content"}
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {}
|
||||
}
|
||||
|
||||
documents = _parsed_json_to_documents(
|
||||
parsed_json=parsed_json,
|
||||
dao_id="test-dao",
|
||||
doc_id="test-doc"
|
||||
)
|
||||
|
||||
assert len(documents) == 1
|
||||
assert documents[0].content == "Valid content"
|
||||
|
||||
50
services/rag-service/tests/test_query.py
Normal file
50
services/rag-service/tests/test_query.py
Normal file
@@ -0,0 +1,50 @@
|
||||
"""
|
||||
Tests for query pipeline
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import AsyncMock, patch, MagicMock
|
||||
from app.query_pipeline import answer_query, _build_citations
|
||||
|
||||
|
||||
class TestQueryPipeline:
|
||||
"""Tests for RAG query pipeline"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_answer_query_no_documents(self):
|
||||
"""Test query when no documents found"""
|
||||
with patch("app.query_pipeline._retrieve_documents", return_value=[]):
|
||||
result = await answer_query(
|
||||
dao_id="test-dao",
|
||||
question="Test question"
|
||||
)
|
||||
|
||||
assert "answer" in result
|
||||
assert "На жаль, я не знайшов" in result["answer"]
|
||||
assert result["citations"] == []
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_build_citations(self):
|
||||
"""Test citation building"""
|
||||
from haystack.schema import Document
|
||||
|
||||
documents = [
|
||||
Document(
|
||||
content="Test content 1",
|
||||
meta={"doc_id": "doc1", "page": 1, "section": "Section 1"}
|
||||
),
|
||||
Document(
|
||||
content="Test content 2",
|
||||
meta={"doc_id": "doc2", "page": 2}
|
||||
)
|
||||
]
|
||||
|
||||
citations = _build_citations(documents)
|
||||
|
||||
assert len(citations) == 2
|
||||
assert citations[0]["doc_id"] == "doc1"
|
||||
assert citations[0]["page"] == 1
|
||||
assert citations[0]["section"] == "Section 1"
|
||||
assert citations[1]["doc_id"] == "doc2"
|
||||
assert citations[1]["page"] == 2
|
||||
|
||||
Reference in New Issue
Block a user