feat: implement RAG Service MVP with PARSER + Memory integration
RAG Service Implementation: - Create rag-service/ with full structure (config, document_store, embedding, pipelines) - Document Store: PostgreSQL + pgvector via Haystack - Embedding: BAAI/bge-m3 (multilingual, 1024 dim) - Ingest Pipeline: Convert ParsedDocument to Haystack Documents, embed, index - Query Pipeline: Retrieve documents, generate answers via DAGI Router - FastAPI endpoints: /ingest, /query, /health Tests: - Unit tests for ingest and query pipelines - E2E test with example parsed JSON - Test fixtures with real PARSER output example Router Integration: - Add mode='rag_query' routing rule in router-config.yml - Priority 7, uses local_qwen3_8b for RAG queries Docker: - Add rag-service to docker-compose.yml - Configure dependencies (router, city-db) - Add model cache volume Documentation: - Complete README with API examples - Integration guides for PARSER and Router
This commit is contained in:
82
services/rag-service/tests/test_ingest.py
Normal file
82
services/rag-service/tests/test_ingest.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""
|
||||
Tests for ingest pipeline
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from app.ingest_pipeline import ingest_parsed_document, _parsed_json_to_documents
|
||||
|
||||
|
||||
class TestIngestPipeline:
|
||||
"""Tests for document ingestion"""
|
||||
|
||||
def test_parsed_json_to_documents(self):
|
||||
"""Test conversion of parsed JSON to Haystack Documents"""
|
||||
parsed_json = {
|
||||
"doc_id": "test-doc",
|
||||
"doc_type": "pdf",
|
||||
"pages": [
|
||||
{
|
||||
"page_num": 1,
|
||||
"blocks": [
|
||||
{
|
||||
"type": "heading",
|
||||
"text": "Test Document",
|
||||
"bbox": {"x": 0, "y": 0, "width": 800, "height": 50},
|
||||
"reading_order": 1
|
||||
},
|
||||
{
|
||||
"type": "paragraph",
|
||||
"text": "This is test content.",
|
||||
"bbox": {"x": 0, "y": 60, "width": 800, "height": 100},
|
||||
"reading_order": 2
|
||||
}
|
||||
],
|
||||
"width": 800,
|
||||
"height": 600
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"dao_id": "test-dao",
|
||||
"title": "Test Document"
|
||||
}
|
||||
}
|
||||
|
||||
documents = _parsed_json_to_documents(
|
||||
parsed_json=parsed_json,
|
||||
dao_id="test-dao",
|
||||
doc_id="test-doc"
|
||||
)
|
||||
|
||||
assert len(documents) == 2
|
||||
assert documents[0].content == "Test Document"
|
||||
assert documents[0].meta["dao_id"] == "test-dao"
|
||||
assert documents[0].meta["doc_id"] == "test-doc"
|
||||
assert documents[0].meta["page"] == 1
|
||||
assert documents[0].meta["block_type"] == "heading"
|
||||
|
||||
def test_parsed_json_to_documents_empty_blocks(self):
|
||||
"""Test that empty blocks are skipped"""
|
||||
parsed_json = {
|
||||
"doc_id": "test-doc",
|
||||
"pages": [
|
||||
{
|
||||
"page_num": 1,
|
||||
"blocks": [
|
||||
{"type": "paragraph", "text": ""},
|
||||
{"type": "paragraph", "text": " "},
|
||||
{"type": "paragraph", "text": "Valid content"}
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {}
|
||||
}
|
||||
|
||||
documents = _parsed_json_to_documents(
|
||||
parsed_json=parsed_json,
|
||||
dao_id="test-dao",
|
||||
doc_id="test-doc"
|
||||
)
|
||||
|
||||
assert len(documents) == 1
|
||||
assert documents[0].content == "Valid content"
|
||||
|
||||
Reference in New Issue
Block a user