Files
microdao-daarion/services/rag-service/tests/test_ingest.py
Apple 9b86f9a694 feat: implement RAG Service MVP with PARSER + Memory integration
RAG Service Implementation:
- Create rag-service/ with full structure (config, document_store, embedding, pipelines)
- Document Store: PostgreSQL + pgvector via Haystack
- Embedding: BAAI/bge-m3 (multilingual, 1024 dim)
- Ingest Pipeline: Convert ParsedDocument to Haystack Documents, embed, index
- Query Pipeline: Retrieve documents, generate answers via DAGI Router
- FastAPI endpoints: /ingest, /query, /health

Tests:
- Unit tests for ingest and query pipelines
- E2E test with example parsed JSON
- Test fixtures with real PARSER output example

Router Integration:
- Add mode='rag_query' routing rule in router-config.yml
- Priority 7, uses local_qwen3_8b for RAG queries

Docker:
- Add rag-service to docker-compose.yml
- Configure dependencies (router, city-db)
- Add model cache volume

Documentation:
- Complete README with API examples
- Integration guides for PARSER and Router
2025-11-16 04:41:53 -08:00

83 lines
2.6 KiB
Python

"""
Tests for ingest pipeline
"""
import pytest
from app.ingest_pipeline import ingest_parsed_document, _parsed_json_to_documents
class TestIngestPipeline:
"""Tests for document ingestion"""
def test_parsed_json_to_documents(self):
"""Test conversion of parsed JSON to Haystack Documents"""
parsed_json = {
"doc_id": "test-doc",
"doc_type": "pdf",
"pages": [
{
"page_num": 1,
"blocks": [
{
"type": "heading",
"text": "Test Document",
"bbox": {"x": 0, "y": 0, "width": 800, "height": 50},
"reading_order": 1
},
{
"type": "paragraph",
"text": "This is test content.",
"bbox": {"x": 0, "y": 60, "width": 800, "height": 100},
"reading_order": 2
}
],
"width": 800,
"height": 600
}
],
"metadata": {
"dao_id": "test-dao",
"title": "Test Document"
}
}
documents = _parsed_json_to_documents(
parsed_json=parsed_json,
dao_id="test-dao",
doc_id="test-doc"
)
assert len(documents) == 2
assert documents[0].content == "Test Document"
assert documents[0].meta["dao_id"] == "test-dao"
assert documents[0].meta["doc_id"] == "test-doc"
assert documents[0].meta["page"] == 1
assert documents[0].meta["block_type"] == "heading"
def test_parsed_json_to_documents_empty_blocks(self):
"""Test that empty blocks are skipped"""
parsed_json = {
"doc_id": "test-doc",
"pages": [
{
"page_num": 1,
"blocks": [
{"type": "paragraph", "text": ""},
{"type": "paragraph", "text": " "},
{"type": "paragraph", "text": "Valid content"}
]
}
],
"metadata": {}
}
documents = _parsed_json_to_documents(
parsed_json=parsed_json,
dao_id="test-dao",
doc_id="test-doc"
)
assert len(documents) == 1
assert documents[0].content == "Valid content"