microdao-daarion/services/rag-service/tests/test_ingest.py

"""
Tests for ingest pipeline
"""

import pytest
from app.ingest_pipeline import ingest_parsed_document, _parsed_json_to_documents


class TestIngestPipeline:
    """Tests for document ingestion"""

    def test_parsed_json_to_documents(self):
        """Test conversion of parsed JSON to Haystack Documents"""
        parsed_json = {
            "doc_id": "test-doc",
            "doc_type": "pdf",
            "pages": [
                {
                    "page_num": 1,
                    "blocks": [
                        {
                            "type": "heading",
                            "text": "Test Document",
                            "bbox": {"x": 0, "y": 0, "width": 800, "height": 50},
                            "reading_order": 1
                        },
                        {
                            "type": "paragraph",
                            "text": "This is test content.",
                            "bbox": {"x": 0, "y": 60, "width": 800, "height": 100},
                            "reading_order": 2
                        }
                    ],
                    "width": 800,
                    "height": 600
                }
            ],
            "metadata": {
                "dao_id": "test-dao",
                "title": "Test Document"
            }
        }

        documents = _parsed_json_to_documents(
            parsed_json=parsed_json,
            dao_id="test-dao",
            doc_id="test-doc"
        )

        assert len(documents) == 2
        assert documents[0].content == "Test Document"
        assert documents[0].meta["dao_id"] == "test-dao"
        assert documents[0].meta["doc_id"] == "test-doc"
        assert documents[0].meta["page"] == 1
        assert documents[0].meta["block_type"] == "heading"

    def test_parsed_json_to_documents_empty_blocks(self):
        """Test that empty blocks are skipped"""
        parsed_json = {
            "doc_id": "test-doc",
            "pages": [
                {
                    "page_num": 1,
                    "blocks": [
                        {"type": "paragraph", "text": ""},
                        {"type": "paragraph", "text": "   "},
                        {"type": "paragraph", "text": "Valid content"}
                    ]
                }
            ],
            "metadata": {}
        }

        documents = _parsed_json_to_documents(
            parsed_json=parsed_json,
            dao_id="test-dao",
            doc_id="test-doc"
        )

        assert len(documents) == 1
        assert documents[0].content == "Valid content"