feat: add tests and integrate dots.ocr model

G.2.5 - Tests: - Add pytest test suite with fixtures - test_preprocessing.py - PDF/image loading, normalization, validation - test_postprocessing.py - chunks, QA pairs, markdown generation - test_inference.py - dummy parser and inference functions - test_api.py - API endpoint tests - Add pytest.ini configuration G.1.3 - dots.ocr Integration: - Update model_loader.py with real model loading code - Support for AutoModelForVision2Seq and AutoProcessor - Device handling (CUDA/CPU/MPS) with fallback - Error handling with dummy fallback option - Update inference.py with real model inference - Process images through model - Generate and decode outputs - Parse model output to blocks - Add model_output_parser.py - Parse JSON or plain text model output - Convert to structured blocks - Layout detection support (placeholder) Dependencies: - Add pytest, pytest-asyncio, httpx for testing
2025-11-15 13:25:01 -08:00
parent 62cb1d2108
commit 2a353040f6
11 changed files with 848 additions and 47 deletions
--- a/services/parser-service/tests/test_preprocessing.py
+++ b/services/parser-service/tests/test_preprocessing.py
@@ -0,0 +1,123 @@
+"""
+Tests for preprocessing functions
+"""
+
+import pytest
+from PIL import Image
+import io
+
+from app.runtime.preprocessing import (
+    convert_pdf_to_images,
+    load_image,
+    normalize_image,
+    prepare_images_for_model,
+    detect_file_type,
+    validate_file_size
+)
+from app.core.config import settings
+
+
+class TestImageLoading:
+    """Tests for image loading functions"""
+    
+    def test_load_image_png(self, sample_image_bytes):
+        """Test loading PNG image"""
+        image = load_image(sample_image_bytes)
+        assert isinstance(image, Image.Image)
+        assert image.size == (800, 600)
+    
+    def test_load_image_invalid(self):
+        """Test loading invalid image"""
+        invalid_bytes = b"not an image"
+        with pytest.raises(ValueError, match="Image loading failed"):
+            load_image(invalid_bytes)
+
+
+class TestPDFConversion:
+    """Tests for PDF conversion"""
+    
+    def test_convert_pdf_to_images(self, sample_pdf_bytes):
+        """Test converting PDF to images"""
+        images = convert_pdf_to_images(sample_pdf_bytes, dpi=150, max_pages=1)
+        assert len(images) > 0
+        assert all(isinstance(img, Image.Image) for img in images)
+    
+    def test_convert_pdf_max_pages(self, sample_pdf_bytes):
+        """Test PDF conversion respects max_pages"""
+        images = convert_pdf_to_images(sample_pdf_bytes, max_pages=1)
+        assert len(images) <= 1
+
+
+class TestImageNormalization:
+    """Tests for image normalization"""
+    
+    def test_normalize_image_rgb(self, sample_image_bytes):
+        """Test image is converted to RGB"""
+        image = load_image(sample_image_bytes)
+        normalized = normalize_image(image)
+        assert normalized.mode == 'RGB'
+    
+    def test_normalize_image_resize(self):
+        """Test image is resized if too large"""
+        # Create large image
+        large_img = Image.new('RGB', (3000, 2000), color='white')
+        normalized = normalize_image(large_img, max_size=2048)
+        assert normalized.width <= 2048 or normalized.height <= 2048
+    
+    def test_normalize_image_small(self):
+        """Test small image is not resized"""
+        small_img = Image.new('RGB', (500, 400), color='white')
+        normalized = normalize_image(small_img, max_size=2048)
+        assert normalized.size == small_img.size
+
+
+class TestFileTypeDetection:
+    """Tests for file type detection"""
+    
+    def test_detect_pdf(self, sample_pdf_bytes):
+        """Test PDF detection"""
+        assert detect_file_type(sample_pdf_bytes) == "pdf"
+        assert detect_file_type(sample_pdf_bytes, "test.pdf") == "pdf"
+    
+    def test_detect_image(self, sample_image_bytes):
+        """Test image detection"""
+        assert detect_file_type(sample_image_bytes) == "image"
+        assert detect_file_type(sample_image_bytes, "test.png") == "image"
+    
+    def test_detect_unsupported(self):
+        """Test unsupported file type"""
+        with pytest.raises(ValueError, match="Unsupported file type"):
+            detect_file_type(b"random bytes", "test.xyz")
+
+
+class TestFileSizeValidation:
+    """Tests for file size validation"""
+    
+    def test_validate_file_size_ok(self):
+        """Test valid file size"""
+        small_file = b"x" * (10 * 1024 * 1024)  # 10 MB
+        validate_file_size(small_file)  # Should not raise
+    
+    def test_validate_file_size_too_large(self):
+        """Test file size exceeds limit"""
+        large_file = b"x" * (100 * 1024 * 1024)  # 100 MB
+        with pytest.raises(ValueError, match="exceeds maximum"):
+            validate_file_size(large_file)
+
+
+class TestPrepareImages:
+    """Tests for preparing images for model"""
+    
+    def test_prepare_images_for_model(self, sample_image_bytes):
+        """Test preparing images for model"""
+        image = load_image(sample_image_bytes)
+        prepared = prepare_images_for_model([image])
+        assert len(prepared) == 1
+        assert isinstance(prepared[0], Image.Image)
+        assert prepared[0].mode == 'RGB'
+    
+    def test_prepare_images_empty(self):
+        """Test preparing empty list"""
+        prepared = prepare_images_for_model([])
+        assert len(prepared) == 0
+