feat: add tests and integrate dots.ocr model

G.2.5 - Tests: - Add pytest test suite with fixtures - test_preprocessing.py - PDF/image loading, normalization, validation - test_postprocessing.py - chunks, QA pairs, markdown generation - test_inference.py - dummy parser and inference functions - test_api.py - API endpoint tests - Add pytest.ini configuration G.1.3 - dots.ocr Integration: - Update model_loader.py with real model loading code - Support for AutoModelForVision2Seq and AutoProcessor - Device handling (CUDA/CPU/MPS) with fallback - Error handling with dummy fallback option - Update inference.py with real model inference - Process images through model - Generate and decode outputs - Parse model output to blocks - Add model_output_parser.py - Parse JSON or plain text model output - Convert to structured blocks - Layout detection support (placeholder) Dependencies: - Add pytest, pytest-asyncio, httpx for testing
2025-11-15 13:25:01 -08:00
parent 62cb1d2108
commit 2a353040f6
11 changed files with 848 additions and 47 deletions
--- a/services/parser-service/tests/test_postprocessing.py
+++ b/services/parser-service/tests/test_postprocessing.py
@@ -0,0 +1,193 @@
+"""
+Tests for postprocessing functions
+"""
+
+import pytest
+
+from app.runtime.postprocessing import (
+    normalize_text,
+    build_parsed_document,
+    build_chunks,
+    build_qa_pairs,
+    build_markdown
+)
+from app.schemas import ParsedDocument, ParsedPage, ParsedBlock, BBox
+
+
+class TestTextNormalization:
+    """Tests for text normalization"""
+    
+    def test_normalize_text_whitespace(self):
+        """Test removing extra whitespace"""
+        text = "  hello   world  "
+        assert normalize_text(text) == "hello world"
+    
+    def test_normalize_text_newlines(self):
+        """Test removing newlines"""
+        text = "hello\n\nworld"
+        assert normalize_text(text) == "hello world"
+    
+    def test_normalize_text_empty(self):
+        """Test empty text"""
+        assert normalize_text("") == ""
+        assert normalize_text("   ") == ""
+
+
+class TestBuildParsedDocument:
+    """Tests for building ParsedDocument"""
+    
+    def test_build_parsed_document(self):
+        """Test building ParsedDocument from model output"""
+        pages_data = [
+            {
+                "blocks": [
+                    {
+                        "type": "heading",
+                        "text": "  Title  ",
+                        "bbox": {"x": 0, "y": 0, "width": 100, "height": 20},
+                        "reading_order": 1
+                    },
+                    {
+                        "type": "paragraph",
+                        "text": "  Content  ",
+                        "bbox": {"x": 0, "y": 30, "width": 100, "height": 50},
+                        "reading_order": 2
+                    }
+                ],
+                "width": 800,
+                "height": 1200
+            }
+        ]
+        
+        doc = build_parsed_document(pages_data, "test-doc", "pdf")
+        
+        assert doc.doc_id == "test-doc"
+        assert doc.doc_type == "pdf"
+        assert len(doc.pages) == 1
+        assert len(doc.pages[0].blocks) == 2
+        assert doc.pages[0].blocks[0].text == "Title"  # Normalized
+        assert doc.pages[0].blocks[0].type == "heading"
+
+
+class TestBuildChunks:
+    """Tests for building chunks"""
+    
+    def test_build_chunks(self):
+        """Test building chunks from ParsedDocument"""
+        doc = ParsedDocument(
+            doc_id="test-doc",
+            doc_type="pdf",
+            pages=[
+                ParsedPage(
+                    page_num=1,
+                    blocks=[
+                        ParsedBlock(
+                            type="heading",
+                            text="Section 1",
+                            bbox=BBox(x=0, y=0, width=100, height=20),
+                            reading_order=1,
+                            page_num=1
+                        ),
+                        ParsedBlock(
+                            type="paragraph",
+                            text="Content of section 1",
+                            bbox=BBox(x=0, y=30, width=100, height=50),
+                            reading_order=2,
+                            page_num=1
+                        )
+                    ],
+                    width=800,
+                    height=1200
+                )
+            ]
+        )
+        
+        chunks = build_chunks(doc, dao_id="test-dao")
+        
+        assert len(chunks) > 0
+        assert all(chunk.page == 1 for chunk in chunks)
+        assert all(chunk.metadata.get("dao_id") == "test-dao" for chunk in chunks)
+        assert all(chunk.metadata.get("doc_id") == "test-doc" for chunk in chunks)
+
+
+class TestBuildQAPairs:
+    """Tests for building Q&A pairs"""
+    
+    def test_build_qa_pairs(self):
+        """Test building Q&A pairs from ParsedDocument"""
+        doc = ParsedDocument(
+            doc_id="test-doc",
+            doc_type="pdf",
+            pages=[
+                ParsedPage(
+                    page_num=1,
+                    blocks=[
+                        ParsedBlock(
+                            type="heading",
+                            text="What is X?",
+                            bbox=BBox(x=0, y=0, width=100, height=20),
+                            reading_order=1,
+                            page_num=1
+                        ),
+                        ParsedBlock(
+                            type="paragraph",
+                            text="X is a test",
+                            bbox=BBox(x=0, y=30, width=100, height=50),
+                            reading_order=2,
+                            page_num=1
+                        )
+                    ],
+                    width=800,
+                    height=1200
+                )
+            ]
+        )
+        
+        qa_pairs = build_qa_pairs(doc, max_pairs=5)
+        
+        assert len(qa_pairs) > 0
+        assert all(isinstance(qa.question, str) for qa in qa_pairs)
+        assert all(isinstance(qa.answer, str) for qa in qa_pairs)
+        assert all(qa.source_page == 1 for qa in qa_pairs)
+
+
+class TestBuildMarkdown:
+    """Tests for building Markdown"""
+    
+    def test_build_markdown(self):
+        """Test building Markdown from ParsedDocument"""
+        doc = ParsedDocument(
+            doc_id="test-doc",
+            doc_type="pdf",
+            pages=[
+                ParsedPage(
+                    page_num=1,
+                    blocks=[
+                        ParsedBlock(
+                            type="heading",
+                            text="Title",
+                            bbox=BBox(x=0, y=0, width=100, height=20),
+                            reading_order=1,
+                            page_num=1
+                        ),
+                        ParsedBlock(
+                            type="paragraph",
+                            text="Content",
+                            bbox=BBox(x=0, y=30, width=100, height=50),
+                            reading_order=2,
+                            page_num=1
+                        )
+                    ],
+                    width=800,
+                    height=1200
+                )
+            ]
+        )
+        
+        markdown = build_markdown(doc)
+        
+        assert isinstance(markdown, str)
+        assert "Title" in markdown
+        assert "Content" in markdown
+        assert "###" in markdown or "####" in markdown  # Heading markers
+