feat: add tests and integrate dots.ocr model

G.2.5 - Tests: - Add pytest test suite with fixtures - test_preprocessing.py - PDF/image loading, normalization, validation - test_postprocessing.py - chunks, QA pairs, markdown generation - test_inference.py - dummy parser and inference functions - test_api.py - API endpoint tests - Add pytest.ini configuration G.1.3 - dots.ocr Integration: - Update model_loader.py with real model loading code - Support for AutoModelForVision2Seq and AutoProcessor - Device handling (CUDA/CPU/MPS) with fallback - Error handling with dummy fallback option - Update inference.py with real model inference - Process images through model - Generate and decode outputs - Parse model output to blocks - Add model_output_parser.py - Parse JSON or plain text model output - Convert to structured blocks - Layout detection support (placeholder) Dependencies: - Add pytest, pytest-asyncio, httpx for testing
2025-11-15 13:25:01 -08:00
parent 62cb1d2108
commit 2a353040f6
11 changed files with 848 additions and 47 deletions
--- a/services/parser-service/app/runtime/inference.py
+++ b/services/parser-service/app/runtime/inference.py
@@ -6,6 +6,7 @@ import logging
 from typing import Literal, Optional, List
 from pathlib import Path

+import torch
 from PIL import Image

 from app.schemas import ParsedDocument, ParsedPage, ParsedBlock, BBox
@@ -14,6 +15,7 @@ from app.runtime.preprocessing import (
    convert_pdf_to_images, load_image, prepare_images_for_model
 )
 from app.runtime.postprocessing import build_parsed_document
+from app.runtime.model_output_parser import parse_model_output_to_blocks
 from app.core.config import settings

 logger = logging.getLogger(__name__)
@@ -63,36 +65,46 @@ def parse_document_from_images(
    
    for idx, image in enumerate(prepared_images, start=1):
        try:
-            # TODO: Implement actual inference with dots.ocr
-            # Example:
-            # inputs = model["processor"](images=image, return_tensors="pt")
-            # outputs = model["model"].generate(**inputs)
-            # text = model["processor"].decode(outputs[0], skip_special_tokens=True)
-            # 
-            # # Parse model output into blocks
-            # blocks = parse_model_output_to_blocks(text, image.size)
-            # 
-            # pages_data.append({
-            #     "blocks": blocks,
-            #     "width": image.width,
-            #     "height": image.height
-            # })
+            # Prepare inputs for model
+            inputs = model["processor"](images=image, return_tensors="pt")
+            
+            # Move inputs to device
+            device = model["device"]
+            if device != "cpu":
+                inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v 
+                         for k, v in inputs.items()}
+            
+            # Generate output
+            with torch.no_grad():
+                outputs = model["model"].generate(
+                    **inputs,
+                    max_new_tokens=2048,  # Adjust based on model capabilities
+                    do_sample=False  # Deterministic output
+                )
+            
+            # Decode output
+            generated_text = model["processor"].decode(
+                outputs[0], 
+                skip_special_tokens=True
+            )
+            
+            logger.debug(f"Model output for page {idx}: {generated_text[:100]}...")
+            
+            # Parse model output into blocks
+            blocks = parse_model_output_to_blocks(
+                generated_text,
+                image.size,
+                page_num=idx
+            )
            
-            # For now, use dummy for each page
-            logger.debug(f"Processing page {idx} with model (placeholder)")
            pages_data.append({
-                "blocks": [
-                    {
-                        "type": "paragraph",
-                        "text": f"Page {idx} content (model output placeholder)",
-                        "bbox": {"x": 0, "y": 0, "width": image.width, "height": image.height},
-                        "reading_order": 1
-                    }
-                ],
+                "blocks": blocks,
                "width": image.width,
                "height": image.height
            })
            
+            logger.info(f"Processed page {idx}/{len(prepared_images)}")
+            
        except Exception as e:
            logger.error(f"Error processing page {idx}: {e}", exc_info=True)
            # Continue with other pages