feat: implement PDF/image preprocessing, post-processing, and dots.ocr integration prep

G.2.3 - PDF/Image Support: - Add preprocessing.py with PDF→images conversion (pdf2image) - Add image loading and normalization - Add file type detection and validation - Support for PDF, PNG, JPEG, WebP, TIFF G.2.4 - Pre/Post-processing: - Add postprocessing.py with structured output builders - build_chunks() - semantic chunks for RAG - build_qa_pairs() - Q&A extraction - build_markdown() - Markdown conversion - Text normalization and chunking logic G.1.3 - dots.ocr Integration Prep: - Update model_loader.py with proper error handling - Add USE_DUMMY_PARSER and ALLOW_DUMMY_FALLBACK flags - Update inference.py to work with images list - Add parse_document_from_images() function - Ready for actual model integration Configuration: - Add PDF_DPI, IMAGE_MAX_SIZE, PAGE_RANGE settings - Add parser mode flags (USE_DUMMY_PARSER, ALLOW_DUMMY_FALLBACK) API Updates: - Update endpoints to use new preprocessing pipeline - Integrate post-processing for all output modes - Remove temp file handling (work directly with bytes)
2025-11-15 13:19:07 -08:00
parent 0f6cfe046f
commit 4befecc425
6 changed files with 762 additions and 122 deletions
--- a/services/parser-service/app/runtime/preprocessing.py
+++ b/services/parser-service/app/runtime/preprocessing.py
@@ -0,0 +1,198 @@
+"""
+Preprocessing functions for PDF and images
+"""
+
+import logging
+from typing import List, Optional
+from io import BytesIO
+from pathlib import Path
+
+from PIL import Image
+import pdf2image
+
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+def convert_pdf_to_images(
+    pdf_bytes: bytes,
+    dpi: Optional[int] = None,
+    max_pages: Optional[int] = None
+) -> List[Image.Image]:
+    """
+    Convert PDF bytes to list of PIL Images
+    
+    Args:
+        pdf_bytes: PDF file content as bytes
+        dpi: DPI for conversion (default from settings)
+        max_pages: Maximum number of pages to process (default from settings)
+    
+    Returns:
+        List of PIL Images (one per page)
+    """
+    dpi = dpi or getattr(settings, 'PDF_DPI', 200)
+    max_pages = max_pages or settings.PARSER_MAX_PAGES
+    
+    try:
+        # Convert PDF to images
+        images = pdf2image.convert_from_bytes(
+            pdf_bytes,
+            dpi=dpi,
+            first_page=1,
+            last_page=max_pages
+        )
+        
+        logger.info(f"Converted PDF to {len(images)} images (DPI: {dpi}, max_pages: {max_pages})")
+        
+        return images
+        
+    except Exception as e:
+        logger.error(f"Failed to convert PDF to images: {e}", exc_info=True)
+        raise ValueError(f"PDF conversion failed: {str(e)}")
+
+
+def load_image(image_bytes: bytes) -> Image.Image:
+    """
+    Load image from bytes
+    
+    Args:
+        image_bytes: Image file content as bytes
+    
+    Returns:
+        PIL Image
+    """
+    try:
+        image = Image.open(BytesIO(image_bytes))
+        logger.info(f"Loaded image: {image.format}, size: {image.size}")
+        return image
+        
+    except Exception as e:
+        logger.error(f"Failed to load image: {e}", exc_info=True)
+        raise ValueError(f"Image loading failed: {str(e)}")
+
+
+def normalize_image(
+    image: Image.Image,
+    max_size: Optional[int] = None
+) -> Image.Image:
+    """
+    Normalize image for model input
+    
+    - Convert to RGB
+    - Resize to max_size (preserving aspect ratio)
+    - Ensure proper format
+    
+    Args:
+        image: PIL Image
+        max_size: Maximum size for longest side (default from settings)
+    
+    Returns:
+        Normalized PIL Image
+    """
+    max_size = max_size or getattr(settings, 'IMAGE_MAX_SIZE', 2048)
+    
+    # Convert to RGB if needed
+    if image.mode != 'RGB':
+        image = image.convert('RGB')
+    
+    # Resize if needed (preserve aspect ratio)
+    width, height = image.size
+    if width > max_size or height > max_size:
+        if width > height:
+            new_width = max_size
+            new_height = int(height * (max_size / width))
+        else:
+            new_height = max_size
+            new_width = int(width * (max_size / height))
+        
+        image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+        logger.info(f"Resized image from {width}x{height} to {new_width}x{new_height}")
+    
+    return image
+
+
+def prepare_images_for_model(
+    images: List[Image.Image],
+    max_size: Optional[int] = None
+) -> List[Image.Image]:
+    """
+    Prepare list of images for model inference
+    
+    - Normalize each image
+    - Apply batch processing if needed
+    
+    Args:
+        images: List of PIL Images
+        max_size: Maximum size for longest side
+    
+    Returns:
+        List of normalized PIL Images
+    """
+    normalized = []
+    
+    for idx, image in enumerate(images):
+        try:
+            norm_image = normalize_image(image, max_size)
+            normalized.append(norm_image)
+        except Exception as e:
+            logger.warning(f"Failed to normalize image {idx + 1}: {e}")
+            # Skip problematic images
+            continue
+    
+    logger.info(f"Prepared {len(normalized)} images for model")
+    return normalized
+
+
+def detect_file_type(content: bytes, filename: Optional[str] = None) -> str:
+    """
+    Detect file type from content and/or filename
+    
+    Args:
+        content: File content as bytes
+        filename: Optional filename (for extension detection)
+    
+    Returns:
+        File type: "pdf" or "image"
+    """
+    # Check magic bytes
+    if content.startswith(b'%PDF'):
+        return "pdf"
+    
+    # Check by extension if available
+    if filename:
+        ext = Path(filename).suffix.lower()
+        if ext == '.pdf':
+            return "pdf"
+        elif ext in ['.png', '.jpg', '.jpeg', '.webp', '.tiff', '.tif']:
+            return "image"
+    
+    # Try to open as image
+    try:
+        Image.open(BytesIO(content))
+        return "image"
+    except:
+        pass
+    
+    raise ValueError("Unsupported file type. Expected PDF or image (PNG/JPEG/WebP)")
+
+
+def validate_file_size(content: bytes) -> None:
+    """
+    Validate file size against MAX_FILE_SIZE_MB
+    
+    Args:
+        content: File content as bytes
+    
+    Raises:
+        ValueError if file is too large
+    """
+    max_size_bytes = settings.MAX_FILE_SIZE_MB * 1024 * 1024
+    file_size = len(content)
+    
+    if file_size > max_size_bytes:
+        raise ValueError(
+            f"File size ({file_size / 1024 / 1024:.2f} MB) exceeds maximum "
+            f"({settings.MAX_FILE_SIZE_MB} MB)"
+        )
+