G.2.3 - PDF/Image Support: - Add preprocessing.py with PDF→images conversion (pdf2image) - Add image loading and normalization - Add file type detection and validation - Support for PDF, PNG, JPEG, WebP, TIFF G.2.4 - Pre/Post-processing: - Add postprocessing.py with structured output builders - build_chunks() - semantic chunks for RAG - build_qa_pairs() - Q&A extraction - build_markdown() - Markdown conversion - Text normalization and chunking logic G.1.3 - dots.ocr Integration Prep: - Update model_loader.py with proper error handling - Add USE_DUMMY_PARSER and ALLOW_DUMMY_FALLBACK flags - Update inference.py to work with images list - Add parse_document_from_images() function - Ready for actual model integration Configuration: - Add PDF_DPI, IMAGE_MAX_SIZE, PAGE_RANGE settings - Add parser mode flags (USE_DUMMY_PARSER, ALLOW_DUMMY_FALLBACK) API Updates: - Update endpoints to use new preprocessing pipeline - Integrate post-processing for all output modes - Remove temp file handling (work directly with bytes)
199 lines
5.1 KiB
Python
199 lines
5.1 KiB
Python
"""
|
|
Preprocessing functions for PDF and images
|
|
"""
|
|
|
|
import logging
|
|
from typing import List, Optional
|
|
from io import BytesIO
|
|
from pathlib import Path
|
|
|
|
from PIL import Image
|
|
import pdf2image
|
|
|
|
from app.core.config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def convert_pdf_to_images(
|
|
pdf_bytes: bytes,
|
|
dpi: Optional[int] = None,
|
|
max_pages: Optional[int] = None
|
|
) -> List[Image.Image]:
|
|
"""
|
|
Convert PDF bytes to list of PIL Images
|
|
|
|
Args:
|
|
pdf_bytes: PDF file content as bytes
|
|
dpi: DPI for conversion (default from settings)
|
|
max_pages: Maximum number of pages to process (default from settings)
|
|
|
|
Returns:
|
|
List of PIL Images (one per page)
|
|
"""
|
|
dpi = dpi or getattr(settings, 'PDF_DPI', 200)
|
|
max_pages = max_pages or settings.PARSER_MAX_PAGES
|
|
|
|
try:
|
|
# Convert PDF to images
|
|
images = pdf2image.convert_from_bytes(
|
|
pdf_bytes,
|
|
dpi=dpi,
|
|
first_page=1,
|
|
last_page=max_pages
|
|
)
|
|
|
|
logger.info(f"Converted PDF to {len(images)} images (DPI: {dpi}, max_pages: {max_pages})")
|
|
|
|
return images
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to convert PDF to images: {e}", exc_info=True)
|
|
raise ValueError(f"PDF conversion failed: {str(e)}")
|
|
|
|
|
|
def load_image(image_bytes: bytes) -> Image.Image:
|
|
"""
|
|
Load image from bytes
|
|
|
|
Args:
|
|
image_bytes: Image file content as bytes
|
|
|
|
Returns:
|
|
PIL Image
|
|
"""
|
|
try:
|
|
image = Image.open(BytesIO(image_bytes))
|
|
logger.info(f"Loaded image: {image.format}, size: {image.size}")
|
|
return image
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to load image: {e}", exc_info=True)
|
|
raise ValueError(f"Image loading failed: {str(e)}")
|
|
|
|
|
|
def normalize_image(
|
|
image: Image.Image,
|
|
max_size: Optional[int] = None
|
|
) -> Image.Image:
|
|
"""
|
|
Normalize image for model input
|
|
|
|
- Convert to RGB
|
|
- Resize to max_size (preserving aspect ratio)
|
|
- Ensure proper format
|
|
|
|
Args:
|
|
image: PIL Image
|
|
max_size: Maximum size for longest side (default from settings)
|
|
|
|
Returns:
|
|
Normalized PIL Image
|
|
"""
|
|
max_size = max_size or getattr(settings, 'IMAGE_MAX_SIZE', 2048)
|
|
|
|
# Convert to RGB if needed
|
|
if image.mode != 'RGB':
|
|
image = image.convert('RGB')
|
|
|
|
# Resize if needed (preserve aspect ratio)
|
|
width, height = image.size
|
|
if width > max_size or height > max_size:
|
|
if width > height:
|
|
new_width = max_size
|
|
new_height = int(height * (max_size / width))
|
|
else:
|
|
new_height = max_size
|
|
new_width = int(width * (max_size / height))
|
|
|
|
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
|
logger.info(f"Resized image from {width}x{height} to {new_width}x{new_height}")
|
|
|
|
return image
|
|
|
|
|
|
def prepare_images_for_model(
|
|
images: List[Image.Image],
|
|
max_size: Optional[int] = None
|
|
) -> List[Image.Image]:
|
|
"""
|
|
Prepare list of images for model inference
|
|
|
|
- Normalize each image
|
|
- Apply batch processing if needed
|
|
|
|
Args:
|
|
images: List of PIL Images
|
|
max_size: Maximum size for longest side
|
|
|
|
Returns:
|
|
List of normalized PIL Images
|
|
"""
|
|
normalized = []
|
|
|
|
for idx, image in enumerate(images):
|
|
try:
|
|
norm_image = normalize_image(image, max_size)
|
|
normalized.append(norm_image)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to normalize image {idx + 1}: {e}")
|
|
# Skip problematic images
|
|
continue
|
|
|
|
logger.info(f"Prepared {len(normalized)} images for model")
|
|
return normalized
|
|
|
|
|
|
def detect_file_type(content: bytes, filename: Optional[str] = None) -> str:
|
|
"""
|
|
Detect file type from content and/or filename
|
|
|
|
Args:
|
|
content: File content as bytes
|
|
filename: Optional filename (for extension detection)
|
|
|
|
Returns:
|
|
File type: "pdf" or "image"
|
|
"""
|
|
# Check magic bytes
|
|
if content.startswith(b'%PDF'):
|
|
return "pdf"
|
|
|
|
# Check by extension if available
|
|
if filename:
|
|
ext = Path(filename).suffix.lower()
|
|
if ext == '.pdf':
|
|
return "pdf"
|
|
elif ext in ['.png', '.jpg', '.jpeg', '.webp', '.tiff', '.tif']:
|
|
return "image"
|
|
|
|
# Try to open as image
|
|
try:
|
|
Image.open(BytesIO(content))
|
|
return "image"
|
|
except:
|
|
pass
|
|
|
|
raise ValueError("Unsupported file type. Expected PDF or image (PNG/JPEG/WebP)")
|
|
|
|
|
|
def validate_file_size(content: bytes) -> None:
|
|
"""
|
|
Validate file size against MAX_FILE_SIZE_MB
|
|
|
|
Args:
|
|
content: File content as bytes
|
|
|
|
Raises:
|
|
ValueError if file is too large
|
|
"""
|
|
max_size_bytes = settings.MAX_FILE_SIZE_MB * 1024 * 1024
|
|
file_size = len(content)
|
|
|
|
if file_size > max_size_bytes:
|
|
raise ValueError(
|
|
f"File size ({file_size / 1024 / 1024:.2f} MB) exceeds maximum "
|
|
f"({settings.MAX_FILE_SIZE_MB} MB)"
|
|
)
|
|
|