feat: implement PDF/image preprocessing, post-processing, and dots.ocr integration prep
G.2.3 - PDF/Image Support: - Add preprocessing.py with PDF→images conversion (pdf2image) - Add image loading and normalization - Add file type detection and validation - Support for PDF, PNG, JPEG, WebP, TIFF G.2.4 - Pre/Post-processing: - Add postprocessing.py with structured output builders - build_chunks() - semantic chunks for RAG - build_qa_pairs() - Q&A extraction - build_markdown() - Markdown conversion - Text normalization and chunking logic G.1.3 - dots.ocr Integration Prep: - Update model_loader.py with proper error handling - Add USE_DUMMY_PARSER and ALLOW_DUMMY_FALLBACK flags - Update inference.py to work with images list - Add parse_document_from_images() function - Ready for actual model integration Configuration: - Add PDF_DPI, IMAGE_MAX_SIZE, PAGE_RANGE settings - Add parser mode flags (USE_DUMMY_PARSER, ALLOW_DUMMY_FALLBACK) API Updates: - Update endpoints to use new preprocessing pipeline - Integrate post-processing for all output modes - Remove temp file handling (work directly with bytes)
This commit is contained in:
198
services/parser-service/app/runtime/preprocessing.py
Normal file
198
services/parser-service/app/runtime/preprocessing.py
Normal file
@@ -0,0 +1,198 @@
|
||||
"""
|
||||
Preprocessing functions for PDF and images
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
from PIL import Image
|
||||
import pdf2image
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def convert_pdf_to_images(
|
||||
pdf_bytes: bytes,
|
||||
dpi: Optional[int] = None,
|
||||
max_pages: Optional[int] = None
|
||||
) -> List[Image.Image]:
|
||||
"""
|
||||
Convert PDF bytes to list of PIL Images
|
||||
|
||||
Args:
|
||||
pdf_bytes: PDF file content as bytes
|
||||
dpi: DPI for conversion (default from settings)
|
||||
max_pages: Maximum number of pages to process (default from settings)
|
||||
|
||||
Returns:
|
||||
List of PIL Images (one per page)
|
||||
"""
|
||||
dpi = dpi or getattr(settings, 'PDF_DPI', 200)
|
||||
max_pages = max_pages or settings.PARSER_MAX_PAGES
|
||||
|
||||
try:
|
||||
# Convert PDF to images
|
||||
images = pdf2image.convert_from_bytes(
|
||||
pdf_bytes,
|
||||
dpi=dpi,
|
||||
first_page=1,
|
||||
last_page=max_pages
|
||||
)
|
||||
|
||||
logger.info(f"Converted PDF to {len(images)} images (DPI: {dpi}, max_pages: {max_pages})")
|
||||
|
||||
return images
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to convert PDF to images: {e}", exc_info=True)
|
||||
raise ValueError(f"PDF conversion failed: {str(e)}")
|
||||
|
||||
|
||||
def load_image(image_bytes: bytes) -> Image.Image:
|
||||
"""
|
||||
Load image from bytes
|
||||
|
||||
Args:
|
||||
image_bytes: Image file content as bytes
|
||||
|
||||
Returns:
|
||||
PIL Image
|
||||
"""
|
||||
try:
|
||||
image = Image.open(BytesIO(image_bytes))
|
||||
logger.info(f"Loaded image: {image.format}, size: {image.size}")
|
||||
return image
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load image: {e}", exc_info=True)
|
||||
raise ValueError(f"Image loading failed: {str(e)}")
|
||||
|
||||
|
||||
def normalize_image(
|
||||
image: Image.Image,
|
||||
max_size: Optional[int] = None
|
||||
) -> Image.Image:
|
||||
"""
|
||||
Normalize image for model input
|
||||
|
||||
- Convert to RGB
|
||||
- Resize to max_size (preserving aspect ratio)
|
||||
- Ensure proper format
|
||||
|
||||
Args:
|
||||
image: PIL Image
|
||||
max_size: Maximum size for longest side (default from settings)
|
||||
|
||||
Returns:
|
||||
Normalized PIL Image
|
||||
"""
|
||||
max_size = max_size or getattr(settings, 'IMAGE_MAX_SIZE', 2048)
|
||||
|
||||
# Convert to RGB if needed
|
||||
if image.mode != 'RGB':
|
||||
image = image.convert('RGB')
|
||||
|
||||
# Resize if needed (preserve aspect ratio)
|
||||
width, height = image.size
|
||||
if width > max_size or height > max_size:
|
||||
if width > height:
|
||||
new_width = max_size
|
||||
new_height = int(height * (max_size / width))
|
||||
else:
|
||||
new_height = max_size
|
||||
new_width = int(width * (max_size / height))
|
||||
|
||||
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
||||
logger.info(f"Resized image from {width}x{height} to {new_width}x{new_height}")
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def prepare_images_for_model(
|
||||
images: List[Image.Image],
|
||||
max_size: Optional[int] = None
|
||||
) -> List[Image.Image]:
|
||||
"""
|
||||
Prepare list of images for model inference
|
||||
|
||||
- Normalize each image
|
||||
- Apply batch processing if needed
|
||||
|
||||
Args:
|
||||
images: List of PIL Images
|
||||
max_size: Maximum size for longest side
|
||||
|
||||
Returns:
|
||||
List of normalized PIL Images
|
||||
"""
|
||||
normalized = []
|
||||
|
||||
for idx, image in enumerate(images):
|
||||
try:
|
||||
norm_image = normalize_image(image, max_size)
|
||||
normalized.append(norm_image)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to normalize image {idx + 1}: {e}")
|
||||
# Skip problematic images
|
||||
continue
|
||||
|
||||
logger.info(f"Prepared {len(normalized)} images for model")
|
||||
return normalized
|
||||
|
||||
|
||||
def detect_file_type(content: bytes, filename: Optional[str] = None) -> str:
|
||||
"""
|
||||
Detect file type from content and/or filename
|
||||
|
||||
Args:
|
||||
content: File content as bytes
|
||||
filename: Optional filename (for extension detection)
|
||||
|
||||
Returns:
|
||||
File type: "pdf" or "image"
|
||||
"""
|
||||
# Check magic bytes
|
||||
if content.startswith(b'%PDF'):
|
||||
return "pdf"
|
||||
|
||||
# Check by extension if available
|
||||
if filename:
|
||||
ext = Path(filename).suffix.lower()
|
||||
if ext == '.pdf':
|
||||
return "pdf"
|
||||
elif ext in ['.png', '.jpg', '.jpeg', '.webp', '.tiff', '.tif']:
|
||||
return "image"
|
||||
|
||||
# Try to open as image
|
||||
try:
|
||||
Image.open(BytesIO(content))
|
||||
return "image"
|
||||
except:
|
||||
pass
|
||||
|
||||
raise ValueError("Unsupported file type. Expected PDF or image (PNG/JPEG/WebP)")
|
||||
|
||||
|
||||
def validate_file_size(content: bytes) -> None:
|
||||
"""
|
||||
Validate file size against MAX_FILE_SIZE_MB
|
||||
|
||||
Args:
|
||||
content: File content as bytes
|
||||
|
||||
Raises:
|
||||
ValueError if file is too large
|
||||
"""
|
||||
max_size_bytes = settings.MAX_FILE_SIZE_MB * 1024 * 1024
|
||||
file_size = len(content)
|
||||
|
||||
if file_size > max_size_bytes:
|
||||
raise ValueError(
|
||||
f"File size ({file_size / 1024 / 1024:.2f} MB) exceeds maximum "
|
||||
f"({settings.MAX_FILE_SIZE_MB} MB)"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user