diff --git a/services/parser-service/app/api/endpoints.py b/services/parser-service/app/api/endpoints.py index 7183ba38..3a99a00a 100644 --- a/services/parser-service/app/api/endpoints.py +++ b/services/parser-service/app/api/endpoints.py @@ -14,7 +14,13 @@ from app.schemas import ( ParseRequest, ParseResponse, ParsedDocument, ParsedChunk, QAPair, ChunksResponse ) from app.core.config import settings -from app.runtime.inference import parse_document, dummy_parse_document +from app.runtime.inference import parse_document_from_images +from app.runtime.preprocessing import ( + convert_pdf_to_images, load_image, detect_file_type, validate_file_size +) +from app.runtime.postprocessing import ( + build_chunks, build_qa_pairs, build_markdown +) logger = logging.getLogger(__name__) @@ -50,31 +56,29 @@ async def parse_document_endpoint( detail="Either 'file' or 'doc_url' must be provided" ) - # Determine document type + # Process file if file: - doc_type = "image" # Will be determined from file extension - file_ext = Path(file.filename or "").suffix.lower() - if file_ext == ".pdf": - doc_type = "pdf" - # Read file content content = await file.read() - # Check file size - max_size = settings.MAX_FILE_SIZE_MB * 1024 * 1024 - if len(content) > max_size: - raise HTTPException( - status_code=413, - detail=f"File size exceeds maximum {settings.MAX_FILE_SIZE_MB}MB" - ) + # Validate file size + try: + validate_file_size(content) + except ValueError as e: + raise HTTPException(status_code=413, detail=str(e)) - # Save to temp file - temp_dir = Path(settings.TEMP_DIR) - temp_dir.mkdir(exist_ok=True, parents=True) - temp_file = temp_dir / f"{uuid.uuid4()}{file_ext}" - temp_file.write_bytes(content) + # Detect file type + try: + doc_type = detect_file_type(content, file.filename) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) - input_path = str(temp_file) + # Convert to images + if doc_type == "pdf": + images = convert_pdf_to_images(content) + else: + image = load_image(content) + images = [image] else: # TODO: Download from doc_url @@ -83,51 +87,31 @@ async def parse_document_endpoint( detail="doc_url download not yet implemented" ) - # Parse document - logger.info(f"Parsing document: {input_path}, mode: {output_mode}") + # Parse document from images + logger.info(f"Parsing document: {len(images)} page(s), mode: {output_mode}") - # TODO: Replace with real parse_document when model is integrated - parsed_doc = dummy_parse_document( - input_path=input_path, + parsed_doc = parse_document_from_images( + images=images, output_mode=output_mode, doc_id=doc_id or str(uuid.uuid4()), doc_type=doc_type ) # Build response based on output_mode - response_data = {"metadata": {}} + response_data = {"metadata": { + "doc_id": parsed_doc.doc_id, + "doc_type": parsed_doc.doc_type, + "page_count": len(parsed_doc.pages) + }} if output_mode == "raw_json": response_data["document"] = parsed_doc elif output_mode == "markdown": - # TODO: Convert to markdown - response_data["markdown"] = "# Document\n\n" + "\n\n".join( - block.text for page in parsed_doc.pages for block in page.blocks - ) + response_data["markdown"] = build_markdown(parsed_doc) elif output_mode == "qa_pairs": - # TODO: Extract QA pairs - response_data["qa_pairs"] = [] + response_data["qa_pairs"] = build_qa_pairs(parsed_doc) elif output_mode == "chunks": - # Convert blocks to chunks - chunks = [] - for page in parsed_doc.pages: - for block in page.blocks: - chunks.append(ParsedChunk( - text=block.text, - page=page.page_num, - bbox=block.bbox, - section=block.type, - metadata={ - "dao_id": dao_id, - "doc_id": parsed_doc.doc_id, - "block_type": block.type - } - )) - response_data["chunks"] = chunks - - # Cleanup temp file - if file and temp_file.exists(): - temp_file.unlink() + response_data["chunks"] = build_chunks(parsed_doc, dao_id=dao_id) return ParseResponse(**response_data) diff --git a/services/parser-service/app/core/config.py b/services/parser-service/app/core/config.py index f7b0cc91..08c3882b 100644 --- a/services/parser-service/app/core/config.py +++ b/services/parser-service/app/core/config.py @@ -3,7 +3,7 @@ Configuration for PARSER Service """ import os -from typing import Literal +from typing import Literal, Optional from pydantic_settings import BaseSettings @@ -25,6 +25,17 @@ class Settings(BaseSettings): MAX_FILE_SIZE_MB: int = int(os.getenv("MAX_FILE_SIZE_MB", "50")) TEMP_DIR: str = os.getenv("TEMP_DIR", "/tmp/parser") + # PDF processing + PDF_DPI: int = int(os.getenv("PDF_DPI", "200")) + PAGE_RANGE: Optional[str] = os.getenv("PAGE_RANGE", None) # e.g., "1-20" for pages 1-20 + + # Image processing + IMAGE_MAX_SIZE: int = int(os.getenv("IMAGE_MAX_SIZE", "2048")) # Max size for longest side + + # Parser mode + USE_DUMMY_PARSER: bool = os.getenv("USE_DUMMY_PARSER", "false").lower() == "true" + ALLOW_DUMMY_FALLBACK: bool = os.getenv("ALLOW_DUMMY_FALLBACK", "true").lower() == "true" + # Runtime RUNTIME_TYPE: Literal["local", "remote"] = os.getenv("RUNTIME_TYPE", "local") RUNTIME_URL: str = os.getenv("RUNTIME_URL", "http://parser-runtime:11435") diff --git a/services/parser-service/app/runtime/inference.py b/services/parser-service/app/runtime/inference.py index c0403395..30c9fbdb 100644 --- a/services/parser-service/app/runtime/inference.py +++ b/services/parser-service/app/runtime/inference.py @@ -3,16 +3,110 @@ Inference functions for document parsing """ import logging -from typing import Literal, Optional +from typing import Literal, Optional, List from pathlib import Path +from PIL import Image + from app.schemas import ParsedDocument, ParsedPage, ParsedBlock, BBox from app.runtime.model_loader import get_model +from app.runtime.preprocessing import ( + convert_pdf_to_images, load_image, prepare_images_for_model +) +from app.runtime.postprocessing import build_parsed_document from app.core.config import settings logger = logging.getLogger(__name__) +def parse_document_from_images( + images: List[Image.Image], + output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json", + doc_id: Optional[str] = None, + doc_type: Literal["pdf", "image"] = "image" +) -> ParsedDocument: + """ + Parse document from list of images using dots.ocr model + + Args: + images: List of PIL Images (pages) + output_mode: Output format mode + doc_id: Document ID + doc_type: Document type (pdf or image) + + Returns: + ParsedDocument with structured content + """ + # Check if we should use dummy parser + if settings.USE_DUMMY_PARSER: + logger.info("Using dummy parser (USE_DUMMY_PARSER=true)") + return dummy_parse_document_from_images(images, doc_id, doc_type) + + # Try to get model + model = get_model() + + if model is None: + if settings.ALLOW_DUMMY_FALLBACK: + logger.warning("Model not loaded, falling back to dummy parser") + return dummy_parse_document_from_images(images, doc_id, doc_type) + else: + raise RuntimeError("Model not loaded and dummy fallback is disabled") + + # Prepare images for model + prepared_images = prepare_images_for_model(images) + + if not prepared_images: + raise ValueError("No valid images to process") + + # Process with model + pages_data = [] + + for idx, image in enumerate(prepared_images, start=1): + try: + # TODO: Implement actual inference with dots.ocr + # Example: + # inputs = model["processor"](images=image, return_tensors="pt") + # outputs = model["model"].generate(**inputs) + # text = model["processor"].decode(outputs[0], skip_special_tokens=True) + # + # # Parse model output into blocks + # blocks = parse_model_output_to_blocks(text, image.size) + # + # pages_data.append({ + # "blocks": blocks, + # "width": image.width, + # "height": image.height + # }) + + # For now, use dummy for each page + logger.debug(f"Processing page {idx} with model (placeholder)") + pages_data.append({ + "blocks": [ + { + "type": "paragraph", + "text": f"Page {idx} content (model output placeholder)", + "bbox": {"x": 0, "y": 0, "width": image.width, "height": image.height}, + "reading_order": 1 + } + ], + "width": image.width, + "height": image.height + }) + + except Exception as e: + logger.error(f"Error processing page {idx}: {e}", exc_info=True) + # Continue with other pages + continue + + # Build ParsedDocument from model output + return build_parsed_document( + pages_data=pages_data, + doc_id=doc_id or "parsed-doc", + doc_type=doc_type, + metadata={"model": settings.PARSER_MODEL_NAME} + ) + + def parse_document( input_path: str, output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json", @@ -20,7 +114,9 @@ def parse_document( doc_type: Literal["pdf", "image"] = "image" ) -> ParsedDocument: """ - Parse document using dots.ocr model + Parse document from file path + + This function handles file loading and delegates to parse_document_from_images Args: input_path: Path to document file (PDF or image) @@ -31,37 +127,70 @@ def parse_document( Returns: ParsedDocument with structured content """ - model = get_model() + # Load file content + with open(input_path, 'rb') as f: + content = f.read() - if model is None: - logger.warning("Model not loaded, using dummy parser") - return dummy_parse_document(input_path, output_mode, doc_id, doc_type) + # Convert to images based on type + if doc_type == "pdf": + images = convert_pdf_to_images(content) + else: + image = load_image(content) + images = [image] - # TODO: Implement actual inference with dots.ocr - # Example: - # from PIL import Image - # import pdf2image # for PDF + # Parse from images + return parse_document_from_images(images, output_mode, doc_id, doc_type) + + +def dummy_parse_document_from_images( + images: List[Image.Image], + doc_id: Optional[str] = None, + doc_type: Literal["pdf", "image"] = "image" +) -> ParsedDocument: + """ + Dummy parser for testing (returns mock data from images) - # if doc_type == "pdf": - # images = pdf2image.convert_from_path(input_path) - # else: - # images = [Image.open(input_path)] - # - # pages = [] - # for idx, image in enumerate(images): - # # Process with model - # inputs = model["processor"](images=image, return_tensors="pt") - # outputs = model["model"].generate(**inputs) - # text = model["processor"].decode(outputs[0], skip_special_tokens=True) - # - # # Parse output into blocks - # blocks = parse_model_output(text, idx + 1) - # pages.append(ParsedPage(...)) - # - # return ParsedDocument(...) + This will be replaced with actual dots.ocr inference + """ + logger.info(f"Dummy parsing: {len(images)} image(s)") - # For now, use dummy - return dummy_parse_document(input_path, output_mode, doc_id, doc_type) + pages = [] + + for idx, image in enumerate(images, start=1): + mock_page = ParsedPage( + page_num=idx, + blocks=[ + ParsedBlock( + type="heading", + text=f"Page {idx} Title", + bbox=BBox(x=0, y=0, width=image.width, height=50), + reading_order=1, + page_num=idx + ), + ParsedBlock( + type="paragraph", + text=f"This is a dummy parsed document (page {idx}). " + f"Image size: {image.width}x{image.height}. " + f"Replace this with actual dots.ocr inference.", + bbox=BBox(x=0, y=60, width=image.width, height=100), + reading_order=2, + page_num=idx + ) + ], + width=image.width, + height=image.height + ) + pages.append(mock_page) + + return ParsedDocument( + doc_id=doc_id or "dummy-doc-1", + doc_type=doc_type, + pages=pages, + metadata={ + "parser": "dummy", + "page_count": len(images) + } + ) def dummy_parse_document( @@ -73,40 +202,18 @@ def dummy_parse_document( """ Dummy parser for testing (returns mock data) - This will be replaced with actual dots.ocr inference + This function loads the file and delegates to dummy_parse_document_from_images """ - logger.info(f"Dummy parsing: {input_path}") + # Load file content + with open(input_path, 'rb') as f: + content = f.read() - # Mock data - mock_page = ParsedPage( - page_num=1, - blocks=[ - ParsedBlock( - type="heading", - text="Document Title", - bbox=BBox(x=0, y=0, width=800, height=50), - reading_order=1, - page_num=1 - ), - ParsedBlock( - type="paragraph", - text="This is a dummy parsed document. Replace this with actual dots.ocr inference.", - bbox=BBox(x=0, y=60, width=800, height=100), - reading_order=2, - page_num=1 - ) - ], - width=800, - height=1200 - ) + # Convert to images + if doc_type == "pdf": + images = convert_pdf_to_images(content) + else: + image = load_image(content) + images = [image] - return ParsedDocument( - doc_id=doc_id or "dummy-doc-1", - doc_type=doc_type, - pages=[mock_page], - metadata={ - "parser": "dummy", - "input_path": input_path - } - ) + return dummy_parse_document_from_images(images, doc_id, doc_type) diff --git a/services/parser-service/app/runtime/model_loader.py b/services/parser-service/app/runtime/model_loader.py index 174120c5..3cda8d5c 100644 --- a/services/parser-service/app/runtime/model_loader.py +++ b/services/parser-service/app/runtime/model_loader.py @@ -15,44 +15,63 @@ logger = logging.getLogger(__name__) _model: Optional[object] = None -def load_model() -> object: +def load_model() -> Optional[object]: """ Load dots.ocr model Returns: - Loaded model instance + Loaded model instance or None if loading fails """ global _model if _model is not None: return _model + # Check if dummy mode is enabled + if settings.USE_DUMMY_PARSER: + logger.info("Dummy parser mode enabled, skipping model loading") + return None + logger.info(f"Loading model: {settings.PARSER_MODEL_NAME}") logger.info(f"Device: {settings.PARSER_DEVICE}") try: # TODO: Implement actual model loading - # Example: + # Example for dots.ocr (adjust based on actual model structure): # from transformers import AutoModelForVision2Seq, AutoProcessor # # processor = AutoProcessor.from_pretrained(settings.PARSER_MODEL_NAME) # model = AutoModelForVision2Seq.from_pretrained( # settings.PARSER_MODEL_NAME, - # device_map=settings.PARSER_DEVICE + # device_map=settings.PARSER_DEVICE if settings.PARSER_DEVICE != "cpu" else None, + # torch_dtype=torch.float16 if settings.PARSER_DEVICE != "cpu" else torch.float32 # ) # + # if settings.PARSER_DEVICE == "cpu": + # model = model.to("cpu") + # # _model = { # "model": model, - # "processor": processor + # "processor": processor, + # "device": settings.PARSER_DEVICE # } + # + # logger.info("Model loaded successfully") # For now, return None (will use dummy parser) - logger.warning("Model loading not yet implemented, using dummy parser") + logger.warning("Model loading not yet implemented, will use dummy parser") _model = None + except ImportError as e: + logger.error(f"Required packages not installed: {e}") + if not settings.ALLOW_DUMMY_FALLBACK: + raise + _model = None except Exception as e: logger.error(f"Failed to load model: {e}", exc_info=True) - raise + if not settings.ALLOW_DUMMY_FALLBACK: + raise + _model = None return _model diff --git a/services/parser-service/app/runtime/postprocessing.py b/services/parser-service/app/runtime/postprocessing.py new file mode 100644 index 00000000..cd1e5067 --- /dev/null +++ b/services/parser-service/app/runtime/postprocessing.py @@ -0,0 +1,321 @@ +""" +Post-processing functions to convert model output to structured formats +""" + +import logging +import re +from typing import List, Dict, Any, Optional + +from app.schemas import ( + ParsedDocument, ParsedPage, ParsedBlock, ParsedChunk, QAPair, BBox +) + +logger = logging.getLogger(__name__) + + +def normalize_text(text: str) -> str: + """ + Normalize text: remove extra whitespace, line breaks, invisible chars + + Args: + text: Raw text + + Returns: + Normalized text + """ + if not text: + return "" + + # Remove invisible characters + text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]', '', text) + + # Normalize whitespace + text = re.sub(r'\s+', ' ', text) + + # Remove leading/trailing whitespace + text = text.strip() + + return text + + +def build_parsed_document( + pages_data: List[Dict[str, Any]], + doc_id: str, + doc_type: str, + metadata: Dict[str, Any] = None +) -> ParsedDocument: + """ + Build ParsedDocument from model output + + Args: + pages_data: List of page data from model + Each page should have: blocks, width, height + doc_id: Document ID + doc_type: Document type ("pdf" or "image") + metadata: Additional metadata + + Returns: + ParsedDocument + """ + pages = [] + + for page_idx, page_data in enumerate(pages_data, start=1): + blocks = [] + + for block_data in page_data.get('blocks', []): + # Normalize text + text = normalize_text(block_data.get('text', '')) + + if not text: + continue + + # Extract bbox + bbox_data = block_data.get('bbox', {}) + bbox = BBox( + x=bbox_data.get('x', 0), + y=bbox_data.get('y', 0), + width=bbox_data.get('width', 0), + height=bbox_data.get('height', 0) + ) + + # Create block + block = ParsedBlock( + type=block_data.get('type', 'paragraph'), + text=text, + bbox=bbox, + reading_order=block_data.get('reading_order', len(blocks) + 1), + page_num=page_idx, + metadata=block_data.get('metadata', {}) + ) + + blocks.append(block) + + page = ParsedPage( + page_num=page_idx, + blocks=blocks, + width=page_data.get('width', 0), + height=page_data.get('height', 0) + ) + pages.append(page) + + return ParsedDocument( + doc_id=doc_id, + doc_type=doc_type, + pages=pages, + metadata=metadata or {} + ) + + +def build_chunks( + parsed_doc: ParsedDocument, + chunk_size: int = 500, + chunk_overlap: int = 50, + dao_id: Optional[str] = None +) -> List[ParsedChunk]: + """ + Build semantic chunks from ParsedDocument + + Args: + parsed_doc: Parsed document + chunk_size: Target chunk size in characters + chunk_overlap: Overlap between chunks + dao_id: Optional DAO ID for metadata + + Returns: + List of ParsedChunk + """ + chunks = [] + + for page in parsed_doc.pages: + # Group blocks by section (heading-based) + current_section = None + current_text_parts = [] + + for block in page.blocks: + # Update section if we encounter a heading + if block.type == 'heading': + # Save previous section if exists + if current_text_parts: + text = ' '.join(current_text_parts) + if text: + chunks.append(ParsedChunk( + text=text, + page=page.page_num, + bbox=block.bbox, # Use first block's bbox + section=current_section or "main", + metadata={ + "dao_id": dao_id, + "doc_id": parsed_doc.doc_id, + "chunk_type": "section" + } + )) + + current_section = normalize_text(block.text) + current_text_parts = [] + + # Add block text + if block.text: + current_text_parts.append(block.text) + + # Save last section + if current_text_parts: + text = ' '.join(current_text_parts) + if text: + chunks.append(ParsedChunk( + text=text, + page=page.page_num, + section=current_section or "main", + metadata={ + "dao_id": dao_id, + "doc_id": parsed_doc.doc_id, + "chunk_type": "section" + } + )) + + # Split large chunks + final_chunks = [] + for chunk in chunks: + if len(chunk.text) <= chunk_size: + final_chunks.append(chunk) + else: + # Split into smaller chunks + words = chunk.text.split() + current_chunk_words = [] + current_length = 0 + + for word in words: + word_length = len(word) + 1 # +1 for space + if current_length + word_length > chunk_size and current_chunk_words: + # Save current chunk + chunk_text = ' '.join(current_chunk_words) + final_chunks.append(ParsedChunk( + text=chunk_text, + page=chunk.page, + bbox=chunk.bbox, + section=chunk.section, + metadata=chunk.metadata + )) + + # Start new chunk with overlap + overlap_words = current_chunk_words[-chunk_overlap:] if chunk_overlap > 0 else [] + current_chunk_words = overlap_words + [word] + current_length = sum(len(w) + 1 for w in current_chunk_words) + else: + current_chunk_words.append(word) + current_length += word_length + + # Save last chunk + if current_chunk_words: + chunk_text = ' '.join(current_chunk_words) + final_chunks.append(ParsedChunk( + text=chunk_text, + page=chunk.page, + bbox=chunk.bbox, + section=chunk.section, + metadata=chunk.metadata + )) + + logger.info(f"Created {len(final_chunks)} chunks from document") + return final_chunks + + +def build_qa_pairs( + parsed_doc: ParsedDocument, + max_pairs: int = 10 +) -> List[QAPair]: + """ + Build Q&A pairs from ParsedDocument + + This is a simple implementation. For production, consider using LLM + to generate better Q&A pairs. + + Args: + parsed_doc: Parsed document + max_pairs: Maximum number of Q&A pairs to generate + + Returns: + List of QAPair + """ + qa_pairs = [] + + # Simple heuristic: use headings as questions, following paragraphs as answers + for page in parsed_doc.pages: + for i, block in enumerate(page.blocks): + if block.type == 'heading' and i + 1 < len(page.blocks): + question = f"What is {normalize_text(block.text)}?" + answer_block = page.blocks[i + 1] + answer = normalize_text(answer_block.text) + + if answer: + qa_pairs.append(QAPair( + question=question, + answer=answer, + source_page=page.page_num, + source_bbox=block.bbox, + confidence=0.7 # Placeholder + )) + + if len(qa_pairs) >= max_pairs: + break + + if len(qa_pairs) >= max_pairs: + break + + logger.info(f"Generated {len(qa_pairs)} Q&A pairs") + return qa_pairs + + +def build_markdown(parsed_doc: ParsedDocument) -> str: + """ + Build Markdown representation from ParsedDocument + + Args: + parsed_doc: Parsed document + + Returns: + Markdown string + """ + markdown_parts = [] + + for page in parsed_doc.pages: + if len(parsed_doc.pages) > 1: + markdown_parts.append(f"\n## Page {page.page_num}\n") + + for block in page.blocks: + text = normalize_text(block.text) + if not text: + continue + + if block.type == 'heading': + # Determine heading level (simple heuristic) + if len(text) < 50: + markdown_parts.append(f"### {text}\n") + else: + markdown_parts.append(f"#### {text}\n") + elif block.type == 'paragraph': + markdown_parts.append(f"{text}\n\n") + elif block.type == 'list': + # Simple list formatting + lines = text.split('\n') + for line in lines: + if line.strip(): + markdown_parts.append(f"- {line.strip()}\n") + markdown_parts.append("\n") + elif block.type == 'table' and block.table_data: + # Format table as Markdown + table = block.table_data + if table.columns: + # Header + markdown_parts.append("| " + " | ".join(table.columns) + " |\n") + markdown_parts.append("| " + " | ".join(["---"] * len(table.columns)) + " |\n") + + # Rows + for row in table.rows: + markdown_parts.append("| " + " | ".join(str(cell) for cell in row) + " |\n") + markdown_parts.append("\n") + else: + # Default: plain text + markdown_parts.append(f"{text}\n\n") + + return ''.join(markdown_parts) + diff --git a/services/parser-service/app/runtime/preprocessing.py b/services/parser-service/app/runtime/preprocessing.py new file mode 100644 index 00000000..e00907a0 --- /dev/null +++ b/services/parser-service/app/runtime/preprocessing.py @@ -0,0 +1,198 @@ +""" +Preprocessing functions for PDF and images +""" + +import logging +from typing import List, Optional +from io import BytesIO +from pathlib import Path + +from PIL import Image +import pdf2image + +from app.core.config import settings + +logger = logging.getLogger(__name__) + + +def convert_pdf_to_images( + pdf_bytes: bytes, + dpi: Optional[int] = None, + max_pages: Optional[int] = None +) -> List[Image.Image]: + """ + Convert PDF bytes to list of PIL Images + + Args: + pdf_bytes: PDF file content as bytes + dpi: DPI for conversion (default from settings) + max_pages: Maximum number of pages to process (default from settings) + + Returns: + List of PIL Images (one per page) + """ + dpi = dpi or getattr(settings, 'PDF_DPI', 200) + max_pages = max_pages or settings.PARSER_MAX_PAGES + + try: + # Convert PDF to images + images = pdf2image.convert_from_bytes( + pdf_bytes, + dpi=dpi, + first_page=1, + last_page=max_pages + ) + + logger.info(f"Converted PDF to {len(images)} images (DPI: {dpi}, max_pages: {max_pages})") + + return images + + except Exception as e: + logger.error(f"Failed to convert PDF to images: {e}", exc_info=True) + raise ValueError(f"PDF conversion failed: {str(e)}") + + +def load_image(image_bytes: bytes) -> Image.Image: + """ + Load image from bytes + + Args: + image_bytes: Image file content as bytes + + Returns: + PIL Image + """ + try: + image = Image.open(BytesIO(image_bytes)) + logger.info(f"Loaded image: {image.format}, size: {image.size}") + return image + + except Exception as e: + logger.error(f"Failed to load image: {e}", exc_info=True) + raise ValueError(f"Image loading failed: {str(e)}") + + +def normalize_image( + image: Image.Image, + max_size: Optional[int] = None +) -> Image.Image: + """ + Normalize image for model input + + - Convert to RGB + - Resize to max_size (preserving aspect ratio) + - Ensure proper format + + Args: + image: PIL Image + max_size: Maximum size for longest side (default from settings) + + Returns: + Normalized PIL Image + """ + max_size = max_size or getattr(settings, 'IMAGE_MAX_SIZE', 2048) + + # Convert to RGB if needed + if image.mode != 'RGB': + image = image.convert('RGB') + + # Resize if needed (preserve aspect ratio) + width, height = image.size + if width > max_size or height > max_size: + if width > height: + new_width = max_size + new_height = int(height * (max_size / width)) + else: + new_height = max_size + new_width = int(width * (max_size / height)) + + image = image.resize((new_width, new_height), Image.Resampling.LANCZOS) + logger.info(f"Resized image from {width}x{height} to {new_width}x{new_height}") + + return image + + +def prepare_images_for_model( + images: List[Image.Image], + max_size: Optional[int] = None +) -> List[Image.Image]: + """ + Prepare list of images for model inference + + - Normalize each image + - Apply batch processing if needed + + Args: + images: List of PIL Images + max_size: Maximum size for longest side + + Returns: + List of normalized PIL Images + """ + normalized = [] + + for idx, image in enumerate(images): + try: + norm_image = normalize_image(image, max_size) + normalized.append(norm_image) + except Exception as e: + logger.warning(f"Failed to normalize image {idx + 1}: {e}") + # Skip problematic images + continue + + logger.info(f"Prepared {len(normalized)} images for model") + return normalized + + +def detect_file_type(content: bytes, filename: Optional[str] = None) -> str: + """ + Detect file type from content and/or filename + + Args: + content: File content as bytes + filename: Optional filename (for extension detection) + + Returns: + File type: "pdf" or "image" + """ + # Check magic bytes + if content.startswith(b'%PDF'): + return "pdf" + + # Check by extension if available + if filename: + ext = Path(filename).suffix.lower() + if ext == '.pdf': + return "pdf" + elif ext in ['.png', '.jpg', '.jpeg', '.webp', '.tiff', '.tif']: + return "image" + + # Try to open as image + try: + Image.open(BytesIO(content)) + return "image" + except: + pass + + raise ValueError("Unsupported file type. Expected PDF or image (PNG/JPEG/WebP)") + + +def validate_file_size(content: bytes) -> None: + """ + Validate file size against MAX_FILE_SIZE_MB + + Args: + content: File content as bytes + + Raises: + ValueError if file is too large + """ + max_size_bytes = settings.MAX_FILE_SIZE_MB * 1024 * 1024 + file_size = len(content) + + if file_size > max_size_bytes: + raise ValueError( + f"File size ({file_size / 1024 / 1024:.2f} MB) exceeds maximum " + f"({settings.MAX_FILE_SIZE_MB} MB)" + ) +