feat: implement PDF/image preprocessing, post-processing, and dots.ocr integration prep

G.2.3 - PDF/Image Support: - Add preprocessing.py with PDF→images conversion (pdf2image) - Add image loading and normalization - Add file type detection and validation - Support for PDF, PNG, JPEG, WebP, TIFF G.2.4 - Pre/Post-processing: - Add postprocessing.py with structured output builders - build_chunks() - semantic chunks for RAG - build_qa_pairs() - Q&A extraction - build_markdown() - Markdown conversion - Text normalization and chunking logic G.1.3 - dots.ocr Integration Prep: - Update model_loader.py with proper error handling - Add USE_DUMMY_PARSER and ALLOW_DUMMY_FALLBACK flags - Update inference.py to work with images list - Add parse_document_from_images() function - Ready for actual model integration Configuration: - Add PDF_DPI, IMAGE_MAX_SIZE, PAGE_RANGE settings - Add parser mode flags (USE_DUMMY_PARSER, ALLOW_DUMMY_FALLBACK) API Updates: - Update endpoints to use new preprocessing pipeline - Integrate post-processing for all output modes - Remove temp file handling (work directly with bytes)
2025-11-15 13:19:07 -08:00
parent 0f6cfe046f
commit 4befecc425
6 changed files with 762 additions and 122 deletions
--- a/services/parser-service/app/api/endpoints.py
+++ b/services/parser-service/app/api/endpoints.py
@@ -14,7 +14,13 @@ from app.schemas import (
    ParseRequest, ParseResponse, ParsedDocument, ParsedChunk, QAPair, ChunksResponse
 )
 from app.core.config import settings
-from app.runtime.inference import parse_document, dummy_parse_document
+from app.runtime.inference import parse_document_from_images
+from app.runtime.preprocessing import (
+    convert_pdf_to_images, load_image, detect_file_type, validate_file_size
+)
+from app.runtime.postprocessing import (
+    build_chunks, build_qa_pairs, build_markdown
+)

 logger = logging.getLogger(__name__)

@@ -50,31 +56,29 @@ async def parse_document_endpoint(
                detail="Either 'file' or 'doc_url' must be provided"
            )
        
-        # Determine document type
+        # Process file
        if file:
-            doc_type = "image"  # Will be determined from file extension
-            file_ext = Path(file.filename or "").suffix.lower()
-            if file_ext == ".pdf":
-                doc_type = "pdf"
-            
            # Read file content
            content = await file.read()
            
-            # Check file size
-            max_size = settings.MAX_FILE_SIZE_MB * 1024 * 1024
-            if len(content) > max_size:
-                raise HTTPException(
-                    status_code=413,
-                    detail=f"File size exceeds maximum {settings.MAX_FILE_SIZE_MB}MB"
-                )
+            # Validate file size
+            try:
+                validate_file_size(content)
+            except ValueError as e:
+                raise HTTPException(status_code=413, detail=str(e))
            
-            # Save to temp file
-            temp_dir = Path(settings.TEMP_DIR)
-            temp_dir.mkdir(exist_ok=True, parents=True)
-            temp_file = temp_dir / f"{uuid.uuid4()}{file_ext}"
-            temp_file.write_bytes(content)
+            # Detect file type
+            try:
+                doc_type = detect_file_type(content, file.filename)
+            except ValueError as e:
+                raise HTTPException(status_code=400, detail=str(e))
            
-            input_path = str(temp_file)
+            # Convert to images
+            if doc_type == "pdf":
+                images = convert_pdf_to_images(content)
+            else:
+                image = load_image(content)
+                images = [image]
            
        else:
            # TODO: Download from doc_url
@@ -83,51 +87,31 @@ async def parse_document_endpoint(
                detail="doc_url download not yet implemented"
            )
        
-        # Parse document
-        logger.info(f"Parsing document: {input_path}, mode: {output_mode}")
+        # Parse document from images
+        logger.info(f"Parsing document: {len(images)} page(s), mode: {output_mode}")
        
-        # TODO: Replace with real parse_document when model is integrated
-        parsed_doc = dummy_parse_document(
-            input_path=input_path,
+        parsed_doc = parse_document_from_images(
+            images=images,
            output_mode=output_mode,
            doc_id=doc_id or str(uuid.uuid4()),
            doc_type=doc_type
        )
        
        # Build response based on output_mode
-        response_data = {"metadata": {}}
+        response_data = {"metadata": {
+            "doc_id": parsed_doc.doc_id,
+            "doc_type": parsed_doc.doc_type,
+            "page_count": len(parsed_doc.pages)
+        }}
        
        if output_mode == "raw_json":
            response_data["document"] = parsed_doc
        elif output_mode == "markdown":
-            # TODO: Convert to markdown
-            response_data["markdown"] = "# Document\n\n" + "\n\n".join(
-                block.text for page in parsed_doc.pages for block in page.blocks
-            )
+            response_data["markdown"] = build_markdown(parsed_doc)
        elif output_mode == "qa_pairs":
-            # TODO: Extract QA pairs
-            response_data["qa_pairs"] = []
+            response_data["qa_pairs"] = build_qa_pairs(parsed_doc)
        elif output_mode == "chunks":
-            # Convert blocks to chunks
-            chunks = []
-            for page in parsed_doc.pages:
-                for block in page.blocks:
-                    chunks.append(ParsedChunk(
-                        text=block.text,
-                        page=page.page_num,
-                        bbox=block.bbox,
-                        section=block.type,
-                        metadata={
-                            "dao_id": dao_id,
-                            "doc_id": parsed_doc.doc_id,
-                            "block_type": block.type
-                        }
-                    ))
-            response_data["chunks"] = chunks
-        
-        # Cleanup temp file
-        if file and temp_file.exists():
-            temp_file.unlink()
+            response_data["chunks"] = build_chunks(parsed_doc, dao_id=dao_id)
        
        return ParseResponse(**response_data)