""" IBM Docling Service - Document conversion with table/formula extraction Converts PDF, DOCX, PPTX, images to Markdown/JSON while preserving: - Tables (with structure) - Formulas (LaTeX) - Code blocks - Images - Document structure """ import logging import os import base64 import tempfile from typing import Optional, List, Dict, Any from pathlib import Path from io import BytesIO from fastapi import FastAPI, HTTPException, File, UploadFile, Form from fastapi.responses import JSONResponse from pydantic import BaseModel import torch logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = FastAPI( title="Docling Document Conversion Service", description="Convert documents to structured formats using IBM Docling" ) # Configuration DEVICE = os.getenv("DEVICE", "cuda" if torch.cuda.is_available() else "cpu") DOCLING_MODEL = os.getenv("DOCLING_MODEL", "ds4sd/docling-models") # Global converter instance converter = None def load_docling(): """Load Docling converter""" global converter if converter is not None: return try: from docling.document_converter import DocumentConverter from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.base_models import InputFormat logger.info(f"Loading Docling on {DEVICE}...") # Configure pipeline options pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = True pipeline_options.do_table_structure = True # Initialize converter converter = DocumentConverter( allowed_formats=[ InputFormat.PDF, InputFormat.DOCX, InputFormat.PPTX, InputFormat.IMAGE, InputFormat.HTML, ] ) logger.info("Docling loaded successfully") except ImportError as e: logger.error(f"Failed to import Docling: {e}") logger.warning("Service will run in degraded mode") except Exception as e: logger.error(f"Failed to load Docling: {e}", exc_info=True) logger.warning("Service will run in degraded mode") @app.on_event("startup") async def startup(): load_docling() @app.get("/health") async def health(): """Health check endpoint""" if converter is None: return { "status": "loading", "service": "docling-service", "device": DEVICE } return { "status": "healthy", "service": "docling-service", "device": DEVICE, "cuda_available": torch.cuda.is_available(), "features": ["pdf", "docx", "pptx", "images", "tables", "formulas"] } class ConvertRequest(BaseModel): """Request model for document conversion""" doc_url: Optional[str] = None doc_base64: Optional[str] = None output_format: str = "markdown" # markdown, json, text extract_tables: bool = True extract_images: bool = False ocr_enabled: bool = True @app.post("/convert") async def convert_document( file: Optional[UploadFile] = File(None), doc_url: Optional[str] = Form(None), doc_base64: Optional[str] = Form(None), output_format: str = Form("markdown"), extract_tables: bool = Form(True), extract_images: bool = Form(False), ocr_enabled: bool = Form(True) ): """ Convert a document to structured format. Supports: - PDF, DOCX, PPTX, HTML, images - Table extraction with structure - Formula extraction (LaTeX) - OCR for scanned documents Output formats: - markdown: Structured markdown with tables - json: Full document structure as JSON - text: Plain text extraction """ if converter is None: raise HTTPException( status_code=503, detail="Docling not loaded. Check logs for details." ) try: # Get document data doc_path = None temp_file = None if file: # Save uploaded file to temp temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix) content = await file.read() temp_file.write(content) temp_file.close() doc_path = temp_file.name elif doc_base64: # Decode base64 and save to temp content = base64.b64decode(doc_base64) temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") temp_file.write(content) temp_file.close() doc_path = temp_file.name elif doc_url: # Download from URL import httpx async with httpx.AsyncClient(timeout=60.0) as client: response = await client.get(doc_url) if response.status_code != 200: raise HTTPException( status_code=400, detail=f"Failed to download document: {response.status_code}" ) content = response.content # Determine extension from URL or content-type ext = ".pdf" if doc_url.endswith(".docx"): ext = ".docx" elif doc_url.endswith(".pptx"): ext = ".pptx" temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext) temp_file.write(content) temp_file.close() doc_path = temp_file.name else: raise HTTPException( status_code=400, detail="No document provided. Use file, doc_url, or doc_base64" ) # Convert document logger.info(f"Converting document: {doc_path}") result = converter.convert(doc_path) # Format output if output_format == "markdown": output = result.document.export_to_markdown() elif output_format == "json": output = result.document.export_to_dict() else: output = result.document.export_to_text() # Extract tables if requested tables = [] if extract_tables: for table in result.document.tables: tables.append({ "id": table.id if hasattr(table, 'id') else None, "content": table.export_to_markdown() if hasattr(table, 'export_to_markdown') else str(table), "rows": len(table.data) if hasattr(table, 'data') else 0 }) # Cleanup temp file if temp_file: os.unlink(temp_file.name) return { "success": True, "output_format": output_format, "result": output, "tables": tables if extract_tables else None, "pages": result.document.num_pages if hasattr(result.document, 'num_pages') else None, "metadata": { "title": result.document.title if hasattr(result.document, 'title') else None, "author": result.document.author if hasattr(result.document, 'author') else None } } except Exception as e: logger.error(f"Document conversion failed: {e}", exc_info=True) # Cleanup on error if temp_file and os.path.exists(temp_file.name): os.unlink(temp_file.name) raise HTTPException( status_code=500, detail=f"Document conversion failed: {str(e)}" ) @app.post("/extract-tables") async def extract_tables( file: Optional[UploadFile] = File(None), doc_base64: Optional[str] = Form(None) ): """ Extract tables from a document. Returns tables as: - Markdown format - Structured data (rows/columns) - HTML format """ if converter is None: raise HTTPException( status_code=503, detail="Docling not loaded. Check logs for details." ) try: # Get document temp_file = None if file: temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix) content = await file.read() temp_file.write(content) temp_file.close() doc_path = temp_file.name elif doc_base64: content = base64.b64decode(doc_base64) temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") temp_file.write(content) temp_file.close() doc_path = temp_file.name else: raise HTTPException( status_code=400, detail="No document provided" ) # Convert and extract tables result = converter.convert(doc_path) tables = [] for idx, table in enumerate(result.document.tables): table_data = { "index": idx, "markdown": table.export_to_markdown() if hasattr(table, 'export_to_markdown') else None, "html": table.export_to_html() if hasattr(table, 'export_to_html') else None, } # Try to get structured data if hasattr(table, 'data'): table_data["data"] = table.data table_data["rows"] = len(table.data) table_data["columns"] = len(table.data[0]) if table.data else 0 tables.append(table_data) # Cleanup if temp_file: os.unlink(temp_file.name) return { "success": True, "tables_count": len(tables), "tables": tables } except Exception as e: logger.error(f"Table extraction failed: {e}", exc_info=True) if temp_file and os.path.exists(temp_file.name): os.unlink(temp_file.name) raise HTTPException( status_code=500, detail=f"Table extraction failed: {str(e)}" ) @app.get("/models") async def list_models(): """List available models and features""" return { "service": "docling-service", "models": [ { "name": "ds4sd/docling-models", "description": "IBM Docling - Document conversion with tables and formulas", "features": ["pdf", "docx", "pptx", "html", "images"], "capabilities": ["ocr", "tables", "formulas", "structure"] } ], "supported_formats": { "input": ["pdf", "docx", "pptx", "html", "png", "jpg", "tiff"], "output": ["markdown", "json", "text"] }, "device": DEVICE, "cuda_available": torch.cuda.is_available() } if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8003)