microdao-daarion/services/docling-service/main.py

"""
IBM Docling Service - Document conversion with table/formula extraction

Converts PDF, DOCX, PPTX, images to Markdown/JSON while preserving:
- Tables (with structure)
- Formulas (LaTeX)
- Code blocks
- Images
- Document structure
"""

import logging
import os
import base64
import tempfile
from typing import Optional, List, Dict, Any
from pathlib import Path
from io import BytesIO

from fastapi import FastAPI, HTTPException, File, UploadFile, Form
from fastapi.responses import JSONResponse
from pydantic import BaseModel
import torch

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI(
    title="Docling Document Conversion Service",
    description="Convert documents to structured formats using IBM Docling"
)

# Configuration
DEVICE = os.getenv("DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
DOCLING_MODEL = os.getenv("DOCLING_MODEL", "ds4sd/docling-models")

# Global converter instance
converter = None


def load_docling():
    """Load Docling converter"""
    global converter

    if converter is not None:
        return

    try:
        from docling.document_converter import DocumentConverter
        from docling.datamodel.pipeline_options import PdfPipelineOptions
        from docling.datamodel.base_models import InputFormat

        logger.info(f"Loading Docling on {DEVICE}...")

        # Configure pipeline options
        pipeline_options = PdfPipelineOptions()
        pipeline_options.do_ocr = True
        pipeline_options.do_table_structure = True

        # Initialize converter
        converter = DocumentConverter(
            allowed_formats=[
                InputFormat.PDF,
                InputFormat.DOCX,
                InputFormat.PPTX,
                InputFormat.IMAGE,
                InputFormat.HTML,
            ]
        )

        logger.info("Docling loaded successfully")

    except ImportError as e:
        logger.error(f"Failed to import Docling: {e}")
        logger.warning("Service will run in degraded mode")
    except Exception as e:
        logger.error(f"Failed to load Docling: {e}", exc_info=True)
        logger.warning("Service will run in degraded mode")


@app.on_event("startup")
async def startup():
    load_docling()


@app.get("/health")
async def health():
    """Health check endpoint"""
    if converter is None:
        return {
            "status": "loading",
            "service": "docling-service",
            "device": DEVICE
        }

    return {
        "status": "healthy",
        "service": "docling-service",
        "device": DEVICE,
        "cuda_available": torch.cuda.is_available(),
        "features": ["pdf", "docx", "pptx", "images", "tables", "formulas"]
    }


class ConvertRequest(BaseModel):
    """Request model for document conversion"""
    doc_url: Optional[str] = None
    doc_base64: Optional[str] = None
    output_format: str = "markdown"  # markdown, json, text
    extract_tables: bool = True
    extract_images: bool = False
    ocr_enabled: bool = True


@app.post("/convert")
async def convert_document(
    file: Optional[UploadFile] = File(None),
    doc_url: Optional[str] = Form(None),
    doc_base64: Optional[str] = Form(None),
    output_format: str = Form("markdown"),
    extract_tables: bool = Form(True),
    extract_images: bool = Form(False),
    ocr_enabled: bool = Form(True)
):
    """
    Convert a document to structured format.

    Supports:
    - PDF, DOCX, PPTX, HTML, images
    - Table extraction with structure
    - Formula extraction (LaTeX)
    - OCR for scanned documents

    Output formats:
    - markdown: Structured markdown with tables
    - json: Full document structure as JSON
    - text: Plain text extraction
    """
    if converter is None:
        raise HTTPException(
            status_code=503,
            detail="Docling not loaded. Check logs for details."
        )

    try:
        # Get document data
        doc_path = None
        temp_file = None

        if file:
            # Save uploaded file to temp
            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix)
            content = await file.read()
            temp_file.write(content)
            temp_file.close()
            doc_path = temp_file.name

        elif doc_base64:
            # Decode base64 and save to temp
            content = base64.b64decode(doc_base64)
            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
            temp_file.write(content)
            temp_file.close()
            doc_path = temp_file.name

        elif doc_url:
            # Download from URL
            import httpx
            async with httpx.AsyncClient(timeout=60.0) as client:
                response = await client.get(doc_url)
                if response.status_code != 200:
                    raise HTTPException(
                        status_code=400,
                        detail=f"Failed to download document: {response.status_code}"
                    )
                content = response.content

                # Determine extension from URL or content-type
                ext = ".pdf"
                if doc_url.endswith(".docx"):
                    ext = ".docx"
                elif doc_url.endswith(".pptx"):
                    ext = ".pptx"

                temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
                temp_file.write(content)
                temp_file.close()
                doc_path = temp_file.name
        else:
            raise HTTPException(
                status_code=400,
                detail="No document provided. Use file, doc_url, or doc_base64"
            )

        # Convert document
        logger.info(f"Converting document: {doc_path}")
        result = converter.convert(doc_path)

        # Format output
        if output_format == "markdown":
            output = result.document.export_to_markdown()
        elif output_format == "json":
            output = result.document.export_to_dict()
        else:
            output = result.document.export_to_text()

        # Extract tables if requested
        tables = []
        if extract_tables:
            for table in result.document.tables:
                tables.append({
                    "id": table.id if hasattr(table, 'id') else None,
                    "content": table.export_to_markdown() if hasattr(table, 'export_to_markdown') else str(table),
                    "rows": len(table.data) if hasattr(table, 'data') else 0
                })

        # Cleanup temp file
        if temp_file:
            os.unlink(temp_file.name)

        return {
            "success": True,
            "output_format": output_format,
            "result": output,
            "tables": tables if extract_tables else None,
            "pages": result.document.num_pages if hasattr(result.document, 'num_pages') else None,
            "metadata": {
                "title": result.document.title if hasattr(result.document, 'title') else None,
                "author": result.document.author if hasattr(result.document, 'author') else None
            }
        }

    except Exception as e:
        logger.error(f"Document conversion failed: {e}", exc_info=True)

        # Cleanup on error
        if temp_file and os.path.exists(temp_file.name):
            os.unlink(temp_file.name)

        raise HTTPException(
            status_code=500,
            detail=f"Document conversion failed: {str(e)}"
        )


@app.post("/extract-tables")
async def extract_tables(
    file: Optional[UploadFile] = File(None),
    doc_base64: Optional[str] = Form(None)
):
    """
    Extract tables from a document.

    Returns tables as:
    - Markdown format
    - Structured data (rows/columns)
    - HTML format
    """
    if converter is None:
        raise HTTPException(
            status_code=503,
            detail="Docling not loaded. Check logs for details."
        )

    try:
        # Get document
        temp_file = None

        if file:
            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix)
            content = await file.read()
            temp_file.write(content)
            temp_file.close()
            doc_path = temp_file.name
        elif doc_base64:
            content = base64.b64decode(doc_base64)
            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
            temp_file.write(content)
            temp_file.close()
            doc_path = temp_file.name
        else:
            raise HTTPException(
                status_code=400,
                detail="No document provided"
            )

        # Convert and extract tables
        result = converter.convert(doc_path)

        tables = []
        for idx, table in enumerate(result.document.tables):
            table_data = {
                "index": idx,
                "markdown": table.export_to_markdown() if hasattr(table, 'export_to_markdown') else None,
                "html": table.export_to_html() if hasattr(table, 'export_to_html') else None,
            }

            # Try to get structured data
            if hasattr(table, 'data'):
                table_data["data"] = table.data
                table_data["rows"] = len(table.data)
                table_data["columns"] = len(table.data[0]) if table.data else 0

            tables.append(table_data)

        # Cleanup
        if temp_file:
            os.unlink(temp_file.name)

        return {
            "success": True,
            "tables_count": len(tables),
            "tables": tables
        }

    except Exception as e:
        logger.error(f"Table extraction failed: {e}", exc_info=True)
        if temp_file and os.path.exists(temp_file.name):
            os.unlink(temp_file.name)
        raise HTTPException(
            status_code=500,
            detail=f"Table extraction failed: {str(e)}"
        )


@app.get("/models")
async def list_models():
    """List available models and features"""
    return {
        "service": "docling-service",
        "models": [
            {
                "name": "ds4sd/docling-models",
                "description": "IBM Docling - Document conversion with tables and formulas",
                "features": ["pdf", "docx", "pptx", "html", "images"],
                "capabilities": ["ocr", "tables", "formulas", "structure"]
            }
        ],
        "supported_formats": {
            "input": ["pdf", "docx", "pptx", "html", "png", "jpg", "tiff"],
            "output": ["markdown", "json", "text"]
        },
        "device": DEVICE,
        "cuda_available": torch.cuda.is_available()
    }


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8003)