Files
microdao-daarion/services/docling-service/main.py
Apple 5290287058 feat: implement TTS, Document processing, and Memory Service /facts API
- TTS: xtts-v2 integration with voice cloning support
- Document: docling integration for PDF/DOCX/PPTX processing
- Memory Service: added /facts/upsert, /facts/{key}, /facts endpoints
- Added required dependencies (TTS, docling)
2026-01-17 08:16:37 -08:00

351 lines
11 KiB
Python

"""
IBM Docling Service - Document conversion with table/formula extraction
Converts PDF, DOCX, PPTX, images to Markdown/JSON while preserving:
- Tables (with structure)
- Formulas (LaTeX)
- Code blocks
- Images
- Document structure
"""
import logging
import os
import base64
import tempfile
from typing import Optional, List, Dict, Any
from pathlib import Path
from io import BytesIO
from fastapi import FastAPI, HTTPException, File, UploadFile, Form
from fastapi.responses import JSONResponse
from pydantic import BaseModel
import torch
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(
title="Docling Document Conversion Service",
description="Convert documents to structured formats using IBM Docling"
)
# Configuration
DEVICE = os.getenv("DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
DOCLING_MODEL = os.getenv("DOCLING_MODEL", "ds4sd/docling-models")
# Global converter instance
converter = None
def load_docling():
"""Load Docling converter"""
global converter
if converter is not None:
return
try:
from docling.document_converter import DocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
logger.info(f"Loading Docling on {DEVICE}...")
# Configure pipeline options
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
# Initialize converter
converter = DocumentConverter(
allowed_formats=[
InputFormat.PDF,
InputFormat.DOCX,
InputFormat.PPTX,
InputFormat.IMAGE,
InputFormat.HTML,
]
)
logger.info("Docling loaded successfully")
except ImportError as e:
logger.error(f"Failed to import Docling: {e}")
logger.warning("Service will run in degraded mode")
except Exception as e:
logger.error(f"Failed to load Docling: {e}", exc_info=True)
logger.warning("Service will run in degraded mode")
@app.on_event("startup")
async def startup():
load_docling()
@app.get("/health")
async def health():
"""Health check endpoint"""
if converter is None:
return {
"status": "loading",
"service": "docling-service",
"device": DEVICE
}
return {
"status": "healthy",
"service": "docling-service",
"device": DEVICE,
"cuda_available": torch.cuda.is_available(),
"features": ["pdf", "docx", "pptx", "images", "tables", "formulas"]
}
class ConvertRequest(BaseModel):
"""Request model for document conversion"""
doc_url: Optional[str] = None
doc_base64: Optional[str] = None
output_format: str = "markdown" # markdown, json, text
extract_tables: bool = True
extract_images: bool = False
ocr_enabled: bool = True
@app.post("/convert")
async def convert_document(
file: Optional[UploadFile] = File(None),
doc_url: Optional[str] = Form(None),
doc_base64: Optional[str] = Form(None),
output_format: str = Form("markdown"),
extract_tables: bool = Form(True),
extract_images: bool = Form(False),
ocr_enabled: bool = Form(True)
):
"""
Convert a document to structured format.
Supports:
- PDF, DOCX, PPTX, HTML, images
- Table extraction with structure
- Formula extraction (LaTeX)
- OCR for scanned documents
Output formats:
- markdown: Structured markdown with tables
- json: Full document structure as JSON
- text: Plain text extraction
"""
if converter is None:
raise HTTPException(
status_code=503,
detail="Docling not loaded. Check logs for details."
)
try:
# Get document data
doc_path = None
temp_file = None
if file:
# Save uploaded file to temp
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix)
content = await file.read()
temp_file.write(content)
temp_file.close()
doc_path = temp_file.name
elif doc_base64:
# Decode base64 and save to temp
content = base64.b64decode(doc_base64)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
temp_file.write(content)
temp_file.close()
doc_path = temp_file.name
elif doc_url:
# Download from URL
import httpx
async with httpx.AsyncClient(timeout=60.0) as client:
response = await client.get(doc_url)
if response.status_code != 200:
raise HTTPException(
status_code=400,
detail=f"Failed to download document: {response.status_code}"
)
content = response.content
# Determine extension from URL or content-type
ext = ".pdf"
if doc_url.endswith(".docx"):
ext = ".docx"
elif doc_url.endswith(".pptx"):
ext = ".pptx"
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
temp_file.write(content)
temp_file.close()
doc_path = temp_file.name
else:
raise HTTPException(
status_code=400,
detail="No document provided. Use file, doc_url, or doc_base64"
)
# Convert document
logger.info(f"Converting document: {doc_path}")
result = converter.convert(doc_path)
# Format output
if output_format == "markdown":
output = result.document.export_to_markdown()
elif output_format == "json":
output = result.document.export_to_dict()
else:
output = result.document.export_to_text()
# Extract tables if requested
tables = []
if extract_tables:
for table in result.document.tables:
tables.append({
"id": table.id if hasattr(table, 'id') else None,
"content": table.export_to_markdown() if hasattr(table, 'export_to_markdown') else str(table),
"rows": len(table.data) if hasattr(table, 'data') else 0
})
# Cleanup temp file
if temp_file:
os.unlink(temp_file.name)
return {
"success": True,
"output_format": output_format,
"result": output,
"tables": tables if extract_tables else None,
"pages": result.document.num_pages if hasattr(result.document, 'num_pages') else None,
"metadata": {
"title": result.document.title if hasattr(result.document, 'title') else None,
"author": result.document.author if hasattr(result.document, 'author') else None
}
}
except Exception as e:
logger.error(f"Document conversion failed: {e}", exc_info=True)
# Cleanup on error
if temp_file and os.path.exists(temp_file.name):
os.unlink(temp_file.name)
raise HTTPException(
status_code=500,
detail=f"Document conversion failed: {str(e)}"
)
@app.post("/extract-tables")
async def extract_tables(
file: Optional[UploadFile] = File(None),
doc_base64: Optional[str] = Form(None)
):
"""
Extract tables from a document.
Returns tables as:
- Markdown format
- Structured data (rows/columns)
- HTML format
"""
if converter is None:
raise HTTPException(
status_code=503,
detail="Docling not loaded. Check logs for details."
)
try:
# Get document
temp_file = None
if file:
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix)
content = await file.read()
temp_file.write(content)
temp_file.close()
doc_path = temp_file.name
elif doc_base64:
content = base64.b64decode(doc_base64)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
temp_file.write(content)
temp_file.close()
doc_path = temp_file.name
else:
raise HTTPException(
status_code=400,
detail="No document provided"
)
# Convert and extract tables
result = converter.convert(doc_path)
tables = []
for idx, table in enumerate(result.document.tables):
table_data = {
"index": idx,
"markdown": table.export_to_markdown() if hasattr(table, 'export_to_markdown') else None,
"html": table.export_to_html() if hasattr(table, 'export_to_html') else None,
}
# Try to get structured data
if hasattr(table, 'data'):
table_data["data"] = table.data
table_data["rows"] = len(table.data)
table_data["columns"] = len(table.data[0]) if table.data else 0
tables.append(table_data)
# Cleanup
if temp_file:
os.unlink(temp_file.name)
return {
"success": True,
"tables_count": len(tables),
"tables": tables
}
except Exception as e:
logger.error(f"Table extraction failed: {e}", exc_info=True)
if temp_file and os.path.exists(temp_file.name):
os.unlink(temp_file.name)
raise HTTPException(
status_code=500,
detail=f"Table extraction failed: {str(e)}"
)
@app.get("/models")
async def list_models():
"""List available models and features"""
return {
"service": "docling-service",
"models": [
{
"name": "ds4sd/docling-models",
"description": "IBM Docling - Document conversion with tables and formulas",
"features": ["pdf", "docx", "pptx", "html", "images"],
"capabilities": ["ocr", "tables", "formulas", "structure"]
}
],
"supported_formats": {
"input": ["pdf", "docx", "pptx", "html", "png", "jpg", "tiff"],
"output": ["markdown", "json", "text"]
},
"device": DEVICE,
"cuda_available": torch.cuda.is_available()
}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8003)