- TTS: xtts-v2 integration with voice cloning support
- Document: docling integration for PDF/DOCX/PPTX processing
- Memory Service: added /facts/upsert, /facts/{key}, /facts endpoints
- Added required dependencies (TTS, docling)
351 lines
11 KiB
Python
351 lines
11 KiB
Python
"""
|
|
IBM Docling Service - Document conversion with table/formula extraction
|
|
|
|
Converts PDF, DOCX, PPTX, images to Markdown/JSON while preserving:
|
|
- Tables (with structure)
|
|
- Formulas (LaTeX)
|
|
- Code blocks
|
|
- Images
|
|
- Document structure
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import base64
|
|
import tempfile
|
|
from typing import Optional, List, Dict, Any
|
|
from pathlib import Path
|
|
from io import BytesIO
|
|
|
|
from fastapi import FastAPI, HTTPException, File, UploadFile, Form
|
|
from fastapi.responses import JSONResponse
|
|
from pydantic import BaseModel
|
|
import torch
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
app = FastAPI(
|
|
title="Docling Document Conversion Service",
|
|
description="Convert documents to structured formats using IBM Docling"
|
|
)
|
|
|
|
# Configuration
|
|
DEVICE = os.getenv("DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
|
|
DOCLING_MODEL = os.getenv("DOCLING_MODEL", "ds4sd/docling-models")
|
|
|
|
# Global converter instance
|
|
converter = None
|
|
|
|
|
|
def load_docling():
|
|
"""Load Docling converter"""
|
|
global converter
|
|
|
|
if converter is not None:
|
|
return
|
|
|
|
try:
|
|
from docling.document_converter import DocumentConverter
|
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
from docling.datamodel.base_models import InputFormat
|
|
|
|
logger.info(f"Loading Docling on {DEVICE}...")
|
|
|
|
# Configure pipeline options
|
|
pipeline_options = PdfPipelineOptions()
|
|
pipeline_options.do_ocr = True
|
|
pipeline_options.do_table_structure = True
|
|
|
|
# Initialize converter
|
|
converter = DocumentConverter(
|
|
allowed_formats=[
|
|
InputFormat.PDF,
|
|
InputFormat.DOCX,
|
|
InputFormat.PPTX,
|
|
InputFormat.IMAGE,
|
|
InputFormat.HTML,
|
|
]
|
|
)
|
|
|
|
logger.info("Docling loaded successfully")
|
|
|
|
except ImportError as e:
|
|
logger.error(f"Failed to import Docling: {e}")
|
|
logger.warning("Service will run in degraded mode")
|
|
except Exception as e:
|
|
logger.error(f"Failed to load Docling: {e}", exc_info=True)
|
|
logger.warning("Service will run in degraded mode")
|
|
|
|
|
|
@app.on_event("startup")
|
|
async def startup():
|
|
load_docling()
|
|
|
|
|
|
@app.get("/health")
|
|
async def health():
|
|
"""Health check endpoint"""
|
|
if converter is None:
|
|
return {
|
|
"status": "loading",
|
|
"service": "docling-service",
|
|
"device": DEVICE
|
|
}
|
|
|
|
return {
|
|
"status": "healthy",
|
|
"service": "docling-service",
|
|
"device": DEVICE,
|
|
"cuda_available": torch.cuda.is_available(),
|
|
"features": ["pdf", "docx", "pptx", "images", "tables", "formulas"]
|
|
}
|
|
|
|
|
|
class ConvertRequest(BaseModel):
|
|
"""Request model for document conversion"""
|
|
doc_url: Optional[str] = None
|
|
doc_base64: Optional[str] = None
|
|
output_format: str = "markdown" # markdown, json, text
|
|
extract_tables: bool = True
|
|
extract_images: bool = False
|
|
ocr_enabled: bool = True
|
|
|
|
|
|
@app.post("/convert")
|
|
async def convert_document(
|
|
file: Optional[UploadFile] = File(None),
|
|
doc_url: Optional[str] = Form(None),
|
|
doc_base64: Optional[str] = Form(None),
|
|
output_format: str = Form("markdown"),
|
|
extract_tables: bool = Form(True),
|
|
extract_images: bool = Form(False),
|
|
ocr_enabled: bool = Form(True)
|
|
):
|
|
"""
|
|
Convert a document to structured format.
|
|
|
|
Supports:
|
|
- PDF, DOCX, PPTX, HTML, images
|
|
- Table extraction with structure
|
|
- Formula extraction (LaTeX)
|
|
- OCR for scanned documents
|
|
|
|
Output formats:
|
|
- markdown: Structured markdown with tables
|
|
- json: Full document structure as JSON
|
|
- text: Plain text extraction
|
|
"""
|
|
if converter is None:
|
|
raise HTTPException(
|
|
status_code=503,
|
|
detail="Docling not loaded. Check logs for details."
|
|
)
|
|
|
|
try:
|
|
# Get document data
|
|
doc_path = None
|
|
temp_file = None
|
|
|
|
if file:
|
|
# Save uploaded file to temp
|
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix)
|
|
content = await file.read()
|
|
temp_file.write(content)
|
|
temp_file.close()
|
|
doc_path = temp_file.name
|
|
|
|
elif doc_base64:
|
|
# Decode base64 and save to temp
|
|
content = base64.b64decode(doc_base64)
|
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
|
|
temp_file.write(content)
|
|
temp_file.close()
|
|
doc_path = temp_file.name
|
|
|
|
elif doc_url:
|
|
# Download from URL
|
|
import httpx
|
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
|
response = await client.get(doc_url)
|
|
if response.status_code != 200:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Failed to download document: {response.status_code}"
|
|
)
|
|
content = response.content
|
|
|
|
# Determine extension from URL or content-type
|
|
ext = ".pdf"
|
|
if doc_url.endswith(".docx"):
|
|
ext = ".docx"
|
|
elif doc_url.endswith(".pptx"):
|
|
ext = ".pptx"
|
|
|
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
|
|
temp_file.write(content)
|
|
temp_file.close()
|
|
doc_path = temp_file.name
|
|
else:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="No document provided. Use file, doc_url, or doc_base64"
|
|
)
|
|
|
|
# Convert document
|
|
logger.info(f"Converting document: {doc_path}")
|
|
result = converter.convert(doc_path)
|
|
|
|
# Format output
|
|
if output_format == "markdown":
|
|
output = result.document.export_to_markdown()
|
|
elif output_format == "json":
|
|
output = result.document.export_to_dict()
|
|
else:
|
|
output = result.document.export_to_text()
|
|
|
|
# Extract tables if requested
|
|
tables = []
|
|
if extract_tables:
|
|
for table in result.document.tables:
|
|
tables.append({
|
|
"id": table.id if hasattr(table, 'id') else None,
|
|
"content": table.export_to_markdown() if hasattr(table, 'export_to_markdown') else str(table),
|
|
"rows": len(table.data) if hasattr(table, 'data') else 0
|
|
})
|
|
|
|
# Cleanup temp file
|
|
if temp_file:
|
|
os.unlink(temp_file.name)
|
|
|
|
return {
|
|
"success": True,
|
|
"output_format": output_format,
|
|
"result": output,
|
|
"tables": tables if extract_tables else None,
|
|
"pages": result.document.num_pages if hasattr(result.document, 'num_pages') else None,
|
|
"metadata": {
|
|
"title": result.document.title if hasattr(result.document, 'title') else None,
|
|
"author": result.document.author if hasattr(result.document, 'author') else None
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Document conversion failed: {e}", exc_info=True)
|
|
|
|
# Cleanup on error
|
|
if temp_file and os.path.exists(temp_file.name):
|
|
os.unlink(temp_file.name)
|
|
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail=f"Document conversion failed: {str(e)}"
|
|
)
|
|
|
|
|
|
@app.post("/extract-tables")
|
|
async def extract_tables(
|
|
file: Optional[UploadFile] = File(None),
|
|
doc_base64: Optional[str] = Form(None)
|
|
):
|
|
"""
|
|
Extract tables from a document.
|
|
|
|
Returns tables as:
|
|
- Markdown format
|
|
- Structured data (rows/columns)
|
|
- HTML format
|
|
"""
|
|
if converter is None:
|
|
raise HTTPException(
|
|
status_code=503,
|
|
detail="Docling not loaded. Check logs for details."
|
|
)
|
|
|
|
try:
|
|
# Get document
|
|
temp_file = None
|
|
|
|
if file:
|
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix)
|
|
content = await file.read()
|
|
temp_file.write(content)
|
|
temp_file.close()
|
|
doc_path = temp_file.name
|
|
elif doc_base64:
|
|
content = base64.b64decode(doc_base64)
|
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
|
|
temp_file.write(content)
|
|
temp_file.close()
|
|
doc_path = temp_file.name
|
|
else:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="No document provided"
|
|
)
|
|
|
|
# Convert and extract tables
|
|
result = converter.convert(doc_path)
|
|
|
|
tables = []
|
|
for idx, table in enumerate(result.document.tables):
|
|
table_data = {
|
|
"index": idx,
|
|
"markdown": table.export_to_markdown() if hasattr(table, 'export_to_markdown') else None,
|
|
"html": table.export_to_html() if hasattr(table, 'export_to_html') else None,
|
|
}
|
|
|
|
# Try to get structured data
|
|
if hasattr(table, 'data'):
|
|
table_data["data"] = table.data
|
|
table_data["rows"] = len(table.data)
|
|
table_data["columns"] = len(table.data[0]) if table.data else 0
|
|
|
|
tables.append(table_data)
|
|
|
|
# Cleanup
|
|
if temp_file:
|
|
os.unlink(temp_file.name)
|
|
|
|
return {
|
|
"success": True,
|
|
"tables_count": len(tables),
|
|
"tables": tables
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Table extraction failed: {e}", exc_info=True)
|
|
if temp_file and os.path.exists(temp_file.name):
|
|
os.unlink(temp_file.name)
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail=f"Table extraction failed: {str(e)}"
|
|
)
|
|
|
|
|
|
@app.get("/models")
|
|
async def list_models():
|
|
"""List available models and features"""
|
|
return {
|
|
"service": "docling-service",
|
|
"models": [
|
|
{
|
|
"name": "ds4sd/docling-models",
|
|
"description": "IBM Docling - Document conversion with tables and formulas",
|
|
"features": ["pdf", "docx", "pptx", "html", "images"],
|
|
"capabilities": ["ocr", "tables", "formulas", "structure"]
|
|
}
|
|
],
|
|
"supported_formats": {
|
|
"input": ["pdf", "docx", "pptx", "html", "png", "jpg", "tiff"],
|
|
"output": ["markdown", "json", "text"]
|
|
},
|
|
"device": DEVICE,
|
|
"cuda_available": torch.cuda.is_available()
|
|
}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
uvicorn.run(app, host="0.0.0.0", port=8003)
|