Files
microdao-daarion/services/parser-service/app/runtime/inference.py
Apple 4befecc425 feat: implement PDF/image preprocessing, post-processing, and dots.ocr integration prep
G.2.3 - PDF/Image Support:
- Add preprocessing.py with PDF→images conversion (pdf2image)
- Add image loading and normalization
- Add file type detection and validation
- Support for PDF, PNG, JPEG, WebP, TIFF

G.2.4 - Pre/Post-processing:
- Add postprocessing.py with structured output builders
- build_chunks() - semantic chunks for RAG
- build_qa_pairs() - Q&A extraction
- build_markdown() - Markdown conversion
- Text normalization and chunking logic

G.1.3 - dots.ocr Integration Prep:
- Update model_loader.py with proper error handling
- Add USE_DUMMY_PARSER and ALLOW_DUMMY_FALLBACK flags
- Update inference.py to work with images list
- Add parse_document_from_images() function
- Ready for actual model integration

Configuration:
- Add PDF_DPI, IMAGE_MAX_SIZE, PAGE_RANGE settings
- Add parser mode flags (USE_DUMMY_PARSER, ALLOW_DUMMY_FALLBACK)

API Updates:
- Update endpoints to use new preprocessing pipeline
- Integrate post-processing for all output modes
- Remove temp file handling (work directly with bytes)
2025-11-15 13:19:07 -08:00

220 lines
6.8 KiB
Python

"""
Inference functions for document parsing
"""
import logging
from typing import Literal, Optional, List
from pathlib import Path
from PIL import Image
from app.schemas import ParsedDocument, ParsedPage, ParsedBlock, BBox
from app.runtime.model_loader import get_model
from app.runtime.preprocessing import (
convert_pdf_to_images, load_image, prepare_images_for_model
)
from app.runtime.postprocessing import build_parsed_document
from app.core.config import settings
logger = logging.getLogger(__name__)
def parse_document_from_images(
images: List[Image.Image],
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
doc_id: Optional[str] = None,
doc_type: Literal["pdf", "image"] = "image"
) -> ParsedDocument:
"""
Parse document from list of images using dots.ocr model
Args:
images: List of PIL Images (pages)
output_mode: Output format mode
doc_id: Document ID
doc_type: Document type (pdf or image)
Returns:
ParsedDocument with structured content
"""
# Check if we should use dummy parser
if settings.USE_DUMMY_PARSER:
logger.info("Using dummy parser (USE_DUMMY_PARSER=true)")
return dummy_parse_document_from_images(images, doc_id, doc_type)
# Try to get model
model = get_model()
if model is None:
if settings.ALLOW_DUMMY_FALLBACK:
logger.warning("Model not loaded, falling back to dummy parser")
return dummy_parse_document_from_images(images, doc_id, doc_type)
else:
raise RuntimeError("Model not loaded and dummy fallback is disabled")
# Prepare images for model
prepared_images = prepare_images_for_model(images)
if not prepared_images:
raise ValueError("No valid images to process")
# Process with model
pages_data = []
for idx, image in enumerate(prepared_images, start=1):
try:
# TODO: Implement actual inference with dots.ocr
# Example:
# inputs = model["processor"](images=image, return_tensors="pt")
# outputs = model["model"].generate(**inputs)
# text = model["processor"].decode(outputs[0], skip_special_tokens=True)
#
# # Parse model output into blocks
# blocks = parse_model_output_to_blocks(text, image.size)
#
# pages_data.append({
# "blocks": blocks,
# "width": image.width,
# "height": image.height
# })
# For now, use dummy for each page
logger.debug(f"Processing page {idx} with model (placeholder)")
pages_data.append({
"blocks": [
{
"type": "paragraph",
"text": f"Page {idx} content (model output placeholder)",
"bbox": {"x": 0, "y": 0, "width": image.width, "height": image.height},
"reading_order": 1
}
],
"width": image.width,
"height": image.height
})
except Exception as e:
logger.error(f"Error processing page {idx}: {e}", exc_info=True)
# Continue with other pages
continue
# Build ParsedDocument from model output
return build_parsed_document(
pages_data=pages_data,
doc_id=doc_id or "parsed-doc",
doc_type=doc_type,
metadata={"model": settings.PARSER_MODEL_NAME}
)
def parse_document(
input_path: str,
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
doc_id: Optional[str] = None,
doc_type: Literal["pdf", "image"] = "image"
) -> ParsedDocument:
"""
Parse document from file path
This function handles file loading and delegates to parse_document_from_images
Args:
input_path: Path to document file (PDF or image)
output_mode: Output format mode
doc_id: Document ID
doc_type: Document type (pdf or image)
Returns:
ParsedDocument with structured content
"""
# Load file content
with open(input_path, 'rb') as f:
content = f.read()
# Convert to images based on type
if doc_type == "pdf":
images = convert_pdf_to_images(content)
else:
image = load_image(content)
images = [image]
# Parse from images
return parse_document_from_images(images, output_mode, doc_id, doc_type)
def dummy_parse_document_from_images(
images: List[Image.Image],
doc_id: Optional[str] = None,
doc_type: Literal["pdf", "image"] = "image"
) -> ParsedDocument:
"""
Dummy parser for testing (returns mock data from images)
This will be replaced with actual dots.ocr inference
"""
logger.info(f"Dummy parsing: {len(images)} image(s)")
pages = []
for idx, image in enumerate(images, start=1):
mock_page = ParsedPage(
page_num=idx,
blocks=[
ParsedBlock(
type="heading",
text=f"Page {idx} Title",
bbox=BBox(x=0, y=0, width=image.width, height=50),
reading_order=1,
page_num=idx
),
ParsedBlock(
type="paragraph",
text=f"This is a dummy parsed document (page {idx}). "
f"Image size: {image.width}x{image.height}. "
f"Replace this with actual dots.ocr inference.",
bbox=BBox(x=0, y=60, width=image.width, height=100),
reading_order=2,
page_num=idx
)
],
width=image.width,
height=image.height
)
pages.append(mock_page)
return ParsedDocument(
doc_id=doc_id or "dummy-doc-1",
doc_type=doc_type,
pages=pages,
metadata={
"parser": "dummy",
"page_count": len(images)
}
)
def dummy_parse_document(
input_path: str,
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
doc_id: Optional[str] = None,
doc_type: Literal["pdf", "image"] = "image"
) -> ParsedDocument:
"""
Dummy parser for testing (returns mock data)
This function loads the file and delegates to dummy_parse_document_from_images
"""
# Load file content
with open(input_path, 'rb') as f:
content = f.read()
# Convert to images
if doc_type == "pdf":
images = convert_pdf_to_images(content)
else:
image = load_image(content)
images = [image]
return dummy_parse_document_from_images(images, doc_id, doc_type)