microdao-daarion/services/parser-service/app/runtime/model_output_parser.py

"""
Parser for dots.ocr model output
Converts model output to structured blocks
"""

import logging
import json
from typing import List, Dict, Any, Optional
from PIL import Image

logger = logging.getLogger(__name__)


def parse_model_output_to_blocks(
    model_output: str,
    image_size: tuple[int, int],
    page_num: int
) -> List[Dict[str, Any]]:
    """
    Parse dots.ocr model output into structured blocks

    Args:
        model_output: Raw text output from model (may be JSON or plain text)
        image_size: (width, height) of the image
        page_num: Page number

    Returns:
        List of block dictionaries
    """
    blocks = []

    try:
        # Try to parse as JSON first (if model outputs structured JSON)
        try:
            output_data = json.loads(model_output)
            if isinstance(output_data, dict) and "blocks" in output_data:
                # Model outputs structured format
                return output_data["blocks"]
            elif isinstance(output_data, list):
                # Model outputs list of blocks
                return output_data
        except (json.JSONDecodeError, KeyError):
            # Not JSON, treat as plain text
            pass

        # Parse plain text output
        # This is a simple heuristic - adjust based on actual dots.ocr output format
        lines = model_output.strip().split('\n')

        current_block = None
        reading_order = 1

        for line in lines:
            line = line.strip()
            if not line:
                continue

            # Heuristic: lines starting with # are headings
            if line.startswith('#'):
                # Save previous block
                if current_block:
                    blocks.append(current_block)

                # New heading block
                current_block = {
                    "type": "heading",
                    "text": line.lstrip('#').strip(),
                    "bbox": {
                        "x": 0,
                        "y": reading_order * 30,
                        "width": image_size[0],
                        "height": 30
                    },
                    "reading_order": reading_order
                }
                reading_order += 1
            else:
                # Regular paragraph
                if current_block and current_block["type"] == "paragraph":
                    # Append to existing paragraph
                    current_block["text"] += " " + line
                else:
                    # Save previous block
                    if current_block:
                        blocks.append(current_block)

                    # New paragraph block
                    current_block = {
                        "type": "paragraph",
                        "text": line,
                        "bbox": {
                            "x": 0,
                            "y": reading_order * 30,
                            "width": image_size[0],
                            "height": 30
                        },
                        "reading_order": reading_order
                    }
                    reading_order += 1

        # Save last block
        if current_block:
            blocks.append(current_block)

        # If no blocks were created, create a single paragraph with all text
        if not blocks:
            blocks.append({
                "type": "paragraph",
                "text": model_output.strip(),
                "bbox": {
                    "x": 0,
                    "y": 0,
                    "width": image_size[0],
                    "height": image_size[1]
                },
                "reading_order": 1
            })

    except Exception as e:
        logger.error(f"Error parsing model output: {e}", exc_info=True)
        # Fallback: create single block with raw output
        blocks = [{
            "type": "paragraph",
            "text": model_output.strip() if model_output else "",
            "bbox": {
                "x": 0,
                "y": 0,
                "width": image_size[0],
                "height": image_size[1]
            },
            "reading_order": 1
        }]

    return blocks


def extract_layout_info(model_output: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Extract layout information from model output (if available)

    Args:
        model_output: Model output dictionary

    Returns:
        Layout info dictionary or None
    """
    # This function should be customized based on actual dots.ocr output format
    # For now, return None (no layout info)
    return None