Files
microdao-daarion/services/parser-service/app/runtime/model_output_parser.py
Apple 2a353040f6 feat: add tests and integrate dots.ocr model
G.2.5 - Tests:
- Add pytest test suite with fixtures
- test_preprocessing.py - PDF/image loading, normalization, validation
- test_postprocessing.py - chunks, QA pairs, markdown generation
- test_inference.py - dummy parser and inference functions
- test_api.py - API endpoint tests
- Add pytest.ini configuration

G.1.3 - dots.ocr Integration:
- Update model_loader.py with real model loading code
  - Support for AutoModelForVision2Seq and AutoProcessor
  - Device handling (CUDA/CPU/MPS) with fallback
  - Error handling with dummy fallback option
- Update inference.py with real model inference
  - Process images through model
  - Generate and decode outputs
  - Parse model output to blocks
- Add model_output_parser.py
  - Parse JSON or plain text model output
  - Convert to structured blocks
  - Layout detection support (placeholder)

Dependencies:
- Add pytest, pytest-asyncio, httpx for testing
2025-11-15 13:25:01 -08:00

151 lines
4.6 KiB
Python

"""
Parser for dots.ocr model output
Converts model output to structured blocks
"""
import logging
import json
from typing import List, Dict, Any, Optional
from PIL import Image
logger = logging.getLogger(__name__)
def parse_model_output_to_blocks(
model_output: str,
image_size: tuple[int, int],
page_num: int
) -> List[Dict[str, Any]]:
"""
Parse dots.ocr model output into structured blocks
Args:
model_output: Raw text output from model (may be JSON or plain text)
image_size: (width, height) of the image
page_num: Page number
Returns:
List of block dictionaries
"""
blocks = []
try:
# Try to parse as JSON first (if model outputs structured JSON)
try:
output_data = json.loads(model_output)
if isinstance(output_data, dict) and "blocks" in output_data:
# Model outputs structured format
return output_data["blocks"]
elif isinstance(output_data, list):
# Model outputs list of blocks
return output_data
except (json.JSONDecodeError, KeyError):
# Not JSON, treat as plain text
pass
# Parse plain text output
# This is a simple heuristic - adjust based on actual dots.ocr output format
lines = model_output.strip().split('\n')
current_block = None
reading_order = 1
for line in lines:
line = line.strip()
if not line:
continue
# Heuristic: lines starting with # are headings
if line.startswith('#'):
# Save previous block
if current_block:
blocks.append(current_block)
# New heading block
current_block = {
"type": "heading",
"text": line.lstrip('#').strip(),
"bbox": {
"x": 0,
"y": reading_order * 30,
"width": image_size[0],
"height": 30
},
"reading_order": reading_order
}
reading_order += 1
else:
# Regular paragraph
if current_block and current_block["type"] == "paragraph":
# Append to existing paragraph
current_block["text"] += " " + line
else:
# Save previous block
if current_block:
blocks.append(current_block)
# New paragraph block
current_block = {
"type": "paragraph",
"text": line,
"bbox": {
"x": 0,
"y": reading_order * 30,
"width": image_size[0],
"height": 30
},
"reading_order": reading_order
}
reading_order += 1
# Save last block
if current_block:
blocks.append(current_block)
# If no blocks were created, create a single paragraph with all text
if not blocks:
blocks.append({
"type": "paragraph",
"text": model_output.strip(),
"bbox": {
"x": 0,
"y": 0,
"width": image_size[0],
"height": image_size[1]
},
"reading_order": 1
})
except Exception as e:
logger.error(f"Error parsing model output: {e}", exc_info=True)
# Fallback: create single block with raw output
blocks = [{
"type": "paragraph",
"text": model_output.strip() if model_output else "",
"bbox": {
"x": 0,
"y": 0,
"width": image_size[0],
"height": image_size[1]
},
"reading_order": 1
}]
return blocks
def extract_layout_info(model_output: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Extract layout information from model output (if available)
Args:
model_output: Model output dictionary
Returns:
Layout info dictionary or None
"""
# This function should be customized based on actual dots.ocr output format
# For now, return None (no layout info)
return None