feat: add tests and integrate dots.ocr model

G.2.5 - Tests:
- Add pytest test suite with fixtures
- test_preprocessing.py - PDF/image loading, normalization, validation
- test_postprocessing.py - chunks, QA pairs, markdown generation
- test_inference.py - dummy parser and inference functions
- test_api.py - API endpoint tests
- Add pytest.ini configuration

G.1.3 - dots.ocr Integration:
- Update model_loader.py with real model loading code
  - Support for AutoModelForVision2Seq and AutoProcessor
  - Device handling (CUDA/CPU/MPS) with fallback
  - Error handling with dummy fallback option
- Update inference.py with real model inference
  - Process images through model
  - Generate and decode outputs
  - Parse model output to blocks
- Add model_output_parser.py
  - Parse JSON or plain text model output
  - Convert to structured blocks
  - Layout detection support (placeholder)

Dependencies:
- Add pytest, pytest-asyncio, httpx for testing
This commit is contained in:
Apple
2025-11-15 13:25:01 -08:00
parent 62cb1d2108
commit 2a353040f6
11 changed files with 848 additions and 47 deletions

View File

@@ -0,0 +1,150 @@
"""
Parser for dots.ocr model output
Converts model output to structured blocks
"""
import logging
import json
from typing import List, Dict, Any, Optional
from PIL import Image
logger = logging.getLogger(__name__)
def parse_model_output_to_blocks(
model_output: str,
image_size: tuple[int, int],
page_num: int
) -> List[Dict[str, Any]]:
"""
Parse dots.ocr model output into structured blocks
Args:
model_output: Raw text output from model (may be JSON or plain text)
image_size: (width, height) of the image
page_num: Page number
Returns:
List of block dictionaries
"""
blocks = []
try:
# Try to parse as JSON first (if model outputs structured JSON)
try:
output_data = json.loads(model_output)
if isinstance(output_data, dict) and "blocks" in output_data:
# Model outputs structured format
return output_data["blocks"]
elif isinstance(output_data, list):
# Model outputs list of blocks
return output_data
except (json.JSONDecodeError, KeyError):
# Not JSON, treat as plain text
pass
# Parse plain text output
# This is a simple heuristic - adjust based on actual dots.ocr output format
lines = model_output.strip().split('\n')
current_block = None
reading_order = 1
for line in lines:
line = line.strip()
if not line:
continue
# Heuristic: lines starting with # are headings
if line.startswith('#'):
# Save previous block
if current_block:
blocks.append(current_block)
# New heading block
current_block = {
"type": "heading",
"text": line.lstrip('#').strip(),
"bbox": {
"x": 0,
"y": reading_order * 30,
"width": image_size[0],
"height": 30
},
"reading_order": reading_order
}
reading_order += 1
else:
# Regular paragraph
if current_block and current_block["type"] == "paragraph":
# Append to existing paragraph
current_block["text"] += " " + line
else:
# Save previous block
if current_block:
blocks.append(current_block)
# New paragraph block
current_block = {
"type": "paragraph",
"text": line,
"bbox": {
"x": 0,
"y": reading_order * 30,
"width": image_size[0],
"height": 30
},
"reading_order": reading_order
}
reading_order += 1
# Save last block
if current_block:
blocks.append(current_block)
# If no blocks were created, create a single paragraph with all text
if not blocks:
blocks.append({
"type": "paragraph",
"text": model_output.strip(),
"bbox": {
"x": 0,
"y": 0,
"width": image_size[0],
"height": image_size[1]
},
"reading_order": 1
})
except Exception as e:
logger.error(f"Error parsing model output: {e}", exc_info=True)
# Fallback: create single block with raw output
blocks = [{
"type": "paragraph",
"text": model_output.strip() if model_output else "",
"bbox": {
"x": 0,
"y": 0,
"width": image_size[0],
"height": image_size[1]
},
"reading_order": 1
}]
return blocks
def extract_layout_info(model_output: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Extract layout information from model output (if available)
Args:
model_output: Model output dictionary
Returns:
Layout info dictionary or None
"""
# This function should be customized based on actual dots.ocr output format
# For now, return None (no layout info)
return None