Files
microdao-daarion/services/parser-service/app/runtime/model_output_parser.py
Apple 7251e519d6 feat: enhance model output parser and add integration guide
Model Output Parser:
- Support multiple dots.ocr output formats (JSON, structured text, plain text)
- Normalize all formats to standard ParsedBlock structure
- Handle JSON with blocks/pages arrays
- Parse markdown-like structured text
- Fallback to plain text parsing
- Better error handling and logging

Schemas:
- Document must-have fields for RAG (doc_id, pages, metadata.dao_id)
- ParsedChunk must-have fields (text, metadata.dao_id, metadata.doc_id)
- Add detailed field descriptions for RAG integration

Integration Guide:
- Create INTEGRATION.md with complete integration guide
- Document dots.ocr output formats
- Show ParsedDocument → Haystack Documents conversion
- Provide DAGI Router integration examples
- RAG pipeline integration with filters
- Complete workflow examples
- RBAC integration recommendations
2025-11-16 03:02:42 -08:00

349 lines
10 KiB
Python

"""
Parser for dots.ocr model output
Converts model output to structured blocks
Expected dots.ocr output formats:
1. JSON with structured blocks (preferred)
2. Plain text with layout hints
3. Markdown-like structure
This parser handles all formats and normalizes to ParsedBlock structure.
"""
import logging
import json
import re
from typing import List, Dict, Any, Optional, Tuple
from PIL import Image
logger = logging.getLogger(__name__)
def parse_model_output_to_blocks(
model_output: str,
image_size: tuple[int, int],
page_num: int
) -> List[Dict[str, Any]]:
"""
Parse dots.ocr model output into structured blocks
Handles multiple output formats:
1. JSON with "blocks" array (preferred)
2. JSON with "pages" array
3. Plain text with layout hints
4. Markdown-like structure
Args:
model_output: Raw text output from model
image_size: (width, height) of the image
page_num: Page number
Returns:
List of block dictionaries with normalized structure
"""
blocks = []
try:
# Format 1: Try to parse as JSON (structured output)
parsed_json = _try_parse_json(model_output)
if parsed_json:
blocks = _extract_blocks_from_json(parsed_json, image_size, page_num)
if blocks:
logger.debug(f"Parsed {len(blocks)} blocks from JSON output")
return blocks
# Format 2: Try to parse as structured text (markdown-like)
blocks = _parse_structured_text(model_output, image_size, page_num)
if blocks:
logger.debug(f"Parsed {len(blocks)} blocks from structured text")
return blocks
# Format 3: Fallback - plain text as single paragraph
blocks = _parse_plain_text(model_output, image_size, page_num)
logger.debug(f"Parsed {len(blocks)} blocks from plain text")
except Exception as e:
logger.error(f"Error parsing model output: {e}", exc_info=True)
blocks = _create_fallback_block(model_output, image_size, page_num)
return blocks
def _try_parse_json(text: str) -> Optional[Dict[str, Any]]:
"""Try to parse text as JSON"""
try:
# Try to find JSON in text (might be wrapped in markdown code blocks)
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
if json_match:
return json.loads(json_match.group(1))
# Try direct JSON parse
return json.loads(text)
except (json.JSONDecodeError, ValueError):
return None
def _extract_blocks_from_json(
data: Dict[str, Any],
image_size: tuple[int, int],
page_num: int
) -> List[Dict[str, Any]]:
"""Extract blocks from JSON structure"""
blocks = []
# Format: {"blocks": [...]}
if "blocks" in data and isinstance(data["blocks"], list):
for idx, block_data in enumerate(data["blocks"], start=1):
block = _normalize_block(block_data, image_size, idx)
if block:
blocks.append(block)
# Format: {"pages": [{"blocks": [...]}]}
elif "pages" in data and isinstance(data["pages"], list):
for page_data in data["pages"]:
if isinstance(page_data, dict) and "blocks" in page_data:
for idx, block_data in enumerate(page_data["blocks"], start=1):
block = _normalize_block(block_data, image_size, idx)
if block:
blocks.append(block)
# Format: Direct array of blocks
elif isinstance(data, list):
for idx, block_data in enumerate(data, start=1):
block = _normalize_block(block_data, image_size, idx)
if block:
blocks.append(block)
return blocks
def _normalize_block(
block_data: Dict[str, Any],
image_size: tuple[int, int],
reading_order: int
) -> Optional[Dict[str, Any]]:
"""Normalize block data to standard format"""
if not isinstance(block_data, dict):
return None
# Extract text
text = block_data.get("text") or block_data.get("content") or ""
if not text or not text.strip():
return None
# Extract type
block_type = block_data.get("type") or block_data.get("block_type") or "paragraph"
# Normalize type
type_mapping = {
"heading": "heading",
"title": "heading",
"h1": "heading",
"h2": "heading",
"h3": "heading",
"paragraph": "paragraph",
"p": "paragraph",
"text": "paragraph",
"table": "table",
"formula": "formula",
"figure": "figure_caption",
"caption": "figure_caption",
"list": "list",
"li": "list"
}
block_type = type_mapping.get(block_type.lower(), "paragraph")
# Extract bbox
bbox = block_data.get("bbox") or block_data.get("bounding_box") or {}
if isinstance(bbox, list) and len(bbox) >= 4:
# Format: [x, y, width, height]
bbox = {
"x": float(bbox[0]),
"y": float(bbox[1]),
"width": float(bbox[2]),
"height": float(bbox[3])
}
elif isinstance(bbox, dict):
# Ensure all fields are present
bbox = {
"x": float(bbox.get("x", 0)),
"y": float(bbox.get("y", 0)),
"width": float(bbox.get("width", image_size[0])),
"height": float(bbox.get("height", 30))
}
else:
# Default bbox
bbox = {
"x": 0,
"y": reading_order * 30,
"width": image_size[0],
"height": 30
}
# Build normalized block
normalized = {
"type": block_type,
"text": text.strip(),
"bbox": bbox,
"reading_order": block_data.get("reading_order") or reading_order
}
# Add table data if present
if block_type == "table" and "table_data" in block_data:
normalized["table_data"] = block_data["table_data"]
# Add metadata if present
if "metadata" in block_data:
normalized["metadata"] = block_data["metadata"]
return normalized
def _parse_structured_text(
text: str,
image_size: tuple[int, int],
page_num: int
) -> List[Dict[str, Any]]:
"""Parse structured text (markdown-like) into blocks"""
blocks = []
lines = text.strip().split('\n')
current_block = None
reading_order = 1
for line in lines:
line = line.strip()
if not line:
if current_block:
blocks.append(current_block)
current_block = None
continue
# Detect heading (markdown style)
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line)
if heading_match:
if current_block:
blocks.append(current_block)
level = len(heading_match.group(1))
heading_text = heading_match.group(2)
current_block = {
"type": "heading",
"text": heading_text,
"bbox": {
"x": 0,
"y": reading_order * 30,
"width": image_size[0],
"height": 30
},
"reading_order": reading_order
}
reading_order += 1
continue
# Detect list item
if re.match(r'^[-*+]\s+', line) or re.match(r'^\d+\.\s+', line):
if current_block and current_block["type"] != "list":
blocks.append(current_block)
list_text = re.sub(r'^[-*+]\s+', '', line)
list_text = re.sub(r'^\d+\.\s+', '', list_text)
current_block = {
"type": "list",
"text": list_text,
"bbox": {
"x": 0,
"y": reading_order * 30,
"width": image_size[0],
"height": 30
},
"reading_order": reading_order
}
reading_order += 1
continue
# Regular paragraph
if current_block and current_block["type"] == "paragraph":
current_block["text"] += " " + line
else:
if current_block:
blocks.append(current_block)
current_block = {
"type": "paragraph",
"text": line,
"bbox": {
"x": 0,
"y": reading_order * 30,
"width": image_size[0],
"height": 30
},
"reading_order": reading_order
}
reading_order += 1
if current_block:
blocks.append(current_block)
return blocks
def _parse_plain_text(
text: str,
image_size: tuple[int, int],
page_num: int
) -> List[Dict[str, Any]]:
"""Parse plain text as single paragraph"""
if not text or not text.strip():
return []
return [{
"type": "paragraph",
"text": text.strip(),
"bbox": {
"x": 0,
"y": 0,
"width": image_size[0],
"height": image_size[1]
},
"reading_order": 1
}]
def _create_fallback_block(
text: str,
image_size: tuple[int, int],
page_num: int
) -> List[Dict[str, Any]]:
"""Create fallback block when parsing fails"""
return [{
"type": "paragraph",
"text": text.strip() if text else f"Page {page_num} (parsing failed)",
"bbox": {
"x": 0,
"y": 0,
"width": image_size[0],
"height": image_size[1]
},
"reading_order": 1,
"metadata": {"parsing_error": True}
}]
def extract_layout_info(model_output: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Extract layout information from model output (if available)
Args:
model_output: Model output dictionary
Returns:
Layout info dictionary or None
"""
# This function should be customized based on actual dots.ocr output format
# For now, return None (no layout info)
return None