feat: enhance model output parser and add integration guide
Model Output Parser: - Support multiple dots.ocr output formats (JSON, structured text, plain text) - Normalize all formats to standard ParsedBlock structure - Handle JSON with blocks/pages arrays - Parse markdown-like structured text - Fallback to plain text parsing - Better error handling and logging Schemas: - Document must-have fields for RAG (doc_id, pages, metadata.dao_id) - ParsedChunk must-have fields (text, metadata.dao_id, metadata.doc_id) - Add detailed field descriptions for RAG integration Integration Guide: - Create INTEGRATION.md with complete integration guide - Document dots.ocr output formats - Show ParsedDocument → Haystack Documents conversion - Provide DAGI Router integration examples - RAG pipeline integration with filters - Complete workflow examples - RBAC integration recommendations
This commit is contained in:
@@ -1,11 +1,19 @@
|
||||
"""
|
||||
Parser for dots.ocr model output
|
||||
Converts model output to structured blocks
|
||||
|
||||
Expected dots.ocr output formats:
|
||||
1. JSON with structured blocks (preferred)
|
||||
2. Plain text with layout hints
|
||||
3. Markdown-like structure
|
||||
|
||||
This parser handles all formats and normalizes to ParsedBlock structure.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import json
|
||||
from typing import List, Dict, Any, Optional
|
||||
import re
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
from PIL import Image
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -19,121 +27,311 @@ def parse_model_output_to_blocks(
|
||||
"""
|
||||
Parse dots.ocr model output into structured blocks
|
||||
|
||||
Handles multiple output formats:
|
||||
1. JSON with "blocks" array (preferred)
|
||||
2. JSON with "pages" array
|
||||
3. Plain text with layout hints
|
||||
4. Markdown-like structure
|
||||
|
||||
Args:
|
||||
model_output: Raw text output from model (may be JSON or plain text)
|
||||
model_output: Raw text output from model
|
||||
image_size: (width, height) of the image
|
||||
page_num: Page number
|
||||
|
||||
Returns:
|
||||
List of block dictionaries
|
||||
List of block dictionaries with normalized structure
|
||||
"""
|
||||
blocks = []
|
||||
|
||||
try:
|
||||
# Try to parse as JSON first (if model outputs structured JSON)
|
||||
try:
|
||||
output_data = json.loads(model_output)
|
||||
if isinstance(output_data, dict) and "blocks" in output_data:
|
||||
# Model outputs structured format
|
||||
return output_data["blocks"]
|
||||
elif isinstance(output_data, list):
|
||||
# Model outputs list of blocks
|
||||
return output_data
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
# Not JSON, treat as plain text
|
||||
pass
|
||||
# Format 1: Try to parse as JSON (structured output)
|
||||
parsed_json = _try_parse_json(model_output)
|
||||
if parsed_json:
|
||||
blocks = _extract_blocks_from_json(parsed_json, image_size, page_num)
|
||||
if blocks:
|
||||
logger.debug(f"Parsed {len(blocks)} blocks from JSON output")
|
||||
return blocks
|
||||
|
||||
# Parse plain text output
|
||||
# This is a simple heuristic - adjust based on actual dots.ocr output format
|
||||
lines = model_output.strip().split('\n')
|
||||
# Format 2: Try to parse as structured text (markdown-like)
|
||||
blocks = _parse_structured_text(model_output, image_size, page_num)
|
||||
if blocks:
|
||||
logger.debug(f"Parsed {len(blocks)} blocks from structured text")
|
||||
return blocks
|
||||
|
||||
current_block = None
|
||||
reading_order = 1
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Heuristic: lines starting with # are headings
|
||||
if line.startswith('#'):
|
||||
# Save previous block
|
||||
if current_block:
|
||||
blocks.append(current_block)
|
||||
|
||||
# New heading block
|
||||
current_block = {
|
||||
"type": "heading",
|
||||
"text": line.lstrip('#').strip(),
|
||||
"bbox": {
|
||||
"x": 0,
|
||||
"y": reading_order * 30,
|
||||
"width": image_size[0],
|
||||
"height": 30
|
||||
},
|
||||
"reading_order": reading_order
|
||||
}
|
||||
reading_order += 1
|
||||
else:
|
||||
# Regular paragraph
|
||||
if current_block and current_block["type"] == "paragraph":
|
||||
# Append to existing paragraph
|
||||
current_block["text"] += " " + line
|
||||
else:
|
||||
# Save previous block
|
||||
if current_block:
|
||||
blocks.append(current_block)
|
||||
|
||||
# New paragraph block
|
||||
current_block = {
|
||||
"type": "paragraph",
|
||||
"text": line,
|
||||
"bbox": {
|
||||
"x": 0,
|
||||
"y": reading_order * 30,
|
||||
"width": image_size[0],
|
||||
"height": 30
|
||||
},
|
||||
"reading_order": reading_order
|
||||
}
|
||||
reading_order += 1
|
||||
|
||||
# Save last block
|
||||
if current_block:
|
||||
blocks.append(current_block)
|
||||
|
||||
# If no blocks were created, create a single paragraph with all text
|
||||
if not blocks:
|
||||
blocks.append({
|
||||
"type": "paragraph",
|
||||
"text": model_output.strip(),
|
||||
"bbox": {
|
||||
"x": 0,
|
||||
"y": 0,
|
||||
"width": image_size[0],
|
||||
"height": image_size[1]
|
||||
},
|
||||
"reading_order": 1
|
||||
})
|
||||
# Format 3: Fallback - plain text as single paragraph
|
||||
blocks = _parse_plain_text(model_output, image_size, page_num)
|
||||
logger.debug(f"Parsed {len(blocks)} blocks from plain text")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing model output: {e}", exc_info=True)
|
||||
# Fallback: create single block with raw output
|
||||
blocks = [{
|
||||
"type": "paragraph",
|
||||
"text": model_output.strip() if model_output else "",
|
||||
"bbox": {
|
||||
"x": 0,
|
||||
"y": 0,
|
||||
"width": image_size[0],
|
||||
"height": image_size[1]
|
||||
},
|
||||
"reading_order": 1
|
||||
}]
|
||||
blocks = _create_fallback_block(model_output, image_size, page_num)
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
def _try_parse_json(text: str) -> Optional[Dict[str, Any]]:
|
||||
"""Try to parse text as JSON"""
|
||||
try:
|
||||
# Try to find JSON in text (might be wrapped in markdown code blocks)
|
||||
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
|
||||
if json_match:
|
||||
return json.loads(json_match.group(1))
|
||||
|
||||
# Try direct JSON parse
|
||||
return json.loads(text)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def _extract_blocks_from_json(
|
||||
data: Dict[str, Any],
|
||||
image_size: tuple[int, int],
|
||||
page_num: int
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Extract blocks from JSON structure"""
|
||||
blocks = []
|
||||
|
||||
# Format: {"blocks": [...]}
|
||||
if "blocks" in data and isinstance(data["blocks"], list):
|
||||
for idx, block_data in enumerate(data["blocks"], start=1):
|
||||
block = _normalize_block(block_data, image_size, idx)
|
||||
if block:
|
||||
blocks.append(block)
|
||||
|
||||
# Format: {"pages": [{"blocks": [...]}]}
|
||||
elif "pages" in data and isinstance(data["pages"], list):
|
||||
for page_data in data["pages"]:
|
||||
if isinstance(page_data, dict) and "blocks" in page_data:
|
||||
for idx, block_data in enumerate(page_data["blocks"], start=1):
|
||||
block = _normalize_block(block_data, image_size, idx)
|
||||
if block:
|
||||
blocks.append(block)
|
||||
|
||||
# Format: Direct array of blocks
|
||||
elif isinstance(data, list):
|
||||
for idx, block_data in enumerate(data, start=1):
|
||||
block = _normalize_block(block_data, image_size, idx)
|
||||
if block:
|
||||
blocks.append(block)
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
def _normalize_block(
|
||||
block_data: Dict[str, Any],
|
||||
image_size: tuple[int, int],
|
||||
reading_order: int
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Normalize block data to standard format"""
|
||||
if not isinstance(block_data, dict):
|
||||
return None
|
||||
|
||||
# Extract text
|
||||
text = block_data.get("text") or block_data.get("content") or ""
|
||||
if not text or not text.strip():
|
||||
return None
|
||||
|
||||
# Extract type
|
||||
block_type = block_data.get("type") or block_data.get("block_type") or "paragraph"
|
||||
|
||||
# Normalize type
|
||||
type_mapping = {
|
||||
"heading": "heading",
|
||||
"title": "heading",
|
||||
"h1": "heading",
|
||||
"h2": "heading",
|
||||
"h3": "heading",
|
||||
"paragraph": "paragraph",
|
||||
"p": "paragraph",
|
||||
"text": "paragraph",
|
||||
"table": "table",
|
||||
"formula": "formula",
|
||||
"figure": "figure_caption",
|
||||
"caption": "figure_caption",
|
||||
"list": "list",
|
||||
"li": "list"
|
||||
}
|
||||
block_type = type_mapping.get(block_type.lower(), "paragraph")
|
||||
|
||||
# Extract bbox
|
||||
bbox = block_data.get("bbox") or block_data.get("bounding_box") or {}
|
||||
if isinstance(bbox, list) and len(bbox) >= 4:
|
||||
# Format: [x, y, width, height]
|
||||
bbox = {
|
||||
"x": float(bbox[0]),
|
||||
"y": float(bbox[1]),
|
||||
"width": float(bbox[2]),
|
||||
"height": float(bbox[3])
|
||||
}
|
||||
elif isinstance(bbox, dict):
|
||||
# Ensure all fields are present
|
||||
bbox = {
|
||||
"x": float(bbox.get("x", 0)),
|
||||
"y": float(bbox.get("y", 0)),
|
||||
"width": float(bbox.get("width", image_size[0])),
|
||||
"height": float(bbox.get("height", 30))
|
||||
}
|
||||
else:
|
||||
# Default bbox
|
||||
bbox = {
|
||||
"x": 0,
|
||||
"y": reading_order * 30,
|
||||
"width": image_size[0],
|
||||
"height": 30
|
||||
}
|
||||
|
||||
# Build normalized block
|
||||
normalized = {
|
||||
"type": block_type,
|
||||
"text": text.strip(),
|
||||
"bbox": bbox,
|
||||
"reading_order": block_data.get("reading_order") or reading_order
|
||||
}
|
||||
|
||||
# Add table data if present
|
||||
if block_type == "table" and "table_data" in block_data:
|
||||
normalized["table_data"] = block_data["table_data"]
|
||||
|
||||
# Add metadata if present
|
||||
if "metadata" in block_data:
|
||||
normalized["metadata"] = block_data["metadata"]
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
def _parse_structured_text(
|
||||
text: str,
|
||||
image_size: tuple[int, int],
|
||||
page_num: int
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Parse structured text (markdown-like) into blocks"""
|
||||
blocks = []
|
||||
lines = text.strip().split('\n')
|
||||
|
||||
current_block = None
|
||||
reading_order = 1
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
if current_block:
|
||||
blocks.append(current_block)
|
||||
current_block = None
|
||||
continue
|
||||
|
||||
# Detect heading (markdown style)
|
||||
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line)
|
||||
if heading_match:
|
||||
if current_block:
|
||||
blocks.append(current_block)
|
||||
|
||||
level = len(heading_match.group(1))
|
||||
heading_text = heading_match.group(2)
|
||||
|
||||
current_block = {
|
||||
"type": "heading",
|
||||
"text": heading_text,
|
||||
"bbox": {
|
||||
"x": 0,
|
||||
"y": reading_order * 30,
|
||||
"width": image_size[0],
|
||||
"height": 30
|
||||
},
|
||||
"reading_order": reading_order
|
||||
}
|
||||
reading_order += 1
|
||||
continue
|
||||
|
||||
# Detect list item
|
||||
if re.match(r'^[-*+]\s+', line) or re.match(r'^\d+\.\s+', line):
|
||||
if current_block and current_block["type"] != "list":
|
||||
blocks.append(current_block)
|
||||
|
||||
list_text = re.sub(r'^[-*+]\s+', '', line)
|
||||
list_text = re.sub(r'^\d+\.\s+', '', list_text)
|
||||
|
||||
current_block = {
|
||||
"type": "list",
|
||||
"text": list_text,
|
||||
"bbox": {
|
||||
"x": 0,
|
||||
"y": reading_order * 30,
|
||||
"width": image_size[0],
|
||||
"height": 30
|
||||
},
|
||||
"reading_order": reading_order
|
||||
}
|
||||
reading_order += 1
|
||||
continue
|
||||
|
||||
# Regular paragraph
|
||||
if current_block and current_block["type"] == "paragraph":
|
||||
current_block["text"] += " " + line
|
||||
else:
|
||||
if current_block:
|
||||
blocks.append(current_block)
|
||||
|
||||
current_block = {
|
||||
"type": "paragraph",
|
||||
"text": line,
|
||||
"bbox": {
|
||||
"x": 0,
|
||||
"y": reading_order * 30,
|
||||
"width": image_size[0],
|
||||
"height": 30
|
||||
},
|
||||
"reading_order": reading_order
|
||||
}
|
||||
reading_order += 1
|
||||
|
||||
if current_block:
|
||||
blocks.append(current_block)
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
def _parse_plain_text(
|
||||
text: str,
|
||||
image_size: tuple[int, int],
|
||||
page_num: int
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Parse plain text as single paragraph"""
|
||||
if not text or not text.strip():
|
||||
return []
|
||||
|
||||
return [{
|
||||
"type": "paragraph",
|
||||
"text": text.strip(),
|
||||
"bbox": {
|
||||
"x": 0,
|
||||
"y": 0,
|
||||
"width": image_size[0],
|
||||
"height": image_size[1]
|
||||
},
|
||||
"reading_order": 1
|
||||
}]
|
||||
|
||||
|
||||
def _create_fallback_block(
|
||||
text: str,
|
||||
image_size: tuple[int, int],
|
||||
page_num: int
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Create fallback block when parsing fails"""
|
||||
return [{
|
||||
"type": "paragraph",
|
||||
"text": text.strip() if text else f"Page {page_num} (parsing failed)",
|
||||
"bbox": {
|
||||
"x": 0,
|
||||
"y": 0,
|
||||
"width": image_size[0],
|
||||
"height": image_size[1]
|
||||
},
|
||||
"reading_order": 1,
|
||||
"metadata": {"parsing_error": True}
|
||||
}]
|
||||
|
||||
|
||||
def extract_layout_info(model_output: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Extract layout information from model output (if available)
|
||||
|
||||
Reference in New Issue
Block a user