feat: enhance model output parser and add integration guide

Model Output Parser:
- Support multiple dots.ocr output formats (JSON, structured text, plain text)
- Normalize all formats to standard ParsedBlock structure
- Handle JSON with blocks/pages arrays
- Parse markdown-like structured text
- Fallback to plain text parsing
- Better error handling and logging

Schemas:
- Document must-have fields for RAG (doc_id, pages, metadata.dao_id)
- ParsedChunk must-have fields (text, metadata.dao_id, metadata.doc_id)
- Add detailed field descriptions for RAG integration

Integration Guide:
- Create INTEGRATION.md with complete integration guide
- Document dots.ocr output formats
- Show ParsedDocument → Haystack Documents conversion
- Provide DAGI Router integration examples
- RAG pipeline integration with filters
- Complete workflow examples
- RBAC integration recommendations
This commit is contained in:
Apple
2025-11-16 03:02:42 -08:00
parent ca05c91799
commit 7251e519d6
3 changed files with 753 additions and 108 deletions

View File

@@ -1,11 +1,19 @@
"""
Parser for dots.ocr model output
Converts model output to structured blocks
Expected dots.ocr output formats:
1. JSON with structured blocks (preferred)
2. Plain text with layout hints
3. Markdown-like structure
This parser handles all formats and normalizes to ParsedBlock structure.
"""
import logging
import json
from typing import List, Dict, Any, Optional
import re
from typing import List, Dict, Any, Optional, Tuple
from PIL import Image
logger = logging.getLogger(__name__)
@@ -19,121 +27,311 @@ def parse_model_output_to_blocks(
"""
Parse dots.ocr model output into structured blocks
Handles multiple output formats:
1. JSON with "blocks" array (preferred)
2. JSON with "pages" array
3. Plain text with layout hints
4. Markdown-like structure
Args:
model_output: Raw text output from model (may be JSON or plain text)
model_output: Raw text output from model
image_size: (width, height) of the image
page_num: Page number
Returns:
List of block dictionaries
List of block dictionaries with normalized structure
"""
blocks = []
try:
# Try to parse as JSON first (if model outputs structured JSON)
try:
output_data = json.loads(model_output)
if isinstance(output_data, dict) and "blocks" in output_data:
# Model outputs structured format
return output_data["blocks"]
elif isinstance(output_data, list):
# Model outputs list of blocks
return output_data
except (json.JSONDecodeError, KeyError):
# Not JSON, treat as plain text
pass
# Format 1: Try to parse as JSON (structured output)
parsed_json = _try_parse_json(model_output)
if parsed_json:
blocks = _extract_blocks_from_json(parsed_json, image_size, page_num)
if blocks:
logger.debug(f"Parsed {len(blocks)} blocks from JSON output")
return blocks
# Parse plain text output
# This is a simple heuristic - adjust based on actual dots.ocr output format
lines = model_output.strip().split('\n')
# Format 2: Try to parse as structured text (markdown-like)
blocks = _parse_structured_text(model_output, image_size, page_num)
if blocks:
logger.debug(f"Parsed {len(blocks)} blocks from structured text")
return blocks
current_block = None
reading_order = 1
for line in lines:
line = line.strip()
if not line:
continue
# Heuristic: lines starting with # are headings
if line.startswith('#'):
# Save previous block
if current_block:
blocks.append(current_block)
# New heading block
current_block = {
"type": "heading",
"text": line.lstrip('#').strip(),
"bbox": {
"x": 0,
"y": reading_order * 30,
"width": image_size[0],
"height": 30
},
"reading_order": reading_order
}
reading_order += 1
else:
# Regular paragraph
if current_block and current_block["type"] == "paragraph":
# Append to existing paragraph
current_block["text"] += " " + line
else:
# Save previous block
if current_block:
blocks.append(current_block)
# New paragraph block
current_block = {
"type": "paragraph",
"text": line,
"bbox": {
"x": 0,
"y": reading_order * 30,
"width": image_size[0],
"height": 30
},
"reading_order": reading_order
}
reading_order += 1
# Save last block
if current_block:
blocks.append(current_block)
# If no blocks were created, create a single paragraph with all text
if not blocks:
blocks.append({
"type": "paragraph",
"text": model_output.strip(),
"bbox": {
"x": 0,
"y": 0,
"width": image_size[0],
"height": image_size[1]
},
"reading_order": 1
})
# Format 3: Fallback - plain text as single paragraph
blocks = _parse_plain_text(model_output, image_size, page_num)
logger.debug(f"Parsed {len(blocks)} blocks from plain text")
except Exception as e:
logger.error(f"Error parsing model output: {e}", exc_info=True)
# Fallback: create single block with raw output
blocks = [{
"type": "paragraph",
"text": model_output.strip() if model_output else "",
"bbox": {
"x": 0,
"y": 0,
"width": image_size[0],
"height": image_size[1]
},
"reading_order": 1
}]
blocks = _create_fallback_block(model_output, image_size, page_num)
return blocks
def _try_parse_json(text: str) -> Optional[Dict[str, Any]]:
"""Try to parse text as JSON"""
try:
# Try to find JSON in text (might be wrapped in markdown code blocks)
json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
if json_match:
return json.loads(json_match.group(1))
# Try direct JSON parse
return json.loads(text)
except (json.JSONDecodeError, ValueError):
return None
def _extract_blocks_from_json(
data: Dict[str, Any],
image_size: tuple[int, int],
page_num: int
) -> List[Dict[str, Any]]:
"""Extract blocks from JSON structure"""
blocks = []
# Format: {"blocks": [...]}
if "blocks" in data and isinstance(data["blocks"], list):
for idx, block_data in enumerate(data["blocks"], start=1):
block = _normalize_block(block_data, image_size, idx)
if block:
blocks.append(block)
# Format: {"pages": [{"blocks": [...]}]}
elif "pages" in data and isinstance(data["pages"], list):
for page_data in data["pages"]:
if isinstance(page_data, dict) and "blocks" in page_data:
for idx, block_data in enumerate(page_data["blocks"], start=1):
block = _normalize_block(block_data, image_size, idx)
if block:
blocks.append(block)
# Format: Direct array of blocks
elif isinstance(data, list):
for idx, block_data in enumerate(data, start=1):
block = _normalize_block(block_data, image_size, idx)
if block:
blocks.append(block)
return blocks
def _normalize_block(
block_data: Dict[str, Any],
image_size: tuple[int, int],
reading_order: int
) -> Optional[Dict[str, Any]]:
"""Normalize block data to standard format"""
if not isinstance(block_data, dict):
return None
# Extract text
text = block_data.get("text") or block_data.get("content") or ""
if not text or not text.strip():
return None
# Extract type
block_type = block_data.get("type") or block_data.get("block_type") or "paragraph"
# Normalize type
type_mapping = {
"heading": "heading",
"title": "heading",
"h1": "heading",
"h2": "heading",
"h3": "heading",
"paragraph": "paragraph",
"p": "paragraph",
"text": "paragraph",
"table": "table",
"formula": "formula",
"figure": "figure_caption",
"caption": "figure_caption",
"list": "list",
"li": "list"
}
block_type = type_mapping.get(block_type.lower(), "paragraph")
# Extract bbox
bbox = block_data.get("bbox") or block_data.get("bounding_box") or {}
if isinstance(bbox, list) and len(bbox) >= 4:
# Format: [x, y, width, height]
bbox = {
"x": float(bbox[0]),
"y": float(bbox[1]),
"width": float(bbox[2]),
"height": float(bbox[3])
}
elif isinstance(bbox, dict):
# Ensure all fields are present
bbox = {
"x": float(bbox.get("x", 0)),
"y": float(bbox.get("y", 0)),
"width": float(bbox.get("width", image_size[0])),
"height": float(bbox.get("height", 30))
}
else:
# Default bbox
bbox = {
"x": 0,
"y": reading_order * 30,
"width": image_size[0],
"height": 30
}
# Build normalized block
normalized = {
"type": block_type,
"text": text.strip(),
"bbox": bbox,
"reading_order": block_data.get("reading_order") or reading_order
}
# Add table data if present
if block_type == "table" and "table_data" in block_data:
normalized["table_data"] = block_data["table_data"]
# Add metadata if present
if "metadata" in block_data:
normalized["metadata"] = block_data["metadata"]
return normalized
def _parse_structured_text(
text: str,
image_size: tuple[int, int],
page_num: int
) -> List[Dict[str, Any]]:
"""Parse structured text (markdown-like) into blocks"""
blocks = []
lines = text.strip().split('\n')
current_block = None
reading_order = 1
for line in lines:
line = line.strip()
if not line:
if current_block:
blocks.append(current_block)
current_block = None
continue
# Detect heading (markdown style)
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line)
if heading_match:
if current_block:
blocks.append(current_block)
level = len(heading_match.group(1))
heading_text = heading_match.group(2)
current_block = {
"type": "heading",
"text": heading_text,
"bbox": {
"x": 0,
"y": reading_order * 30,
"width": image_size[0],
"height": 30
},
"reading_order": reading_order
}
reading_order += 1
continue
# Detect list item
if re.match(r'^[-*+]\s+', line) or re.match(r'^\d+\.\s+', line):
if current_block and current_block["type"] != "list":
blocks.append(current_block)
list_text = re.sub(r'^[-*+]\s+', '', line)
list_text = re.sub(r'^\d+\.\s+', '', list_text)
current_block = {
"type": "list",
"text": list_text,
"bbox": {
"x": 0,
"y": reading_order * 30,
"width": image_size[0],
"height": 30
},
"reading_order": reading_order
}
reading_order += 1
continue
# Regular paragraph
if current_block and current_block["type"] == "paragraph":
current_block["text"] += " " + line
else:
if current_block:
blocks.append(current_block)
current_block = {
"type": "paragraph",
"text": line,
"bbox": {
"x": 0,
"y": reading_order * 30,
"width": image_size[0],
"height": 30
},
"reading_order": reading_order
}
reading_order += 1
if current_block:
blocks.append(current_block)
return blocks
def _parse_plain_text(
text: str,
image_size: tuple[int, int],
page_num: int
) -> List[Dict[str, Any]]:
"""Parse plain text as single paragraph"""
if not text or not text.strip():
return []
return [{
"type": "paragraph",
"text": text.strip(),
"bbox": {
"x": 0,
"y": 0,
"width": image_size[0],
"height": image_size[1]
},
"reading_order": 1
}]
def _create_fallback_block(
text: str,
image_size: tuple[int, int],
page_num: int
) -> List[Dict[str, Any]]:
"""Create fallback block when parsing fails"""
return [{
"type": "paragraph",
"text": text.strip() if text else f"Page {page_num} (parsing failed)",
"bbox": {
"x": 0,
"y": 0,
"width": image_size[0],
"height": image_size[1]
},
"reading_order": 1,
"metadata": {"parsing_error": True}
}]
def extract_layout_info(model_output: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Extract layout information from model output (if available)