feat: enhance model output parser and add integration guide

Model Output Parser: - Support multiple dots.ocr output formats (JSON, structured text, plain text) - Normalize all formats to standard ParsedBlock structure - Handle JSON with blocks/pages arrays - Parse markdown-like structured text - Fallback to plain text parsing - Better error handling and logging Schemas: - Document must-have fields for RAG (doc_id, pages, metadata.dao_id) - ParsedChunk must-have fields (text, metadata.dao_id, metadata.doc_id) - Add detailed field descriptions for RAG integration Integration Guide: - Create INTEGRATION.md with complete integration guide - Document dots.ocr output formats - Show ParsedDocument → Haystack Documents conversion - Provide DAGI Router integration examples - RAG pipeline integration with filters - Complete workflow examples - RBAC integration recommendations
2025-11-16 03:02:42 -08:00
parent ca05c91799
commit 7251e519d6
3 changed files with 753 additions and 108 deletions
--- a/services/parser-service/app/runtime/model_output_parser.py
+++ b/services/parser-service/app/runtime/model_output_parser.py
@@ -1,11 +1,19 @@
 """
 Parser for dots.ocr model output
 Converts model output to structured blocks
+
+Expected dots.ocr output formats:
+1. JSON with structured blocks (preferred)
+2. Plain text with layout hints
+3. Markdown-like structure
+
+This parser handles all formats and normalizes to ParsedBlock structure.
 """

 import logging
 import json
-from typing import List, Dict, Any, Optional
+import re
+from typing import List, Dict, Any, Optional, Tuple
 from PIL import Image

 logger = logging.getLogger(__name__)
@@ -19,121 +27,311 @@ def parse_model_output_to_blocks(
    """
    Parse dots.ocr model output into structured blocks
    
+    Handles multiple output formats:
+    1. JSON with "blocks" array (preferred)
+    2. JSON with "pages" array
+    3. Plain text with layout hints
+    4. Markdown-like structure
+    
    Args:
-        model_output: Raw text output from model (may be JSON or plain text)
+        model_output: Raw text output from model
        image_size: (width, height) of the image
        page_num: Page number
    
    Returns:
-        List of block dictionaries
+        List of block dictionaries with normalized structure
    """
    blocks = []
    
    try:
-        # Try to parse as JSON first (if model outputs structured JSON)
-        try:
-            output_data = json.loads(model_output)
-            if isinstance(output_data, dict) and "blocks" in output_data:
-                # Model outputs structured format
-                return output_data["blocks"]
-            elif isinstance(output_data, list):
-                # Model outputs list of blocks
-                return output_data
-        except (json.JSONDecodeError, KeyError):
-            # Not JSON, treat as plain text
-            pass
+        # Format 1: Try to parse as JSON (structured output)
+        parsed_json = _try_parse_json(model_output)
+        if parsed_json:
+            blocks = _extract_blocks_from_json(parsed_json, image_size, page_num)
+            if blocks:
+                logger.debug(f"Parsed {len(blocks)} blocks from JSON output")
+                return blocks
        
-        # Parse plain text output
-        # This is a simple heuristic - adjust based on actual dots.ocr output format
-        lines = model_output.strip().split('\n')
+        # Format 2: Try to parse as structured text (markdown-like)
+        blocks = _parse_structured_text(model_output, image_size, page_num)
+        if blocks:
+            logger.debug(f"Parsed {len(blocks)} blocks from structured text")
+            return blocks
        
-        current_block = None
-        reading_order = 1
-        
-        for line in lines:
-            line = line.strip()
-            if not line:
-                continue
-            
-            # Heuristic: lines starting with # are headings
-            if line.startswith('#'):
-                # Save previous block
-                if current_block:
-                    blocks.append(current_block)
-                
-                # New heading block
-                current_block = {
-                    "type": "heading",
-                    "text": line.lstrip('#').strip(),
-                    "bbox": {
-                        "x": 0,
-                        "y": reading_order * 30,
-                        "width": image_size[0],
-                        "height": 30
-                    },
-                    "reading_order": reading_order
-                }
-                reading_order += 1
-            else:
-                # Regular paragraph
-                if current_block and current_block["type"] == "paragraph":
-                    # Append to existing paragraph
-                    current_block["text"] += " " + line
-                else:
-                    # Save previous block
-                    if current_block:
-                        blocks.append(current_block)
-                    
-                    # New paragraph block
-                    current_block = {
-                        "type": "paragraph",
-                        "text": line,
-                        "bbox": {
-                            "x": 0,
-                            "y": reading_order * 30,
-                            "width": image_size[0],
-                            "height": 30
-                        },
-                        "reading_order": reading_order
-                    }
-                    reading_order += 1
-        
-        # Save last block
-        if current_block:
-            blocks.append(current_block)
-        
-        # If no blocks were created, create a single paragraph with all text
-        if not blocks:
-            blocks.append({
-                "type": "paragraph",
-                "text": model_output.strip(),
-                "bbox": {
-                    "x": 0,
-                    "y": 0,
-                    "width": image_size[0],
-                    "height": image_size[1]
-                },
-                "reading_order": 1
-            })
+        # Format 3: Fallback - plain text as single paragraph
+        blocks = _parse_plain_text(model_output, image_size, page_num)
+        logger.debug(f"Parsed {len(blocks)} blocks from plain text")
        
    except Exception as e:
        logger.error(f"Error parsing model output: {e}", exc_info=True)
-        # Fallback: create single block with raw output
-        blocks = [{
-            "type": "paragraph",
-            "text": model_output.strip() if model_output else "",
-            "bbox": {
-                "x": 0,
-                "y": 0,
-                "width": image_size[0],
-                "height": image_size[1]
-            },
-            "reading_order": 1
-        }]
+        blocks = _create_fallback_block(model_output, image_size, page_num)
    
    return blocks


+def _try_parse_json(text: str) -> Optional[Dict[str, Any]]:
+    """Try to parse text as JSON"""
+    try:
+        # Try to find JSON in text (might be wrapped in markdown code blocks)
+        json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
+        if json_match:
+            return json.loads(json_match.group(1))
+        
+        # Try direct JSON parse
+        return json.loads(text)
+    except (json.JSONDecodeError, ValueError):
+        return None
+
+
+def _extract_blocks_from_json(
+    data: Dict[str, Any],
+    image_size: tuple[int, int],
+    page_num: int
+) -> List[Dict[str, Any]]:
+    """Extract blocks from JSON structure"""
+    blocks = []
+    
+    # Format: {"blocks": [...]}
+    if "blocks" in data and isinstance(data["blocks"], list):
+        for idx, block_data in enumerate(data["blocks"], start=1):
+            block = _normalize_block(block_data, image_size, idx)
+            if block:
+                blocks.append(block)
+    
+    # Format: {"pages": [{"blocks": [...]}]}
+    elif "pages" in data and isinstance(data["pages"], list):
+        for page_data in data["pages"]:
+            if isinstance(page_data, dict) and "blocks" in page_data:
+                for idx, block_data in enumerate(page_data["blocks"], start=1):
+                    block = _normalize_block(block_data, image_size, idx)
+                    if block:
+                        blocks.append(block)
+    
+    # Format: Direct array of blocks
+    elif isinstance(data, list):
+        for idx, block_data in enumerate(data, start=1):
+            block = _normalize_block(block_data, image_size, idx)
+            if block:
+                blocks.append(block)
+    
+    return blocks
+
+
+def _normalize_block(
+    block_data: Dict[str, Any],
+    image_size: tuple[int, int],
+    reading_order: int
+) -> Optional[Dict[str, Any]]:
+    """Normalize block data to standard format"""
+    if not isinstance(block_data, dict):
+        return None
+    
+    # Extract text
+    text = block_data.get("text") or block_data.get("content") or ""
+    if not text or not text.strip():
+        return None
+    
+    # Extract type
+    block_type = block_data.get("type") or block_data.get("block_type") or "paragraph"
+    
+    # Normalize type
+    type_mapping = {
+        "heading": "heading",
+        "title": "heading",
+        "h1": "heading",
+        "h2": "heading",
+        "h3": "heading",
+        "paragraph": "paragraph",
+        "p": "paragraph",
+        "text": "paragraph",
+        "table": "table",
+        "formula": "formula",
+        "figure": "figure_caption",
+        "caption": "figure_caption",
+        "list": "list",
+        "li": "list"
+    }
+    block_type = type_mapping.get(block_type.lower(), "paragraph")
+    
+    # Extract bbox
+    bbox = block_data.get("bbox") or block_data.get("bounding_box") or {}
+    if isinstance(bbox, list) and len(bbox) >= 4:
+        # Format: [x, y, width, height]
+        bbox = {
+            "x": float(bbox[0]),
+            "y": float(bbox[1]),
+            "width": float(bbox[2]),
+            "height": float(bbox[3])
+        }
+    elif isinstance(bbox, dict):
+        # Ensure all fields are present
+        bbox = {
+            "x": float(bbox.get("x", 0)),
+            "y": float(bbox.get("y", 0)),
+            "width": float(bbox.get("width", image_size[0])),
+            "height": float(bbox.get("height", 30))
+        }
+    else:
+        # Default bbox
+        bbox = {
+            "x": 0,
+            "y": reading_order * 30,
+            "width": image_size[0],
+            "height": 30
+        }
+    
+    # Build normalized block
+    normalized = {
+        "type": block_type,
+        "text": text.strip(),
+        "bbox": bbox,
+        "reading_order": block_data.get("reading_order") or reading_order
+    }
+    
+    # Add table data if present
+    if block_type == "table" and "table_data" in block_data:
+        normalized["table_data"] = block_data["table_data"]
+    
+    # Add metadata if present
+    if "metadata" in block_data:
+        normalized["metadata"] = block_data["metadata"]
+    
+    return normalized
+
+
+def _parse_structured_text(
+    text: str,
+    image_size: tuple[int, int],
+    page_num: int
+) -> List[Dict[str, Any]]:
+    """Parse structured text (markdown-like) into blocks"""
+    blocks = []
+    lines = text.strip().split('\n')
+    
+    current_block = None
+    reading_order = 1
+    
+    for line in lines:
+        line = line.strip()
+        if not line:
+            if current_block:
+                blocks.append(current_block)
+                current_block = None
+            continue
+        
+        # Detect heading (markdown style)
+        heading_match = re.match(r'^(#{1,6})\s+(.+)$', line)
+        if heading_match:
+            if current_block:
+                blocks.append(current_block)
+            
+            level = len(heading_match.group(1))
+            heading_text = heading_match.group(2)
+            
+            current_block = {
+                "type": "heading",
+                "text": heading_text,
+                "bbox": {
+                    "x": 0,
+                    "y": reading_order * 30,
+                    "width": image_size[0],
+                    "height": 30
+                },
+                "reading_order": reading_order
+            }
+            reading_order += 1
+            continue
+        
+        # Detect list item
+        if re.match(r'^[-*+]\s+', line) or re.match(r'^\d+\.\s+', line):
+            if current_block and current_block["type"] != "list":
+                blocks.append(current_block)
+            
+            list_text = re.sub(r'^[-*+]\s+', '', line)
+            list_text = re.sub(r'^\d+\.\s+', '', list_text)
+            
+            current_block = {
+                "type": "list",
+                "text": list_text,
+                "bbox": {
+                    "x": 0,
+                    "y": reading_order * 30,
+                    "width": image_size[0],
+                    "height": 30
+                },
+                "reading_order": reading_order
+            }
+            reading_order += 1
+            continue
+        
+        # Regular paragraph
+        if current_block and current_block["type"] == "paragraph":
+            current_block["text"] += " " + line
+        else:
+            if current_block:
+                blocks.append(current_block)
+            
+            current_block = {
+                "type": "paragraph",
+                "text": line,
+                "bbox": {
+                    "x": 0,
+                    "y": reading_order * 30,
+                    "width": image_size[0],
+                    "height": 30
+                },
+                "reading_order": reading_order
+            }
+            reading_order += 1
+    
+    if current_block:
+        blocks.append(current_block)
+    
+    return blocks
+
+
+def _parse_plain_text(
+    text: str,
+    image_size: tuple[int, int],
+    page_num: int
+) -> List[Dict[str, Any]]:
+    """Parse plain text as single paragraph"""
+    if not text or not text.strip():
+        return []
+    
+    return [{
+        "type": "paragraph",
+        "text": text.strip(),
+        "bbox": {
+            "x": 0,
+            "y": 0,
+            "width": image_size[0],
+            "height": image_size[1]
+        },
+        "reading_order": 1
+    }]
+
+
+def _create_fallback_block(
+    text: str,
+    image_size: tuple[int, int],
+    page_num: int
+) -> List[Dict[str, Any]]:
+    """Create fallback block when parsing fails"""
+    return [{
+        "type": "paragraph",
+        "text": text.strip() if text else f"Page {page_num} (parsing failed)",
+        "bbox": {
+            "x": 0,
+            "y": 0,
+            "width": image_size[0],
+            "height": image_size[1]
+        },
+        "reading_order": 1,
+        "metadata": {"parsing_error": True}
+    }]
+
+
 def extract_layout_info(model_output: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Extract layout information from model output (if available)