feat: integrate dots.ocr native prompt modes and 2-stage qa_pairs pipeline

Prompt Modes Integration: - Create local_runtime.py with DOTS_PROMPT_MAP - Map OutputMode to native dots.ocr prompt modes (prompt_layout_all_en, prompt_ocr, etc.) - Support dict_promptmode_to_prompt from dots.ocr with fallback prompts - Add layout_only and region modes to OutputMode enum 2-Stage Q&A Pipeline: - Create qa_builder.py for 2-stage qa_pairs generation - Stage 1: PARSER (dots.ocr) → raw JSON via prompt_layout_all_en - Stage 2: LLM (DAGI Router) → Q&A pairs via mode=qa_build - Update endpoints.py to use 2-stage pipeline for qa_pairs mode - Add ROUTER_BASE_URL and ROUTER_TIMEOUT to config Updates: - Update inference.py to use local_runtime with native prompts - Update ollama_client.py to use same prompt map - Add PROMPT_MODES.md documentation
2025-11-16 04:24:03 -08:00
parent d474a085c3
commit be22752590
8 changed files with 714 additions and 44 deletions
--- a/services/parser-service/app/runtime/inference.py
+++ b/services/parser-service/app/runtime/inference.py
@@ -11,6 +11,7 @@ from PIL import Image

 from app.schemas import ParsedDocument, ParsedPage, ParsedBlock, BBox
 from app.runtime.model_loader import get_model
+from app.runtime.local_runtime import parse_document_with_local
 from app.runtime.preprocessing import (
    convert_pdf_to_images, load_image, prepare_images_for_model
 )
@@ -26,7 +27,7 @@ logger = logging.getLogger(__name__)

 async def parse_document_with_ollama(
    images: List[Image.Image],
-    output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
+    output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks", "layout_only", "region"] = "raw_json",
    doc_id: Optional[str] = None,
    doc_type: Literal["pdf", "image"] = "image"
 ) -> ParsedDocument:
@@ -106,7 +107,7 @@ async def parse_document_with_ollama(

 def parse_document_from_images(
    images: List[Image.Image],
-    output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
+    output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks", "layout_only", "region"] = "raw_json",
    doc_id: Optional[str] = None,
    doc_type: Literal["pdf", "image"] = "image"
 ) -> ParsedDocument:
@@ -146,33 +147,19 @@ def parse_document_from_images(
    if not prepared_images:
        raise ValueError("No valid images to process")
    
-    # Process with model
+    # Process with model using local_runtime (with native dots.ocr prompts)
    pages_data = []
    
    for idx, image in enumerate(prepared_images, start=1):
        try:
-            # Prepare inputs for model
-            inputs = model["processor"](images=image, return_tensors="pt")
+            # Convert image to bytes for local_runtime
+            import io
+            buf = io.BytesIO()
+            image.convert("RGB").save(buf, format="PNG")
+            image_bytes = buf.getvalue()
            
-            # Move inputs to device
-            device = model["device"]
-            if device != "cpu":
-                inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v 
-                         for k, v in inputs.items()}
-            
-            # Generate output
-            with torch.no_grad():
-                outputs = model["model"].generate(
-                    **inputs,
-                    max_new_tokens=2048,  # Adjust based on model capabilities
-                    do_sample=False  # Deterministic output
-                )
-            
-            # Decode output
-            generated_text = model["processor"].decode(
-                outputs[0], 
-                skip_special_tokens=True
-            )
+            # Use local_runtime with native prompt modes
+            generated_text = parse_document_with_local(image_bytes, output_mode)
            
            logger.debug(f"Model output for page {idx}: {generated_text[:100]}...")
            
--- a/services/parser-service/app/runtime/local_runtime.py
+++ b/services/parser-service/app/runtime/local_runtime.py
@@ -0,0 +1,273 @@
+"""
+Local runtime for dots.ocr model with native prompt modes
+Maps OutputMode to dots.ocr prompt modes using dict_promptmode_to_prompt
+"""
+
+import os
+import tempfile
+import logging
+from typing import Literal, Optional
+
+import torch
+from transformers import AutoModelForVision2Seq, AutoProcessor
+from qwen_vl_utils import process_vision_info
+
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+# Try to import dots.ocr prompt dictionary
+try:
+    from dots_ocr.utils.prompts import dict_promptmode_to_prompt
+    DOTS_PROMPTS_AVAILABLE = True
+except ImportError:
+    logger.warning(
+        "dots_ocr.utils.prompts not available. "
+        "Using fallback prompts. Install dots.ocr package for native prompt modes."
+    )
+    DOTS_PROMPTS_AVAILABLE = False
+    dict_promptmode_to_prompt = {}
+
+# Map OutputMode to dots.ocr native prompt modes
+DOTS_PROMPT_MAP = {
+    "raw_json": "prompt_layout_all_en",      # Full JSON (layout + content)
+    "markdown": "prompt_ocr",                # Content-oriented OCR (Markdown)
+    "qa_pairs": "prompt_layout_all_en",      # Full JSON, then 2nd step LLM
+    "chunks": "prompt_layout_all_en",        # Full JSON for chunking
+    "layout_only": "prompt_layout_only_en",  # Layout only (bbox + categories, no text)
+    "region": "prompt_grounding_ocr",        # Targeted region parsing (grounding)
+}
+
+# Fallback prompts if dict_promptmode_to_prompt is not available
+FALLBACK_PROMPTS = {
+    "prompt_layout_all_en": (
+        "You are a document OCR and layout parser. "
+        "Extract all text, tables, formulas, and layout into a clean JSON structure with fields like "
+        "`blocks`, `tables`, `reading_order`, including bounding boxes and page numbers. "
+        "Respond with JSON only, no explanations."
+    ),
+    "prompt_ocr": (
+        "You are a document OCR and layout parser. "
+        "Extract the document as Markdown, preserving headings, paragraphs, and tables. "
+        "Tables should be proper GitHub-flavored Markdown tables. "
+        "Respond with Markdown as plain text."
+    ),
+    "prompt_layout_only_en": (
+        "You are a document layout parser. "
+        "Extract only the layout structure (bounding boxes, block types, reading order) "
+        "without the text content. "
+        "Respond with JSON containing only layout information (bbox, type, reading_order)."
+    ),
+    "prompt_grounding_ocr": (
+        "You are a document OCR assistant for targeted region parsing. "
+        "Extract text and layout for the specified region of the document. "
+        "Respond with JSON containing the parsed content for the region."
+    ),
+}
+
+# Global model instance
+_model: Optional[dict] = None
+_processor: Optional[object] = None
+
+# Model configuration
+MODEL_PATH = settings.PARSER_MODEL_NAME
+DEVICE = settings.PARSER_DEVICE
+DTYPE = torch.bfloat16 if DEVICE != "cpu" else torch.float32
+MAX_NEW_TOKENS = int(os.getenv("DOTS_OCR_MAX_NEW_TOKENS", "24000"))
+
+
+def load_model():
+    """Load dots.ocr model with lazy initialization"""
+    global _model, _processor
+    
+    if _model is not None and _processor is not None:
+        return _model, _processor
+    
+    logger.info(f"Loading dots.ocr model: {MODEL_PATH}")
+    logger.info(f"Device: {DEVICE}")
+    
+    try:
+        model = AutoModelForVision2Seq.from_pretrained(
+            MODEL_PATH,
+            attn_implementation="flash_attention_2",
+            torch_dtype=DTYPE,
+            device_map="auto",
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+        )
+        
+        processor = AutoProcessor.from_pretrained(
+            MODEL_PATH,
+            trust_remote_code=True
+        )
+        
+        if DEVICE == "cuda" and torch.cuda.is_available():
+            model.to("cuda")
+        elif DEVICE == "mps" and hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            model.to("mps")
+        
+        _model = model
+        _processor = processor
+        
+        logger.info(f"Model loaded successfully on {DEVICE}")
+        return _model, _processor
+        
+    except Exception as e:
+        logger.error(f"Failed to load model: {e}", exc_info=True)
+        raise
+
+
+def get_model():
+    """Get loaded model instance"""
+    if _model is None or _processor is None:
+        return load_model()
+    return _model, _processor
+
+
+def _build_prompt(output_mode: str) -> str:
+    """
+    Build prompt for dots.ocr based on OutputMode
+    
+    Args:
+        output_mode: One of "raw_json", "markdown", "qa_pairs", "chunks", "layout_only", "region"
+    
+    Returns:
+        Prompt string for dots.ocr
+    """
+    prompt_key = DOTS_PROMPT_MAP.get(output_mode, "prompt_layout_all_en")
+    
+    # Try to use native dots.ocr prompts
+    if DOTS_PROMPTS_AVAILABLE and prompt_key in dict_promptmode_to_prompt:
+        prompt = dict_promptmode_to_prompt[prompt_key]
+        logger.debug(f"Using native dots.ocr prompt: {prompt_key}")
+        return prompt
+    
+    # Fallback to our prompts
+    if prompt_key in FALLBACK_PROMPTS:
+        logger.debug(f"Using fallback prompt: {prompt_key}")
+        return FALLBACK_PROMPTS[prompt_key]
+    
+    # Ultimate fallback
+    logger.warning(f"Unknown prompt key: {prompt_key}, using default")
+    return FALLBACK_PROMPTS["prompt_layout_all_en"]
+
+
+def _build_messages(image_path: str, prompt: str) -> list:
+    """Build messages for dots.ocr model"""
+    return [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image_path},
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+
+
+def _generate_from_path(
+    image_path: str,
+    output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks", "layout_only", "region"]
+) -> str:
+    """
+    Generate output from image path using dots.ocr model
+    
+    Args:
+        image_path: Path to image file
+        output_mode: Output mode (maps to dots.ocr prompt mode)
+    
+    Returns:
+        Generated text from model
+    """
+    model, processor = get_model()
+    prompt = _build_prompt(output_mode)
+    messages = _build_messages(image_path, prompt)
+    
+    # Apply chat template
+    text = processor.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+    
+    # Process vision info
+    image_inputs, video_inputs = process_vision_info(messages)
+    
+    # Prepare inputs
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    
+    # Move to device
+    device = DEVICE
+    if device == "cuda" and torch.cuda.is_available():
+        inputs = {k: v.to("cuda") if isinstance(v, torch.Tensor) else v 
+                 for k, v in inputs.items()}
+    elif device == "mps" and hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        inputs = {k: v.to("mps") if isinstance(v, torch.Tensor) else v 
+                 for k, v in inputs.items()}
+    
+    # Generate
+    with torch.inference_mode():
+        generated_ids = model.generate(
+            **inputs,
+            max_new_tokens=MAX_NEW_TOKENS,
+        )
+    
+    # Trim input tokens
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] 
+        for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
+    ]
+    
+    # Decode
+    output_text = processor.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+    )
+    
+    return output_text[0]
+
+
+def parse_document_with_local(
+    image_bytes: bytes,
+    output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks", "layout_only", "region"] = "raw_json"
+) -> str:
+    """
+    Parse document from image bytes using local dots.ocr model
+    
+    Args:
+        image_bytes: Image bytes (PNG/JPEG)
+        output_mode: Output mode (maps to dots.ocr prompt mode)
+            - raw_json: Full JSON (layout + content) via prompt_layout_all_en
+            - markdown: Markdown text via prompt_ocr
+            - qa_pairs: Full JSON (same as raw_json), then 2nd step LLM
+            - chunks: Full JSON for chunking
+            - layout_only: Layout only (bbox + categories) via prompt_layout_only_en
+            - region: Targeted region parsing via prompt_grounding_ocr
+    
+    Returns:
+        Generated text from model (JSON or Markdown depending on mode)
+    
+    Note:
+        For "qa_pairs" mode, this returns full JSON. 
+        The 2nd step (LLM Q&A generation) should be done separately.
+    """
+    # Save to temporary file
+    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
+        tmp_path = f.name
+        f.write(image_bytes)
+    
+    try:
+        return _generate_from_path(tmp_path, output_mode)
+    finally:
+        try:
+            os.remove(tmp_path)
+        except OSError:
+            pass
+
--- a/services/parser-service/app/runtime/ollama_client.py
+++ b/services/parser-service/app/runtime/ollama_client.py
@@ -20,32 +20,56 @@ class OutputMode(str, Enum):
    raw_json = "raw_json"
    markdown = "markdown"
    qa_pairs = "qa_pairs"
+    chunks = "chunks"
+    layout_only = "layout_only"
+    region = "region"


 def build_prompt(mode: OutputMode) -> str:
-    """Build prompt for Ollama based on output mode"""
-    if mode == OutputMode.raw_json:
-        return (
+    """
+    Build prompt for Ollama based on output mode
+    Maps to dots.ocr native prompt modes
+    """
+    # Map to dots.ocr prompt modes (same as local_runtime)
+    prompt_map = {
+        OutputMode.raw_json: "prompt_layout_all_en",
+        OutputMode.markdown: "prompt_ocr",
+        OutputMode.qa_pairs: "prompt_layout_all_en",  # Full JSON, then 2nd step LLM
+        OutputMode.chunks: "prompt_layout_all_en",
+        OutputMode.layout_only: "prompt_layout_only_en",
+        OutputMode.region: "prompt_grounding_ocr",
+    }
+    
+    prompt_key = prompt_map.get(mode, "prompt_layout_all_en")
+    
+    # Fallback prompts (same as local_runtime)
+    fallback_prompts = {
+        "prompt_layout_all_en": (
            "You are a document OCR and layout parser. "
            "Extract all text, tables, formulas, and layout into a clean JSON structure with fields like "
            "`blocks`, `tables`, `reading_order`, including bounding boxes and page numbers. "
            "Respond with JSON only, no explanations."
-        )
-    elif mode == OutputMode.markdown:
-        return (
+        ),
+        "prompt_ocr": (
            "You are a document OCR and layout parser. "
            "Extract the document as Markdown, preserving headings, paragraphs, and tables. "
            "Tables should be proper GitHub-flavored Markdown tables. "
            "Respond with Markdown as plain text."
-        )
-    elif mode == OutputMode.qa_pairs:
-        return (
-            "You are a document OCR and knowledge extraction assistant. "
-            "Read the document and output a JSON array of Q&A pairs covering the key information. "
-            "Each item should be {\"question\": ..., \"answer\": ..., \"page\": ..., \"section\": ...}. "
-            "Respond with JSON only, no explanations."
-        )
-    return "You are a document OCR assistant. Extract text."
+        ),
+        "prompt_layout_only_en": (
+            "You are a document layout parser. "
+            "Extract only the layout structure (bounding boxes, block types, reading order) "
+            "without the text content. "
+            "Respond with JSON containing only layout information (bbox, type, reading_order)."
+        ),
+        "prompt_grounding_ocr": (
+            "You are a document OCR assistant for targeted region parsing. "
+            "Extract text and layout for the specified region of the document. "
+            "Respond with JSON containing the parsed content for the region."
+        ),
+    }
+    
+    return fallback_prompts.get(prompt_key, fallback_prompts["prompt_layout_all_en"])


 async def call_ollama_vision(
--- a/services/parser-service/app/runtime/qa_builder.py
+++ b/services/parser-service/app/runtime/qa_builder.py
@@ -0,0 +1,198 @@
+"""
+Q&A Builder - 2-stage pipeline for qa_pairs mode
+Stage 1: PARSER (dots.ocr) → raw JSON
+Stage 2: LLM (DAGI Router) → Q&A pairs
+"""
+
+import json
+import logging
+from typing import List, Dict, Any, Optional
+
+import httpx
+
+from app.schemas import QAPair, ParsedDocument
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+async def build_qa_pairs_via_router(
+    parsed_doc: ParsedDocument,
+    dao_id: str = "daarion"
+) -> List[QAPair]:
+    """
+    2-stage pipeline: Generate Q&A pairs from parsed document using DAGI Router
+    
+    Args:
+        parsed_doc: ParsedDocument from dots.ocr (stage 1)
+        dao_id: DAO identifier
+    
+    Returns:
+        List of QAPair objects
+    """
+    # Build prompt for LLM
+    prompt = _build_qa_prompt(parsed_doc)
+    
+    # Prepare payload for DAGI Router
+    payload = {
+        "mode": "qa_build",  # New mode in Router
+        "dao_id": dao_id,
+        "user_id": "parser-service",
+        "payload": {
+            "instruction": prompt,
+            "parsed_document": parsed_doc.model_dump(mode="json"),
+        },
+    }
+    
+    # Call DAGI Router
+    router_url = f"{settings.ROUTER_BASE_URL.rstrip('/')}/route"
+    logger.info(f"Calling DAGI Router for Q&A generation: {router_url}")
+    
+    try:
+        async with httpx.AsyncClient(timeout=settings.ROUTER_TIMEOUT) as client:
+            resp = await client.post(router_url, json=payload)
+            resp.raise_for_status()
+            data = resp.json()
+            
+            # Extract response text
+            text = data.get("data", {}).get("text", "")
+            if not text:
+                logger.warning("Empty response from DAGI Router")
+                return []
+            
+            # Parse JSON response
+            qa_pairs = _parse_qa_response(text, parsed_doc)
+            logger.info(f"Generated {len(qa_pairs)} Q&A pairs")
+            return qa_pairs
+            
+    except httpx.HTTPError as e:
+        logger.error(f"DAGI Router HTTP error: {e}")
+        raise RuntimeError(f"DAGI Router API error: {e}") from e
+    except Exception as e:
+        logger.error(f"Failed to build Q&A pairs: {e}", exc_info=True)
+        raise RuntimeError(f"Q&A generation failed: {e}") from e
+
+
+def _build_qa_prompt(parsed_doc: ParsedDocument) -> str:
+    """
+    Build prompt for Q&A generation from parsed document
+    
+    Args:
+        parsed_doc: ParsedDocument with structured content
+    
+    Returns:
+        Prompt string for LLM
+    """
+    # Extract text content from document (first 5000 chars to avoid token limits)
+    text_content = []
+    for page in parsed_doc.pages:
+        for block in page.blocks:
+            if block.text:
+                text_content.append(f"[Page {page.page_num}] {block.text}")
+    
+    document_text = "\n\n".join(text_content[:50])  # Limit to first 50 blocks
+    if len(document_text) > 5000:
+        document_text = document_text[:5000] + "..."
+    
+    prompt = (
+        "Тобі дається результат OCR-документу у JSON-форматі (layout + текст).\n"
+        "Твоє завдання: побудувати список запитань/відповідей, які покривають ключову "
+        "інформацію цього документу.\n\n"
+        "Формат відповіді — СУВОРО JSON-масив об'єктів:\n"
+        "[\n"
+        '  {"question": "...", "answer": "...", "source_page": <int|null>, "confidence": <float|null>},\n'
+        "  ...\n"
+        "]\n\n"
+        "Вимоги:\n"
+        "- Формулюй питання українською.\n"
+        "- Відповіді мають базуватись на тексті документа (не вигадуй).\n"
+        "- Якщо можна визначити номер сторінки — заповни поле source_page.\n"
+        "- Не додавай ніякого пояснення поза JSON.\n"
+        "- Мінімум 5-10 Q&A пар, максимум 20.\n\n"
+        f"Документ:\n{document_text}\n\n"
+        "Відповідь (тільки JSON):"
+    )
+    
+    return prompt
+
+
+def _parse_qa_response(text: str, parsed_doc: ParsedDocument) -> List[QAPair]:
+    """
+    Parse LLM response into QAPair objects
+    
+    Args:
+        text: Response text from LLM (should be JSON)
+        parsed_doc: Original parsed document (for page numbers)
+    
+    Returns:
+        List of QAPair objects
+    """
+    # Try to extract JSON from response
+    text_clean = text.strip()
+    
+    # Remove markdown code blocks if present
+    if text_clean.startswith("```"):
+        lines = text_clean.split("\n")
+        text_clean = "\n".join(lines[1:-1]) if len(lines) > 2 else text_clean
+    
+    # Try to parse as JSON
+    try:
+        qa_data = json.loads(text_clean)
+        if not isinstance(qa_data, list):
+            logger.warning(f"Expected list, got {type(qa_data)}")
+            return []
+        
+        # Convert to QAPair objects
+        qa_pairs = []
+        for item in qa_data:
+            if not isinstance(item, dict):
+                continue
+            
+            question = item.get("question", "").strip()
+            answer = item.get("answer", "").strip()
+            
+            if not question or not answer:
+                continue
+            
+            # Extract page number
+            source_page = item.get("source_page")
+            if source_page is None:
+                # Try to infer from answer text
+                source_page = _infer_page_number(answer, parsed_doc)
+            
+            qa_pairs.append(QAPair(
+                question=question,
+                answer=answer,
+                source_page=source_page or 1,
+                confidence=item.get("confidence")
+            ))
+        
+        return qa_pairs
+        
+    except json.JSONDecodeError as e:
+        logger.warning(f"Failed to parse JSON response: {e}")
+        logger.debug(f"Response text: {text_clean[:500]}")
+        return []
+
+
+def _infer_page_number(text: str, parsed_doc: ParsedDocument) -> Optional[int]:
+    """
+    Try to infer page number from text content
+    
+    Args:
+        text: Answer text
+        parsed_doc: Parsed document
+    
+    Returns:
+        Page number or None
+    """
+    # Simple heuristic: check if text appears in any page
+    text_lower = text.lower()
+    
+    for page in parsed_doc.pages:
+        for block in page.blocks:
+            if block.text and text_lower in block.text.lower():
+                return page.page_num
+    
+    return None
+