feat: integrate dots.ocr native prompt modes and 2-stage qa_pairs pipeline

Prompt Modes Integration: - Create local_runtime.py with DOTS_PROMPT_MAP - Map OutputMode to native dots.ocr prompt modes (prompt_layout_all_en, prompt_ocr, etc.) - Support dict_promptmode_to_prompt from dots.ocr with fallback prompts - Add layout_only and region modes to OutputMode enum 2-Stage Q&A Pipeline: - Create qa_builder.py for 2-stage qa_pairs generation - Stage 1: PARSER (dots.ocr) → raw JSON via prompt_layout_all_en - Stage 2: LLM (DAGI Router) → Q&A pairs via mode=qa_build - Update endpoints.py to use 2-stage pipeline for qa_pairs mode - Add ROUTER_BASE_URL and ROUTER_TIMEOUT to config Updates: - Update inference.py to use local_runtime with native prompts - Update ollama_client.py to use same prompt map - Add PROMPT_MODES.md documentation
2025-11-16 04:24:03 -08:00
parent d474a085c3
commit be22752590
8 changed files with 714 additions and 44 deletions
--- a/services/parser-service/app/runtime/inference.py
+++ b/services/parser-service/app/runtime/inference.py
@@ -11,6 +11,7 @@ from PIL import Image

 from app.schemas import ParsedDocument, ParsedPage, ParsedBlock, BBox
 from app.runtime.model_loader import get_model
+from app.runtime.local_runtime import parse_document_with_local
 from app.runtime.preprocessing import (
    convert_pdf_to_images, load_image, prepare_images_for_model
 )
@@ -26,7 +27,7 @@ logger = logging.getLogger(__name__)

 async def parse_document_with_ollama(
    images: List[Image.Image],
-    output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
+    output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks", "layout_only", "region"] = "raw_json",
    doc_id: Optional[str] = None,
    doc_type: Literal["pdf", "image"] = "image"
 ) -> ParsedDocument:
@@ -106,7 +107,7 @@ async def parse_document_with_ollama(

 def parse_document_from_images(
    images: List[Image.Image],
-    output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
+    output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks", "layout_only", "region"] = "raw_json",
    doc_id: Optional[str] = None,
    doc_type: Literal["pdf", "image"] = "image"
 ) -> ParsedDocument:
@@ -146,33 +147,19 @@ def parse_document_from_images(
    if not prepared_images:
        raise ValueError("No valid images to process")
    
-    # Process with model
+    # Process with model using local_runtime (with native dots.ocr prompts)
    pages_data = []
    
    for idx, image in enumerate(prepared_images, start=1):
        try:
-            # Prepare inputs for model
-            inputs = model["processor"](images=image, return_tensors="pt")
+            # Convert image to bytes for local_runtime
+            import io
+            buf = io.BytesIO()
+            image.convert("RGB").save(buf, format="PNG")
+            image_bytes = buf.getvalue()
            
-            # Move inputs to device
-            device = model["device"]
-            if device != "cpu":
-                inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v 
-                         for k, v in inputs.items()}
-            
-            # Generate output
-            with torch.no_grad():
-                outputs = model["model"].generate(
-                    **inputs,
-                    max_new_tokens=2048,  # Adjust based on model capabilities
-                    do_sample=False  # Deterministic output
-                )
-            
-            # Decode output
-            generated_text = model["processor"].decode(
-                outputs[0], 
-                skip_special_tokens=True
-            )
+            # Use local_runtime with native prompt modes
+            generated_text = parse_document_with_local(image_bytes, output_mode)
            
            logger.debug(f"Model output for page {idx}: {generated_text[:100]}...")