feat: integrate dots.ocr native prompt modes and 2-stage qa_pairs pipeline

Prompt Modes Integration:
- Create local_runtime.py with DOTS_PROMPT_MAP
- Map OutputMode to native dots.ocr prompt modes (prompt_layout_all_en, prompt_ocr, etc.)
- Support dict_promptmode_to_prompt from dots.ocr with fallback prompts
- Add layout_only and region modes to OutputMode enum

2-Stage Q&A Pipeline:
- Create qa_builder.py for 2-stage qa_pairs generation
- Stage 1: PARSER (dots.ocr) → raw JSON via prompt_layout_all_en
- Stage 2: LLM (DAGI Router) → Q&A pairs via mode=qa_build
- Update endpoints.py to use 2-stage pipeline for qa_pairs mode
- Add ROUTER_BASE_URL and ROUTER_TIMEOUT to config

Updates:
- Update inference.py to use local_runtime with native prompts
- Update ollama_client.py to use same prompt map
- Add PROMPT_MODES.md documentation
This commit is contained in:
Apple
2025-11-16 04:24:03 -08:00
parent d474a085c3
commit be22752590
8 changed files with 714 additions and 44 deletions

View File

@@ -11,6 +11,7 @@ from PIL import Image
from app.schemas import ParsedDocument, ParsedPage, ParsedBlock, BBox
from app.runtime.model_loader import get_model
from app.runtime.local_runtime import parse_document_with_local
from app.runtime.preprocessing import (
convert_pdf_to_images, load_image, prepare_images_for_model
)
@@ -26,7 +27,7 @@ logger = logging.getLogger(__name__)
async def parse_document_with_ollama(
images: List[Image.Image],
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks", "layout_only", "region"] = "raw_json",
doc_id: Optional[str] = None,
doc_type: Literal["pdf", "image"] = "image"
) -> ParsedDocument:
@@ -106,7 +107,7 @@ async def parse_document_with_ollama(
def parse_document_from_images(
images: List[Image.Image],
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks", "layout_only", "region"] = "raw_json",
doc_id: Optional[str] = None,
doc_type: Literal["pdf", "image"] = "image"
) -> ParsedDocument:
@@ -146,33 +147,19 @@ def parse_document_from_images(
if not prepared_images:
raise ValueError("No valid images to process")
# Process with model
# Process with model using local_runtime (with native dots.ocr prompts)
pages_data = []
for idx, image in enumerate(prepared_images, start=1):
try:
# Prepare inputs for model
inputs = model["processor"](images=image, return_tensors="pt")
# Convert image to bytes for local_runtime
import io
buf = io.BytesIO()
image.convert("RGB").save(buf, format="PNG")
image_bytes = buf.getvalue()
# Move inputs to device
device = model["device"]
if device != "cpu":
inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v
for k, v in inputs.items()}
# Generate output
with torch.no_grad():
outputs = model["model"].generate(
**inputs,
max_new_tokens=2048, # Adjust based on model capabilities
do_sample=False # Deterministic output
)
# Decode output
generated_text = model["processor"].decode(
outputs[0],
skip_special_tokens=True
)
# Use local_runtime with native prompt modes
generated_text = parse_document_with_local(image_bytes, output_mode)
logger.debug(f"Model output for page {idx}: {generated_text[:100]}...")