feat: integrate dots.ocr native prompt modes and 2-stage qa_pairs pipeline
Prompt Modes Integration: - Create local_runtime.py with DOTS_PROMPT_MAP - Map OutputMode to native dots.ocr prompt modes (prompt_layout_all_en, prompt_ocr, etc.) - Support dict_promptmode_to_prompt from dots.ocr with fallback prompts - Add layout_only and region modes to OutputMode enum 2-Stage Q&A Pipeline: - Create qa_builder.py for 2-stage qa_pairs generation - Stage 1: PARSER (dots.ocr) → raw JSON via prompt_layout_all_en - Stage 2: LLM (DAGI Router) → Q&A pairs via mode=qa_build - Update endpoints.py to use 2-stage pipeline for qa_pairs mode - Add ROUTER_BASE_URL and ROUTER_TIMEOUT to config Updates: - Update inference.py to use local_runtime with native prompts - Update ollama_client.py to use same prompt map - Add PROMPT_MODES.md documentation
This commit is contained in:
@@ -11,6 +11,7 @@ from PIL import Image
|
||||
|
||||
from app.schemas import ParsedDocument, ParsedPage, ParsedBlock, BBox
|
||||
from app.runtime.model_loader import get_model
|
||||
from app.runtime.local_runtime import parse_document_with_local
|
||||
from app.runtime.preprocessing import (
|
||||
convert_pdf_to_images, load_image, prepare_images_for_model
|
||||
)
|
||||
@@ -26,7 +27,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
async def parse_document_with_ollama(
|
||||
images: List[Image.Image],
|
||||
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
|
||||
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks", "layout_only", "region"] = "raw_json",
|
||||
doc_id: Optional[str] = None,
|
||||
doc_type: Literal["pdf", "image"] = "image"
|
||||
) -> ParsedDocument:
|
||||
@@ -106,7 +107,7 @@ async def parse_document_with_ollama(
|
||||
|
||||
def parse_document_from_images(
|
||||
images: List[Image.Image],
|
||||
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
|
||||
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks", "layout_only", "region"] = "raw_json",
|
||||
doc_id: Optional[str] = None,
|
||||
doc_type: Literal["pdf", "image"] = "image"
|
||||
) -> ParsedDocument:
|
||||
@@ -146,33 +147,19 @@ def parse_document_from_images(
|
||||
if not prepared_images:
|
||||
raise ValueError("No valid images to process")
|
||||
|
||||
# Process with model
|
||||
# Process with model using local_runtime (with native dots.ocr prompts)
|
||||
pages_data = []
|
||||
|
||||
for idx, image in enumerate(prepared_images, start=1):
|
||||
try:
|
||||
# Prepare inputs for model
|
||||
inputs = model["processor"](images=image, return_tensors="pt")
|
||||
# Convert image to bytes for local_runtime
|
||||
import io
|
||||
buf = io.BytesIO()
|
||||
image.convert("RGB").save(buf, format="PNG")
|
||||
image_bytes = buf.getvalue()
|
||||
|
||||
# Move inputs to device
|
||||
device = model["device"]
|
||||
if device != "cpu":
|
||||
inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v
|
||||
for k, v in inputs.items()}
|
||||
|
||||
# Generate output
|
||||
with torch.no_grad():
|
||||
outputs = model["model"].generate(
|
||||
**inputs,
|
||||
max_new_tokens=2048, # Adjust based on model capabilities
|
||||
do_sample=False # Deterministic output
|
||||
)
|
||||
|
||||
# Decode output
|
||||
generated_text = model["processor"].decode(
|
||||
outputs[0],
|
||||
skip_special_tokens=True
|
||||
)
|
||||
# Use local_runtime with native prompt modes
|
||||
generated_text = parse_document_with_local(image_bytes, output_mode)
|
||||
|
||||
logger.debug(f"Model output for page {idx}: {generated_text[:100]}...")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user