feat: integrate dots.ocr native prompt modes and 2-stage qa_pairs pipeline
Prompt Modes Integration: - Create local_runtime.py with DOTS_PROMPT_MAP - Map OutputMode to native dots.ocr prompt modes (prompt_layout_all_en, prompt_ocr, etc.) - Support dict_promptmode_to_prompt from dots.ocr with fallback prompts - Add layout_only and region modes to OutputMode enum 2-Stage Q&A Pipeline: - Create qa_builder.py for 2-stage qa_pairs generation - Stage 1: PARSER (dots.ocr) → raw JSON via prompt_layout_all_en - Stage 2: LLM (DAGI Router) → Q&A pairs via mode=qa_build - Update endpoints.py to use 2-stage pipeline for qa_pairs mode - Add ROUTER_BASE_URL and ROUTER_TIMEOUT to config Updates: - Update inference.py to use local_runtime with native prompts - Update ollama_client.py to use same prompt map - Add PROMPT_MODES.md documentation
This commit is contained in:
@@ -11,6 +11,7 @@ from PIL import Image
|
||||
|
||||
from app.schemas import ParsedDocument, ParsedPage, ParsedBlock, BBox
|
||||
from app.runtime.model_loader import get_model
|
||||
from app.runtime.local_runtime import parse_document_with_local
|
||||
from app.runtime.preprocessing import (
|
||||
convert_pdf_to_images, load_image, prepare_images_for_model
|
||||
)
|
||||
@@ -26,7 +27,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
async def parse_document_with_ollama(
|
||||
images: List[Image.Image],
|
||||
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
|
||||
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks", "layout_only", "region"] = "raw_json",
|
||||
doc_id: Optional[str] = None,
|
||||
doc_type: Literal["pdf", "image"] = "image"
|
||||
) -> ParsedDocument:
|
||||
@@ -106,7 +107,7 @@ async def parse_document_with_ollama(
|
||||
|
||||
def parse_document_from_images(
|
||||
images: List[Image.Image],
|
||||
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks"] = "raw_json",
|
||||
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks", "layout_only", "region"] = "raw_json",
|
||||
doc_id: Optional[str] = None,
|
||||
doc_type: Literal["pdf", "image"] = "image"
|
||||
) -> ParsedDocument:
|
||||
@@ -146,33 +147,19 @@ def parse_document_from_images(
|
||||
if not prepared_images:
|
||||
raise ValueError("No valid images to process")
|
||||
|
||||
# Process with model
|
||||
# Process with model using local_runtime (with native dots.ocr prompts)
|
||||
pages_data = []
|
||||
|
||||
for idx, image in enumerate(prepared_images, start=1):
|
||||
try:
|
||||
# Prepare inputs for model
|
||||
inputs = model["processor"](images=image, return_tensors="pt")
|
||||
# Convert image to bytes for local_runtime
|
||||
import io
|
||||
buf = io.BytesIO()
|
||||
image.convert("RGB").save(buf, format="PNG")
|
||||
image_bytes = buf.getvalue()
|
||||
|
||||
# Move inputs to device
|
||||
device = model["device"]
|
||||
if device != "cpu":
|
||||
inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v
|
||||
for k, v in inputs.items()}
|
||||
|
||||
# Generate output
|
||||
with torch.no_grad():
|
||||
outputs = model["model"].generate(
|
||||
**inputs,
|
||||
max_new_tokens=2048, # Adjust based on model capabilities
|
||||
do_sample=False # Deterministic output
|
||||
)
|
||||
|
||||
# Decode output
|
||||
generated_text = model["processor"].decode(
|
||||
outputs[0],
|
||||
skip_special_tokens=True
|
||||
)
|
||||
# Use local_runtime with native prompt modes
|
||||
generated_text = parse_document_with_local(image_bytes, output_mode)
|
||||
|
||||
logger.debug(f"Model output for page {idx}: {generated_text[:100]}...")
|
||||
|
||||
|
||||
273
services/parser-service/app/runtime/local_runtime.py
Normal file
273
services/parser-service/app/runtime/local_runtime.py
Normal file
@@ -0,0 +1,273 @@
|
||||
"""
|
||||
Local runtime for dots.ocr model with native prompt modes
|
||||
Maps OutputMode to dots.ocr prompt modes using dict_promptmode_to_prompt
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
import logging
|
||||
from typing import Literal, Optional
|
||||
|
||||
import torch
|
||||
from transformers import AutoModelForVision2Seq, AutoProcessor
|
||||
from qwen_vl_utils import process_vision_info
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Try to import dots.ocr prompt dictionary
|
||||
try:
|
||||
from dots_ocr.utils.prompts import dict_promptmode_to_prompt
|
||||
DOTS_PROMPTS_AVAILABLE = True
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"dots_ocr.utils.prompts not available. "
|
||||
"Using fallback prompts. Install dots.ocr package for native prompt modes."
|
||||
)
|
||||
DOTS_PROMPTS_AVAILABLE = False
|
||||
dict_promptmode_to_prompt = {}
|
||||
|
||||
# Map OutputMode to dots.ocr native prompt modes
|
||||
DOTS_PROMPT_MAP = {
|
||||
"raw_json": "prompt_layout_all_en", # Full JSON (layout + content)
|
||||
"markdown": "prompt_ocr", # Content-oriented OCR (Markdown)
|
||||
"qa_pairs": "prompt_layout_all_en", # Full JSON, then 2nd step LLM
|
||||
"chunks": "prompt_layout_all_en", # Full JSON for chunking
|
||||
"layout_only": "prompt_layout_only_en", # Layout only (bbox + categories, no text)
|
||||
"region": "prompt_grounding_ocr", # Targeted region parsing (grounding)
|
||||
}
|
||||
|
||||
# Fallback prompts if dict_promptmode_to_prompt is not available
|
||||
FALLBACK_PROMPTS = {
|
||||
"prompt_layout_all_en": (
|
||||
"You are a document OCR and layout parser. "
|
||||
"Extract all text, tables, formulas, and layout into a clean JSON structure with fields like "
|
||||
"`blocks`, `tables`, `reading_order`, including bounding boxes and page numbers. "
|
||||
"Respond with JSON only, no explanations."
|
||||
),
|
||||
"prompt_ocr": (
|
||||
"You are a document OCR and layout parser. "
|
||||
"Extract the document as Markdown, preserving headings, paragraphs, and tables. "
|
||||
"Tables should be proper GitHub-flavored Markdown tables. "
|
||||
"Respond with Markdown as plain text."
|
||||
),
|
||||
"prompt_layout_only_en": (
|
||||
"You are a document layout parser. "
|
||||
"Extract only the layout structure (bounding boxes, block types, reading order) "
|
||||
"without the text content. "
|
||||
"Respond with JSON containing only layout information (bbox, type, reading_order)."
|
||||
),
|
||||
"prompt_grounding_ocr": (
|
||||
"You are a document OCR assistant for targeted region parsing. "
|
||||
"Extract text and layout for the specified region of the document. "
|
||||
"Respond with JSON containing the parsed content for the region."
|
||||
),
|
||||
}
|
||||
|
||||
# Global model instance
|
||||
_model: Optional[dict] = None
|
||||
_processor: Optional[object] = None
|
||||
|
||||
# Model configuration
|
||||
MODEL_PATH = settings.PARSER_MODEL_NAME
|
||||
DEVICE = settings.PARSER_DEVICE
|
||||
DTYPE = torch.bfloat16 if DEVICE != "cpu" else torch.float32
|
||||
MAX_NEW_TOKENS = int(os.getenv("DOTS_OCR_MAX_NEW_TOKENS", "24000"))
|
||||
|
||||
|
||||
def load_model():
|
||||
"""Load dots.ocr model with lazy initialization"""
|
||||
global _model, _processor
|
||||
|
||||
if _model is not None and _processor is not None:
|
||||
return _model, _processor
|
||||
|
||||
logger.info(f"Loading dots.ocr model: {MODEL_PATH}")
|
||||
logger.info(f"Device: {DEVICE}")
|
||||
|
||||
try:
|
||||
model = AutoModelForVision2Seq.from_pretrained(
|
||||
MODEL_PATH,
|
||||
attn_implementation="flash_attention_2",
|
||||
torch_dtype=DTYPE,
|
||||
device_map="auto",
|
||||
trust_remote_code=True,
|
||||
low_cpu_mem_usage=True,
|
||||
)
|
||||
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
MODEL_PATH,
|
||||
trust_remote_code=True
|
||||
)
|
||||
|
||||
if DEVICE == "cuda" and torch.cuda.is_available():
|
||||
model.to("cuda")
|
||||
elif DEVICE == "mps" and hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
||||
model.to("mps")
|
||||
|
||||
_model = model
|
||||
_processor = processor
|
||||
|
||||
logger.info(f"Model loaded successfully on {DEVICE}")
|
||||
return _model, _processor
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load model: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
def get_model():
|
||||
"""Get loaded model instance"""
|
||||
if _model is None or _processor is None:
|
||||
return load_model()
|
||||
return _model, _processor
|
||||
|
||||
|
||||
def _build_prompt(output_mode: str) -> str:
|
||||
"""
|
||||
Build prompt for dots.ocr based on OutputMode
|
||||
|
||||
Args:
|
||||
output_mode: One of "raw_json", "markdown", "qa_pairs", "chunks", "layout_only", "region"
|
||||
|
||||
Returns:
|
||||
Prompt string for dots.ocr
|
||||
"""
|
||||
prompt_key = DOTS_PROMPT_MAP.get(output_mode, "prompt_layout_all_en")
|
||||
|
||||
# Try to use native dots.ocr prompts
|
||||
if DOTS_PROMPTS_AVAILABLE and prompt_key in dict_promptmode_to_prompt:
|
||||
prompt = dict_promptmode_to_prompt[prompt_key]
|
||||
logger.debug(f"Using native dots.ocr prompt: {prompt_key}")
|
||||
return prompt
|
||||
|
||||
# Fallback to our prompts
|
||||
if prompt_key in FALLBACK_PROMPTS:
|
||||
logger.debug(f"Using fallback prompt: {prompt_key}")
|
||||
return FALLBACK_PROMPTS[prompt_key]
|
||||
|
||||
# Ultimate fallback
|
||||
logger.warning(f"Unknown prompt key: {prompt_key}, using default")
|
||||
return FALLBACK_PROMPTS["prompt_layout_all_en"]
|
||||
|
||||
|
||||
def _build_messages(image_path: str, prompt: str) -> list:
|
||||
"""Build messages for dots.ocr model"""
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image", "image": image_path},
|
||||
{"type": "text", "text": prompt},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _generate_from_path(
|
||||
image_path: str,
|
||||
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks", "layout_only", "region"]
|
||||
) -> str:
|
||||
"""
|
||||
Generate output from image path using dots.ocr model
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
output_mode: Output mode (maps to dots.ocr prompt mode)
|
||||
|
||||
Returns:
|
||||
Generated text from model
|
||||
"""
|
||||
model, processor = get_model()
|
||||
prompt = _build_prompt(output_mode)
|
||||
messages = _build_messages(image_path, prompt)
|
||||
|
||||
# Apply chat template
|
||||
text = processor.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
|
||||
# Process vision info
|
||||
image_inputs, video_inputs = process_vision_info(messages)
|
||||
|
||||
# Prepare inputs
|
||||
inputs = processor(
|
||||
text=[text],
|
||||
images=image_inputs,
|
||||
videos=video_inputs,
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
# Move to device
|
||||
device = DEVICE
|
||||
if device == "cuda" and torch.cuda.is_available():
|
||||
inputs = {k: v.to("cuda") if isinstance(v, torch.Tensor) else v
|
||||
for k, v in inputs.items()}
|
||||
elif device == "mps" and hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
||||
inputs = {k: v.to("mps") if isinstance(v, torch.Tensor) else v
|
||||
for k, v in inputs.items()}
|
||||
|
||||
# Generate
|
||||
with torch.inference_mode():
|
||||
generated_ids = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=MAX_NEW_TOKENS,
|
||||
)
|
||||
|
||||
# Trim input tokens
|
||||
generated_ids_trimmed = [
|
||||
out_ids[len(in_ids):]
|
||||
for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
|
||||
]
|
||||
|
||||
# Decode
|
||||
output_text = processor.batch_decode(
|
||||
generated_ids_trimmed,
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=False,
|
||||
)
|
||||
|
||||
return output_text[0]
|
||||
|
||||
|
||||
def parse_document_with_local(
|
||||
image_bytes: bytes,
|
||||
output_mode: Literal["raw_json", "markdown", "qa_pairs", "chunks", "layout_only", "region"] = "raw_json"
|
||||
) -> str:
|
||||
"""
|
||||
Parse document from image bytes using local dots.ocr model
|
||||
|
||||
Args:
|
||||
image_bytes: Image bytes (PNG/JPEG)
|
||||
output_mode: Output mode (maps to dots.ocr prompt mode)
|
||||
- raw_json: Full JSON (layout + content) via prompt_layout_all_en
|
||||
- markdown: Markdown text via prompt_ocr
|
||||
- qa_pairs: Full JSON (same as raw_json), then 2nd step LLM
|
||||
- chunks: Full JSON for chunking
|
||||
- layout_only: Layout only (bbox + categories) via prompt_layout_only_en
|
||||
- region: Targeted region parsing via prompt_grounding_ocr
|
||||
|
||||
Returns:
|
||||
Generated text from model (JSON or Markdown depending on mode)
|
||||
|
||||
Note:
|
||||
For "qa_pairs" mode, this returns full JSON.
|
||||
The 2nd step (LLM Q&A generation) should be done separately.
|
||||
"""
|
||||
# Save to temporary file
|
||||
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
|
||||
tmp_path = f.name
|
||||
f.write(image_bytes)
|
||||
|
||||
try:
|
||||
return _generate_from_path(tmp_path, output_mode)
|
||||
finally:
|
||||
try:
|
||||
os.remove(tmp_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
@@ -20,32 +20,56 @@ class OutputMode(str, Enum):
|
||||
raw_json = "raw_json"
|
||||
markdown = "markdown"
|
||||
qa_pairs = "qa_pairs"
|
||||
chunks = "chunks"
|
||||
layout_only = "layout_only"
|
||||
region = "region"
|
||||
|
||||
|
||||
def build_prompt(mode: OutputMode) -> str:
|
||||
"""Build prompt for Ollama based on output mode"""
|
||||
if mode == OutputMode.raw_json:
|
||||
return (
|
||||
"""
|
||||
Build prompt for Ollama based on output mode
|
||||
Maps to dots.ocr native prompt modes
|
||||
"""
|
||||
# Map to dots.ocr prompt modes (same as local_runtime)
|
||||
prompt_map = {
|
||||
OutputMode.raw_json: "prompt_layout_all_en",
|
||||
OutputMode.markdown: "prompt_ocr",
|
||||
OutputMode.qa_pairs: "prompt_layout_all_en", # Full JSON, then 2nd step LLM
|
||||
OutputMode.chunks: "prompt_layout_all_en",
|
||||
OutputMode.layout_only: "prompt_layout_only_en",
|
||||
OutputMode.region: "prompt_grounding_ocr",
|
||||
}
|
||||
|
||||
prompt_key = prompt_map.get(mode, "prompt_layout_all_en")
|
||||
|
||||
# Fallback prompts (same as local_runtime)
|
||||
fallback_prompts = {
|
||||
"prompt_layout_all_en": (
|
||||
"You are a document OCR and layout parser. "
|
||||
"Extract all text, tables, formulas, and layout into a clean JSON structure with fields like "
|
||||
"`blocks`, `tables`, `reading_order`, including bounding boxes and page numbers. "
|
||||
"Respond with JSON only, no explanations."
|
||||
)
|
||||
elif mode == OutputMode.markdown:
|
||||
return (
|
||||
),
|
||||
"prompt_ocr": (
|
||||
"You are a document OCR and layout parser. "
|
||||
"Extract the document as Markdown, preserving headings, paragraphs, and tables. "
|
||||
"Tables should be proper GitHub-flavored Markdown tables. "
|
||||
"Respond with Markdown as plain text."
|
||||
)
|
||||
elif mode == OutputMode.qa_pairs:
|
||||
return (
|
||||
"You are a document OCR and knowledge extraction assistant. "
|
||||
"Read the document and output a JSON array of Q&A pairs covering the key information. "
|
||||
"Each item should be {\"question\": ..., \"answer\": ..., \"page\": ..., \"section\": ...}. "
|
||||
"Respond with JSON only, no explanations."
|
||||
)
|
||||
return "You are a document OCR assistant. Extract text."
|
||||
),
|
||||
"prompt_layout_only_en": (
|
||||
"You are a document layout parser. "
|
||||
"Extract only the layout structure (bounding boxes, block types, reading order) "
|
||||
"without the text content. "
|
||||
"Respond with JSON containing only layout information (bbox, type, reading_order)."
|
||||
),
|
||||
"prompt_grounding_ocr": (
|
||||
"You are a document OCR assistant for targeted region parsing. "
|
||||
"Extract text and layout for the specified region of the document. "
|
||||
"Respond with JSON containing the parsed content for the region."
|
||||
),
|
||||
}
|
||||
|
||||
return fallback_prompts.get(prompt_key, fallback_prompts["prompt_layout_all_en"])
|
||||
|
||||
|
||||
async def call_ollama_vision(
|
||||
|
||||
198
services/parser-service/app/runtime/qa_builder.py
Normal file
198
services/parser-service/app/runtime/qa_builder.py
Normal file
@@ -0,0 +1,198 @@
|
||||
"""
|
||||
Q&A Builder - 2-stage pipeline for qa_pairs mode
|
||||
Stage 1: PARSER (dots.ocr) → raw JSON
|
||||
Stage 2: LLM (DAGI Router) → Q&A pairs
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from app.schemas import QAPair, ParsedDocument
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def build_qa_pairs_via_router(
|
||||
parsed_doc: ParsedDocument,
|
||||
dao_id: str = "daarion"
|
||||
) -> List[QAPair]:
|
||||
"""
|
||||
2-stage pipeline: Generate Q&A pairs from parsed document using DAGI Router
|
||||
|
||||
Args:
|
||||
parsed_doc: ParsedDocument from dots.ocr (stage 1)
|
||||
dao_id: DAO identifier
|
||||
|
||||
Returns:
|
||||
List of QAPair objects
|
||||
"""
|
||||
# Build prompt for LLM
|
||||
prompt = _build_qa_prompt(parsed_doc)
|
||||
|
||||
# Prepare payload for DAGI Router
|
||||
payload = {
|
||||
"mode": "qa_build", # New mode in Router
|
||||
"dao_id": dao_id,
|
||||
"user_id": "parser-service",
|
||||
"payload": {
|
||||
"instruction": prompt,
|
||||
"parsed_document": parsed_doc.model_dump(mode="json"),
|
||||
},
|
||||
}
|
||||
|
||||
# Call DAGI Router
|
||||
router_url = f"{settings.ROUTER_BASE_URL.rstrip('/')}/route"
|
||||
logger.info(f"Calling DAGI Router for Q&A generation: {router_url}")
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=settings.ROUTER_TIMEOUT) as client:
|
||||
resp = await client.post(router_url, json=payload)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
# Extract response text
|
||||
text = data.get("data", {}).get("text", "")
|
||||
if not text:
|
||||
logger.warning("Empty response from DAGI Router")
|
||||
return []
|
||||
|
||||
# Parse JSON response
|
||||
qa_pairs = _parse_qa_response(text, parsed_doc)
|
||||
logger.info(f"Generated {len(qa_pairs)} Q&A pairs")
|
||||
return qa_pairs
|
||||
|
||||
except httpx.HTTPError as e:
|
||||
logger.error(f"DAGI Router HTTP error: {e}")
|
||||
raise RuntimeError(f"DAGI Router API error: {e}") from e
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to build Q&A pairs: {e}", exc_info=True)
|
||||
raise RuntimeError(f"Q&A generation failed: {e}") from e
|
||||
|
||||
|
||||
def _build_qa_prompt(parsed_doc: ParsedDocument) -> str:
|
||||
"""
|
||||
Build prompt for Q&A generation from parsed document
|
||||
|
||||
Args:
|
||||
parsed_doc: ParsedDocument with structured content
|
||||
|
||||
Returns:
|
||||
Prompt string for LLM
|
||||
"""
|
||||
# Extract text content from document (first 5000 chars to avoid token limits)
|
||||
text_content = []
|
||||
for page in parsed_doc.pages:
|
||||
for block in page.blocks:
|
||||
if block.text:
|
||||
text_content.append(f"[Page {page.page_num}] {block.text}")
|
||||
|
||||
document_text = "\n\n".join(text_content[:50]) # Limit to first 50 blocks
|
||||
if len(document_text) > 5000:
|
||||
document_text = document_text[:5000] + "..."
|
||||
|
||||
prompt = (
|
||||
"Тобі дається результат OCR-документу у JSON-форматі (layout + текст).\n"
|
||||
"Твоє завдання: побудувати список запитань/відповідей, які покривають ключову "
|
||||
"інформацію цього документу.\n\n"
|
||||
"Формат відповіді — СУВОРО JSON-масив об'єктів:\n"
|
||||
"[\n"
|
||||
' {"question": "...", "answer": "...", "source_page": <int|null>, "confidence": <float|null>},\n'
|
||||
" ...\n"
|
||||
"]\n\n"
|
||||
"Вимоги:\n"
|
||||
"- Формулюй питання українською.\n"
|
||||
"- Відповіді мають базуватись на тексті документа (не вигадуй).\n"
|
||||
"- Якщо можна визначити номер сторінки — заповни поле source_page.\n"
|
||||
"- Не додавай ніякого пояснення поза JSON.\n"
|
||||
"- Мінімум 5-10 Q&A пар, максимум 20.\n\n"
|
||||
f"Документ:\n{document_text}\n\n"
|
||||
"Відповідь (тільки JSON):"
|
||||
)
|
||||
|
||||
return prompt
|
||||
|
||||
|
||||
def _parse_qa_response(text: str, parsed_doc: ParsedDocument) -> List[QAPair]:
|
||||
"""
|
||||
Parse LLM response into QAPair objects
|
||||
|
||||
Args:
|
||||
text: Response text from LLM (should be JSON)
|
||||
parsed_doc: Original parsed document (for page numbers)
|
||||
|
||||
Returns:
|
||||
List of QAPair objects
|
||||
"""
|
||||
# Try to extract JSON from response
|
||||
text_clean = text.strip()
|
||||
|
||||
# Remove markdown code blocks if present
|
||||
if text_clean.startswith("```"):
|
||||
lines = text_clean.split("\n")
|
||||
text_clean = "\n".join(lines[1:-1]) if len(lines) > 2 else text_clean
|
||||
|
||||
# Try to parse as JSON
|
||||
try:
|
||||
qa_data = json.loads(text_clean)
|
||||
if not isinstance(qa_data, list):
|
||||
logger.warning(f"Expected list, got {type(qa_data)}")
|
||||
return []
|
||||
|
||||
# Convert to QAPair objects
|
||||
qa_pairs = []
|
||||
for item in qa_data:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
|
||||
question = item.get("question", "").strip()
|
||||
answer = item.get("answer", "").strip()
|
||||
|
||||
if not question or not answer:
|
||||
continue
|
||||
|
||||
# Extract page number
|
||||
source_page = item.get("source_page")
|
||||
if source_page is None:
|
||||
# Try to infer from answer text
|
||||
source_page = _infer_page_number(answer, parsed_doc)
|
||||
|
||||
qa_pairs.append(QAPair(
|
||||
question=question,
|
||||
answer=answer,
|
||||
source_page=source_page or 1,
|
||||
confidence=item.get("confidence")
|
||||
))
|
||||
|
||||
return qa_pairs
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Failed to parse JSON response: {e}")
|
||||
logger.debug(f"Response text: {text_clean[:500]}")
|
||||
return []
|
||||
|
||||
|
||||
def _infer_page_number(text: str, parsed_doc: ParsedDocument) -> Optional[int]:
|
||||
"""
|
||||
Try to infer page number from text content
|
||||
|
||||
Args:
|
||||
text: Answer text
|
||||
parsed_doc: Parsed document
|
||||
|
||||
Returns:
|
||||
Page number or None
|
||||
"""
|
||||
# Simple heuristic: check if text appears in any page
|
||||
text_lower = text.lower()
|
||||
|
||||
for page in parsed_doc.pages:
|
||||
for block in page.blocks:
|
||||
if block.text and text_lower in block.text.lower():
|
||||
return page.page_num
|
||||
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user