feat: add qa_build mode, tests, and region mode support
Router Configuration: - Add mode='qa_build' routing rule in router-config.yml - Priority 8, uses local_qwen3_8b for Q&A generation 2-Stage Q&A Pipeline Tests: - Create test_qa_pipeline.py with comprehensive tests - Test prompt building, JSON parsing, router integration - Mock DAGI Router responses for testing Region Mode (Grounding OCR): - Add region_bbox and region_page parameters to ParseRequest - Support region mode in local_runtime with bbox in prompt - Update endpoints to accept region parameters (x, y, width, height, page) - Validate region parameters and filter pages for region mode - Pass region_bbox through inference pipeline Updates: - Update local_runtime to support region_bbox in prompts - Update inference.py to pass region_bbox to local_runtime - Update endpoints.py to handle region mode parameters
This commit is contained in:
197
services/parser-service/tests/test_qa_pipeline.py
Normal file
197
services/parser-service/tests/test_qa_pipeline.py
Normal file
@@ -0,0 +1,197 @@
|
||||
"""
|
||||
Tests for 2-stage Q&A pipeline (PARSER → LLM)
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import AsyncMock, patch, MagicMock
|
||||
from PIL import Image
|
||||
import io
|
||||
import json
|
||||
|
||||
from app.schemas import ParsedDocument, ParsedPage, ParsedBlock, BBox
|
||||
from app.runtime.qa_builder import build_qa_pairs_via_router, _build_qa_prompt, _parse_qa_response
|
||||
|
||||
|
||||
class TestQABuilder:
|
||||
"""Tests for Q&A builder (2-stage pipeline)"""
|
||||
|
||||
def test_build_qa_prompt(self):
|
||||
"""Test prompt building for Q&A generation"""
|
||||
# Create mock parsed document
|
||||
doc = ParsedDocument(
|
||||
doc_id="test-doc",
|
||||
doc_type="pdf",
|
||||
pages=[
|
||||
ParsedPage(
|
||||
page_num=1,
|
||||
blocks=[
|
||||
ParsedBlock(
|
||||
type="heading",
|
||||
text="Test Document",
|
||||
bbox=BBox(x=0, y=0, width=800, height=50),
|
||||
reading_order=1,
|
||||
page_num=1
|
||||
),
|
||||
ParsedBlock(
|
||||
type="paragraph",
|
||||
text="This is a test document with some content.",
|
||||
bbox=BBox(x=0, y=60, width=800, height=100),
|
||||
reading_order=2,
|
||||
page_num=1
|
||||
)
|
||||
],
|
||||
width=800,
|
||||
height=600
|
||||
)
|
||||
],
|
||||
metadata={}
|
||||
)
|
||||
|
||||
prompt = _build_qa_prompt(doc)
|
||||
|
||||
# Check prompt structure
|
||||
assert "OCR-документу" in prompt
|
||||
assert "JSON-масив" in prompt
|
||||
assert "question" in prompt
|
||||
assert "answer" in prompt
|
||||
assert "source_page" in prompt
|
||||
assert "Test Document" in prompt or "test document" in prompt.lower()
|
||||
|
||||
def test_parse_qa_response_valid_json(self):
|
||||
"""Test parsing valid JSON response from LLM"""
|
||||
doc = ParsedDocument(
|
||||
doc_id="test-doc",
|
||||
doc_type="pdf",
|
||||
pages=[ParsedPage(page_num=1, blocks=[], width=800, height=600)],
|
||||
metadata={}
|
||||
)
|
||||
|
||||
response_text = json.dumps([
|
||||
{
|
||||
"question": "Що це за документ?",
|
||||
"answer": "Це тестовий документ",
|
||||
"source_page": 1,
|
||||
"confidence": 0.9
|
||||
},
|
||||
{
|
||||
"question": "Який контент?",
|
||||
"answer": "Тестовий контент",
|
||||
"source_page": 1
|
||||
}
|
||||
])
|
||||
|
||||
qa_pairs = _parse_qa_response(response_text, doc)
|
||||
|
||||
assert len(qa_pairs) == 2
|
||||
assert qa_pairs[0].question == "Що це за документ?"
|
||||
assert qa_pairs[0].answer == "Це тестовий документ"
|
||||
assert qa_pairs[0].source_page == 1
|
||||
assert qa_pairs[0].confidence == 0.9
|
||||
|
||||
def test_parse_qa_response_markdown_code_block(self):
|
||||
"""Test parsing JSON from markdown code block"""
|
||||
doc = ParsedDocument(
|
||||
doc_id="test-doc",
|
||||
doc_type="pdf",
|
||||
pages=[ParsedPage(page_num=1, blocks=[], width=800, height=600)],
|
||||
metadata={}
|
||||
)
|
||||
|
||||
response_text = "```json\n" + json.dumps([
|
||||
{
|
||||
"question": "Тест?",
|
||||
"answer": "Відповідь"
|
||||
}
|
||||
]) + "\n```"
|
||||
|
||||
qa_pairs = _parse_qa_response(response_text, doc)
|
||||
|
||||
assert len(qa_pairs) == 1
|
||||
assert qa_pairs[0].question == "Тест?"
|
||||
|
||||
def test_parse_qa_response_invalid_json(self):
|
||||
"""Test parsing invalid JSON (should return empty list)"""
|
||||
doc = ParsedDocument(
|
||||
doc_id="test-doc",
|
||||
doc_type="pdf",
|
||||
pages=[ParsedPage(page_num=1, blocks=[], width=800, height=600)],
|
||||
metadata={}
|
||||
)
|
||||
|
||||
response_text = "This is not JSON"
|
||||
|
||||
qa_pairs = _parse_qa_response(response_text, doc)
|
||||
|
||||
assert len(qa_pairs) == 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_build_qa_pairs_via_router_success(self):
|
||||
"""Test successful Q&A generation via DAGI Router"""
|
||||
# Create mock parsed document
|
||||
doc = ParsedDocument(
|
||||
doc_id="test-doc",
|
||||
doc_type="pdf",
|
||||
pages=[
|
||||
ParsedPage(
|
||||
page_num=1,
|
||||
blocks=[
|
||||
ParsedBlock(
|
||||
type="paragraph",
|
||||
text="Test content",
|
||||
bbox=BBox(x=0, y=0, width=800, height=100),
|
||||
reading_order=1,
|
||||
page_num=1
|
||||
)
|
||||
],
|
||||
width=800,
|
||||
height=600
|
||||
)
|
||||
],
|
||||
metadata={}
|
||||
)
|
||||
|
||||
# Mock router response
|
||||
mock_response = {
|
||||
"ok": True,
|
||||
"data": {
|
||||
"text": json.dumps([
|
||||
{
|
||||
"question": "Що це?",
|
||||
"answer": "Тест",
|
||||
"source_page": 1
|
||||
}
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
with patch("app.runtime.qa_builder.httpx.AsyncClient") as mock_client:
|
||||
mock_client.return_value.__aenter__.return_value.post = AsyncMock(
|
||||
return_value=MagicMock(
|
||||
raise_for_status=MagicMock(),
|
||||
json=lambda: mock_response
|
||||
)
|
||||
)
|
||||
|
||||
qa_pairs = await build_qa_pairs_via_router(doc, dao_id="test-dao")
|
||||
|
||||
assert len(qa_pairs) == 1
|
||||
assert qa_pairs[0].question == "Що це?"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_build_qa_pairs_via_router_failure(self):
|
||||
"""Test Q&A generation failure (should raise exception)"""
|
||||
doc = ParsedDocument(
|
||||
doc_id="test-doc",
|
||||
doc_type="pdf",
|
||||
pages=[ParsedPage(page_num=1, blocks=[], width=800, height=600)],
|
||||
metadata={}
|
||||
)
|
||||
|
||||
with patch("app.runtime.qa_builder.httpx.AsyncClient") as mock_client:
|
||||
mock_client.return_value.__aenter__.return_value.post = AsyncMock(
|
||||
side_effect=Exception("Router error")
|
||||
)
|
||||
|
||||
with pytest.raises(RuntimeError):
|
||||
await build_qa_pairs_via_router(doc, dao_id="test-dao")
|
||||
|
||||
Reference in New Issue
Block a user