Files
microdao-daarion/services/parser-service/tests/test_qa_pipeline.py
Apple d3c701f3ff feat: add qa_build mode, tests, and region mode support
Router Configuration:
- Add mode='qa_build' routing rule in router-config.yml
- Priority 8, uses local_qwen3_8b for Q&A generation

2-Stage Q&A Pipeline Tests:
- Create test_qa_pipeline.py with comprehensive tests
- Test prompt building, JSON parsing, router integration
- Mock DAGI Router responses for testing

Region Mode (Grounding OCR):
- Add region_bbox and region_page parameters to ParseRequest
- Support region mode in local_runtime with bbox in prompt
- Update endpoints to accept region parameters (x, y, width, height, page)
- Validate region parameters and filter pages for region mode
- Pass region_bbox through inference pipeline

Updates:
- Update local_runtime to support region_bbox in prompts
- Update inference.py to pass region_bbox to local_runtime
- Update endpoints.py to handle region mode parameters
2025-11-16 04:26:35 -08:00

198 lines
6.5 KiB
Python

"""
Tests for 2-stage Q&A pipeline (PARSER → LLM)
"""
import pytest
from unittest.mock import AsyncMock, patch, MagicMock
from PIL import Image
import io
import json
from app.schemas import ParsedDocument, ParsedPage, ParsedBlock, BBox
from app.runtime.qa_builder import build_qa_pairs_via_router, _build_qa_prompt, _parse_qa_response
class TestQABuilder:
"""Tests for Q&A builder (2-stage pipeline)"""
def test_build_qa_prompt(self):
"""Test prompt building for Q&A generation"""
# Create mock parsed document
doc = ParsedDocument(
doc_id="test-doc",
doc_type="pdf",
pages=[
ParsedPage(
page_num=1,
blocks=[
ParsedBlock(
type="heading",
text="Test Document",
bbox=BBox(x=0, y=0, width=800, height=50),
reading_order=1,
page_num=1
),
ParsedBlock(
type="paragraph",
text="This is a test document with some content.",
bbox=BBox(x=0, y=60, width=800, height=100),
reading_order=2,
page_num=1
)
],
width=800,
height=600
)
],
metadata={}
)
prompt = _build_qa_prompt(doc)
# Check prompt structure
assert "OCR-документу" in prompt
assert "JSON-масив" in prompt
assert "question" in prompt
assert "answer" in prompt
assert "source_page" in prompt
assert "Test Document" in prompt or "test document" in prompt.lower()
def test_parse_qa_response_valid_json(self):
"""Test parsing valid JSON response from LLM"""
doc = ParsedDocument(
doc_id="test-doc",
doc_type="pdf",
pages=[ParsedPage(page_num=1, blocks=[], width=800, height=600)],
metadata={}
)
response_text = json.dumps([
{
"question": "Що це за документ?",
"answer": "Це тестовий документ",
"source_page": 1,
"confidence": 0.9
},
{
"question": "Який контент?",
"answer": "Тестовий контент",
"source_page": 1
}
])
qa_pairs = _parse_qa_response(response_text, doc)
assert len(qa_pairs) == 2
assert qa_pairs[0].question == "Що це за документ?"
assert qa_pairs[0].answer == "Це тестовий документ"
assert qa_pairs[0].source_page == 1
assert qa_pairs[0].confidence == 0.9
def test_parse_qa_response_markdown_code_block(self):
"""Test parsing JSON from markdown code block"""
doc = ParsedDocument(
doc_id="test-doc",
doc_type="pdf",
pages=[ParsedPage(page_num=1, blocks=[], width=800, height=600)],
metadata={}
)
response_text = "```json\n" + json.dumps([
{
"question": "Тест?",
"answer": "Відповідь"
}
]) + "\n```"
qa_pairs = _parse_qa_response(response_text, doc)
assert len(qa_pairs) == 1
assert qa_pairs[0].question == "Тест?"
def test_parse_qa_response_invalid_json(self):
"""Test parsing invalid JSON (should return empty list)"""
doc = ParsedDocument(
doc_id="test-doc",
doc_type="pdf",
pages=[ParsedPage(page_num=1, blocks=[], width=800, height=600)],
metadata={}
)
response_text = "This is not JSON"
qa_pairs = _parse_qa_response(response_text, doc)
assert len(qa_pairs) == 0
@pytest.mark.asyncio
async def test_build_qa_pairs_via_router_success(self):
"""Test successful Q&A generation via DAGI Router"""
# Create mock parsed document
doc = ParsedDocument(
doc_id="test-doc",
doc_type="pdf",
pages=[
ParsedPage(
page_num=1,
blocks=[
ParsedBlock(
type="paragraph",
text="Test content",
bbox=BBox(x=0, y=0, width=800, height=100),
reading_order=1,
page_num=1
)
],
width=800,
height=600
)
],
metadata={}
)
# Mock router response
mock_response = {
"ok": True,
"data": {
"text": json.dumps([
{
"question": "Що це?",
"answer": "Тест",
"source_page": 1
}
])
}
}
with patch("app.runtime.qa_builder.httpx.AsyncClient") as mock_client:
mock_client.return_value.__aenter__.return_value.post = AsyncMock(
return_value=MagicMock(
raise_for_status=MagicMock(),
json=lambda: mock_response
)
)
qa_pairs = await build_qa_pairs_via_router(doc, dao_id="test-dao")
assert len(qa_pairs) == 1
assert qa_pairs[0].question == "Що це?"
@pytest.mark.asyncio
async def test_build_qa_pairs_via_router_failure(self):
"""Test Q&A generation failure (should raise exception)"""
doc = ParsedDocument(
doc_id="test-doc",
doc_type="pdf",
pages=[ParsedPage(page_num=1, blocks=[], width=800, height=600)],
metadata={}
)
with patch("app.runtime.qa_builder.httpx.AsyncClient") as mock_client:
mock_client.return_value.__aenter__.return_value.post = AsyncMock(
side_effect=Exception("Router error")
)
with pytest.raises(RuntimeError):
await build_qa_pairs_via_router(doc, dao_id="test-dao")