feat: add tests and integrate dots.ocr model
G.2.5 - Tests: - Add pytest test suite with fixtures - test_preprocessing.py - PDF/image loading, normalization, validation - test_postprocessing.py - chunks, QA pairs, markdown generation - test_inference.py - dummy parser and inference functions - test_api.py - API endpoint tests - Add pytest.ini configuration G.1.3 - dots.ocr Integration: - Update model_loader.py with real model loading code - Support for AutoModelForVision2Seq and AutoProcessor - Device handling (CUDA/CPU/MPS) with fallback - Error handling with dummy fallback option - Update inference.py with real model inference - Process images through model - Generate and decode outputs - Parse model output to blocks - Add model_output_parser.py - Parse JSON or plain text model output - Convert to structured blocks - Layout detection support (placeholder) Dependencies: - Add pytest, pytest-asyncio, httpx for testing
This commit is contained in:
193
services/parser-service/tests/test_postprocessing.py
Normal file
193
services/parser-service/tests/test_postprocessing.py
Normal file
@@ -0,0 +1,193 @@
|
||||
"""
|
||||
Tests for postprocessing functions
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from app.runtime.postprocessing import (
|
||||
normalize_text,
|
||||
build_parsed_document,
|
||||
build_chunks,
|
||||
build_qa_pairs,
|
||||
build_markdown
|
||||
)
|
||||
from app.schemas import ParsedDocument, ParsedPage, ParsedBlock, BBox
|
||||
|
||||
|
||||
class TestTextNormalization:
|
||||
"""Tests for text normalization"""
|
||||
|
||||
def test_normalize_text_whitespace(self):
|
||||
"""Test removing extra whitespace"""
|
||||
text = " hello world "
|
||||
assert normalize_text(text) == "hello world"
|
||||
|
||||
def test_normalize_text_newlines(self):
|
||||
"""Test removing newlines"""
|
||||
text = "hello\n\nworld"
|
||||
assert normalize_text(text) == "hello world"
|
||||
|
||||
def test_normalize_text_empty(self):
|
||||
"""Test empty text"""
|
||||
assert normalize_text("") == ""
|
||||
assert normalize_text(" ") == ""
|
||||
|
||||
|
||||
class TestBuildParsedDocument:
|
||||
"""Tests for building ParsedDocument"""
|
||||
|
||||
def test_build_parsed_document(self):
|
||||
"""Test building ParsedDocument from model output"""
|
||||
pages_data = [
|
||||
{
|
||||
"blocks": [
|
||||
{
|
||||
"type": "heading",
|
||||
"text": " Title ",
|
||||
"bbox": {"x": 0, "y": 0, "width": 100, "height": 20},
|
||||
"reading_order": 1
|
||||
},
|
||||
{
|
||||
"type": "paragraph",
|
||||
"text": " Content ",
|
||||
"bbox": {"x": 0, "y": 30, "width": 100, "height": 50},
|
||||
"reading_order": 2
|
||||
}
|
||||
],
|
||||
"width": 800,
|
||||
"height": 1200
|
||||
}
|
||||
]
|
||||
|
||||
doc = build_parsed_document(pages_data, "test-doc", "pdf")
|
||||
|
||||
assert doc.doc_id == "test-doc"
|
||||
assert doc.doc_type == "pdf"
|
||||
assert len(doc.pages) == 1
|
||||
assert len(doc.pages[0].blocks) == 2
|
||||
assert doc.pages[0].blocks[0].text == "Title" # Normalized
|
||||
assert doc.pages[0].blocks[0].type == "heading"
|
||||
|
||||
|
||||
class TestBuildChunks:
|
||||
"""Tests for building chunks"""
|
||||
|
||||
def test_build_chunks(self):
|
||||
"""Test building chunks from ParsedDocument"""
|
||||
doc = ParsedDocument(
|
||||
doc_id="test-doc",
|
||||
doc_type="pdf",
|
||||
pages=[
|
||||
ParsedPage(
|
||||
page_num=1,
|
||||
blocks=[
|
||||
ParsedBlock(
|
||||
type="heading",
|
||||
text="Section 1",
|
||||
bbox=BBox(x=0, y=0, width=100, height=20),
|
||||
reading_order=1,
|
||||
page_num=1
|
||||
),
|
||||
ParsedBlock(
|
||||
type="paragraph",
|
||||
text="Content of section 1",
|
||||
bbox=BBox(x=0, y=30, width=100, height=50),
|
||||
reading_order=2,
|
||||
page_num=1
|
||||
)
|
||||
],
|
||||
width=800,
|
||||
height=1200
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
chunks = build_chunks(doc, dao_id="test-dao")
|
||||
|
||||
assert len(chunks) > 0
|
||||
assert all(chunk.page == 1 for chunk in chunks)
|
||||
assert all(chunk.metadata.get("dao_id") == "test-dao" for chunk in chunks)
|
||||
assert all(chunk.metadata.get("doc_id") == "test-doc" for chunk in chunks)
|
||||
|
||||
|
||||
class TestBuildQAPairs:
|
||||
"""Tests for building Q&A pairs"""
|
||||
|
||||
def test_build_qa_pairs(self):
|
||||
"""Test building Q&A pairs from ParsedDocument"""
|
||||
doc = ParsedDocument(
|
||||
doc_id="test-doc",
|
||||
doc_type="pdf",
|
||||
pages=[
|
||||
ParsedPage(
|
||||
page_num=1,
|
||||
blocks=[
|
||||
ParsedBlock(
|
||||
type="heading",
|
||||
text="What is X?",
|
||||
bbox=BBox(x=0, y=0, width=100, height=20),
|
||||
reading_order=1,
|
||||
page_num=1
|
||||
),
|
||||
ParsedBlock(
|
||||
type="paragraph",
|
||||
text="X is a test",
|
||||
bbox=BBox(x=0, y=30, width=100, height=50),
|
||||
reading_order=2,
|
||||
page_num=1
|
||||
)
|
||||
],
|
||||
width=800,
|
||||
height=1200
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
qa_pairs = build_qa_pairs(doc, max_pairs=5)
|
||||
|
||||
assert len(qa_pairs) > 0
|
||||
assert all(isinstance(qa.question, str) for qa in qa_pairs)
|
||||
assert all(isinstance(qa.answer, str) for qa in qa_pairs)
|
||||
assert all(qa.source_page == 1 for qa in qa_pairs)
|
||||
|
||||
|
||||
class TestBuildMarkdown:
|
||||
"""Tests for building Markdown"""
|
||||
|
||||
def test_build_markdown(self):
|
||||
"""Test building Markdown from ParsedDocument"""
|
||||
doc = ParsedDocument(
|
||||
doc_id="test-doc",
|
||||
doc_type="pdf",
|
||||
pages=[
|
||||
ParsedPage(
|
||||
page_num=1,
|
||||
blocks=[
|
||||
ParsedBlock(
|
||||
type="heading",
|
||||
text="Title",
|
||||
bbox=BBox(x=0, y=0, width=100, height=20),
|
||||
reading_order=1,
|
||||
page_num=1
|
||||
),
|
||||
ParsedBlock(
|
||||
type="paragraph",
|
||||
text="Content",
|
||||
bbox=BBox(x=0, y=30, width=100, height=50),
|
||||
reading_order=2,
|
||||
page_num=1
|
||||
)
|
||||
],
|
||||
width=800,
|
||||
height=1200
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
markdown = build_markdown(doc)
|
||||
|
||||
assert isinstance(markdown, str)
|
||||
assert "Title" in markdown
|
||||
assert "Content" in markdown
|
||||
assert "###" in markdown or "####" in markdown # Heading markers
|
||||
|
||||
Reference in New Issue
Block a user