Services: Admin-Lehrer, Backend-Lehrer, Studio v2, Website, Klausur-Service, School-Service, Voice-Service, Geo-Service, BreakPilot Drive, Agent-Core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
570 lines
22 KiB
Python
570 lines
22 KiB
Python
"""
|
|
Unit Tests for CV Vocab Pipeline (cv_vocab_pipeline.py)
|
|
|
|
Tests cover:
|
|
- Data classes (PageRegion, VocabRow, PipelineResult)
|
|
- Stage 2: Deskew image
|
|
- Stage 3: Dewarp (pass-through)
|
|
- Stage 4: Image preparation (OCR + Layout images)
|
|
- Stage 5: Layout analysis (content bounds, projection profiles, column detection)
|
|
- Stage 6: Multi-pass OCR region handling
|
|
- Stage 7: Line grouping and vocabulary matching
|
|
- Orchestrator (run_cv_pipeline)
|
|
|
|
DSGVO Note: All tests run locally with synthetic data. No external API calls.
|
|
"""
|
|
|
|
import pytest
|
|
import numpy as np
|
|
from unittest.mock import AsyncMock, MagicMock, patch, PropertyMock
|
|
from dataclasses import asdict
|
|
|
|
# Import module under test
|
|
from cv_vocab_pipeline import (
|
|
PageRegion,
|
|
VocabRow,
|
|
PipelineResult,
|
|
deskew_image,
|
|
dewarp_image,
|
|
create_ocr_image,
|
|
create_layout_image,
|
|
_find_content_bounds,
|
|
analyze_layout,
|
|
_group_words_into_lines,
|
|
match_lines_to_vocab,
|
|
run_cv_pipeline,
|
|
CV2_AVAILABLE,
|
|
TESSERACT_AVAILABLE,
|
|
CV_PIPELINE_AVAILABLE,
|
|
)
|
|
|
|
|
|
# =============================================
|
|
# FIXTURES
|
|
# =============================================
|
|
|
|
@pytest.fixture
|
|
def white_image():
|
|
"""Create a simple 300x200 white BGR image."""
|
|
return np.ones((200, 300, 3), dtype=np.uint8) * 255
|
|
|
|
|
|
@pytest.fixture
|
|
def text_like_image():
|
|
"""Create a 600x400 image with dark text-like regions simulating 3 columns."""
|
|
img = np.ones((400, 600, 3), dtype=np.uint8) * 255
|
|
|
|
# Column 1 (EN): x=20..170
|
|
for y in range(50, 350, 30):
|
|
img[y:y+15, 30:160, :] = 30 # Dark text lines
|
|
|
|
# Gap between col1 and col2: x=170..210 (white)
|
|
|
|
# Column 2 (DE): x=210..370
|
|
for y in range(50, 350, 30):
|
|
img[y:y+15, 220:360, :] = 30
|
|
|
|
# Gap between col2 and col3: x=370..410 (white)
|
|
|
|
# Column 3 (Example): x=410..580
|
|
for y in range(50, 350, 30):
|
|
img[y:y+15, 420:570, :] = 30
|
|
|
|
return img
|
|
|
|
|
|
@pytest.fixture
|
|
def binary_image():
|
|
"""Create a binary (single-channel) image for OCR tests."""
|
|
# White background (255) with some black text-like areas
|
|
img = np.ones((400, 600), dtype=np.uint8) * 255
|
|
# Add text-like dark bands
|
|
for y in range(50, 350, 30):
|
|
img[y:y+15, 30:570] = 0
|
|
return img
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_words_column_en():
|
|
"""Sample OCR word dicts for English column."""
|
|
return [
|
|
{'text': 'achieve', 'left': 30, 'top': 50, 'width': 80, 'height': 15, 'conf': 90, 'region_type': 'column_en'},
|
|
{'text': 'improve', 'left': 30, 'top': 80, 'width': 80, 'height': 15, 'conf': 85, 'region_type': 'column_en'},
|
|
{'text': 'success', 'left': 30, 'top': 110, 'width': 80, 'height': 15, 'conf': 92, 'region_type': 'column_en'},
|
|
]
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_words_column_de():
|
|
"""Sample OCR word dicts for German column."""
|
|
return [
|
|
{'text': 'erreichen', 'left': 220, 'top': 52, 'width': 100, 'height': 15, 'conf': 88, 'region_type': 'column_de'},
|
|
{'text': 'verbessern', 'left': 220, 'top': 82, 'width': 100, 'height': 15, 'conf': 80, 'region_type': 'column_de'},
|
|
{'text': 'Erfolg', 'left': 220, 'top': 112, 'width': 100, 'height': 15, 'conf': 95, 'region_type': 'column_de'},
|
|
]
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_words_column_ex():
|
|
"""Sample OCR word dicts for Example column."""
|
|
return [
|
|
{'text': 'She', 'left': 420, 'top': 50, 'width': 30, 'height': 15, 'conf': 85, 'region_type': 'column_example'},
|
|
{'text': 'achieved', 'left': 455, 'top': 50, 'width': 70, 'height': 15, 'conf': 80, 'region_type': 'column_example'},
|
|
{'text': 'her', 'left': 530, 'top': 50, 'width': 30, 'height': 15, 'conf': 90, 'region_type': 'column_example'},
|
|
{'text': 'goals.', 'left': 420, 'top': 52, 'width': 50, 'height': 15, 'conf': 75, 'region_type': 'column_example'},
|
|
]
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_regions():
|
|
"""Sample 3-column PageRegion layout."""
|
|
return [
|
|
PageRegion(type='column_en', x=0, y=50, width=190, height=300),
|
|
PageRegion(type='column_de', x=210, y=50, width=160, height=300),
|
|
PageRegion(type='column_example', x=410, y=50, width=190, height=300),
|
|
]
|
|
|
|
|
|
# =============================================
|
|
# DATA CLASS TESTS
|
|
# =============================================
|
|
|
|
class TestDataClasses:
|
|
"""Test data classes for correct defaults and fields."""
|
|
|
|
def test_page_region_creation(self):
|
|
region = PageRegion(type='column_en', x=10, y=20, width=100, height=200)
|
|
assert region.type == 'column_en'
|
|
assert region.x == 10
|
|
assert region.y == 20
|
|
assert region.width == 100
|
|
assert region.height == 200
|
|
|
|
def test_vocab_row_defaults(self):
|
|
row = VocabRow()
|
|
assert row.english == ""
|
|
assert row.german == ""
|
|
assert row.example == ""
|
|
assert row.confidence == 0.0
|
|
assert row.y_position == 0
|
|
|
|
def test_vocab_row_with_values(self):
|
|
row = VocabRow(english="test", german="Test", example="A test.", confidence=85.5, y_position=100)
|
|
assert row.english == "test"
|
|
assert row.german == "Test"
|
|
assert row.confidence == 85.5
|
|
|
|
def test_pipeline_result_defaults(self):
|
|
result = PipelineResult()
|
|
assert result.vocabulary == []
|
|
assert result.word_count == 0
|
|
assert result.columns_detected == 0
|
|
assert result.duration_seconds == 0.0
|
|
assert result.stages == {}
|
|
assert result.error is None
|
|
|
|
def test_pipeline_result_error(self):
|
|
result = PipelineResult(error="Something went wrong")
|
|
assert result.error == "Something went wrong"
|
|
|
|
|
|
# =============================================
|
|
# STAGE 2: DESKEW TESTS
|
|
# =============================================
|
|
|
|
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
|
class TestDeskew:
|
|
"""Test deskew (rotation correction) stage."""
|
|
|
|
def test_deskew_straight_image(self, white_image):
|
|
"""A perfectly straight image should not be rotated."""
|
|
corrected, angle = deskew_image(white_image)
|
|
assert abs(angle) < 0.1
|
|
assert corrected.shape == white_image.shape
|
|
|
|
def test_deskew_returns_tuple(self, white_image):
|
|
"""deskew_image must return (image, angle) tuple."""
|
|
result = deskew_image(white_image)
|
|
assert isinstance(result, tuple)
|
|
assert len(result) == 2
|
|
assert isinstance(result[0], np.ndarray)
|
|
assert isinstance(result[1], float)
|
|
|
|
def test_deskew_preserves_shape(self, text_like_image):
|
|
"""Output image should have same shape as input."""
|
|
corrected, _ = deskew_image(text_like_image)
|
|
assert corrected.shape == text_like_image.shape
|
|
|
|
|
|
# =============================================
|
|
# STAGE 3: DEWARP TESTS
|
|
# =============================================
|
|
|
|
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
|
class TestDewarp:
|
|
"""Test dewarp (pass-through) stage."""
|
|
|
|
def test_dewarp_passthrough(self, white_image):
|
|
"""Current dewarp should return the same image (pass-through)."""
|
|
result = dewarp_image(white_image)
|
|
np.testing.assert_array_equal(result, white_image)
|
|
|
|
def test_dewarp_preserves_shape(self, text_like_image):
|
|
result = dewarp_image(text_like_image)
|
|
assert result.shape == text_like_image.shape
|
|
|
|
|
|
# =============================================
|
|
# STAGE 4: IMAGE PREPARATION TESTS
|
|
# =============================================
|
|
|
|
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
|
class TestImagePreparation:
|
|
"""Test OCR and layout image creation."""
|
|
|
|
def test_create_ocr_image_returns_grayscale(self, text_like_image):
|
|
"""OCR image should be single-channel (binarized)."""
|
|
ocr_img = create_ocr_image(text_like_image)
|
|
assert len(ocr_img.shape) == 2 # Single channel
|
|
assert ocr_img.dtype == np.uint8
|
|
|
|
def test_create_ocr_image_is_binary(self, text_like_image):
|
|
"""OCR image should contain only 0 and 255 values."""
|
|
ocr_img = create_ocr_image(text_like_image)
|
|
unique_vals = np.unique(ocr_img)
|
|
assert all(v in [0, 255] for v in unique_vals)
|
|
|
|
def test_create_layout_image_returns_grayscale(self, text_like_image):
|
|
"""Layout image should be single-channel (CLAHE enhanced)."""
|
|
layout_img = create_layout_image(text_like_image)
|
|
assert len(layout_img.shape) == 2
|
|
assert layout_img.dtype == np.uint8
|
|
|
|
def test_create_layout_image_enhanced_contrast(self, text_like_image):
|
|
"""Layout image should have different histogram than simple grayscale."""
|
|
import cv2
|
|
gray = cv2.cvtColor(text_like_image, cv2.COLOR_BGR2GRAY)
|
|
layout_img = create_layout_image(text_like_image)
|
|
# CLAHE should change the histogram
|
|
assert layout_img.shape == gray.shape
|
|
|
|
|
|
# =============================================
|
|
# STAGE 5: LAYOUT ANALYSIS TESTS
|
|
# =============================================
|
|
|
|
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
|
class TestContentBounds:
|
|
"""Test _find_content_bounds helper."""
|
|
|
|
def test_empty_image(self):
|
|
"""Fully white (inverted = black) image should return full bounds."""
|
|
inv = np.zeros((200, 300), dtype=np.uint8)
|
|
left, right, top, bottom = _find_content_bounds(inv)
|
|
# With no content, bounds should span the image
|
|
assert left >= 0
|
|
assert right <= 300
|
|
assert top >= 0
|
|
assert bottom <= 200
|
|
|
|
def test_centered_content(self):
|
|
"""Content in center should give tight bounds."""
|
|
inv = np.zeros((400, 600), dtype=np.uint8)
|
|
# Add content block in center
|
|
inv[100:300, 50:550] = 255
|
|
left, right, top, bottom = _find_content_bounds(inv)
|
|
assert left <= 52 # ~50 with 2px margin
|
|
assert right >= 548 # ~550 with 2px margin
|
|
assert top <= 102
|
|
assert bottom >= 298
|
|
|
|
|
|
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
|
class TestLayoutAnalysis:
|
|
"""Test analyze_layout for column detection."""
|
|
|
|
def test_returns_list_of_regions(self, text_like_image):
|
|
"""analyze_layout should return a list of PageRegion."""
|
|
ocr_img = create_ocr_image(text_like_image)
|
|
layout_img = create_layout_image(text_like_image)
|
|
regions = analyze_layout(layout_img, ocr_img)
|
|
assert isinstance(regions, list)
|
|
assert all(isinstance(r, PageRegion) for r in regions)
|
|
|
|
def test_detects_columns(self, text_like_image):
|
|
"""With clear 3-column image, should detect at least 1 column."""
|
|
ocr_img = create_ocr_image(text_like_image)
|
|
layout_img = create_layout_image(text_like_image)
|
|
regions = analyze_layout(layout_img, ocr_img)
|
|
column_regions = [r for r in regions if r.type.startswith('column')]
|
|
assert len(column_regions) >= 1
|
|
|
|
def test_single_column_fallback(self):
|
|
"""Image with no clear columns should fall back to single column."""
|
|
# Uniform text across full width
|
|
img = np.ones((400, 600, 3), dtype=np.uint8) * 255
|
|
for y in range(50, 350, 20):
|
|
img[y:y+10, 20:580, :] = 30 # Full-width text
|
|
ocr_img = create_ocr_image(img)
|
|
layout_img = create_layout_image(img)
|
|
regions = analyze_layout(layout_img, ocr_img)
|
|
column_regions = [r for r in regions if r.type.startswith('column')]
|
|
# Should at least return 1 column (full page fallback)
|
|
assert len(column_regions) >= 1
|
|
|
|
def test_region_types_are_valid(self, text_like_image):
|
|
"""All region types should be from the expected set."""
|
|
ocr_img = create_ocr_image(text_like_image)
|
|
layout_img = create_layout_image(text_like_image)
|
|
regions = analyze_layout(layout_img, ocr_img)
|
|
valid_types = {'column_en', 'column_de', 'column_example', 'header', 'footer'}
|
|
for r in regions:
|
|
assert r.type in valid_types, f"Unexpected region type: {r.type}"
|
|
|
|
|
|
# =============================================
|
|
# STAGE 7: LINE GROUPING TESTS
|
|
# =============================================
|
|
|
|
class TestLineGrouping:
|
|
"""Test _group_words_into_lines function."""
|
|
|
|
def test_empty_input(self):
|
|
"""Empty word list should return empty lines."""
|
|
assert _group_words_into_lines([]) == []
|
|
|
|
def test_single_word(self):
|
|
"""Single word should return one line with one word."""
|
|
words = [{'text': 'hello', 'left': 10, 'top': 50, 'width': 50, 'height': 15, 'conf': 90}]
|
|
lines = _group_words_into_lines(words)
|
|
assert len(lines) == 1
|
|
assert len(lines[0]) == 1
|
|
assert lines[0][0]['text'] == 'hello'
|
|
|
|
def test_words_on_same_line(self):
|
|
"""Words close in Y should be grouped into one line."""
|
|
words = [
|
|
{'text': 'hello', 'left': 10, 'top': 50, 'width': 50, 'height': 15, 'conf': 90},
|
|
{'text': 'world', 'left': 70, 'top': 52, 'width': 50, 'height': 15, 'conf': 85},
|
|
]
|
|
lines = _group_words_into_lines(words, y_tolerance_px=10)
|
|
assert len(lines) == 1
|
|
assert len(lines[0]) == 2
|
|
|
|
def test_words_on_different_lines(self):
|
|
"""Words far apart in Y should be on different lines."""
|
|
words = [
|
|
{'text': 'line1', 'left': 10, 'top': 50, 'width': 50, 'height': 15, 'conf': 90},
|
|
{'text': 'line2', 'left': 10, 'top': 100, 'width': 50, 'height': 15, 'conf': 85},
|
|
{'text': 'line3', 'left': 10, 'top': 150, 'width': 50, 'height': 15, 'conf': 88},
|
|
]
|
|
lines = _group_words_into_lines(words, y_tolerance_px=20)
|
|
assert len(lines) == 3
|
|
|
|
def test_words_sorted_by_x_within_line(self):
|
|
"""Words within a line should be sorted by X position."""
|
|
words = [
|
|
{'text': 'world', 'left': 100, 'top': 50, 'width': 50, 'height': 15, 'conf': 85},
|
|
{'text': 'hello', 'left': 10, 'top': 52, 'width': 50, 'height': 15, 'conf': 90},
|
|
]
|
|
lines = _group_words_into_lines(words, y_tolerance_px=10)
|
|
assert len(lines) == 1
|
|
assert lines[0][0]['text'] == 'hello'
|
|
assert lines[0][1]['text'] == 'world'
|
|
|
|
|
|
# =============================================
|
|
# STAGE 7: VOCABULARY MATCHING TESTS
|
|
# =============================================
|
|
|
|
class TestVocabMatching:
|
|
"""Test match_lines_to_vocab function."""
|
|
|
|
def test_empty_results(self, sample_regions):
|
|
"""Empty OCR results should return empty vocab."""
|
|
vocab = match_lines_to_vocab({}, sample_regions)
|
|
assert vocab == []
|
|
|
|
def test_en_only(self, sample_words_column_en, sample_regions):
|
|
"""Only EN words should create entries with empty DE/example."""
|
|
ocr_results = {'column_en': sample_words_column_en}
|
|
vocab = match_lines_to_vocab(ocr_results, sample_regions)
|
|
assert len(vocab) == 3
|
|
for row in vocab:
|
|
assert row.english != ""
|
|
assert row.german == ""
|
|
|
|
def test_en_de_matching(self, sample_words_column_en, sample_words_column_de, sample_regions):
|
|
"""EN and DE words on same Y should be matched."""
|
|
ocr_results = {
|
|
'column_en': sample_words_column_en,
|
|
'column_de': sample_words_column_de,
|
|
}
|
|
vocab = match_lines_to_vocab(ocr_results, sample_regions, y_tolerance_px=25)
|
|
assert len(vocab) == 3
|
|
# First entry should match achieve <-> erreichen
|
|
assert vocab[0].english == 'achieve'
|
|
assert vocab[0].german == 'erreichen'
|
|
|
|
def test_full_3_column_matching(self, sample_words_column_en, sample_words_column_de,
|
|
sample_words_column_ex, sample_regions):
|
|
"""All 3 columns should be matched by Y coordinate."""
|
|
ocr_results = {
|
|
'column_en': sample_words_column_en,
|
|
'column_de': sample_words_column_de,
|
|
'column_example': sample_words_column_ex,
|
|
}
|
|
vocab = match_lines_to_vocab(ocr_results, sample_regions, y_tolerance_px=25)
|
|
assert len(vocab) >= 1
|
|
# First entry should have example text
|
|
assert vocab[0].english == 'achieve'
|
|
assert vocab[0].example != ""
|
|
|
|
def test_sorted_by_y_position(self, sample_words_column_en, sample_regions):
|
|
"""Result should be sorted by Y position."""
|
|
ocr_results = {'column_en': sample_words_column_en}
|
|
vocab = match_lines_to_vocab(ocr_results, sample_regions)
|
|
positions = [row.y_position for row in vocab]
|
|
assert positions == sorted(positions)
|
|
|
|
def test_skips_short_entries(self, sample_regions):
|
|
"""Very short text (< 2 chars) should be skipped."""
|
|
words = [
|
|
{'text': 'a', 'left': 30, 'top': 50, 'width': 10, 'height': 15, 'conf': 90, 'region_type': 'column_en'},
|
|
{'text': 'valid', 'left': 30, 'top': 80, 'width': 50, 'height': 15, 'conf': 90, 'region_type': 'column_en'},
|
|
]
|
|
ocr_results = {'column_en': words}
|
|
vocab = match_lines_to_vocab(ocr_results, sample_regions)
|
|
assert len(vocab) == 1
|
|
assert vocab[0].english == 'valid'
|
|
|
|
def test_confidence_calculation(self, sample_words_column_en, sample_words_column_de, sample_regions):
|
|
"""Confidence should be the average of matched columns."""
|
|
ocr_results = {
|
|
'column_en': sample_words_column_en,
|
|
'column_de': sample_words_column_de,
|
|
}
|
|
vocab = match_lines_to_vocab(ocr_results, sample_regions, y_tolerance_px=25)
|
|
# First entry: EN conf=90, DE conf=88 → avg=89
|
|
assert vocab[0].confidence > 0
|
|
assert vocab[0].confidence == pytest.approx(89.0, abs=1.0)
|
|
|
|
|
|
# =============================================
|
|
# ORCHESTRATOR TESTS
|
|
# =============================================
|
|
|
|
class TestOrchestrator:
|
|
"""Test run_cv_pipeline orchestrator."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_no_input_returns_error(self):
|
|
"""Pipeline without input should return error."""
|
|
result = await run_cv_pipeline()
|
|
assert result.error is not None
|
|
assert "No input data" in result.error
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_pipeline_unavailable(self):
|
|
"""When CV_PIPELINE_AVAILABLE is False, should return error."""
|
|
with patch('cv_vocab_pipeline.CV_PIPELINE_AVAILABLE', False):
|
|
result = await run_cv_pipeline(pdf_data=b"fake")
|
|
assert result.error is not None
|
|
assert "not available" in result.error
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
|
async def test_pipeline_with_image_data(self):
|
|
"""Pipeline with a real synthetic image should run without errors."""
|
|
import cv2
|
|
# Create a simple test image (white with some text-like black bars)
|
|
img = np.ones((200, 300, 3), dtype=np.uint8) * 255
|
|
for y in range(30, 170, 25):
|
|
img[y:y+12, 20:280, :] = 30
|
|
_, img_bytes = cv2.imencode('.png', img)
|
|
image_data = img_bytes.tobytes()
|
|
|
|
with patch('cv_vocab_pipeline.pytesseract') as mock_tess:
|
|
# Mock Tesseract to return empty results
|
|
mock_tess.image_to_data.return_value = {
|
|
'text': [], 'conf': [], 'left': [], 'top': [],
|
|
'width': [], 'height': [],
|
|
}
|
|
mock_tess.Output.DICT = 'dict'
|
|
|
|
result = await run_cv_pipeline(image_data=image_data)
|
|
assert result.error is None
|
|
assert result.image_width == 300
|
|
assert result.image_height == 200
|
|
assert 'render' in result.stages
|
|
assert 'deskew' in result.stages
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
|
async def test_pipeline_records_timing(self):
|
|
"""Pipeline should record timing for each stage."""
|
|
import cv2
|
|
img = np.ones((100, 150, 3), dtype=np.uint8) * 255
|
|
_, img_bytes = cv2.imencode('.png', img)
|
|
|
|
with patch('cv_vocab_pipeline.pytesseract') as mock_tess:
|
|
mock_tess.image_to_data.return_value = {
|
|
'text': [], 'conf': [], 'left': [], 'top': [],
|
|
'width': [], 'height': [],
|
|
}
|
|
mock_tess.Output.DICT = 'dict'
|
|
|
|
result = await run_cv_pipeline(image_data=img_bytes.tobytes())
|
|
assert result.duration_seconds >= 0
|
|
assert all(v >= 0 for v in result.stages.values())
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_pipeline_result_format(self):
|
|
"""PipelineResult vocabulary should be list of dicts with expected keys."""
|
|
result = PipelineResult()
|
|
result.vocabulary = [
|
|
{"english": "test", "german": "Test", "example": "A test.", "confidence": 90.0}
|
|
]
|
|
assert len(result.vocabulary) == 1
|
|
entry = result.vocabulary[0]
|
|
assert "english" in entry
|
|
assert "german" in entry
|
|
assert "example" in entry
|
|
assert "confidence" in entry
|
|
|
|
|
|
# =============================================
|
|
# INTEGRATION-STYLE TESTS (with mocked Tesseract)
|
|
# =============================================
|
|
|
|
@pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available")
|
|
class TestStageIntegration:
|
|
"""Test multiple stages together (still unit-test level with mocked OCR)."""
|
|
|
|
def test_image_prep_to_layout(self, text_like_image):
|
|
"""Stages 4→5: image prep feeds layout analysis correctly."""
|
|
ocr_img = create_ocr_image(text_like_image)
|
|
layout_img = create_layout_image(text_like_image)
|
|
|
|
assert ocr_img.shape[:2] == text_like_image.shape[:2]
|
|
assert layout_img.shape[:2] == text_like_image.shape[:2]
|
|
|
|
regions = analyze_layout(layout_img, ocr_img)
|
|
assert len(regions) >= 1
|
|
|
|
def test_deskew_to_image_prep(self, text_like_image):
|
|
"""Stages 2→4: deskew output can be processed by image prep."""
|
|
corrected, angle = deskew_image(text_like_image)
|
|
ocr_img = create_ocr_image(corrected)
|
|
layout_img = create_layout_image(corrected)
|
|
assert ocr_img.shape[:2] == corrected.shape[:2]
|
|
assert layout_img.shape[:2] == corrected.shape[:2]
|
|
|
|
|
|
# =============================================
|
|
# RUN TESTS
|
|
# =============================================
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|