""" Unit Tests for CV Vocab Pipeline (cv_vocab_pipeline.py) Tests cover: - Data classes (PageRegion, VocabRow, PipelineResult) - Stage 2: Deskew image - Stage 3: Dewarp (pass-through) - Stage 4: Image preparation (OCR + Layout images) - Stage 5: Layout analysis (content bounds, projection profiles, column detection) - Stage 6: Multi-pass OCR region handling - Stage 7: Line grouping and vocabulary matching - Orchestrator (run_cv_pipeline) DSGVO Note: All tests run locally with synthetic data. No external API calls. """ import pytest import numpy as np from unittest.mock import AsyncMock, MagicMock, patch, PropertyMock from dataclasses import asdict # Import module under test from cv_vocab_pipeline import ( PageRegion, VocabRow, PipelineResult, deskew_image, dewarp_image, create_ocr_image, create_layout_image, _find_content_bounds, analyze_layout, _group_words_into_lines, match_lines_to_vocab, run_cv_pipeline, CV2_AVAILABLE, TESSERACT_AVAILABLE, CV_PIPELINE_AVAILABLE, ) # ============================================= # FIXTURES # ============================================= @pytest.fixture def white_image(): """Create a simple 300x200 white BGR image.""" return np.ones((200, 300, 3), dtype=np.uint8) * 255 @pytest.fixture def text_like_image(): """Create a 600x400 image with dark text-like regions simulating 3 columns.""" img = np.ones((400, 600, 3), dtype=np.uint8) * 255 # Column 1 (EN): x=20..170 for y in range(50, 350, 30): img[y:y+15, 30:160, :] = 30 # Dark text lines # Gap between col1 and col2: x=170..210 (white) # Column 2 (DE): x=210..370 for y in range(50, 350, 30): img[y:y+15, 220:360, :] = 30 # Gap between col2 and col3: x=370..410 (white) # Column 3 (Example): x=410..580 for y in range(50, 350, 30): img[y:y+15, 420:570, :] = 30 return img @pytest.fixture def binary_image(): """Create a binary (single-channel) image for OCR tests.""" # White background (255) with some black text-like areas img = np.ones((400, 600), dtype=np.uint8) * 255 # Add text-like dark bands for y in range(50, 350, 30): img[y:y+15, 30:570] = 0 return img @pytest.fixture def sample_words_column_en(): """Sample OCR word dicts for English column.""" return [ {'text': 'achieve', 'left': 30, 'top': 50, 'width': 80, 'height': 15, 'conf': 90, 'region_type': 'column_en'}, {'text': 'improve', 'left': 30, 'top': 80, 'width': 80, 'height': 15, 'conf': 85, 'region_type': 'column_en'}, {'text': 'success', 'left': 30, 'top': 110, 'width': 80, 'height': 15, 'conf': 92, 'region_type': 'column_en'}, ] @pytest.fixture def sample_words_column_de(): """Sample OCR word dicts for German column.""" return [ {'text': 'erreichen', 'left': 220, 'top': 52, 'width': 100, 'height': 15, 'conf': 88, 'region_type': 'column_de'}, {'text': 'verbessern', 'left': 220, 'top': 82, 'width': 100, 'height': 15, 'conf': 80, 'region_type': 'column_de'}, {'text': 'Erfolg', 'left': 220, 'top': 112, 'width': 100, 'height': 15, 'conf': 95, 'region_type': 'column_de'}, ] @pytest.fixture def sample_words_column_ex(): """Sample OCR word dicts for Example column.""" return [ {'text': 'She', 'left': 420, 'top': 50, 'width': 30, 'height': 15, 'conf': 85, 'region_type': 'column_example'}, {'text': 'achieved', 'left': 455, 'top': 50, 'width': 70, 'height': 15, 'conf': 80, 'region_type': 'column_example'}, {'text': 'her', 'left': 530, 'top': 50, 'width': 30, 'height': 15, 'conf': 90, 'region_type': 'column_example'}, {'text': 'goals.', 'left': 420, 'top': 52, 'width': 50, 'height': 15, 'conf': 75, 'region_type': 'column_example'}, ] @pytest.fixture def sample_regions(): """Sample 3-column PageRegion layout.""" return [ PageRegion(type='column_en', x=0, y=50, width=190, height=300), PageRegion(type='column_de', x=210, y=50, width=160, height=300), PageRegion(type='column_example', x=410, y=50, width=190, height=300), ] # ============================================= # DATA CLASS TESTS # ============================================= class TestDataClasses: """Test data classes for correct defaults and fields.""" def test_page_region_creation(self): region = PageRegion(type='column_en', x=10, y=20, width=100, height=200) assert region.type == 'column_en' assert region.x == 10 assert region.y == 20 assert region.width == 100 assert region.height == 200 def test_vocab_row_defaults(self): row = VocabRow() assert row.english == "" assert row.german == "" assert row.example == "" assert row.confidence == 0.0 assert row.y_position == 0 def test_vocab_row_with_values(self): row = VocabRow(english="test", german="Test", example="A test.", confidence=85.5, y_position=100) assert row.english == "test" assert row.german == "Test" assert row.confidence == 85.5 def test_pipeline_result_defaults(self): result = PipelineResult() assert result.vocabulary == [] assert result.word_count == 0 assert result.columns_detected == 0 assert result.duration_seconds == 0.0 assert result.stages == {} assert result.error is None def test_pipeline_result_error(self): result = PipelineResult(error="Something went wrong") assert result.error == "Something went wrong" # ============================================= # STAGE 2: DESKEW TESTS # ============================================= @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available") class TestDeskew: """Test deskew (rotation correction) stage.""" def test_deskew_straight_image(self, white_image): """A perfectly straight image should not be rotated.""" corrected, angle = deskew_image(white_image) assert abs(angle) < 0.1 assert corrected.shape == white_image.shape def test_deskew_returns_tuple(self, white_image): """deskew_image must return (image, angle) tuple.""" result = deskew_image(white_image) assert isinstance(result, tuple) assert len(result) == 2 assert isinstance(result[0], np.ndarray) assert isinstance(result[1], float) def test_deskew_preserves_shape(self, text_like_image): """Output image should have same shape as input.""" corrected, _ = deskew_image(text_like_image) assert corrected.shape == text_like_image.shape # ============================================= # STAGE 3: DEWARP TESTS # ============================================= @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available") class TestDewarp: """Test dewarp (pass-through) stage.""" def test_dewarp_passthrough(self, white_image): """Current dewarp should return the same image (pass-through).""" result = dewarp_image(white_image) np.testing.assert_array_equal(result, white_image) def test_dewarp_preserves_shape(self, text_like_image): result = dewarp_image(text_like_image) assert result.shape == text_like_image.shape # ============================================= # STAGE 4: IMAGE PREPARATION TESTS # ============================================= @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available") class TestImagePreparation: """Test OCR and layout image creation.""" def test_create_ocr_image_returns_grayscale(self, text_like_image): """OCR image should be single-channel (binarized).""" ocr_img = create_ocr_image(text_like_image) assert len(ocr_img.shape) == 2 # Single channel assert ocr_img.dtype == np.uint8 def test_create_ocr_image_is_binary(self, text_like_image): """OCR image should contain only 0 and 255 values.""" ocr_img = create_ocr_image(text_like_image) unique_vals = np.unique(ocr_img) assert all(v in [0, 255] for v in unique_vals) def test_create_layout_image_returns_grayscale(self, text_like_image): """Layout image should be single-channel (CLAHE enhanced).""" layout_img = create_layout_image(text_like_image) assert len(layout_img.shape) == 2 assert layout_img.dtype == np.uint8 def test_create_layout_image_enhanced_contrast(self, text_like_image): """Layout image should have different histogram than simple grayscale.""" import cv2 gray = cv2.cvtColor(text_like_image, cv2.COLOR_BGR2GRAY) layout_img = create_layout_image(text_like_image) # CLAHE should change the histogram assert layout_img.shape == gray.shape # ============================================= # STAGE 5: LAYOUT ANALYSIS TESTS # ============================================= @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available") class TestContentBounds: """Test _find_content_bounds helper.""" def test_empty_image(self): """Fully white (inverted = black) image should return full bounds.""" inv = np.zeros((200, 300), dtype=np.uint8) left, right, top, bottom = _find_content_bounds(inv) # With no content, bounds should span the image assert left >= 0 assert right <= 300 assert top >= 0 assert bottom <= 200 def test_centered_content(self): """Content in center should give tight bounds.""" inv = np.zeros((400, 600), dtype=np.uint8) # Add content block in center inv[100:300, 50:550] = 255 left, right, top, bottom = _find_content_bounds(inv) assert left <= 52 # ~50 with 2px margin assert right >= 548 # ~550 with 2px margin assert top <= 102 assert bottom >= 298 @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available") class TestLayoutAnalysis: """Test analyze_layout for column detection.""" def test_returns_list_of_regions(self, text_like_image): """analyze_layout should return a list of PageRegion.""" ocr_img = create_ocr_image(text_like_image) layout_img = create_layout_image(text_like_image) regions = analyze_layout(layout_img, ocr_img) assert isinstance(regions, list) assert all(isinstance(r, PageRegion) for r in regions) def test_detects_columns(self, text_like_image): """With clear 3-column image, should detect at least 1 column.""" ocr_img = create_ocr_image(text_like_image) layout_img = create_layout_image(text_like_image) regions = analyze_layout(layout_img, ocr_img) column_regions = [r for r in regions if r.type.startswith('column')] assert len(column_regions) >= 1 def test_single_column_fallback(self): """Image with no clear columns should fall back to single column.""" # Uniform text across full width img = np.ones((400, 600, 3), dtype=np.uint8) * 255 for y in range(50, 350, 20): img[y:y+10, 20:580, :] = 30 # Full-width text ocr_img = create_ocr_image(img) layout_img = create_layout_image(img) regions = analyze_layout(layout_img, ocr_img) column_regions = [r for r in regions if r.type.startswith('column')] # Should at least return 1 column (full page fallback) assert len(column_regions) >= 1 def test_region_types_are_valid(self, text_like_image): """All region types should be from the expected set.""" ocr_img = create_ocr_image(text_like_image) layout_img = create_layout_image(text_like_image) regions = analyze_layout(layout_img, ocr_img) valid_types = {'column_en', 'column_de', 'column_example', 'header', 'footer'} for r in regions: assert r.type in valid_types, f"Unexpected region type: {r.type}" # ============================================= # STAGE 7: LINE GROUPING TESTS # ============================================= class TestLineGrouping: """Test _group_words_into_lines function.""" def test_empty_input(self): """Empty word list should return empty lines.""" assert _group_words_into_lines([]) == [] def test_single_word(self): """Single word should return one line with one word.""" words = [{'text': 'hello', 'left': 10, 'top': 50, 'width': 50, 'height': 15, 'conf': 90}] lines = _group_words_into_lines(words) assert len(lines) == 1 assert len(lines[0]) == 1 assert lines[0][0]['text'] == 'hello' def test_words_on_same_line(self): """Words close in Y should be grouped into one line.""" words = [ {'text': 'hello', 'left': 10, 'top': 50, 'width': 50, 'height': 15, 'conf': 90}, {'text': 'world', 'left': 70, 'top': 52, 'width': 50, 'height': 15, 'conf': 85}, ] lines = _group_words_into_lines(words, y_tolerance_px=10) assert len(lines) == 1 assert len(lines[0]) == 2 def test_words_on_different_lines(self): """Words far apart in Y should be on different lines.""" words = [ {'text': 'line1', 'left': 10, 'top': 50, 'width': 50, 'height': 15, 'conf': 90}, {'text': 'line2', 'left': 10, 'top': 100, 'width': 50, 'height': 15, 'conf': 85}, {'text': 'line3', 'left': 10, 'top': 150, 'width': 50, 'height': 15, 'conf': 88}, ] lines = _group_words_into_lines(words, y_tolerance_px=20) assert len(lines) == 3 def test_words_sorted_by_x_within_line(self): """Words within a line should be sorted by X position.""" words = [ {'text': 'world', 'left': 100, 'top': 50, 'width': 50, 'height': 15, 'conf': 85}, {'text': 'hello', 'left': 10, 'top': 52, 'width': 50, 'height': 15, 'conf': 90}, ] lines = _group_words_into_lines(words, y_tolerance_px=10) assert len(lines) == 1 assert lines[0][0]['text'] == 'hello' assert lines[0][1]['text'] == 'world' # ============================================= # STAGE 7: VOCABULARY MATCHING TESTS # ============================================= class TestVocabMatching: """Test match_lines_to_vocab function.""" def test_empty_results(self, sample_regions): """Empty OCR results should return empty vocab.""" vocab = match_lines_to_vocab({}, sample_regions) assert vocab == [] def test_en_only(self, sample_words_column_en, sample_regions): """Only EN words should create entries with empty DE/example.""" ocr_results = {'column_en': sample_words_column_en} vocab = match_lines_to_vocab(ocr_results, sample_regions) assert len(vocab) == 3 for row in vocab: assert row.english != "" assert row.german == "" def test_en_de_matching(self, sample_words_column_en, sample_words_column_de, sample_regions): """EN and DE words on same Y should be matched.""" ocr_results = { 'column_en': sample_words_column_en, 'column_de': sample_words_column_de, } vocab = match_lines_to_vocab(ocr_results, sample_regions, y_tolerance_px=25) assert len(vocab) == 3 # First entry should match achieve <-> erreichen assert vocab[0].english == 'achieve' assert vocab[0].german == 'erreichen' def test_full_3_column_matching(self, sample_words_column_en, sample_words_column_de, sample_words_column_ex, sample_regions): """All 3 columns should be matched by Y coordinate.""" ocr_results = { 'column_en': sample_words_column_en, 'column_de': sample_words_column_de, 'column_example': sample_words_column_ex, } vocab = match_lines_to_vocab(ocr_results, sample_regions, y_tolerance_px=25) assert len(vocab) >= 1 # First entry should have example text assert vocab[0].english == 'achieve' assert vocab[0].example != "" def test_sorted_by_y_position(self, sample_words_column_en, sample_regions): """Result should be sorted by Y position.""" ocr_results = {'column_en': sample_words_column_en} vocab = match_lines_to_vocab(ocr_results, sample_regions) positions = [row.y_position for row in vocab] assert positions == sorted(positions) def test_skips_short_entries(self, sample_regions): """Very short text (< 2 chars) should be skipped.""" words = [ {'text': 'a', 'left': 30, 'top': 50, 'width': 10, 'height': 15, 'conf': 90, 'region_type': 'column_en'}, {'text': 'valid', 'left': 30, 'top': 80, 'width': 50, 'height': 15, 'conf': 90, 'region_type': 'column_en'}, ] ocr_results = {'column_en': words} vocab = match_lines_to_vocab(ocr_results, sample_regions) assert len(vocab) == 1 assert vocab[0].english == 'valid' def test_confidence_calculation(self, sample_words_column_en, sample_words_column_de, sample_regions): """Confidence should be the average of matched columns.""" ocr_results = { 'column_en': sample_words_column_en, 'column_de': sample_words_column_de, } vocab = match_lines_to_vocab(ocr_results, sample_regions, y_tolerance_px=25) # First entry: EN conf=90, DE conf=88 → avg=89 assert vocab[0].confidence > 0 assert vocab[0].confidence == pytest.approx(89.0, abs=1.0) # ============================================= # ORCHESTRATOR TESTS # ============================================= class TestOrchestrator: """Test run_cv_pipeline orchestrator.""" @pytest.mark.asyncio async def test_no_input_returns_error(self): """Pipeline without input should return error.""" result = await run_cv_pipeline() assert result.error is not None assert "No input data" in result.error @pytest.mark.asyncio async def test_pipeline_unavailable(self): """When CV_PIPELINE_AVAILABLE is False, should return error.""" with patch('cv_vocab_pipeline.CV_PIPELINE_AVAILABLE', False): result = await run_cv_pipeline(pdf_data=b"fake") assert result.error is not None assert "not available" in result.error @pytest.mark.asyncio @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available") async def test_pipeline_with_image_data(self): """Pipeline with a real synthetic image should run without errors.""" import cv2 # Create a simple test image (white with some text-like black bars) img = np.ones((200, 300, 3), dtype=np.uint8) * 255 for y in range(30, 170, 25): img[y:y+12, 20:280, :] = 30 _, img_bytes = cv2.imencode('.png', img) image_data = img_bytes.tobytes() with patch('cv_vocab_pipeline.pytesseract') as mock_tess: # Mock Tesseract to return empty results mock_tess.image_to_data.return_value = { 'text': [], 'conf': [], 'left': [], 'top': [], 'width': [], 'height': [], } mock_tess.Output.DICT = 'dict' result = await run_cv_pipeline(image_data=image_data) assert result.error is None assert result.image_width == 300 assert result.image_height == 200 assert 'render' in result.stages assert 'deskew' in result.stages @pytest.mark.asyncio @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available") async def test_pipeline_records_timing(self): """Pipeline should record timing for each stage.""" import cv2 img = np.ones((100, 150, 3), dtype=np.uint8) * 255 _, img_bytes = cv2.imencode('.png', img) with patch('cv_vocab_pipeline.pytesseract') as mock_tess: mock_tess.image_to_data.return_value = { 'text': [], 'conf': [], 'left': [], 'top': [], 'width': [], 'height': [], } mock_tess.Output.DICT = 'dict' result = await run_cv_pipeline(image_data=img_bytes.tobytes()) assert result.duration_seconds >= 0 assert all(v >= 0 for v in result.stages.values()) @pytest.mark.asyncio async def test_pipeline_result_format(self): """PipelineResult vocabulary should be list of dicts with expected keys.""" result = PipelineResult() result.vocabulary = [ {"english": "test", "german": "Test", "example": "A test.", "confidence": 90.0} ] assert len(result.vocabulary) == 1 entry = result.vocabulary[0] assert "english" in entry assert "german" in entry assert "example" in entry assert "confidence" in entry # ============================================= # INTEGRATION-STYLE TESTS (with mocked Tesseract) # ============================================= @pytest.mark.skipif(not CV2_AVAILABLE, reason="OpenCV not available") class TestStageIntegration: """Test multiple stages together (still unit-test level with mocked OCR).""" def test_image_prep_to_layout(self, text_like_image): """Stages 4→5: image prep feeds layout analysis correctly.""" ocr_img = create_ocr_image(text_like_image) layout_img = create_layout_image(text_like_image) assert ocr_img.shape[:2] == text_like_image.shape[:2] assert layout_img.shape[:2] == text_like_image.shape[:2] regions = analyze_layout(layout_img, ocr_img) assert len(regions) >= 1 def test_deskew_to_image_prep(self, text_like_image): """Stages 2→4: deskew output can be processed by image prep.""" corrected, angle = deskew_image(text_like_image) ocr_img = create_ocr_image(corrected) layout_img = create_layout_image(corrected) assert ocr_img.shape[:2] == corrected.shape[:2] assert layout_img.shape[:2] == corrected.shape[:2] # ============================================= # RUN TESTS # ============================================= if __name__ == "__main__": pytest.main([__file__, "-v"])